]> git.proxmox.com Git - ceph.git/blob - ceph/src/include/frag.h
add subtree-ish sources for 12.0.3
[ceph.git] / ceph / src / include / frag.h
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15 #ifndef CEPH_FRAG_H
16 #define CEPH_FRAG_H
17
18 #include <stdint.h>
19 #include <list>
20 #include <iostream>
21 #include <stdio.h>
22
23 #include "buffer.h"
24 #include "compact_map.h"
25
26 #include "ceph_frag.h"
27 #include "include/assert.h"
28
29 /*
30 *
31 * the goal here is to use a binary split strategy to partition a namespace.
32 * frag_t represents a particular fragment. bits() tells you the size of the
33 * fragment, and value() it's name. this is roughly analogous to an ip address
34 * and netmask.
35 *
36 * fragtree_t represents an entire namespace and it's partition. it essentially
37 * tells you where fragments are split into other fragments, and by how much
38 * (i.e. by how many bits, resulting in a power of 2 number of child fragments).
39 *
40 * this vaguely resembles a btree, in that when a fragment becomes large or small
41 * we can split or merge, except that there is no guarantee of being balanced.
42 *
43 * presumably we are partitioning the output of a (perhaps specialized) hash
44 * function.
45 */
46
47 /**
48 * frag_t
49 *
50 * description of an individual fragment. that is, a particular piece
51 * of the overall namespace.
52 *
53 * this is conceptually analogous to an ip address and netmask.
54 *
55 * a value v falls "within" fragment f iff (v & f.mask()) == f.value().
56 *
57 * we write it as v/b, where v is a value and b is the number of bits.
58 * 0/0 (bits==0) corresponds to the entire namespace. if we bisect that,
59 * we get 0/1 and 1/1. quartering gives us 0/2, 1/2, 2/2, 3/2. and so on.
60 *
61 * this makes the right most bit of v the "most significant", which is the
62 * opposite of what we usually see.
63 */
64
65 /*
66 * TODO:
67 * - get_first_child(), next_sibling(int parent_bits) to make (possibly partial)
68 * iteration efficient (see, e.g., try_assimilate_children()
69 * - rework frag_t so that we mask the left-most (most significant) bits instead of
70 * the right-most (least significant) bits. just because it's more intutive, and
71 * matches the network/netmask concept.
72 */
73
74 typedef uint32_t _frag_t;
75
76 class frag_t {
77 /*
78 * encoding is dictated by frag_* functions in ceph_fs.h. use those
79 * helpers _exclusively_.
80 */
81 public:
82 _frag_t _enc;
83
84 frag_t() : _enc(0) { }
85 frag_t(unsigned v, unsigned b) : _enc(ceph_frag_make(b, v)) { }
86 frag_t(_frag_t e) : _enc(e) { }
87
88 // constructors
89 void from_unsigned(unsigned e) { _enc = e; }
90
91 // accessors
92 unsigned value() const { return ceph_frag_value(_enc); }
93 unsigned bits() const { return ceph_frag_bits(_enc); }
94 unsigned mask() const { return ceph_frag_mask(_enc); }
95 unsigned mask_shift() const { return ceph_frag_mask_shift(_enc); }
96
97 operator _frag_t() const { return _enc; }
98
99 // tests
100 bool contains(unsigned v) const { return ceph_frag_contains_value(_enc, v); }
101 bool contains(frag_t sub) const { return ceph_frag_contains_frag(_enc, sub._enc); }
102 bool is_root() const { return bits() == 0; }
103 frag_t parent() const {
104 assert(bits() > 0);
105 return frag_t(ceph_frag_parent(_enc));
106 }
107
108 // splitting
109 frag_t make_child(int i, int nb) const {
110 assert(i < (1<<nb));
111 return frag_t(ceph_frag_make_child(_enc, nb, i));
112 }
113 void split(int nb, std::list<frag_t>& fragments) const {
114 assert(nb > 0);
115 unsigned nway = 1 << nb;
116 for (unsigned i=0; i<nway; i++)
117 fragments.push_back(make_child(i, nb));
118 }
119
120 // binary splitting
121 frag_t left_child() const { return frag_t(ceph_frag_left_child(_enc)); }
122 frag_t right_child() const { return frag_t(ceph_frag_right_child(_enc)); }
123
124 bool is_left() const { return ceph_frag_is_left_child(_enc); }
125 bool is_right() const { return ceph_frag_is_right_child(_enc); }
126 frag_t get_sibling() const {
127 assert(!is_root());
128 return frag_t(ceph_frag_sibling(_enc));
129 }
130
131 // sequencing
132 bool is_leftmost() const { return ceph_frag_is_leftmost(_enc); }
133 bool is_rightmost() const { return ceph_frag_is_rightmost(_enc); }
134 frag_t next() const {
135 assert(!is_rightmost());
136 return frag_t(ceph_frag_next(_enc));
137 }
138
139 // parse
140 bool parse(const char *s) {
141 int pvalue, pbits;
142 int r = sscanf(s, "%x/%d", &pvalue, &pbits);
143 if (r == 2) {
144 *this = frag_t(pvalue, pbits);
145 return true;
146 }
147 return false;
148 }
149 };
150
151 inline std::ostream& operator<<(std::ostream& out, frag_t hb)
152 {
153 //out << std::hex << hb.value() << std::dec << "/" << hb.bits() << '=';
154 unsigned num = hb.bits();
155 if (num) {
156 unsigned val = hb.value();
157 for (unsigned bit = 23; num; num--, bit--)
158 out << ((val & (1<<bit)) ? '1':'0');
159 }
160 return out << '*';
161 }
162
163 inline void encode(frag_t f, bufferlist& bl) { encode_raw(f._enc, bl); }
164 inline void decode(frag_t &f, bufferlist::iterator& p) {
165 __u32 v;
166 decode_raw(v, p);
167 f._enc = v;
168 }
169
170
171
172 /**
173 * fragtree_t -- partition an entire namespace into one or more frag_t's.
174 */
175 class fragtree_t {
176 // pairs <f, b>:
177 // frag_t f is split by b bits.
178 // if child frag_t does not appear, it is not split.
179 public:
180 compact_map<frag_t,int32_t> _splits;
181
182 public:
183 // -------------
184 // basics
185 void swap(fragtree_t& other) {
186 _splits.swap(other._splits);
187 }
188 void clear() {
189 _splits.clear();
190 }
191
192 // -------------
193 // accessors
194 bool empty() const {
195 return _splits.empty();
196 }
197 int get_split(const frag_t hb) const {
198 compact_map<frag_t,int32_t>::const_iterator p = _splits.find(hb);
199 if (p == _splits.end())
200 return 0;
201 else
202 return p->second;
203 }
204
205
206 bool is_leaf(frag_t x) const {
207 std::list<frag_t> ls;
208 get_leaves_under(x, ls);
209 //generic_dout(10) << "is_leaf(" << x << ") -> " << ls << dendl;
210 if (!ls.empty() &&
211 ls.front() == x &&
212 ls.size() == 1)
213 return true;
214 return false;
215 }
216
217 /**
218 * get_leaves -- list all leaves
219 */
220 void get_leaves(std::list<frag_t>& ls) const {
221 return get_leaves_under_split(frag_t(), ls);
222 }
223
224 /**
225 * get_leaves_under_split -- list all leaves under a known split point (or root)
226 */
227 void get_leaves_under_split(frag_t under, std::list<frag_t>& ls) const {
228 std::list<frag_t> q;
229 q.push_back(under);
230 while (!q.empty()) {
231 frag_t t = q.back();
232 q.pop_back();
233 int nb = get_split(t);
234 if (nb)
235 t.split(nb, q); // queue up children
236 else
237 ls.push_front(t); // not spit, it's a leaf.
238 }
239 }
240
241 /**
242 * get_branch -- get branch point at OR above frag @a x
243 * - may be @a x itself, if @a x is a split
244 * - may be root (frag_t())
245 */
246 frag_t get_branch(frag_t x) const {
247 while (1) {
248 if (x == frag_t()) return x; // root
249 if (get_split(x)) return x; // found it!
250 x = x.parent();
251 }
252 }
253
254 /**
255 * get_branch_above -- get a branch point above frag @a x
256 * - may be root (frag_t())
257 * - may NOT be @a x, even if @a x is a split.
258 */
259 frag_t get_branch_above(frag_t x) const {
260 while (1) {
261 if (x == frag_t()) return x; // root
262 x = x.parent();
263 if (get_split(x)) return x; // found it!
264 }
265 }
266
267
268 /**
269 * get_branch_or_leaf -- get branch or leaf point parent for frag @a x
270 * - may be @a x itself, if @a x is a split or leaf
271 * - may be root (frag_t())
272 */
273 frag_t get_branch_or_leaf(frag_t x) const {
274 frag_t branch = get_branch(x);
275 int nb = get_split(branch);
276 if (nb > 0 && // if branch is a split, and
277 branch.bits() + nb <= x.bits()) // one of the children is or contains x
278 return frag_t(x.value(), branch.bits()+nb); // then return that child (it's a leaf)
279 else
280 return branch;
281 }
282
283 /**
284 * get_leaves_under(x, ls) -- search for any leaves fully contained by x
285 */
286 void get_leaves_under(frag_t x, std::list<frag_t>& ls) const {
287 std::list<frag_t> q;
288 q.push_back(get_branch_or_leaf(x));
289 while (!q.empty()) {
290 frag_t t = q.front();
291 q.pop_front();
292 if (t.bits() >= x.bits() && // if t is more specific than x, and
293 !x.contains(t)) // x does not contain t,
294 continue; // then skip
295 int nb = get_split(t);
296 if (nb)
297 t.split(nb, q); // queue up children
298 else if (x.contains(t))
299 ls.push_back(t); // not spit, it's a leaf.
300 }
301 }
302
303 /**
304 * contains(fg) -- does fragtree contain the specific frag @a x
305 */
306 bool contains(frag_t x) const {
307 std::list<frag_t> q;
308 q.push_back(get_branch(x));
309 while (!q.empty()) {
310 frag_t t = q.front();
311 q.pop_front();
312 if (t.bits() >= x.bits() && // if t is more specific than x, and
313 !x.contains(t)) // x does not contain t,
314 continue; // then skip
315 int nb = get_split(t);
316 if (nb) {
317 if (t == x) return false; // it's split.
318 t.split(nb, q); // queue up children
319 } else {
320 if (t == x) return true; // it's there.
321 }
322 }
323 return false;
324 }
325
326 /**
327 * operator[] -- map a (hash?) value to a frag
328 */
329 frag_t operator[](unsigned v) const {
330 frag_t t;
331 while (1) {
332 assert(t.contains(v));
333 int nb = get_split(t);
334
335 // is this a leaf?
336 if (nb == 0) return t; // done.
337
338 // pick appropriate child fragment.
339 unsigned nway = 1 << nb;
340 unsigned i;
341 for (i=0; i<nway; i++) {
342 frag_t n = t.make_child(i, nb);
343 if (n.contains(v)) {
344 t = n;
345 break;
346 }
347 }
348 assert(i < nway);
349 }
350 }
351
352
353 // ---------------
354 // modifiers
355 void split(frag_t x, int b, bool simplify=true) {
356 assert(is_leaf(x));
357 _splits[x] = b;
358
359 if (simplify)
360 try_assimilate_children(get_branch_above(x));
361 }
362 void merge(frag_t x, int b, bool simplify=true) {
363 assert(!is_leaf(x));
364 assert(_splits[x] == b);
365 _splits.erase(x);
366
367 if (simplify)
368 try_assimilate_children(get_branch_above(x));
369 }
370
371 /*
372 * if all of a given split's children are identically split,
373 * then the children can be assimilated.
374 */
375 void try_assimilate_children(frag_t x) {
376 int nb = get_split(x);
377 if (!nb) return;
378 std::list<frag_t> children;
379 x.split(nb, children);
380 int childbits = 0;
381 for (std::list<frag_t>::iterator p = children.begin();
382 p != children.end();
383 ++p) {
384 int cb = get_split(*p);
385 if (!cb) return; // nope.
386 if (childbits && cb != childbits) return; // not the same
387 childbits = cb;
388 }
389 // all children are split with childbits!
390 for (std::list<frag_t>::iterator p = children.begin();
391 p != children.end();
392 ++p)
393 _splits.erase(*p);
394 _splits[x] += childbits;
395 }
396
397 bool force_to_leaf(CephContext *cct, frag_t x) {
398 if (is_leaf(x))
399 return false;
400
401 lgeneric_dout(cct, 10) << "force_to_leaf " << x << " on " << _splits << dendl;
402
403 frag_t parent = get_branch_or_leaf(x);
404 assert(parent.bits() <= x.bits());
405 lgeneric_dout(cct, 10) << "parent is " << parent << dendl;
406
407 // do we need to split from parent to x?
408 if (parent.bits() < x.bits()) {
409 int spread = x.bits() - parent.bits();
410 int nb = get_split(parent);
411 lgeneric_dout(cct, 10) << "spread " << spread << ", parent splits by " << nb << dendl;
412 if (nb == 0) {
413 // easy: split parent (a leaf) by the difference
414 lgeneric_dout(cct, 10) << "splitting parent " << parent << " by spread " << spread << dendl;
415 split(parent, spread);
416 assert(is_leaf(x));
417 return true;
418 }
419 assert(nb > spread);
420
421 // add an intermediary split
422 merge(parent, nb, false);
423 split(parent, spread, false);
424
425 std::list<frag_t> subs;
426 parent.split(spread, subs);
427 for (std::list<frag_t>::iterator p = subs.begin();
428 p != subs.end();
429 ++p) {
430 lgeneric_dout(cct, 10) << "splitting intermediate " << *p << " by " << (nb-spread) << dendl;
431 split(*p, nb - spread, false);
432 }
433 }
434
435 // x is now a leaf or split.
436 // hoover up any children.
437 std::list<frag_t> q;
438 q.push_back(x);
439 while (!q.empty()) {
440 frag_t t = q.front();
441 q.pop_front();
442 int nb = get_split(t);
443 if (nb) {
444 lgeneric_dout(cct, 10) << "merging child " << t << " by " << nb << dendl;
445 merge(t, nb, false); // merge this point, and
446 t.split(nb, q); // queue up children
447 }
448 }
449
450 lgeneric_dout(cct, 10) << "force_to_leaf done" << dendl;
451 assert(is_leaf(x));
452 return true;
453 }
454
455 // encoding
456 void encode(bufferlist& bl) const {
457 ::encode(_splits, bl);
458 }
459 void decode(bufferlist::iterator& p) {
460 ::decode(_splits, p);
461 }
462 void encode_nohead(bufferlist& bl) const {
463 for (compact_map<frag_t,int32_t>::const_iterator p = _splits.begin();
464 p != _splits.end();
465 ++p) {
466 ::encode(p->first, bl);
467 ::encode(p->second, bl);
468 }
469 }
470 void decode_nohead(int n, bufferlist::iterator& p) {
471 _splits.clear();
472 while (n-- > 0) {
473 frag_t f;
474 ::decode(f, p);
475 ::decode(_splits[f], p);
476 }
477 }
478
479 void print(std::ostream& out) {
480 out << "fragtree_t(";
481 std::list<frag_t> q;
482 q.push_back(frag_t());
483 while (!q.empty()) {
484 frag_t t = q.front();
485 q.pop_front();
486 // newline + indent?
487 if (t.bits()) {
488 out << std::endl;
489 for (unsigned i=0; i<t.bits(); i++) out << ' ';
490 }
491 int nb = get_split(t);
492 if (nb) {
493 out << t << " %" << nb;
494 t.split(nb, q); // queue up children
495 } else {
496 out << t;
497 }
498 }
499 out << ")";
500 }
501
502 void dump(Formatter *f) const {
503 f->open_array_section("splits");
504 for (compact_map<frag_t,int32_t>::const_iterator p = _splits.begin();
505 p != _splits.end();
506 ++p) {
507 f->open_object_section("split");
508 std::ostringstream frag_str;
509 frag_str << p->first;
510 f->dump_string("frag", frag_str.str());
511 f->dump_int("children", p->second);
512 f->close_section(); // split
513 }
514 f->close_section(); // splits
515 }
516 };
517 WRITE_CLASS_ENCODER(fragtree_t)
518
519 inline bool operator==(const fragtree_t& l, const fragtree_t& r) {
520 return l._splits == r._splits;
521 }
522 inline bool operator!=(const fragtree_t& l, const fragtree_t& r) {
523 return l._splits != r._splits;
524 }
525
526 inline std::ostream& operator<<(std::ostream& out, const fragtree_t& ft)
527 {
528 out << "fragtree_t(";
529
530 for (compact_map<frag_t,int32_t>::const_iterator p = ft._splits.begin();
531 p != ft._splits.end();
532 ++p) {
533 if (p != ft._splits.begin())
534 out << " ";
535 out << p->first << "^" << p->second;
536 }
537 return out << ")";
538 }
539
540
541 /**
542 * fragset_t -- a set of fragments
543 */
544 class fragset_t {
545 std::set<frag_t> _set;
546
547 public:
548 const std::set<frag_t> &get() const { return _set; }
549 std::set<frag_t>::iterator begin() { return _set.begin(); }
550 std::set<frag_t>::iterator end() { return _set.end(); }
551
552 bool empty() const { return _set.empty(); }
553
554 bool contains(frag_t f) const {
555 while (1) {
556 if (_set.count(f)) return true;
557 if (f.bits() == 0) return false;
558 f = f.parent();
559 }
560 }
561
562 void insert(frag_t f) {
563 _set.insert(f);
564 simplify();
565 }
566
567 void simplify() {
568 while (1) {
569 bool clean = true;
570 std::set<frag_t>::iterator p = _set.begin();
571 while (p != _set.end()) {
572 if (!p->is_root() &&
573 _set.count(p->get_sibling())) {
574 _set.erase(p->get_sibling());
575 _set.insert(p->parent());
576 _set.erase(p++);
577 clean = false;
578 } else {
579 p++;
580 }
581 }
582 if (clean)
583 break;
584 }
585 }
586 };
587
588 inline std::ostream& operator<<(std::ostream& out, const fragset_t& fs)
589 {
590 return out << "fragset_t(" << fs.get() << ")";
591 }
592
593 #endif