]> git.proxmox.com Git - ceph.git/blob - ceph/src/osd/OSDMap.h
1dc67090d4446a4691763d89c6b46c775b530bb7
[ceph.git] / ceph / src / osd / OSDMap.h
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 * Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com>
8 *
9 * Author: Loic Dachary <loic@dachary.org>
10 *
11 * This is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public
13 * License version 2.1, as published by the Free Software
14 * Foundation. See file COPYING.
15 *
16 */
17
18
19 #ifndef CEPH_OSDMAP_H
20 #define CEPH_OSDMAP_H
21
22 /*
23 * describe properties of the OSD cluster.
24 * disks, disk groups, total # osds,
25 *
26 */
27 #include "include/types.h"
28 #include "osd_types.h"
29
30 //#include "include/ceph_features.h"
31 #include "crush/CrushWrapper.h"
32 #include <vector>
33 #include <list>
34 #include <set>
35 #include <map>
36 #include "include/memory.h"
37 using namespace std;
38
39 // forward declaration
40 class CephContext;
41 class CrushWrapper;
42
43 // FIXME C++11 does not have std::equal for two differently-typed containers.
44 // use this until we move to c++14
45 template<typename A, typename B>
46 bool vectors_equal(A a, B b)
47 {
48 return
49 a.size() == b.size() &&
50 (a.empty() ||
51 memcmp((char*)&a[0], (char*)&b[0], sizeof(a[0]) * a.size()) == 0);
52 }
53
54
55 /*
56 * we track up to two intervals during which the osd was alive and
57 * healthy. the most recent is [up_from,up_thru), where up_thru is
58 * the last epoch the osd is known to have _started_. i.e., a lower
59 * bound on the actual osd death. down_at (if it is > up_from) is an
60 * upper bound on the actual osd death.
61 *
62 * the second is the last_clean interval [first,last]. in that case,
63 * the last interval is the last epoch known to have been either
64 * _finished_, or during which the osd cleanly shut down. when
65 * possible, we push this forward to the epoch the osd was eventually
66 * marked down.
67 *
68 * the lost_at is used to allow build_prior to proceed without waiting
69 * for an osd to recover. In certain cases, progress may be blocked
70 * because an osd is down that may contain updates (i.e., a pg may have
71 * gone rw during an interval). If the osd can't be brought online, we
72 * can force things to proceed knowing that we _might_ be losing some
73 * acked writes. If the osd comes back to life later, that's fine to,
74 * but those writes will still be lost (the divergent objects will be
75 * thrown out).
76 */
77 struct osd_info_t {
78 epoch_t last_clean_begin; // last interval that ended with a clean osd shutdown
79 epoch_t last_clean_end;
80 epoch_t up_from; // epoch osd marked up
81 epoch_t up_thru; // lower bound on actual osd death (if > up_from)
82 epoch_t down_at; // upper bound on actual osd death (if > up_from)
83 epoch_t lost_at; // last epoch we decided data was "lost"
84
85 osd_info_t() : last_clean_begin(0), last_clean_end(0),
86 up_from(0), up_thru(0), down_at(0), lost_at(0) {}
87
88 void dump(Formatter *f) const;
89 void encode(bufferlist& bl) const;
90 void decode(bufferlist::iterator& bl);
91 static void generate_test_instances(list<osd_info_t*>& o);
92 };
93 WRITE_CLASS_ENCODER(osd_info_t)
94
95 ostream& operator<<(ostream& out, const osd_info_t& info);
96
97 struct osd_xinfo_t {
98 utime_t down_stamp; ///< timestamp when we were last marked down
99 float laggy_probability; ///< encoded as __u32: 0 = definitely not laggy, 0xffffffff definitely laggy
100 __u32 laggy_interval; ///< average interval between being marked laggy and recovering
101 uint64_t features; ///< features supported by this osd we should know about
102 __u32 old_weight; ///< weight prior to being auto marked out
103
104 osd_xinfo_t() : laggy_probability(0), laggy_interval(0),
105 features(0), old_weight(0) {}
106
107 void dump(Formatter *f) const;
108 void encode(bufferlist& bl) const;
109 void decode(bufferlist::iterator& bl);
110 static void generate_test_instances(list<osd_xinfo_t*>& o);
111 };
112 WRITE_CLASS_ENCODER(osd_xinfo_t)
113
114 ostream& operator<<(ostream& out, const osd_xinfo_t& xi);
115
116
117 /** OSDMap
118 */
119 class OSDMap {
120 public:
121 MEMPOOL_CLASS_HELPERS();
122
123 class Incremental {
124 public:
125 MEMPOOL_CLASS_HELPERS();
126
127 /// feature bits we were encoded with. the subsequent OSDMap
128 /// encoding should match.
129 uint64_t encode_features;
130 uuid_d fsid;
131 epoch_t epoch; // new epoch; we are a diff from epoch-1 to epoch
132 utime_t modified;
133 int64_t new_pool_max; //incremented by the OSDMonitor on each pool create
134 int32_t new_flags;
135
136 // full (rare)
137 bufferlist fullmap; // in lieu of below.
138 bufferlist crush;
139
140 // incremental
141 int32_t new_max_osd;
142 mempool::osdmap::map<int64_t,pg_pool_t> new_pools;
143 mempool::osdmap::map<int64_t,string> new_pool_names;
144 mempool::osdmap::set<int64_t> old_pools;
145 mempool::osdmap::map<string,map<string,string> > new_erasure_code_profiles;
146 mempool::osdmap::vector<string> old_erasure_code_profiles;
147 mempool::osdmap::map<int32_t,entity_addr_t> new_up_client;
148 mempool::osdmap::map<int32_t,entity_addr_t> new_up_cluster;
149 mempool::osdmap::map<int32_t,uint8_t> new_state; // XORed onto previous state.
150 mempool::osdmap::map<int32_t,uint32_t> new_weight;
151 mempool::osdmap::map<pg_t,mempool::osdmap::vector<int32_t> > new_pg_temp; // [] to remove
152 mempool::osdmap::map<pg_t, int32_t> new_primary_temp; // [-1] to remove
153 mempool::osdmap::map<int32_t,uint32_t> new_primary_affinity;
154 mempool::osdmap::map<int32_t,epoch_t> new_up_thru;
155 mempool::osdmap::map<int32_t,pair<epoch_t,epoch_t> > new_last_clean_interval;
156 mempool::osdmap::map<int32_t,epoch_t> new_lost;
157 mempool::osdmap::map<int32_t,uuid_d> new_uuid;
158 mempool::osdmap::map<int32_t,osd_xinfo_t> new_xinfo;
159
160 mempool::osdmap::map<entity_addr_t,utime_t> new_blacklist;
161 mempool::osdmap::vector<entity_addr_t> old_blacklist;
162 mempool::osdmap::map<int32_t, entity_addr_t> new_hb_back_up;
163 mempool::osdmap::map<int32_t, entity_addr_t> new_hb_front_up;
164
165 mempool::osdmap::map<pg_t,mempool::osdmap::vector<int32_t>> new_pg_upmap;
166 mempool::osdmap::map<pg_t,mempool::osdmap::vector<pair<int32_t,int32_t>>> new_pg_upmap_items;
167 mempool::osdmap::set<pg_t> old_pg_upmap, old_pg_upmap_items;
168
169 string cluster_snapshot;
170
171 float new_nearfull_ratio = -1;
172 float new_backfillfull_ratio = -1;
173 float new_full_ratio = -1;
174
175 string new_require_min_compat_client;
176
177 mutable bool have_crc; ///< crc values are defined
178 uint32_t full_crc; ///< crc of the resulting OSDMap
179 mutable uint32_t inc_crc; ///< crc of this incremental
180
181 int get_net_marked_out(const OSDMap *previous) const;
182 int get_net_marked_down(const OSDMap *previous) const;
183 int identify_osd(uuid_d u) const;
184
185 void encode_client_old(bufferlist& bl) const;
186 void encode_classic(bufferlist& bl, uint64_t features) const;
187 void encode(bufferlist& bl, uint64_t features=CEPH_FEATURES_ALL) const;
188 void decode_classic(bufferlist::iterator &p);
189 void decode(bufferlist::iterator &bl);
190 void dump(Formatter *f) const;
191 static void generate_test_instances(list<Incremental*>& o);
192
193 explicit Incremental(epoch_t e=0) :
194 encode_features(0),
195 epoch(e), new_pool_max(-1), new_flags(-1), new_max_osd(-1),
196 have_crc(false), full_crc(0), inc_crc(0) {
197 memset(&fsid, 0, sizeof(fsid));
198 }
199 explicit Incremental(bufferlist &bl) {
200 bufferlist::iterator p = bl.begin();
201 decode(p);
202 }
203 explicit Incremental(bufferlist::iterator &p) {
204 decode(p);
205 }
206
207 pg_pool_t *get_new_pool(int64_t pool, const pg_pool_t *orig) {
208 if (new_pools.count(pool) == 0)
209 new_pools[pool] = *orig;
210 return &new_pools[pool];
211 }
212 bool has_erasure_code_profile(const string &name) const {
213 auto i = new_erasure_code_profiles.find(name);
214 return i != new_erasure_code_profiles.end();
215 }
216 void set_erasure_code_profile(const string &name,
217 const map<string,string>& profile) {
218 new_erasure_code_profiles[name] = profile;
219 }
220
221 /// propage update pools' snap metadata to any of their tiers
222 int propagate_snaps_to_tiers(CephContext *cct, const OSDMap &base);
223 };
224
225 private:
226 uuid_d fsid;
227 epoch_t epoch; // what epoch of the osd cluster descriptor is this
228 utime_t created, modified; // epoch start time
229 int32_t pool_max; // the largest pool num, ever
230
231 uint32_t flags;
232
233 int num_osd; // not saved; see calc_num_osds
234 int num_up_osd; // not saved; see calc_num_osds
235 int num_in_osd; // not saved; see calc_num_osds
236
237 int32_t max_osd;
238 vector<uint8_t> osd_state;
239
240 struct addrs_s {
241 mempool::osdmap::vector<ceph::shared_ptr<entity_addr_t> > client_addr;
242 mempool::osdmap::vector<ceph::shared_ptr<entity_addr_t> > cluster_addr;
243 mempool::osdmap::vector<ceph::shared_ptr<entity_addr_t> > hb_back_addr;
244 mempool::osdmap::vector<ceph::shared_ptr<entity_addr_t> > hb_front_addr;
245 entity_addr_t blank;
246 };
247 ceph::shared_ptr<addrs_s> osd_addrs;
248
249 mempool::osdmap::vector<__u32> osd_weight; // 16.16 fixed point, 0x10000 = "in", 0 = "out"
250 mempool::osdmap::vector<osd_info_t> osd_info;
251 ceph::shared_ptr< mempool::osdmap::map<pg_t,mempool::osdmap::vector<int32_t> > > pg_temp; // temp pg mapping (e.g. while we rebuild)
252 ceph::shared_ptr< mempool::osdmap::map<pg_t,int32_t > > primary_temp; // temp primary mapping (e.g. while we rebuild)
253 ceph::shared_ptr< mempool::osdmap::vector<__u32> > osd_primary_affinity; ///< 16.16 fixed point, 0x10000 = baseline
254
255 // remap (post-CRUSH, pre-up)
256 mempool::osdmap::map<pg_t,mempool::osdmap::vector<int32_t>> pg_upmap; ///< remap pg
257 mempool::osdmap::map<pg_t,mempool::osdmap::vector<pair<int32_t,int32_t>>> pg_upmap_items; ///< remap osds in up set
258
259 mempool::osdmap::map<int64_t,pg_pool_t> pools;
260 mempool::osdmap::map<int64_t,string> pool_name;
261 mempool::osdmap::map<string,map<string,string> > erasure_code_profiles;
262 mempool::osdmap::map<string,int64_t> name_pool;
263
264 ceph::shared_ptr< mempool::osdmap::vector<uuid_d> > osd_uuid;
265 mempool::osdmap::vector<osd_xinfo_t> osd_xinfo;
266
267 mempool::osdmap::unordered_map<entity_addr_t,utime_t> blacklist;
268
269 epoch_t cluster_snapshot_epoch;
270 string cluster_snapshot;
271 bool new_blacklist_entries;
272
273 float full_ratio = 0, backfillfull_ratio = 0, nearfull_ratio = 0;
274
275 /// min compat client we want to support
276 string require_min_compat_client;
277
278 mutable uint64_t cached_up_osd_features;
279
280 mutable bool crc_defined;
281 mutable uint32_t crc;
282
283 void _calc_up_osd_features();
284
285 public:
286 bool have_crc() const { return crc_defined; }
287 uint32_t get_crc() const { return crc; }
288
289 ceph::shared_ptr<CrushWrapper> crush; // hierarchical map
290
291 friend class OSDMonitor;
292
293 public:
294 OSDMap() : epoch(0),
295 pool_max(-1),
296 flags(0),
297 num_osd(0), num_up_osd(0), num_in_osd(0),
298 max_osd(0),
299 osd_addrs(std::make_shared<addrs_s>()),
300 pg_temp(std::make_shared<mempool::osdmap::map<pg_t,mempool::osdmap::vector<int32_t>>>()),
301 primary_temp(std::make_shared<mempool::osdmap::map<pg_t,int32_t>>()),
302 osd_uuid(std::make_shared<mempool::osdmap::vector<uuid_d>>()),
303 cluster_snapshot_epoch(0),
304 new_blacklist_entries(false),
305 cached_up_osd_features(0),
306 crc_defined(false), crc(0),
307 crush(std::make_shared<CrushWrapper>()) {
308 memset(&fsid, 0, sizeof(fsid));
309 }
310
311 // no copying
312 private:
313 OSDMap(const OSDMap& other) = default;
314 OSDMap& operator=(const OSDMap& other) = default;
315 public:
316
317 void deepish_copy_from(const OSDMap& o) {
318 *this = o;
319 primary_temp.reset(new mempool::osdmap::map<pg_t,int32_t>(*o.primary_temp));
320 pg_temp.reset(new mempool::osdmap::map<pg_t,mempool::osdmap::vector<int32_t> >(*o.pg_temp));
321 osd_uuid.reset(new mempool::osdmap::vector<uuid_d>(*o.osd_uuid));
322
323 if (o.osd_primary_affinity)
324 osd_primary_affinity.reset(new mempool::osdmap::vector<__u32>(*o.osd_primary_affinity));
325
326 // NOTE: this still references shared entity_addr_t's.
327 osd_addrs.reset(new addrs_s(*o.osd_addrs));
328
329 // NOTE: we do not copy crush. note that apply_incremental will
330 // allocate a new CrushWrapper, though.
331 }
332
333 // map info
334 const uuid_d& get_fsid() const { return fsid; }
335 void set_fsid(uuid_d& f) { fsid = f; }
336
337 epoch_t get_epoch() const { return epoch; }
338 void inc_epoch() { epoch++; }
339
340 void set_epoch(epoch_t e);
341
342 /* stamps etc */
343 const utime_t& get_created() const { return created; }
344 const utime_t& get_modified() const { return modified; }
345
346 bool is_blacklisted(const entity_addr_t& a) const;
347 void get_blacklist(list<pair<entity_addr_t,utime_t > > *bl) const;
348
349 string get_cluster_snapshot() const {
350 if (cluster_snapshot_epoch == epoch)
351 return cluster_snapshot;
352 return string();
353 }
354
355 float get_full_ratio() const {
356 return full_ratio;
357 }
358 float get_backfillfull_ratio() const {
359 return backfillfull_ratio;
360 }
361 float get_nearfull_ratio() const {
362 return nearfull_ratio;
363 }
364 void count_full_nearfull_osds(int *full, int *backfill, int *nearfull) const;
365 void get_full_osd_util(
366 const ceph::unordered_map<int32_t,osd_stat_t> &osd_stat,
367 map<int, float> *full,
368 map<int, float> *backfill,
369 map<int, float> *nearfull) const;
370
371 /***** cluster state *****/
372 /* osds */
373 int get_max_osd() const { return max_osd; }
374 void set_max_osd(int m);
375
376 unsigned get_num_osds() const {
377 return num_osd;
378 }
379 unsigned get_num_up_osds() const {
380 return num_up_osd;
381 }
382 unsigned get_num_in_osds() const {
383 return num_in_osd;
384 }
385 /// recalculate cached values for get_num{,_up,_in}_osds
386 int calc_num_osds();
387
388 void get_all_osds(set<int32_t>& ls) const;
389 void get_up_osds(set<int32_t>& ls) const;
390 unsigned get_num_pg_temp() const {
391 return pg_temp->size();
392 }
393
394 int get_flags() const { return flags; }
395 bool test_flag(int f) const { return flags & f; }
396 void set_flag(int f) { flags |= f; }
397 void clear_flag(int f) { flags &= ~f; }
398
399 static void calc_state_set(int state, set<string>& st);
400
401 int get_state(int o) const {
402 assert(o < max_osd);
403 return osd_state[o];
404 }
405 int get_state(int o, set<string>& st) const {
406 assert(o < max_osd);
407 unsigned t = osd_state[o];
408 calc_state_set(t, st);
409 return osd_state[o];
410 }
411 void set_state(int o, unsigned s) {
412 assert(o < max_osd);
413 osd_state[o] = s;
414 }
415 void set_weight(int o, unsigned w) {
416 assert(o < max_osd);
417 osd_weight[o] = w;
418 if (w)
419 osd_state[o] |= CEPH_OSD_EXISTS;
420 }
421 unsigned get_weight(int o) const {
422 assert(o < max_osd);
423 return osd_weight[o];
424 }
425 float get_weightf(int o) const {
426 return (float)get_weight(o) / (float)CEPH_OSD_IN;
427 }
428 void adjust_osd_weights(const map<int,double>& weights, Incremental& inc) const;
429
430 void set_primary_affinity(int o, int w) {
431 assert(o < max_osd);
432 if (!osd_primary_affinity)
433 osd_primary_affinity.reset(
434 new mempool::osdmap::vector<__u32>(
435 max_osd, CEPH_OSD_DEFAULT_PRIMARY_AFFINITY));
436 (*osd_primary_affinity)[o] = w;
437 }
438 unsigned get_primary_affinity(int o) const {
439 assert(o < max_osd);
440 if (!osd_primary_affinity)
441 return CEPH_OSD_DEFAULT_PRIMARY_AFFINITY;
442 return (*osd_primary_affinity)[o];
443 }
444 float get_primary_affinityf(int o) const {
445 return (float)get_primary_affinity(o) / (float)CEPH_OSD_MAX_PRIMARY_AFFINITY;
446 }
447
448 bool has_erasure_code_profile(const string &name) const {
449 auto i = erasure_code_profiles.find(name);
450 return i != erasure_code_profiles.end();
451 }
452 int get_erasure_code_profile_default(CephContext *cct,
453 map<string,string> &profile_map,
454 ostream *ss);
455 void set_erasure_code_profile(const string &name,
456 const map<string,string>& profile) {
457 erasure_code_profiles[name] = profile;
458 }
459 const map<string,string> &get_erasure_code_profile(
460 const string &name) const {
461 static map<string,string> empty;
462 auto i = erasure_code_profiles.find(name);
463 if (i == erasure_code_profiles.end())
464 return empty;
465 else
466 return i->second;
467 }
468 const mempool::osdmap::map<string,map<string,string> > &get_erasure_code_profiles() const {
469 return erasure_code_profiles;
470 }
471
472 bool exists(int osd) const {
473 //assert(osd >= 0);
474 return osd >= 0 && osd < max_osd && (osd_state[osd] & CEPH_OSD_EXISTS);
475 }
476
477 bool is_up(int osd) const {
478 return exists(osd) && (osd_state[osd] & CEPH_OSD_UP);
479 }
480
481 bool has_been_up_since(int osd, epoch_t epoch) const {
482 return is_up(osd) && get_up_from(osd) <= epoch;
483 }
484
485 bool is_down(int osd) const {
486 return !is_up(osd);
487 }
488
489 bool is_out(int osd) const {
490 return !exists(osd) || get_weight(osd) == CEPH_OSD_OUT;
491 }
492
493 bool is_in(int osd) const {
494 return !is_out(osd);
495 }
496
497 /**
498 * check if an entire crush subtree is down
499 */
500 bool subtree_is_down(int id, set<int> *down_cache) const;
501 bool containing_subtree_is_down(CephContext *cct, int osd, int subtree_type, set<int> *down_cache) const;
502
503 int identify_osd(const entity_addr_t& addr) const;
504 int identify_osd(const uuid_d& u) const;
505 int identify_osd_on_all_channels(const entity_addr_t& addr) const;
506
507 bool have_addr(const entity_addr_t& addr) const {
508 return identify_osd(addr) >= 0;
509 }
510 int find_osd_on_ip(const entity_addr_t& ip) const;
511 const entity_addr_t &get_addr(int osd) const {
512 assert(exists(osd));
513 return osd_addrs->client_addr[osd] ? *osd_addrs->client_addr[osd] : osd_addrs->blank;
514 }
515 const entity_addr_t &get_cluster_addr(int osd) const {
516 assert(exists(osd));
517 if (!osd_addrs->cluster_addr[osd] || *osd_addrs->cluster_addr[osd] == entity_addr_t())
518 return get_addr(osd);
519 return *osd_addrs->cluster_addr[osd];
520 }
521 const entity_addr_t &get_hb_back_addr(int osd) const {
522 assert(exists(osd));
523 return osd_addrs->hb_back_addr[osd] ? *osd_addrs->hb_back_addr[osd] : osd_addrs->blank;
524 }
525 const entity_addr_t &get_hb_front_addr(int osd) const {
526 assert(exists(osd));
527 return osd_addrs->hb_front_addr[osd] ? *osd_addrs->hb_front_addr[osd] : osd_addrs->blank;
528 }
529 entity_inst_t get_most_recent_inst(int osd) const {
530 assert(exists(osd));
531 return entity_inst_t(entity_name_t::OSD(osd), get_addr(osd));
532 }
533 entity_inst_t get_inst(int osd) const {
534 assert(is_up(osd));
535 return get_most_recent_inst(osd);
536 }
537 entity_inst_t get_cluster_inst(int osd) const {
538 assert(is_up(osd));
539 return entity_inst_t(entity_name_t::OSD(osd), get_cluster_addr(osd));
540 }
541 entity_inst_t get_hb_back_inst(int osd) const {
542 assert(is_up(osd));
543 return entity_inst_t(entity_name_t::OSD(osd), get_hb_back_addr(osd));
544 }
545 entity_inst_t get_hb_front_inst(int osd) const {
546 assert(is_up(osd));
547 return entity_inst_t(entity_name_t::OSD(osd), get_hb_front_addr(osd));
548 }
549
550 const uuid_d& get_uuid(int osd) const {
551 assert(exists(osd));
552 return (*osd_uuid)[osd];
553 }
554
555 const epoch_t& get_up_from(int osd) const {
556 assert(exists(osd));
557 return osd_info[osd].up_from;
558 }
559 const epoch_t& get_up_thru(int osd) const {
560 assert(exists(osd));
561 return osd_info[osd].up_thru;
562 }
563 const epoch_t& get_down_at(int osd) const {
564 assert(exists(osd));
565 return osd_info[osd].down_at;
566 }
567 const osd_info_t& get_info(int osd) const {
568 assert(osd < max_osd);
569 return osd_info[osd];
570 }
571
572 const osd_xinfo_t& get_xinfo(int osd) const {
573 assert(osd < max_osd);
574 return osd_xinfo[osd];
575 }
576
577 int get_next_up_osd_after(int n) const {
578 if (get_max_osd() == 0)
579 return -1;
580 for (int i = n + 1; i != n; ++i) {
581 if (i >= get_max_osd())
582 i = 0;
583 if (i == n)
584 break;
585 if (is_up(i))
586 return i;
587 }
588 return -1;
589 }
590
591 int get_previous_up_osd_before(int n) const {
592 if (get_max_osd() == 0)
593 return -1;
594 for (int i = n - 1; i != n; --i) {
595 if (i < 0)
596 i = get_max_osd() - 1;
597 if (i == n)
598 break;
599 if (is_up(i))
600 return i;
601 }
602 return -1;
603 }
604
605 /**
606 * get feature bits required by the current structure
607 *
608 * @param entity_type [in] what entity type we are asking about
609 * @param mask [out] set of all possible map-related features we could set
610 * @return feature bits used by this map
611 */
612 uint64_t get_features(int entity_type, uint64_t *mask) const;
613
614 /**
615 * get oldest *client* version (firefly, hammer, etc.) that can connect given
616 * the feature bits required (according to get_features()).
617 */
618 pair<string,string> get_min_compat_client() const;
619
620 /**
621 * get intersection of features supported by up osds
622 */
623 uint64_t get_up_osd_features() const;
624
625 int apply_incremental(const Incremental &inc);
626
627 /// try to re-use/reference addrs in oldmap from newmap
628 static void dedup(const OSDMap *oldmap, OSDMap *newmap);
629
630 static void clean_temps(CephContext *cct, const OSDMap& osdmap,
631 Incremental *pending_inc);
632
633 // serialize, unserialize
634 private:
635 void encode_client_old(bufferlist& bl) const;
636 void encode_classic(bufferlist& bl, uint64_t features) const;
637 void decode_classic(bufferlist::iterator& p);
638 void post_decode();
639 public:
640 void encode(bufferlist& bl, uint64_t features=CEPH_FEATURES_ALL) const;
641 void decode(bufferlist& bl);
642 void decode(bufferlist::iterator& bl);
643
644
645 /**** mapping facilities ****/
646 int map_to_pg(
647 int64_t pool,
648 const string& name,
649 const string& key,
650 const string& nspace,
651 pg_t *pg) const;
652 int object_locator_to_pg(const object_t& oid, const object_locator_t& loc,
653 pg_t &pg) const;
654 pg_t object_locator_to_pg(const object_t& oid,
655 const object_locator_t& loc) const {
656 pg_t pg;
657 int ret = object_locator_to_pg(oid, loc, pg);
658 assert(ret == 0);
659 return pg;
660 }
661
662
663 static object_locator_t file_to_object_locator(const file_layout_t& layout) {
664 return object_locator_t(layout.pool_id, layout.pool_ns);
665 }
666
667 ceph_object_layout file_to_object_layout(object_t oid,
668 file_layout_t& layout) const {
669 return make_object_layout(oid, layout.pool_id, layout.pool_ns);
670 }
671
672 ceph_object_layout make_object_layout(object_t oid, int pg_pool,
673 string nspace) const;
674
675 int get_pg_num(int pg_pool) const
676 {
677 const pg_pool_t *pool = get_pg_pool(pg_pool);
678 assert(NULL != pool);
679 return pool->get_pg_num();
680 }
681
682 bool pg_exists(pg_t pgid) const {
683 const pg_pool_t *p = get_pg_pool(pgid.pool());
684 return p && pgid.ps() < p->get_pg_num();
685 }
686
687 private:
688 /// pg -> (raw osd list)
689 int _pg_to_raw_osds(
690 const pg_pool_t& pool, pg_t pg,
691 vector<int> *osds,
692 ps_t *ppps) const;
693 int _pick_primary(const vector<int>& osds) const;
694 void _remove_nonexistent_osds(const pg_pool_t& pool, vector<int>& osds) const;
695
696 void _apply_primary_affinity(ps_t seed, const pg_pool_t& pool,
697 vector<int> *osds, int *primary) const;
698
699 /// apply pg_upmap[_items] mappings
700 void _apply_remap(const pg_pool_t& pi, pg_t pg, vector<int> *raw) const;
701
702 /// pg -> (up osd list)
703 void _raw_to_up_osds(const pg_pool_t& pool, const vector<int>& raw,
704 vector<int> *up) const;
705
706
707 /**
708 * Get the pg and primary temp, if they are specified.
709 * @param temp_pg [out] Will be empty or contain the temp PG mapping on return
710 * @param temp_primary [out] Will be the value in primary_temp, or a value derived
711 * from the pg_temp (if specified), or -1 if you should use the calculated (up_)primary.
712 */
713 void _get_temp_osds(const pg_pool_t& pool, pg_t pg,
714 vector<int> *temp_pg, int *temp_primary) const;
715
716 /**
717 * map to up and acting. Fills in whatever fields are non-NULL.
718 */
719 void _pg_to_up_acting_osds(const pg_t& pg, vector<int> *up, int *up_primary,
720 vector<int> *acting, int *acting_primary,
721 bool raw_pg_to_pg = true) const;
722
723 public:
724 /***
725 * This is suitable only for looking at raw CRUSH outputs. It skips
726 * applying the temp and up checks and should not be used
727 * by anybody for data mapping purposes.
728 * raw and primary must be non-NULL
729 */
730 int pg_to_raw_osds(pg_t pg, vector<int> *raw, int *primary) const;
731 /// map a pg to its acting set. @return acting set size
732 int pg_to_acting_osds(const pg_t& pg, vector<int> *acting,
733 int *acting_primary) const {
734 _pg_to_up_acting_osds(pg, NULL, NULL, acting, acting_primary);
735 return acting->size();
736 }
737 int pg_to_acting_osds(pg_t pg, vector<int>& acting) const {
738 return pg_to_acting_osds(pg, &acting, NULL);
739 }
740 /**
741 * This does not apply temp overrides and should not be used
742 * by anybody for data mapping purposes. Specify both pointers.
743 */
744 void pg_to_raw_up(pg_t pg, vector<int> *up, int *primary) const;
745 /**
746 * map a pg to its acting set as well as its up set. You must use
747 * the acting set for data mapping purposes, but some users will
748 * also find the up set useful for things like deciding what to
749 * set as pg_temp.
750 * Each of these pointers must be non-NULL.
751 */
752 void pg_to_up_acting_osds(pg_t pg, vector<int> *up, int *up_primary,
753 vector<int> *acting, int *acting_primary) const {
754 _pg_to_up_acting_osds(pg, up, up_primary, acting, acting_primary);
755 }
756 void pg_to_up_acting_osds(pg_t pg, vector<int>& up, vector<int>& acting) const {
757 int up_primary, acting_primary;
758 pg_to_up_acting_osds(pg, &up, &up_primary, &acting, &acting_primary);
759 }
760 bool pg_is_ec(pg_t pg) const {
761 auto i = pools.find(pg.pool());
762 assert(i != pools.end());
763 return i->second.ec_pool();
764 }
765 bool get_primary_shard(const pg_t& pgid, spg_t *out) const {
766 auto i = get_pools().find(pgid.pool());
767 if (i == get_pools().end()) {
768 return false;
769 }
770 if (!i->second.ec_pool()) {
771 *out = spg_t(pgid);
772 return true;
773 }
774 int primary;
775 vector<int> acting;
776 pg_to_acting_osds(pgid, &acting, &primary);
777 for (uint8_t i = 0; i < acting.size(); ++i) {
778 if (acting[i] == primary) {
779 *out = spg_t(pgid, shard_id_t(i));
780 return true;
781 }
782 }
783 return false;
784 }
785
786 int64_t lookup_pg_pool_name(const string& name) const {
787 auto p = name_pool.find(name);
788 if (p == name_pool.end())
789 return -ENOENT;
790 return p->second;
791 }
792
793 int64_t get_pool_max() const {
794 return pool_max;
795 }
796 const mempool::osdmap::map<int64_t,pg_pool_t>& get_pools() const {
797 return pools;
798 }
799 mempool::osdmap::map<int64_t,pg_pool_t>& get_pools() {
800 return pools;
801 }
802 const string& get_pool_name(int64_t p) const {
803 auto i = pool_name.find(p);
804 assert(i != pool_name.end());
805 return i->second;
806 }
807 bool have_pg_pool(int64_t p) const {
808 return pools.count(p);
809 }
810 const pg_pool_t* get_pg_pool(int64_t p) const {
811 auto i = pools.find(p);
812 if (i != pools.end())
813 return &i->second;
814 return NULL;
815 }
816 unsigned get_pg_size(pg_t pg) const {
817 auto p = pools.find(pg.pool());
818 assert(p != pools.end());
819 return p->second.get_size();
820 }
821 int get_pg_type(pg_t pg) const {
822 auto p = pools.find(pg.pool());
823 assert(p != pools.end());
824 return p->second.get_type();
825 }
826
827
828 pg_t raw_pg_to_pg(pg_t pg) const {
829 auto p = pools.find(pg.pool());
830 assert(p != pools.end());
831 return p->second.raw_pg_to_pg(pg);
832 }
833
834 // pg -> acting primary osd
835 int get_pg_acting_primary(pg_t pg) const {
836 int primary = -1;
837 _pg_to_up_acting_osds(pg, nullptr, nullptr, nullptr, &primary);
838 return primary;
839 }
840
841 /*
842 * check whether an spg_t maps to a particular osd
843 */
844 bool is_up_acting_osd_shard(spg_t pg, int osd) const {
845 vector<int> up, acting;
846 _pg_to_up_acting_osds(pg.pgid, &up, NULL, &acting, NULL, false);
847 if (pg.shard == shard_id_t::NO_SHARD) {
848 if (calc_pg_role(osd, acting, acting.size()) >= 0 ||
849 calc_pg_role(osd, up, up.size()) >= 0)
850 return true;
851 } else {
852 if (pg.shard < (int)acting.size() && acting[pg.shard] == osd)
853 return true;
854 if (pg.shard < (int)up.size() && up[pg.shard] == osd)
855 return true;
856 }
857 return false;
858 }
859
860
861 /* what replica # is a given osd? 0 primary, -1 for none. */
862 static int calc_pg_rank(int osd, const vector<int>& acting, int nrep=0);
863 static int calc_pg_role(int osd, const vector<int>& acting, int nrep=0);
864 static bool primary_changed(
865 int oldprimary,
866 const vector<int> &oldacting,
867 int newprimary,
868 const vector<int> &newacting);
869
870 /* rank is -1 (stray), 0 (primary), 1,2,3,... (replica) */
871 int get_pg_acting_rank(pg_t pg, int osd) const {
872 vector<int> group;
873 int nrep = pg_to_acting_osds(pg, group);
874 return calc_pg_rank(osd, group, nrep);
875 }
876 /* role is -1 (stray), 0 (primary), 1 (replica) */
877 int get_pg_acting_role(const pg_t& pg, int osd) const {
878 vector<int> group;
879 int nrep = pg_to_acting_osds(pg, group);
880 return calc_pg_role(osd, group, nrep);
881 }
882
883 bool osd_is_valid_op_target(pg_t pg, int osd) const {
884 int primary;
885 vector<int> group;
886 int nrep = pg_to_acting_osds(pg, &group, &primary);
887 if (osd == primary)
888 return true;
889 if (pg_is_ec(pg))
890 return false;
891
892 return calc_pg_role(osd, group, nrep) >= 0;
893 }
894
895 int clean_pg_upmaps(
896 CephContext *cct,
897 Incremental *pending_inc);
898
899 bool try_pg_upmap(
900 CephContext *cct,
901 pg_t pg, ///< pg to potentially remap
902 const set<int>& overfull, ///< osds we'd want to evacuate
903 const vector<int>& underfull, ///< osds to move to, in order of preference
904 vector<int> *orig,
905 vector<int> *out); ///< resulting alternative mapping
906
907 int calc_pg_upmaps(
908 CephContext *cct,
909 float max_deviation, ///< max deviation from target (value < 1.0)
910 int max_iterations, ///< max iterations to run
911 const set<int64_t>& pools, ///< [optional] restrict to pool
912 Incremental *pending_inc
913 );
914
915 /*
916 * handy helpers to build simple maps...
917 */
918 /**
919 * Build an OSD map suitable for basic usage. If **num_osd** is >= 0
920 * it will be initialized with the specified number of OSDs in a
921 * single host. If **num_osd** is < 0 the layout of the OSD map will
922 * be built by reading the content of the configuration file.
923 *
924 * @param cct [in] in core ceph context
925 * @param e [in] initial epoch
926 * @param fsid [in] id of the cluster
927 * @param num_osd [in] number of OSDs if >= 0 or read from conf if < 0
928 * @return **0** on success, negative errno on error.
929 */
930 int build_simple(CephContext *cct, epoch_t e, uuid_d &fsid,
931 int num_osd, int pg_bits, int pgp_bits);
932 static int _build_crush_types(CrushWrapper& crush);
933 static int build_simple_crush_map(CephContext *cct, CrushWrapper& crush,
934 int num_osd, ostream *ss);
935 static int build_simple_crush_map_from_conf(CephContext *cct,
936 CrushWrapper& crush,
937 ostream *ss);
938 static int build_simple_crush_rulesets(CephContext *cct, CrushWrapper& crush,
939 const string& root,
940 ostream *ss);
941
942 bool crush_ruleset_in_use(int ruleset) const;
943
944 void clear_temp() {
945 pg_temp->clear();
946 primary_temp->clear();
947 }
948
949 private:
950 void print_osd_line(int cur, ostream *out, Formatter *f) const;
951 public:
952 void print(ostream& out) const;
953 void print_pools(ostream& out) const;
954 void print_summary(Formatter *f, ostream& out) const;
955 void print_oneline_summary(ostream& out) const;
956 void print_tree(Formatter *f, ostream *out) const;
957
958 int summarize_mapping_stats(
959 OSDMap *newmap,
960 const set<int64_t> *pools,
961 std::string *out,
962 Formatter *f) const;
963
964 string get_flag_string() const;
965 static string get_flag_string(unsigned flags);
966 static void dump_erasure_code_profiles(
967 const mempool::osdmap::map<string,map<string,string> > &profiles,
968 Formatter *f);
969 void dump(Formatter *f) const;
970 static void generate_test_instances(list<OSDMap*>& o);
971 bool check_new_blacklist_entries() const { return new_blacklist_entries; }
972 };
973 WRITE_CLASS_ENCODER_FEATURES(OSDMap)
974 WRITE_CLASS_ENCODER_FEATURES(OSDMap::Incremental)
975
976 typedef ceph::shared_ptr<const OSDMap> OSDMapRef;
977
978 inline ostream& operator<<(ostream& out, const OSDMap& m) {
979 m.print_oneline_summary(out);
980 return out;
981 }
982
983
984 #endif