]> git.proxmox.com Git - ceph.git/blob - ceph/src/mds/MDBalancer.h
update sources to ceph Nautilus 14.2.1
[ceph.git] / ceph / src / mds / MDBalancer.h
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15
16
17 #ifndef CEPH_MDBALANCER_H
18 #define CEPH_MDBALANCER_H
19
20 #include <list>
21 #include <map>
22
23 #include "include/types.h"
24 #include "common/Clock.h"
25 #include "common/Cond.h"
26
27 #include "msg/Message.h"
28 #include "messages/MHeartbeat.h"
29
30 #include "MDSMap.h"
31
32 class MDSRank;
33 class MHeartbeat;
34 class CInode;
35 class CDir;
36 class Messenger;
37 class MonClient;
38
39 class MDBalancer {
40 public:
41 using clock = ceph::coarse_mono_clock;
42 using time = ceph::coarse_mono_time;
43 friend class C_Bal_SendHeartbeat;
44
45 MDBalancer(MDSRank *m, Messenger *msgr, MonClient *monc);
46
47 void handle_conf_change(const ConfigProxy& conf,
48 const std::set <std::string> &changed,
49 const MDSMap &mds_map);
50
51 int proc_message(const Message::const_ref &m);
52
53 /**
54 * Regularly called upkeep function.
55 *
56 * Sends MHeartbeat messages to the mons.
57 */
58 void tick();
59
60 void subtract_export(CDir *ex);
61 void add_import(CDir *im);
62 void adjust_pop_for_rename(CDir *pdir, CDir *dir, bool inc);
63
64 void hit_inode(CInode *in, int type, int who=-1);
65 void hit_dir(CDir *dir, int type, int who=-1, double amount=1.0);
66
67 void queue_split(const CDir *dir, bool fast);
68 void queue_merge(CDir *dir);
69
70 /**
71 * Based on size and configuration, decide whether to issue a queue_split
72 * or queue_merge for this CDir.
73 *
74 * \param hot whether the directory's temperature is enough to split it
75 */
76 void maybe_fragment(CDir *dir, bool hot);
77
78 void handle_mds_failure(mds_rank_t who);
79
80 int dump_loads(Formatter *f);
81
82 private:
83 bool bal_fragment_dirs;
84 int64_t bal_fragment_interval;
85
86 typedef struct {
87 std::map<mds_rank_t, double> targets;
88 std::map<mds_rank_t, double> imported;
89 std::map<mds_rank_t, double> exported;
90 } balance_state_t;
91
92 //set up the rebalancing targets for export and do one if the
93 //MDSMap is up to date
94 void prep_rebalance(int beat);
95 int mantle_prep_rebalance();
96
97 void handle_export_pins(void);
98
99 mds_load_t get_load();
100 int localize_balancer();
101 void send_heartbeat();
102 void handle_heartbeat(const MHeartbeat::const_ref &m);
103 void find_exports(CDir *dir,
104 double amount,
105 std::list<CDir*>& exports,
106 double& have,
107 set<CDir*>& already_exporting);
108
109 double try_match(balance_state_t &state,
110 mds_rank_t ex, double& maxex,
111 mds_rank_t im, double& maxim);
112
113 double get_maxim(balance_state_t &state, mds_rank_t im) {
114 return target_load - mds_meta_load[im] - state.imported[im];
115 }
116 double get_maxex(balance_state_t &state, mds_rank_t ex) {
117 return mds_meta_load[ex] - target_load - state.exported[ex];
118 }
119
120 /**
121 * Try to rebalance.
122 *
123 * Check if the monitor has recorded the current export targets;
124 * if it has then do the actual export. Otherwise send off our
125 * export targets message again.
126 */
127 void try_rebalance(balance_state_t& state);
128
129 MDSRank *mds;
130 Messenger *messenger;
131 MonClient *mon_client;
132 int beat_epoch = 0;
133
134 string bal_code;
135 string bal_version;
136
137 time last_heartbeat = clock::zero();
138 time last_sample = clock::zero();
139 time rebalance_time = clock::zero(); //ensure a consistent view of load for rebalance
140
141 time last_get_load = clock::zero();
142 uint64_t last_num_requests = 0;
143 uint64_t last_cpu_time = 0;
144
145 // Dirfrags which are marked to be passed on to MDCache::[split|merge]_dir
146 // just as soon as a delayed context comes back and triggers it.
147 // These sets just prevent us from spawning extra timer contexts for
148 // dirfrags that already have one in flight.
149 set<dirfrag_t> split_pending, merge_pending;
150
151 // per-epoch scatter/gathered info
152 std::map<mds_rank_t, mds_load_t> mds_load;
153 std::map<mds_rank_t, double> mds_meta_load;
154 std::map<mds_rank_t, map<mds_rank_t, float> > mds_import_map;
155 std::map<mds_rank_t, int> mds_last_epoch_under_map;
156
157 // per-epoch state
158 double my_load = 0;
159 double target_load = 0;
160 };
161
162 #endif