]> git.proxmox.com Git - ceph.git/blob - ceph/src/mds/MDBalancer.h
update sources to 12.2.10
[ceph.git] / ceph / src / mds / MDBalancer.h
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15
16
17 #ifndef CEPH_MDBALANCER_H
18 #define CEPH_MDBALANCER_H
19
20 #include <list>
21 #include <map>
22 using std::list;
23 using std::map;
24
25 #include "include/types.h"
26 #include "common/Clock.h"
27 #include "common/Cond.h"
28
29 class MDSMap;
30 class MDSRank;
31 class Message;
32 class MHeartbeat;
33 class CInode;
34 class CDir;
35 class Messenger;
36 class MonClient;
37
38 class MDBalancer {
39 friend class C_Bal_SendHeartbeat;
40 public:
41 MDBalancer(MDSRank *m, Messenger *msgr, MonClient *monc);
42
43 void handle_conf_change(const struct md_config_t *conf,
44 const std::set <std::string> &changed,
45 const MDSMap &mds_map);
46
47 int proc_message(Message *m);
48
49 /**
50 * Regularly called upkeep function.
51 *
52 * Sends MHeartbeat messages to the mons.
53 */
54 void tick();
55
56 void subtract_export(CDir *ex, utime_t now);
57 void add_import(CDir *im, utime_t now);
58 void adjust_pop_for_rename(CDir *pdir, CDir *dir, utime_t now, bool inc);
59
60 void hit_inode(const utime_t& now, CInode *in, int type, int who=-1);
61 void hit_dir(const utime_t& now, CDir *dir, int type, int who=-1, double amount=1.0);
62
63 void queue_split(const CDir *dir, bool fast);
64 void queue_merge(CDir *dir);
65
66 /**
67 * Based on size and configuration, decide whether to issue a queue_split
68 * or queue_merge for this CDir.
69 *
70 * \param hot whether the directory's temperature is enough to split it
71 */
72 void maybe_fragment(CDir *dir, bool hot);
73
74 void handle_mds_failure(mds_rank_t who);
75
76 int dump_loads(Formatter *f);
77
78 private:
79 bool bal_fragment_dirs;
80 int64_t bal_fragment_interval;
81
82 typedef struct {
83 std::map<mds_rank_t, double> targets;
84 std::map<mds_rank_t, double> imported;
85 std::map<mds_rank_t, double> exported;
86 } balance_state_t;
87
88 //set up the rebalancing targets for export and do one if the
89 //MDSMap is up to date
90 void prep_rebalance(int beat);
91 int mantle_prep_rebalance();
92
93 void handle_export_pins(void);
94
95 mds_load_t get_load(utime_t now);
96 int localize_balancer();
97 void send_heartbeat();
98 void handle_heartbeat(MHeartbeat *m);
99 void find_exports(CDir *dir,
100 double amount,
101 list<CDir*>& exports,
102 double& have,
103 set<CDir*>& already_exporting);
104
105 double try_match(balance_state_t &state,
106 mds_rank_t ex, double& maxex,
107 mds_rank_t im, double& maxim);
108
109 double get_maxim(balance_state_t &state, mds_rank_t im) {
110 return target_load - mds_meta_load[im] - state.imported[im];
111 }
112 double get_maxex(balance_state_t &state, mds_rank_t ex) {
113 return mds_meta_load[ex] - target_load - state.exported[ex];
114 }
115
116 /**
117 * Try to rebalance.
118 *
119 * Check if the monitor has recorded the current export targets;
120 * if it has then do the actual export. Otherwise send off our
121 * export targets message again.
122 */
123 void try_rebalance(balance_state_t& state);
124
125 MDSRank *mds;
126 Messenger *messenger;
127 MonClient *mon_client;
128 int beat_epoch = 0;
129
130 string bal_code;
131 string bal_version;
132
133 utime_t last_heartbeat;
134 utime_t last_sample;
135 utime_t rebalance_time; //ensure a consistent view of load for rebalance
136
137 utime_t last_get_load;
138 uint64_t last_num_requests = 0;
139 uint64_t last_cpu_time = 0;
140
141 // Dirfrags which are marked to be passed on to MDCache::[split|merge]_dir
142 // just as soon as a delayed context comes back and triggers it.
143 // These sets just prevent us from spawning extra timer contexts for
144 // dirfrags that already have one in flight.
145 set<dirfrag_t> split_pending, merge_pending;
146
147 // per-epoch scatter/gathered info
148 map<mds_rank_t, mds_load_t> mds_load;
149 map<mds_rank_t, double> mds_meta_load;
150 map<mds_rank_t, map<mds_rank_t, float> > mds_import_map;
151 map<mds_rank_t, int> mds_last_epoch_under_map;
152
153 // per-epoch state
154 double my_load = 0;
155 double target_load = 0;
156 };
157
158 #endif