]> git.proxmox.com Git - ceph.git/blame - ceph/src/mds/MDBalancer.h
update sources to 12.2.10
[ceph.git] / ceph / src / mds / MDBalancer.h
CommitLineData
7c673cae
FG
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3/*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15
16
17#ifndef CEPH_MDBALANCER_H
18#define CEPH_MDBALANCER_H
19
20#include <list>
21#include <map>
22using std::list;
23using std::map;
24
25#include "include/types.h"
26#include "common/Clock.h"
27#include "common/Cond.h"
28
91327a77 29class MDSMap;
7c673cae
FG
30class MDSRank;
31class Message;
32class MHeartbeat;
33class CInode;
34class CDir;
35class Messenger;
36class MonClient;
37
38class MDBalancer {
39 friend class C_Bal_SendHeartbeat;
40public:
91327a77
AA
41 MDBalancer(MDSRank *m, Messenger *msgr, MonClient *monc);
42
43 void handle_conf_change(const struct md_config_t *conf,
44 const std::set <std::string> &changed,
45 const MDSMap &mds_map);
7c673cae
FG
46
47 int proc_message(Message *m);
48
49 /**
50 * Regularly called upkeep function.
51 *
52 * Sends MHeartbeat messages to the mons.
53 */
54 void tick();
55
56 void subtract_export(CDir *ex, utime_t now);
57 void add_import(CDir *im, utime_t now);
28e407b8 58 void adjust_pop_for_rename(CDir *pdir, CDir *dir, utime_t now, bool inc);
7c673cae 59
28e407b8
AA
60 void hit_inode(const utime_t& now, CInode *in, int type, int who=-1);
61 void hit_dir(const utime_t& now, CDir *dir, int type, int who=-1, double amount=1.0);
7c673cae
FG
62
63 void queue_split(const CDir *dir, bool fast);
64 void queue_merge(CDir *dir);
65
66 /**
67 * Based on size and configuration, decide whether to issue a queue_split
68 * or queue_merge for this CDir.
69 *
70 * \param hot whether the directory's temperature is enough to split it
71 */
72 void maybe_fragment(CDir *dir, bool hot);
73
224ce89b
WB
74 void handle_mds_failure(mds_rank_t who);
75
28e407b8
AA
76 int dump_loads(Formatter *f);
77
7c673cae 78private:
91327a77
AA
79 bool bal_fragment_dirs;
80 int64_t bal_fragment_interval;
81
7c673cae
FG
82 typedef struct {
83 std::map<mds_rank_t, double> targets;
84 std::map<mds_rank_t, double> imported;
85 std::map<mds_rank_t, double> exported;
86 } balance_state_t;
87
88 //set up the rebalancing targets for export and do one if the
89 //MDSMap is up to date
90 void prep_rebalance(int beat);
91 int mantle_prep_rebalance();
92
93 void handle_export_pins(void);
94
28e407b8 95 mds_load_t get_load(utime_t now);
7c673cae 96 int localize_balancer();
7c673cae
FG
97 void send_heartbeat();
98 void handle_heartbeat(MHeartbeat *m);
99 void find_exports(CDir *dir,
100 double amount,
101 list<CDir*>& exports,
102 double& have,
103 set<CDir*>& already_exporting);
104
105 double try_match(balance_state_t &state,
106 mds_rank_t ex, double& maxex,
107 mds_rank_t im, double& maxim);
108
109 double get_maxim(balance_state_t &state, mds_rank_t im) {
110 return target_load - mds_meta_load[im] - state.imported[im];
111 }
112 double get_maxex(balance_state_t &state, mds_rank_t ex) {
113 return mds_meta_load[ex] - target_load - state.exported[ex];
114 }
115
116 /**
117 * Try to rebalance.
118 *
119 * Check if the monitor has recorded the current export targets;
120 * if it has then do the actual export. Otherwise send off our
121 * export targets message again.
122 */
123 void try_rebalance(balance_state_t& state);
124
125 MDSRank *mds;
126 Messenger *messenger;
127 MonClient *mon_client;
28e407b8 128 int beat_epoch = 0;
7c673cae 129
7c673cae
FG
130 string bal_code;
131 string bal_version;
132
133 utime_t last_heartbeat;
134 utime_t last_sample;
135 utime_t rebalance_time; //ensure a consistent view of load for rebalance
136
28e407b8
AA
137 utime_t last_get_load;
138 uint64_t last_num_requests = 0;
91327a77 139 uint64_t last_cpu_time = 0;
28e407b8 140
7c673cae
FG
141 // Dirfrags which are marked to be passed on to MDCache::[split|merge]_dir
142 // just as soon as a delayed context comes back and triggers it.
143 // These sets just prevent us from spawning extra timer contexts for
144 // dirfrags that already have one in flight.
145 set<dirfrag_t> split_pending, merge_pending;
146
147 // per-epoch scatter/gathered info
148 map<mds_rank_t, mds_load_t> mds_load;
149 map<mds_rank_t, double> mds_meta_load;
150 map<mds_rank_t, map<mds_rank_t, float> > mds_import_map;
28e407b8 151 map<mds_rank_t, int> mds_last_epoch_under_map;
7c673cae
FG
152
153 // per-epoch state
28e407b8
AA
154 double my_load = 0;
155 double target_load = 0;
7c673cae
FG
156};
157
158#endif