]> git.proxmox.com Git - ceph.git/blob - ceph/src/mds/MDBalancer.h
1f8d0dd1e607ecadd2e2f384985021ae94e9457c
[ceph.git] / ceph / src / mds / MDBalancer.h
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15
16
17 #ifndef CEPH_MDBALANCER_H
18 #define CEPH_MDBALANCER_H
19
20 #include <list>
21 #include <map>
22 using std::list;
23 using std::map;
24
25 #include "include/types.h"
26 #include "common/Clock.h"
27 #include "common/Cond.h"
28
29 class MDSRank;
30 class Message;
31 class MHeartbeat;
32 class CInode;
33 class CDir;
34 class Messenger;
35 class MonClient;
36
37 class MDBalancer {
38 friend class C_Bal_SendHeartbeat;
39 public:
40 MDBalancer(MDSRank *m, Messenger *msgr, MonClient *monc) :
41 mds(m),
42 messenger(msgr),
43 mon_client(monc),
44 beat_epoch(0),
45 last_epoch_under(0), last_epoch_over(0), my_load(0.0), target_load(0.0)
46 { }
47
48 mds_load_t get_load(utime_t);
49
50 int proc_message(Message *m);
51
52 /**
53 * Regularly called upkeep function.
54 *
55 * Sends MHeartbeat messages to the mons.
56 */
57 void tick();
58
59 void subtract_export(CDir *ex, utime_t now);
60 void add_import(CDir *im, utime_t now);
61
62 void hit_inode(utime_t now, CInode *in, int type, int who=-1);
63 void hit_dir(utime_t now, CDir *dir, int type, int who=-1, double amount=1.0);
64
65 void queue_split(const CDir *dir, bool fast);
66 void queue_merge(CDir *dir);
67
68 /**
69 * Based on size and configuration, decide whether to issue a queue_split
70 * or queue_merge for this CDir.
71 *
72 * \param hot whether the directory's temperature is enough to split it
73 */
74 void maybe_fragment(CDir *dir, bool hot);
75
76 private:
77 typedef struct {
78 std::map<mds_rank_t, double> targets;
79 std::map<mds_rank_t, double> imported;
80 std::map<mds_rank_t, double> exported;
81 } balance_state_t;
82
83 //set up the rebalancing targets for export and do one if the
84 //MDSMap is up to date
85 void prep_rebalance(int beat);
86 int mantle_prep_rebalance();
87
88 void handle_export_pins(void);
89
90 void export_empties();
91 int localize_balancer();
92 void send_heartbeat();
93 void handle_heartbeat(MHeartbeat *m);
94 void find_exports(CDir *dir,
95 double amount,
96 list<CDir*>& exports,
97 double& have,
98 set<CDir*>& already_exporting);
99
100 double try_match(balance_state_t &state,
101 mds_rank_t ex, double& maxex,
102 mds_rank_t im, double& maxim);
103
104 double get_maxim(balance_state_t &state, mds_rank_t im) {
105 return target_load - mds_meta_load[im] - state.imported[im];
106 }
107 double get_maxex(balance_state_t &state, mds_rank_t ex) {
108 return mds_meta_load[ex] - target_load - state.exported[ex];
109 }
110
111 /**
112 * Try to rebalance.
113 *
114 * Check if the monitor has recorded the current export targets;
115 * if it has then do the actual export. Otherwise send off our
116 * export targets message again.
117 */
118 void try_rebalance(balance_state_t& state);
119
120 MDSRank *mds;
121 Messenger *messenger;
122 MonClient *mon_client;
123 int beat_epoch;
124
125 int last_epoch_under;
126 int last_epoch_over;
127 string bal_code;
128 string bal_version;
129
130 utime_t last_heartbeat;
131 utime_t last_sample;
132 utime_t rebalance_time; //ensure a consistent view of load for rebalance
133
134 // Dirfrags which are marked to be passed on to MDCache::[split|merge]_dir
135 // just as soon as a delayed context comes back and triggers it.
136 // These sets just prevent us from spawning extra timer contexts for
137 // dirfrags that already have one in flight.
138 set<dirfrag_t> split_pending, merge_pending;
139
140 // per-epoch scatter/gathered info
141 map<mds_rank_t, mds_load_t> mds_load;
142 map<mds_rank_t, double> mds_meta_load;
143 map<mds_rank_t, map<mds_rank_t, float> > mds_import_map;
144
145 // per-epoch state
146 double my_load, target_load;
147 };
148
149 #endif