]>
Commit | Line | Data |
---|---|---|
7c673cae FG |
1 | // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- |
2 | // vim: ts=8 sw=2 smarttab | |
3 | /* | |
4 | * Ceph - scalable distributed file system | |
5 | * | |
6 | * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> | |
7 | * | |
8 | * This is free software; you can redistribute it and/or | |
9 | * modify it under the terms of the GNU Lesser General Public | |
10 | * License version 2.1, as published by the Free Software | |
11 | * Foundation. See file COPYING. | |
12 | * | |
13 | */ | |
14 | ||
15 | ||
16 | ||
17 | #ifndef CEPH_MDBALANCER_H | |
18 | #define CEPH_MDBALANCER_H | |
19 | ||
20 | #include <list> | |
21 | #include <map> | |
22 | using std::list; | |
23 | using std::map; | |
24 | ||
25 | #include "include/types.h" | |
26 | #include "common/Clock.h" | |
27 | #include "common/Cond.h" | |
28 | ||
91327a77 | 29 | class MDSMap; |
7c673cae FG |
30 | class MDSRank; |
31 | class Message; | |
32 | class MHeartbeat; | |
33 | class CInode; | |
34 | class CDir; | |
35 | class Messenger; | |
36 | class MonClient; | |
37 | ||
38 | class MDBalancer { | |
39 | friend class C_Bal_SendHeartbeat; | |
40 | public: | |
91327a77 AA |
41 | MDBalancer(MDSRank *m, Messenger *msgr, MonClient *monc); |
42 | ||
43 | void handle_conf_change(const struct md_config_t *conf, | |
44 | const std::set <std::string> &changed, | |
45 | const MDSMap &mds_map); | |
7c673cae FG |
46 | |
47 | int proc_message(Message *m); | |
48 | ||
49 | /** | |
50 | * Regularly called upkeep function. | |
51 | * | |
52 | * Sends MHeartbeat messages to the mons. | |
53 | */ | |
54 | void tick(); | |
55 | ||
56 | void subtract_export(CDir *ex, utime_t now); | |
57 | void add_import(CDir *im, utime_t now); | |
28e407b8 | 58 | void adjust_pop_for_rename(CDir *pdir, CDir *dir, utime_t now, bool inc); |
7c673cae | 59 | |
28e407b8 AA |
60 | void hit_inode(const utime_t& now, CInode *in, int type, int who=-1); |
61 | void hit_dir(const utime_t& now, CDir *dir, int type, int who=-1, double amount=1.0); | |
7c673cae FG |
62 | |
63 | void queue_split(const CDir *dir, bool fast); | |
64 | void queue_merge(CDir *dir); | |
65 | ||
66 | /** | |
67 | * Based on size and configuration, decide whether to issue a queue_split | |
68 | * or queue_merge for this CDir. | |
69 | * | |
70 | * \param hot whether the directory's temperature is enough to split it | |
71 | */ | |
72 | void maybe_fragment(CDir *dir, bool hot); | |
73 | ||
224ce89b WB |
74 | void handle_mds_failure(mds_rank_t who); |
75 | ||
28e407b8 AA |
76 | int dump_loads(Formatter *f); |
77 | ||
7c673cae | 78 | private: |
91327a77 AA |
79 | bool bal_fragment_dirs; |
80 | int64_t bal_fragment_interval; | |
81 | ||
7c673cae FG |
82 | typedef struct { |
83 | std::map<mds_rank_t, double> targets; | |
84 | std::map<mds_rank_t, double> imported; | |
85 | std::map<mds_rank_t, double> exported; | |
86 | } balance_state_t; | |
87 | ||
88 | //set up the rebalancing targets for export and do one if the | |
89 | //MDSMap is up to date | |
90 | void prep_rebalance(int beat); | |
91 | int mantle_prep_rebalance(); | |
92 | ||
93 | void handle_export_pins(void); | |
94 | ||
28e407b8 | 95 | mds_load_t get_load(utime_t now); |
7c673cae | 96 | int localize_balancer(); |
7c673cae FG |
97 | void send_heartbeat(); |
98 | void handle_heartbeat(MHeartbeat *m); | |
99 | void find_exports(CDir *dir, | |
100 | double amount, | |
101 | list<CDir*>& exports, | |
102 | double& have, | |
103 | set<CDir*>& already_exporting); | |
104 | ||
105 | double try_match(balance_state_t &state, | |
106 | mds_rank_t ex, double& maxex, | |
107 | mds_rank_t im, double& maxim); | |
108 | ||
109 | double get_maxim(balance_state_t &state, mds_rank_t im) { | |
110 | return target_load - mds_meta_load[im] - state.imported[im]; | |
111 | } | |
112 | double get_maxex(balance_state_t &state, mds_rank_t ex) { | |
113 | return mds_meta_load[ex] - target_load - state.exported[ex]; | |
114 | } | |
115 | ||
116 | /** | |
117 | * Try to rebalance. | |
118 | * | |
119 | * Check if the monitor has recorded the current export targets; | |
120 | * if it has then do the actual export. Otherwise send off our | |
121 | * export targets message again. | |
122 | */ | |
123 | void try_rebalance(balance_state_t& state); | |
124 | ||
125 | MDSRank *mds; | |
126 | Messenger *messenger; | |
127 | MonClient *mon_client; | |
28e407b8 | 128 | int beat_epoch = 0; |
7c673cae | 129 | |
7c673cae FG |
130 | string bal_code; |
131 | string bal_version; | |
132 | ||
133 | utime_t last_heartbeat; | |
134 | utime_t last_sample; | |
135 | utime_t rebalance_time; //ensure a consistent view of load for rebalance | |
136 | ||
28e407b8 AA |
137 | utime_t last_get_load; |
138 | uint64_t last_num_requests = 0; | |
91327a77 | 139 | uint64_t last_cpu_time = 0; |
28e407b8 | 140 | |
7c673cae FG |
141 | // Dirfrags which are marked to be passed on to MDCache::[split|merge]_dir |
142 | // just as soon as a delayed context comes back and triggers it. | |
143 | // These sets just prevent us from spawning extra timer contexts for | |
144 | // dirfrags that already have one in flight. | |
145 | set<dirfrag_t> split_pending, merge_pending; | |
146 | ||
147 | // per-epoch scatter/gathered info | |
148 | map<mds_rank_t, mds_load_t> mds_load; | |
149 | map<mds_rank_t, double> mds_meta_load; | |
150 | map<mds_rank_t, map<mds_rank_t, float> > mds_import_map; | |
28e407b8 | 151 | map<mds_rank_t, int> mds_last_epoch_under_map; |
7c673cae FG |
152 | |
153 | // per-epoch state | |
28e407b8 AA |
154 | double my_load = 0; |
155 | double target_load = 0; | |
7c673cae FG |
156 | }; |
157 | ||
158 | #endif |