]>
Commit | Line | Data |
---|---|---|
7c673cae FG |
1 | // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- |
2 | // vim: ts=8 sw=2 smarttab | |
3 | /* | |
4 | * Ceph - scalable distributed file system | |
5 | * | |
6 | * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> | |
7 | * | |
8 | * This is free software; you can redistribute it and/or | |
9 | * modify it under the terms of the GNU Lesser General Public | |
10 | * License version 2.1, as published by the Free Software | |
11 | * Foundation. See file COPYING. | |
12 | * | |
13 | */ | |
14 | ||
15 | ||
16 | ||
17 | #ifndef CEPH_MDBALANCER_H | |
18 | #define CEPH_MDBALANCER_H | |
19 | ||
20 | #include <list> | |
21 | #include <map> | |
22 | using std::list; | |
23 | using std::map; | |
24 | ||
25 | #include "include/types.h" | |
26 | #include "common/Clock.h" | |
27 | #include "common/Cond.h" | |
28 | ||
29 | class MDSRank; | |
30 | class Message; | |
31 | class MHeartbeat; | |
32 | class CInode; | |
33 | class CDir; | |
34 | class Messenger; | |
35 | class MonClient; | |
36 | ||
37 | class MDBalancer { | |
38 | friend class C_Bal_SendHeartbeat; | |
39 | public: | |
40 | MDBalancer(MDSRank *m, Messenger *msgr, MonClient *monc) : | |
41 | mds(m), | |
42 | messenger(msgr), | |
43 | mon_client(monc), | |
44 | beat_epoch(0), | |
224ce89b | 45 | last_epoch_under(0), my_load(0.0), target_load(0.0) |
7c673cae FG |
46 | { } |
47 | ||
48 | mds_load_t get_load(utime_t); | |
49 | ||
50 | int proc_message(Message *m); | |
51 | ||
52 | /** | |
53 | * Regularly called upkeep function. | |
54 | * | |
55 | * Sends MHeartbeat messages to the mons. | |
56 | */ | |
57 | void tick(); | |
58 | ||
59 | void subtract_export(CDir *ex, utime_t now); | |
60 | void add_import(CDir *im, utime_t now); | |
61 | ||
62 | void hit_inode(utime_t now, CInode *in, int type, int who=-1); | |
63 | void hit_dir(utime_t now, CDir *dir, int type, int who=-1, double amount=1.0); | |
64 | ||
65 | void queue_split(const CDir *dir, bool fast); | |
66 | void queue_merge(CDir *dir); | |
67 | ||
68 | /** | |
69 | * Based on size and configuration, decide whether to issue a queue_split | |
70 | * or queue_merge for this CDir. | |
71 | * | |
72 | * \param hot whether the directory's temperature is enough to split it | |
73 | */ | |
74 | void maybe_fragment(CDir *dir, bool hot); | |
75 | ||
224ce89b WB |
76 | void handle_mds_failure(mds_rank_t who); |
77 | ||
7c673cae FG |
78 | private: |
79 | typedef struct { | |
80 | std::map<mds_rank_t, double> targets; | |
81 | std::map<mds_rank_t, double> imported; | |
82 | std::map<mds_rank_t, double> exported; | |
83 | } balance_state_t; | |
84 | ||
85 | //set up the rebalancing targets for export and do one if the | |
86 | //MDSMap is up to date | |
87 | void prep_rebalance(int beat); | |
88 | int mantle_prep_rebalance(); | |
89 | ||
90 | void handle_export_pins(void); | |
91 | ||
92 | void export_empties(); | |
93 | int localize_balancer(); | |
7c673cae FG |
94 | void send_heartbeat(); |
95 | void handle_heartbeat(MHeartbeat *m); | |
96 | void find_exports(CDir *dir, | |
97 | double amount, | |
98 | list<CDir*>& exports, | |
99 | double& have, | |
100 | set<CDir*>& already_exporting); | |
101 | ||
102 | double try_match(balance_state_t &state, | |
103 | mds_rank_t ex, double& maxex, | |
104 | mds_rank_t im, double& maxim); | |
105 | ||
106 | double get_maxim(balance_state_t &state, mds_rank_t im) { | |
107 | return target_load - mds_meta_load[im] - state.imported[im]; | |
108 | } | |
109 | double get_maxex(balance_state_t &state, mds_rank_t ex) { | |
110 | return mds_meta_load[ex] - target_load - state.exported[ex]; | |
111 | } | |
112 | ||
113 | /** | |
114 | * Try to rebalance. | |
115 | * | |
116 | * Check if the monitor has recorded the current export targets; | |
117 | * if it has then do the actual export. Otherwise send off our | |
118 | * export targets message again. | |
119 | */ | |
120 | void try_rebalance(balance_state_t& state); | |
121 | ||
122 | MDSRank *mds; | |
123 | Messenger *messenger; | |
124 | MonClient *mon_client; | |
125 | int beat_epoch; | |
126 | ||
127 | int last_epoch_under; | |
7c673cae FG |
128 | string bal_code; |
129 | string bal_version; | |
130 | ||
131 | utime_t last_heartbeat; | |
132 | utime_t last_sample; | |
133 | utime_t rebalance_time; //ensure a consistent view of load for rebalance | |
134 | ||
135 | // Dirfrags which are marked to be passed on to MDCache::[split|merge]_dir | |
136 | // just as soon as a delayed context comes back and triggers it. | |
137 | // These sets just prevent us from spawning extra timer contexts for | |
138 | // dirfrags that already have one in flight. | |
139 | set<dirfrag_t> split_pending, merge_pending; | |
140 | ||
141 | // per-epoch scatter/gathered info | |
142 | map<mds_rank_t, mds_load_t> mds_load; | |
143 | map<mds_rank_t, double> mds_meta_load; | |
144 | map<mds_rank_t, map<mds_rank_t, float> > mds_import_map; | |
145 | ||
146 | // per-epoch state | |
147 | double my_load, target_load; | |
148 | }; | |
149 | ||
150 | #endif |