]> git.proxmox.com Git - ceph.git/blame - ceph/src/mds/MDBalancer.h
update sources to 12.2.7
[ceph.git] / ceph / src / mds / MDBalancer.h
CommitLineData
7c673cae
FG
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3/*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15
16
17#ifndef CEPH_MDBALANCER_H
18#define CEPH_MDBALANCER_H
19
20#include <list>
21#include <map>
22using std::list;
23using std::map;
24
25#include "include/types.h"
26#include "common/Clock.h"
27#include "common/Cond.h"
28
29class MDSRank;
30class Message;
31class MHeartbeat;
32class CInode;
33class CDir;
34class Messenger;
35class MonClient;
36
37class MDBalancer {
38 friend class C_Bal_SendHeartbeat;
39public:
40 MDBalancer(MDSRank *m, Messenger *msgr, MonClient *monc) :
28e407b8 41 mds(m), messenger(msgr), mon_client(monc) { }
7c673cae
FG
42
43 int proc_message(Message *m);
44
45 /**
46 * Regularly called upkeep function.
47 *
48 * Sends MHeartbeat messages to the mons.
49 */
50 void tick();
51
52 void subtract_export(CDir *ex, utime_t now);
53 void add_import(CDir *im, utime_t now);
28e407b8 54 void adjust_pop_for_rename(CDir *pdir, CDir *dir, utime_t now, bool inc);
7c673cae 55
28e407b8
AA
56 void hit_inode(const utime_t& now, CInode *in, int type, int who=-1);
57 void hit_dir(const utime_t& now, CDir *dir, int type, int who=-1, double amount=1.0);
7c673cae
FG
58
59 void queue_split(const CDir *dir, bool fast);
60 void queue_merge(CDir *dir);
61
62 /**
63 * Based on size and configuration, decide whether to issue a queue_split
64 * or queue_merge for this CDir.
65 *
66 * \param hot whether the directory's temperature is enough to split it
67 */
68 void maybe_fragment(CDir *dir, bool hot);
69
224ce89b
WB
70 void handle_mds_failure(mds_rank_t who);
71
28e407b8
AA
72 int dump_loads(Formatter *f);
73
7c673cae
FG
74private:
75 typedef struct {
76 std::map<mds_rank_t, double> targets;
77 std::map<mds_rank_t, double> imported;
78 std::map<mds_rank_t, double> exported;
79 } balance_state_t;
80
81 //set up the rebalancing targets for export and do one if the
82 //MDSMap is up to date
83 void prep_rebalance(int beat);
84 int mantle_prep_rebalance();
85
86 void handle_export_pins(void);
87
28e407b8 88 mds_load_t get_load(utime_t now);
7c673cae 89 int localize_balancer();
7c673cae
FG
90 void send_heartbeat();
91 void handle_heartbeat(MHeartbeat *m);
92 void find_exports(CDir *dir,
93 double amount,
94 list<CDir*>& exports,
95 double& have,
96 set<CDir*>& already_exporting);
97
98 double try_match(balance_state_t &state,
99 mds_rank_t ex, double& maxex,
100 mds_rank_t im, double& maxim);
101
102 double get_maxim(balance_state_t &state, mds_rank_t im) {
103 return target_load - mds_meta_load[im] - state.imported[im];
104 }
105 double get_maxex(balance_state_t &state, mds_rank_t ex) {
106 return mds_meta_load[ex] - target_load - state.exported[ex];
107 }
108
109 /**
110 * Try to rebalance.
111 *
112 * Check if the monitor has recorded the current export targets;
113 * if it has then do the actual export. Otherwise send off our
114 * export targets message again.
115 */
116 void try_rebalance(balance_state_t& state);
117
118 MDSRank *mds;
119 Messenger *messenger;
120 MonClient *mon_client;
28e407b8 121 int beat_epoch = 0;
7c673cae 122
7c673cae
FG
123 string bal_code;
124 string bal_version;
125
126 utime_t last_heartbeat;
127 utime_t last_sample;
128 utime_t rebalance_time; //ensure a consistent view of load for rebalance
129
28e407b8
AA
130 utime_t last_get_load;
131 uint64_t last_num_requests = 0;
132
7c673cae
FG
133 // Dirfrags which are marked to be passed on to MDCache::[split|merge]_dir
134 // just as soon as a delayed context comes back and triggers it.
135 // These sets just prevent us from spawning extra timer contexts for
136 // dirfrags that already have one in flight.
137 set<dirfrag_t> split_pending, merge_pending;
138
139 // per-epoch scatter/gathered info
140 map<mds_rank_t, mds_load_t> mds_load;
141 map<mds_rank_t, double> mds_meta_load;
142 map<mds_rank_t, map<mds_rank_t, float> > mds_import_map;
28e407b8 143 map<mds_rank_t, int> mds_last_epoch_under_map;
7c673cae
FG
144
145 // per-epoch state
28e407b8
AA
146 double my_load = 0;
147 double target_load = 0;
7c673cae
FG
148};
149
150#endif