]> git.proxmox.com Git - ceph.git/blob - ceph/src/mds/RecoveryQueue.cc
import quincy beta 17.1.0
[ceph.git] / ceph / src / mds / RecoveryQueue.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15 #include "CInode.h"
16 #include "MDCache.h"
17 #include "MDSRank.h"
18 #include "Locker.h"
19 #include "osdc/Filer.h"
20
21 #include "RecoveryQueue.h"
22
23 #define dout_context g_ceph_context
24 #define dout_subsys ceph_subsys_mds
25 #undef dout_prefix
26 #define dout_prefix *_dout << "mds." << mds->get_nodeid() << " RecoveryQueue::" << __func__ << " "
27
28 using namespace std;
29
30 class C_MDC_Recover : public MDSIOContextBase {
31 public:
32 C_MDC_Recover(RecoveryQueue *rq_, CInode *i) :
33 MDSIOContextBase(false), rq(rq_), in(i) {
34 ceph_assert(rq != NULL);
35 }
36 void print(ostream& out) const override {
37 out << "file_recover(" << in->ino() << ")";
38 }
39
40 uint64_t size = 0;
41 utime_t mtime;
42 protected:
43 void finish(int r) override {
44 rq->_recovered(in, r, size, mtime);
45 }
46
47 MDSRank *get_mds() override {
48 return rq->mds;
49 }
50
51 RecoveryQueue *rq;
52 CInode *in;
53 };
54
55 RecoveryQueue::RecoveryQueue(MDSRank *mds_) :
56 file_recover_queue(member_offset(CInode, item_dirty_dirfrag_dir)),
57 file_recover_queue_front(member_offset(CInode, item_dirty_dirfrag_nest)),
58 mds(mds_), filer(mds_->objecter, mds_->finisher)
59 { }
60
61 /**
62 * Progress the queue. Call this after enqueuing something or on
63 * completion of something.
64 */
65 void RecoveryQueue::advance()
66 {
67 dout(10) << file_recover_queue_size << " queued, "
68 << file_recover_queue_front_size << " prioritized, "
69 << file_recovering.size() << " recovering" << dendl;
70
71 while (file_recovering.size() < g_conf()->mds_max_file_recover) {
72 if (!file_recover_queue_front.empty()) {
73 CInode *in = file_recover_queue_front.front();
74 in->item_recover_queue_front.remove_myself();
75 file_recover_queue_front_size--;
76 _start(in);
77 } else if (!file_recover_queue.empty()) {
78 CInode *in = file_recover_queue.front();
79 in->item_recover_queue.remove_myself();
80 file_recover_queue_size--;
81 _start(in);
82 } else {
83 break;
84 }
85 }
86
87 logger->set(l_mdc_num_recovering_processing, file_recovering.size());
88 logger->set(l_mdc_num_recovering_enqueued, file_recover_queue_size + file_recover_queue_front_size);
89 logger->set(l_mdc_num_recovering_prioritized, file_recover_queue_front_size);
90 }
91
92 void RecoveryQueue::_start(CInode *in)
93 {
94 const auto& pi = in->get_projected_inode();
95
96 // blech
97 if (pi->client_ranges.size() && !pi->get_max_size()) {
98 mds->clog->warn() << "bad client_range " << pi->client_ranges
99 << " on ino " << pi->ino;
100 }
101
102 auto p = file_recovering.find(in);
103 if (pi->client_ranges.size() && pi->get_max_size()) {
104 dout(10) << "starting " << pi->size << " " << pi->client_ranges
105 << " " << *in << dendl;
106 if (p == file_recovering.end()) {
107 file_recovering.insert(make_pair(in, false));
108
109 C_MDC_Recover *fin = new C_MDC_Recover(this, in);
110 auto layout = pi->layout;
111 filer.probe(in->ino(), &layout, in->last,
112 pi->get_max_size(), &fin->size, &fin->mtime, false,
113 0, fin);
114 } else {
115 p->second = true;
116 dout(10) << "already working on " << *in << ", set need_restart flag" << dendl;
117 }
118 } else {
119 dout(10) << "skipping " << pi->size << " " << *in << dendl;
120 if (p == file_recovering.end()) {
121 in->state_clear(CInode::STATE_RECOVERING);
122 mds->locker->eval(in, CEPH_LOCK_IFILE);
123 in->auth_unpin(this);
124 }
125 }
126 }
127
128 void RecoveryQueue::prioritize(CInode *in)
129 {
130 if (file_recovering.count(in)) {
131 dout(10) << "already working on " << *in << dendl;
132 return;
133 }
134
135 if (!in->item_recover_queue_front.is_on_list()) {
136 dout(20) << *in << dendl;
137
138 ceph_assert(in->item_recover_queue.is_on_list());
139 in->item_recover_queue.remove_myself();
140 file_recover_queue_size--;
141
142 file_recover_queue_front.push_back(&in->item_recover_queue_front);
143
144 file_recover_queue_front_size++;
145 logger->set(l_mdc_num_recovering_prioritized, file_recover_queue_front_size);
146 return;
147 }
148
149 dout(10) << "not queued " << *in << dendl;
150 }
151
152 static bool _is_in_any_recover_queue(CInode *in)
153 {
154 return in->item_recover_queue.is_on_list() ||
155 in->item_recover_queue_front.is_on_list();
156 }
157
158 /**
159 * Given an authoritative inode which is in the cache,
160 * enqueue it for recovery.
161 */
162 void RecoveryQueue::enqueue(CInode *in)
163 {
164 dout(15) << "RecoveryQueue::enqueue " << *in << dendl;
165 ceph_assert(logger); // Caller should have done set_logger before using me
166 ceph_assert(in->is_auth());
167
168 in->state_clear(CInode::STATE_NEEDSRECOVER);
169 if (!in->state_test(CInode::STATE_RECOVERING)) {
170 in->state_set(CInode::STATE_RECOVERING);
171 in->auth_pin(this);
172 logger->inc(l_mdc_recovery_started);
173 }
174
175 if (!_is_in_any_recover_queue(in)) {
176 file_recover_queue.push_back(&in->item_recover_queue);
177 file_recover_queue_size++;
178 logger->set(l_mdc_num_recovering_enqueued, file_recover_queue_size + file_recover_queue_front_size);
179 }
180 }
181
182
183 /**
184 * Call back on completion of Filer probe on an inode.
185 */
186 void RecoveryQueue::_recovered(CInode *in, int r, uint64_t size, utime_t mtime)
187 {
188 dout(10) << "_recovered r=" << r << " size=" << size << " mtime=" << mtime
189 << " for " << *in << dendl;
190
191 if (r != 0) {
192 dout(0) << "recovery error! " << r << dendl;
193 if (r == -CEPHFS_EBLOCKLISTED) {
194 mds->respawn();
195 return;
196 } else {
197 // Something wrong on the OSD side trying to recover the size
198 // of this inode. In principle we could record this as a piece
199 // of per-inode damage, but it's actually more likely that
200 // this indicates something wrong with the MDS (like maybe
201 // it has the wrong auth caps?)
202 mds->clog->error() << " OSD read error while recovering size"
203 " for inode " << in->ino();
204 mds->damaged();
205 }
206 }
207
208 auto p = file_recovering.find(in);
209 ceph_assert(p != file_recovering.end());
210 bool restart = p->second;
211 file_recovering.erase(p);
212
213 logger->set(l_mdc_num_recovering_processing, file_recovering.size());
214 logger->inc(l_mdc_recovery_completed);
215 in->state_clear(CInode::STATE_RECOVERING);
216
217 if (restart) {
218 if (in->item_recover_queue.is_on_list()) {
219 in->item_recover_queue.remove_myself();
220 file_recover_queue_size--;
221 }
222 if (in->item_recover_queue_front.is_on_list()) {
223 in->item_recover_queue_front.remove_myself();
224 file_recover_queue_front_size--;
225 }
226 logger->set(l_mdc_num_recovering_enqueued, file_recover_queue_size + file_recover_queue_front_size);
227 logger->set(l_mdc_num_recovering_prioritized, file_recover_queue_front_size);
228 _start(in);
229 } else if (!_is_in_any_recover_queue(in)) {
230 // journal
231 mds->locker->check_inode_max_size(in, true, 0, size, mtime);
232 mds->locker->eval(in, CEPH_LOCK_IFILE);
233 in->auth_unpin(this);
234 }
235
236 advance();
237 }
238