]> git.proxmox.com Git - ceph.git/blob - ceph/src/mds/RecoveryQueue.cc
update sources to 12.2.10
[ceph.git] / ceph / src / mds / RecoveryQueue.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15 #include "CInode.h"
16 #include "MDCache.h"
17 #include "MDSRank.h"
18 #include "Locker.h"
19 #include "osdc/Filer.h"
20
21 #include "RecoveryQueue.h"
22
23 #define dout_context g_ceph_context
24 #define dout_subsys ceph_subsys_mds
25 #undef dout_prefix
26 #define dout_prefix *_dout << "mds." << mds->get_nodeid() << " RecoveryQueue::" << __func__ << " "
27
28 class C_MDC_Recover : public MDSIOContextBase {
29 protected:
30 RecoveryQueue *rq;
31 CInode *in;
32 void finish(int r) override {
33 rq->_recovered(in, r, size, mtime);
34 }
35
36 MDSRank *get_mds() override {
37 return rq->mds;
38 }
39
40 public:
41 uint64_t size;
42 utime_t mtime;
43
44 C_MDC_Recover(RecoveryQueue *rq_, CInode *i) :
45 MDSIOContextBase(false), rq(rq_), in(i), size(0) {
46 assert(rq != NULL);
47 }
48 void print(ostream& out) const override {
49 out << "file_recover(" << in->ino() << ")";
50 }
51 };
52
53
54 RecoveryQueue::RecoveryQueue(MDSRank *mds_) :
55 file_recover_queue(member_offset(CInode, item_dirty_dirfrag_dir)),
56 file_recover_queue_front(member_offset(CInode, item_dirty_dirfrag_nest)),
57 mds(mds_), logger(NULL), filer(mds_->objecter, mds_->finisher)
58 { }
59
60
61 /**
62 * Progress the queue. Call this after enqueuing something or on
63 * completion of something.
64 */
65 void RecoveryQueue::advance()
66 {
67 dout(10) << file_recover_queue_size << " queued, "
68 << file_recover_queue_front_size << " prioritized, "
69 << file_recovering.size() << " recovering" << dendl;
70
71 while (file_recovering.size() < g_conf->mds_max_file_recover) {
72 if (!file_recover_queue_front.empty()) {
73 CInode *in = file_recover_queue_front.front();
74 in->item_recover_queue_front.remove_myself();
75 file_recover_queue_front_size--;
76 _start(in);
77 } else if (!file_recover_queue.empty()) {
78 CInode *in = file_recover_queue.front();
79 in->item_recover_queue.remove_myself();
80 file_recover_queue_size--;
81 _start(in);
82 } else {
83 break;
84 }
85 }
86
87 logger->set(l_mdc_num_recovering_processing, file_recovering.size());
88 logger->set(l_mdc_num_recovering_enqueued, file_recover_queue_size + file_recover_queue_front_size);
89 logger->set(l_mdc_num_recovering_prioritized, file_recover_queue_front_size);
90 }
91
92 void RecoveryQueue::_start(CInode *in)
93 {
94 auto pi = in->get_projected_inode();
95
96 // blech
97 if (pi->client_ranges.size() && !pi->get_max_size()) {
98 mds->clog->warn() << "bad client_range " << pi->client_ranges
99 << " on ino " << pi->ino;
100 }
101
102 auto p = file_recovering.find(in);
103 if (pi->client_ranges.size() && pi->get_max_size()) {
104 dout(10) << "starting " << in->inode.size << " " << pi->client_ranges
105 << " " << *in << dendl;
106 if (p == file_recovering.end()) {
107 file_recovering.insert(make_pair(in, false));
108
109 C_MDC_Recover *fin = new C_MDC_Recover(this, in);
110 filer.probe(in->inode.ino, &in->inode.layout, in->last,
111 pi->get_max_size(), &fin->size, &fin->mtime, false,
112 0, fin);
113 } else {
114 p->second = true;
115 dout(10) << "already working on " << *in << ", set need_restart flag" << dendl;
116 }
117 } else {
118 dout(10) << "skipping " << in->inode.size << " " << *in << dendl;
119 if (p == file_recovering.end()) {
120 in->state_clear(CInode::STATE_RECOVERING);
121 mds->locker->eval(in, CEPH_LOCK_IFILE);
122 in->auth_unpin(this);
123 }
124 }
125 }
126
127 void RecoveryQueue::prioritize(CInode *in)
128 {
129 if (file_recovering.count(in)) {
130 dout(10) << "already working on " << *in << dendl;
131 return;
132 }
133
134 if (!in->item_recover_queue_front.is_on_list()) {
135 dout(20) << *in << dendl;
136
137 assert(in->item_recover_queue.is_on_list());
138 in->item_recover_queue.remove_myself();
139 file_recover_queue_size--;
140
141 file_recover_queue_front.push_back(&in->item_recover_queue_front);
142
143 file_recover_queue_front_size++;
144 logger->set(l_mdc_num_recovering_prioritized, file_recover_queue_front_size);
145 return;
146 }
147
148 dout(10) << "not queued " << *in << dendl;
149 }
150
151 static bool _is_in_any_recover_queue(CInode *in)
152 {
153 return in->item_recover_queue.is_on_list() ||
154 in->item_recover_queue_front.is_on_list();
155 }
156
157 /**
158 * Given an authoritative inode which is in the cache,
159 * enqueue it for recovery.
160 */
161 void RecoveryQueue::enqueue(CInode *in)
162 {
163 dout(15) << "RecoveryQueue::enqueue " << *in << dendl;
164 assert(logger); // Caller should have done set_logger before using me
165 assert(in->is_auth());
166
167 in->state_clear(CInode::STATE_NEEDSRECOVER);
168 if (!in->state_test(CInode::STATE_RECOVERING)) {
169 in->state_set(CInode::STATE_RECOVERING);
170 in->auth_pin(this);
171 logger->inc(l_mdc_recovery_started);
172 }
173
174 if (!_is_in_any_recover_queue(in)) {
175 file_recover_queue.push_back(&in->item_recover_queue);
176 file_recover_queue_size++;
177 logger->set(l_mdc_num_recovering_enqueued, file_recover_queue_size + file_recover_queue_front_size);
178 }
179 }
180
181
182 /**
183 * Call back on completion of Filer probe on an inode.
184 */
185 void RecoveryQueue::_recovered(CInode *in, int r, uint64_t size, utime_t mtime)
186 {
187 dout(10) << "_recovered r=" << r << " size=" << size << " mtime=" << mtime
188 << " for " << *in << dendl;
189
190 if (r != 0) {
191 dout(0) << "recovery error! " << r << dendl;
192 if (r == -EBLACKLISTED) {
193 mds->respawn();
194 return;
195 } else {
196 // Something wrong on the OSD side trying to recover the size
197 // of this inode. In principle we could record this as a piece
198 // of per-inode damage, but it's actually more likely that
199 // this indicates something wrong with the MDS (like maybe
200 // it has the wrong auth caps?)
201 mds->clog->error() << " OSD read error while recovering size for inode 0x"
202 << std::hex << in->ino() << std::dec;
203 mds->damaged();
204 }
205 }
206
207 auto p = file_recovering.find(in);
208 assert(p != file_recovering.end());
209 bool restart = p->second;
210 file_recovering.erase(p);
211
212 logger->set(l_mdc_num_recovering_processing, file_recovering.size());
213 logger->inc(l_mdc_recovery_completed);
214 in->state_clear(CInode::STATE_RECOVERING);
215
216 if (restart) {
217 if (in->item_recover_queue.is_on_list()) {
218 in->item_recover_queue.remove_myself();
219 file_recover_queue_size--;
220 }
221 if (in->item_recover_queue_front.is_on_list()) {
222 in->item_recover_queue_front.remove_myself();
223 file_recover_queue_front_size--;
224 }
225 logger->set(l_mdc_num_recovering_enqueued, file_recover_queue_size + file_recover_queue_front_size);
226 logger->set(l_mdc_num_recovering_prioritized, file_recover_queue_front_size);
227 _start(in);
228 } else if (!_is_in_any_recover_queue(in)) {
229 // journal
230 mds->locker->check_inode_max_size(in, true, 0, size, mtime);
231 mds->locker->eval(in, CEPH_LOCK_IFILE);
232 in->auth_unpin(this);
233 }
234
235 advance();
236 }
237