]> git.proxmox.com Git - ceph.git/blame - ceph/src/mds/RecoveryQueue.cc
import ceph 16.2.6
[ceph.git] / ceph / src / mds / RecoveryQueue.cc
CommitLineData
7c673cae
FG
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3/*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15#include "CInode.h"
16#include "MDCache.h"
17#include "MDSRank.h"
18#include "Locker.h"
19#include "osdc/Filer.h"
20
21#include "RecoveryQueue.h"
22
7c673cae
FG
23#define dout_context g_ceph_context
24#define dout_subsys ceph_subsys_mds
25#undef dout_prefix
26#define dout_prefix *_dout << "mds." << mds->get_nodeid() << " RecoveryQueue::" << __func__ << " "
27
28class C_MDC_Recover : public MDSIOContextBase {
9f95a23c
TL
29public:
30 C_MDC_Recover(RecoveryQueue *rq_, CInode *i) :
31 MDSIOContextBase(false), rq(rq_), in(i) {
32 ceph_assert(rq != NULL);
33 }
34 void print(ostream& out) const override {
35 out << "file_recover(" << in->ino() << ")";
36 }
37
38 uint64_t size = 0;
39 utime_t mtime;
7c673cae 40protected:
7c673cae
FG
41 void finish(int r) override {
42 rq->_recovered(in, r, size, mtime);
43 }
44
45 MDSRank *get_mds() override {
46 return rq->mds;
47 }
48
9f95a23c
TL
49 RecoveryQueue *rq;
50 CInode *in;
7c673cae
FG
51};
52
b32b8144
FG
53RecoveryQueue::RecoveryQueue(MDSRank *mds_) :
54 file_recover_queue(member_offset(CInode, item_dirty_dirfrag_dir)),
55 file_recover_queue_front(member_offset(CInode, item_dirty_dirfrag_nest)),
9f95a23c 56 mds(mds_), filer(mds_->objecter, mds_->finisher)
b32b8144 57{ }
7c673cae 58
7c673cae
FG
59/**
60 * Progress the queue. Call this after enqueuing something or on
61 * completion of something.
62 */
63void RecoveryQueue::advance()
64{
b32b8144
FG
65 dout(10) << file_recover_queue_size << " queued, "
66 << file_recover_queue_front_size << " prioritized, "
7c673cae
FG
67 << file_recovering.size() << " recovering" << dendl;
68
11fdf7f2 69 while (file_recovering.size() < g_conf()->mds_max_file_recover) {
7c673cae 70 if (!file_recover_queue_front.empty()) {
b32b8144
FG
71 CInode *in = file_recover_queue_front.front();
72 in->item_recover_queue_front.remove_myself();
73 file_recover_queue_front_size--;
7c673cae
FG
74 _start(in);
75 } else if (!file_recover_queue.empty()) {
b32b8144
FG
76 CInode *in = file_recover_queue.front();
77 in->item_recover_queue.remove_myself();
78 file_recover_queue_size--;
7c673cae
FG
79 _start(in);
80 } else {
81 break;
82 }
83 }
84
85 logger->set(l_mdc_num_recovering_processing, file_recovering.size());
b32b8144
FG
86 logger->set(l_mdc_num_recovering_enqueued, file_recover_queue_size + file_recover_queue_front_size);
87 logger->set(l_mdc_num_recovering_prioritized, file_recover_queue_front_size);
7c673cae
FG
88}
89
90void RecoveryQueue::_start(CInode *in)
91{
f67539c2 92 const auto& pi = in->get_projected_inode();
7c673cae
FG
93
94 // blech
95 if (pi->client_ranges.size() && !pi->get_max_size()) {
96 mds->clog->warn() << "bad client_range " << pi->client_ranges
97 << " on ino " << pi->ino;
98 }
99
b32b8144 100 auto p = file_recovering.find(in);
7c673cae 101 if (pi->client_ranges.size() && pi->get_max_size()) {
f67539c2 102 dout(10) << "starting " << pi->size << " " << pi->client_ranges
7c673cae 103 << " " << *in << dendl;
b32b8144
FG
104 if (p == file_recovering.end()) {
105 file_recovering.insert(make_pair(in, false));
7c673cae 106
b32b8144 107 C_MDC_Recover *fin = new C_MDC_Recover(this, in);
f67539c2
TL
108 auto layout = pi->layout;
109 filer.probe(in->ino(), &layout, in->last,
b32b8144
FG
110 pi->get_max_size(), &fin->size, &fin->mtime, false,
111 0, fin);
112 } else {
113 p->second = true;
114 dout(10) << "already working on " << *in << ", set need_restart flag" << dendl;
115 }
7c673cae 116 } else {
f67539c2 117 dout(10) << "skipping " << pi->size << " " << *in << dendl;
b32b8144
FG
118 if (p == file_recovering.end()) {
119 in->state_clear(CInode::STATE_RECOVERING);
120 mds->locker->eval(in, CEPH_LOCK_IFILE);
121 in->auth_unpin(this);
122 }
7c673cae
FG
123 }
124}
125
126void RecoveryQueue::prioritize(CInode *in)
127{
128 if (file_recovering.count(in)) {
129 dout(10) << "already working on " << *in << dendl;
130 return;
131 }
132
b32b8144 133 if (!in->item_recover_queue_front.is_on_list()) {
7c673cae 134 dout(20) << *in << dendl;
b32b8144 135
11fdf7f2 136 ceph_assert(in->item_recover_queue.is_on_list());
b32b8144
FG
137 in->item_recover_queue.remove_myself();
138 file_recover_queue_size--;
139
140 file_recover_queue_front.push_back(&in->item_recover_queue_front);
141
142 file_recover_queue_front_size++;
143 logger->set(l_mdc_num_recovering_prioritized, file_recover_queue_front_size);
7c673cae
FG
144 return;
145 }
146
147 dout(10) << "not queued " << *in << dendl;
148}
149
b32b8144
FG
150static bool _is_in_any_recover_queue(CInode *in)
151{
152 return in->item_recover_queue.is_on_list() ||
153 in->item_recover_queue_front.is_on_list();
154}
7c673cae
FG
155
156/**
157 * Given an authoritative inode which is in the cache,
158 * enqueue it for recovery.
159 */
160void RecoveryQueue::enqueue(CInode *in)
161{
162 dout(15) << "RecoveryQueue::enqueue " << *in << dendl;
11fdf7f2
TL
163 ceph_assert(logger); // Caller should have done set_logger before using me
164 ceph_assert(in->is_auth());
7c673cae
FG
165
166 in->state_clear(CInode::STATE_NEEDSRECOVER);
167 if (!in->state_test(CInode::STATE_RECOVERING)) {
168 in->state_set(CInode::STATE_RECOVERING);
169 in->auth_pin(this);
170 logger->inc(l_mdc_recovery_started);
171 }
b32b8144
FG
172
173 if (!_is_in_any_recover_queue(in)) {
174 file_recover_queue.push_back(&in->item_recover_queue);
175 file_recover_queue_size++;
176 logger->set(l_mdc_num_recovering_enqueued, file_recover_queue_size + file_recover_queue_front_size);
177 }
7c673cae
FG
178}
179
180
181/**
182 * Call back on completion of Filer probe on an inode.
183 */
184void RecoveryQueue::_recovered(CInode *in, int r, uint64_t size, utime_t mtime)
185{
186 dout(10) << "_recovered r=" << r << " size=" << size << " mtime=" << mtime
187 << " for " << *in << dendl;
188
189 if (r != 0) {
190 dout(0) << "recovery error! " << r << dendl;
f67539c2 191 if (r == -CEPHFS_EBLOCKLISTED) {
7c673cae
FG
192 mds->respawn();
193 return;
194 } else {
195 // Something wrong on the OSD side trying to recover the size
196 // of this inode. In principle we could record this as a piece
197 // of per-inode damage, but it's actually more likely that
198 // this indicates something wrong with the MDS (like maybe
199 // it has the wrong auth caps?)
11fdf7f2
TL
200 mds->clog->error() << " OSD read error while recovering size"
201 " for inode " << in->ino();
7c673cae
FG
202 mds->damaged();
203 }
204 }
205
b32b8144 206 auto p = file_recovering.find(in);
11fdf7f2 207 ceph_assert(p != file_recovering.end());
b32b8144
FG
208 bool restart = p->second;
209 file_recovering.erase(p);
210
7c673cae
FG
211 logger->set(l_mdc_num_recovering_processing, file_recovering.size());
212 logger->inc(l_mdc_recovery_completed);
213 in->state_clear(CInode::STATE_RECOVERING);
214
b32b8144
FG
215 if (restart) {
216 if (in->item_recover_queue.is_on_list()) {
217 in->item_recover_queue.remove_myself();
218 file_recover_queue_size--;
219 }
220 if (in->item_recover_queue_front.is_on_list()) {
221 in->item_recover_queue_front.remove_myself();
222 file_recover_queue_front_size--;
223 }
224 logger->set(l_mdc_num_recovering_enqueued, file_recover_queue_size + file_recover_queue_front_size);
225 logger->set(l_mdc_num_recovering_prioritized, file_recover_queue_front_size);
226 _start(in);
227 } else if (!_is_in_any_recover_queue(in)) {
7c673cae
FG
228 // journal
229 mds->locker->check_inode_max_size(in, true, 0, size, mtime);
230 mds->locker->eval(in, CEPH_LOCK_IFILE);
231 in->auth_unpin(this);
232 }
233
234 advance();
235}
236