]> git.proxmox.com Git - ceph.git/blame - ceph/src/mds/RecoveryQueue.cc
import quincy beta 17.1.0
[ceph.git] / ceph / src / mds / RecoveryQueue.cc
CommitLineData
7c673cae
FG
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3/*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15#include "CInode.h"
16#include "MDCache.h"
17#include "MDSRank.h"
18#include "Locker.h"
19#include "osdc/Filer.h"
20
21#include "RecoveryQueue.h"
22
7c673cae
FG
23#define dout_context g_ceph_context
24#define dout_subsys ceph_subsys_mds
25#undef dout_prefix
26#define dout_prefix *_dout << "mds." << mds->get_nodeid() << " RecoveryQueue::" << __func__ << " "
27
20effc67
TL
28using namespace std;
29
7c673cae 30class C_MDC_Recover : public MDSIOContextBase {
9f95a23c
TL
31public:
32 C_MDC_Recover(RecoveryQueue *rq_, CInode *i) :
33 MDSIOContextBase(false), rq(rq_), in(i) {
34 ceph_assert(rq != NULL);
35 }
36 void print(ostream& out) const override {
37 out << "file_recover(" << in->ino() << ")";
38 }
39
40 uint64_t size = 0;
41 utime_t mtime;
7c673cae 42protected:
7c673cae
FG
43 void finish(int r) override {
44 rq->_recovered(in, r, size, mtime);
45 }
46
47 MDSRank *get_mds() override {
48 return rq->mds;
49 }
50
9f95a23c
TL
51 RecoveryQueue *rq;
52 CInode *in;
7c673cae
FG
53};
54
b32b8144
FG
55RecoveryQueue::RecoveryQueue(MDSRank *mds_) :
56 file_recover_queue(member_offset(CInode, item_dirty_dirfrag_dir)),
57 file_recover_queue_front(member_offset(CInode, item_dirty_dirfrag_nest)),
9f95a23c 58 mds(mds_), filer(mds_->objecter, mds_->finisher)
b32b8144 59{ }
7c673cae 60
7c673cae
FG
61/**
62 * Progress the queue. Call this after enqueuing something or on
63 * completion of something.
64 */
65void RecoveryQueue::advance()
66{
b32b8144
FG
67 dout(10) << file_recover_queue_size << " queued, "
68 << file_recover_queue_front_size << " prioritized, "
7c673cae
FG
69 << file_recovering.size() << " recovering" << dendl;
70
11fdf7f2 71 while (file_recovering.size() < g_conf()->mds_max_file_recover) {
7c673cae 72 if (!file_recover_queue_front.empty()) {
b32b8144
FG
73 CInode *in = file_recover_queue_front.front();
74 in->item_recover_queue_front.remove_myself();
75 file_recover_queue_front_size--;
7c673cae
FG
76 _start(in);
77 } else if (!file_recover_queue.empty()) {
b32b8144
FG
78 CInode *in = file_recover_queue.front();
79 in->item_recover_queue.remove_myself();
80 file_recover_queue_size--;
7c673cae
FG
81 _start(in);
82 } else {
83 break;
84 }
85 }
86
87 logger->set(l_mdc_num_recovering_processing, file_recovering.size());
b32b8144
FG
88 logger->set(l_mdc_num_recovering_enqueued, file_recover_queue_size + file_recover_queue_front_size);
89 logger->set(l_mdc_num_recovering_prioritized, file_recover_queue_front_size);
7c673cae
FG
90}
91
92void RecoveryQueue::_start(CInode *in)
93{
f67539c2 94 const auto& pi = in->get_projected_inode();
7c673cae
FG
95
96 // blech
97 if (pi->client_ranges.size() && !pi->get_max_size()) {
98 mds->clog->warn() << "bad client_range " << pi->client_ranges
99 << " on ino " << pi->ino;
100 }
101
b32b8144 102 auto p = file_recovering.find(in);
7c673cae 103 if (pi->client_ranges.size() && pi->get_max_size()) {
f67539c2 104 dout(10) << "starting " << pi->size << " " << pi->client_ranges
7c673cae 105 << " " << *in << dendl;
b32b8144
FG
106 if (p == file_recovering.end()) {
107 file_recovering.insert(make_pair(in, false));
7c673cae 108
b32b8144 109 C_MDC_Recover *fin = new C_MDC_Recover(this, in);
f67539c2
TL
110 auto layout = pi->layout;
111 filer.probe(in->ino(), &layout, in->last,
b32b8144
FG
112 pi->get_max_size(), &fin->size, &fin->mtime, false,
113 0, fin);
114 } else {
115 p->second = true;
116 dout(10) << "already working on " << *in << ", set need_restart flag" << dendl;
117 }
7c673cae 118 } else {
f67539c2 119 dout(10) << "skipping " << pi->size << " " << *in << dendl;
b32b8144
FG
120 if (p == file_recovering.end()) {
121 in->state_clear(CInode::STATE_RECOVERING);
122 mds->locker->eval(in, CEPH_LOCK_IFILE);
123 in->auth_unpin(this);
124 }
7c673cae
FG
125 }
126}
127
128void RecoveryQueue::prioritize(CInode *in)
129{
130 if (file_recovering.count(in)) {
131 dout(10) << "already working on " << *in << dendl;
132 return;
133 }
134
b32b8144 135 if (!in->item_recover_queue_front.is_on_list()) {
7c673cae 136 dout(20) << *in << dendl;
b32b8144 137
11fdf7f2 138 ceph_assert(in->item_recover_queue.is_on_list());
b32b8144
FG
139 in->item_recover_queue.remove_myself();
140 file_recover_queue_size--;
141
142 file_recover_queue_front.push_back(&in->item_recover_queue_front);
143
144 file_recover_queue_front_size++;
145 logger->set(l_mdc_num_recovering_prioritized, file_recover_queue_front_size);
7c673cae
FG
146 return;
147 }
148
149 dout(10) << "not queued " << *in << dendl;
150}
151
b32b8144
FG
152static bool _is_in_any_recover_queue(CInode *in)
153{
154 return in->item_recover_queue.is_on_list() ||
155 in->item_recover_queue_front.is_on_list();
156}
7c673cae
FG
157
158/**
159 * Given an authoritative inode which is in the cache,
160 * enqueue it for recovery.
161 */
162void RecoveryQueue::enqueue(CInode *in)
163{
164 dout(15) << "RecoveryQueue::enqueue " << *in << dendl;
11fdf7f2
TL
165 ceph_assert(logger); // Caller should have done set_logger before using me
166 ceph_assert(in->is_auth());
7c673cae
FG
167
168 in->state_clear(CInode::STATE_NEEDSRECOVER);
169 if (!in->state_test(CInode::STATE_RECOVERING)) {
170 in->state_set(CInode::STATE_RECOVERING);
171 in->auth_pin(this);
172 logger->inc(l_mdc_recovery_started);
173 }
b32b8144
FG
174
175 if (!_is_in_any_recover_queue(in)) {
176 file_recover_queue.push_back(&in->item_recover_queue);
177 file_recover_queue_size++;
178 logger->set(l_mdc_num_recovering_enqueued, file_recover_queue_size + file_recover_queue_front_size);
179 }
7c673cae
FG
180}
181
182
183/**
184 * Call back on completion of Filer probe on an inode.
185 */
186void RecoveryQueue::_recovered(CInode *in, int r, uint64_t size, utime_t mtime)
187{
188 dout(10) << "_recovered r=" << r << " size=" << size << " mtime=" << mtime
189 << " for " << *in << dendl;
190
191 if (r != 0) {
192 dout(0) << "recovery error! " << r << dendl;
f67539c2 193 if (r == -CEPHFS_EBLOCKLISTED) {
7c673cae
FG
194 mds->respawn();
195 return;
196 } else {
197 // Something wrong on the OSD side trying to recover the size
198 // of this inode. In principle we could record this as a piece
199 // of per-inode damage, but it's actually more likely that
200 // this indicates something wrong with the MDS (like maybe
201 // it has the wrong auth caps?)
11fdf7f2
TL
202 mds->clog->error() << " OSD read error while recovering size"
203 " for inode " << in->ino();
7c673cae
FG
204 mds->damaged();
205 }
206 }
207
b32b8144 208 auto p = file_recovering.find(in);
11fdf7f2 209 ceph_assert(p != file_recovering.end());
b32b8144
FG
210 bool restart = p->second;
211 file_recovering.erase(p);
212
7c673cae
FG
213 logger->set(l_mdc_num_recovering_processing, file_recovering.size());
214 logger->inc(l_mdc_recovery_completed);
215 in->state_clear(CInode::STATE_RECOVERING);
216
b32b8144
FG
217 if (restart) {
218 if (in->item_recover_queue.is_on_list()) {
219 in->item_recover_queue.remove_myself();
220 file_recover_queue_size--;
221 }
222 if (in->item_recover_queue_front.is_on_list()) {
223 in->item_recover_queue_front.remove_myself();
224 file_recover_queue_front_size--;
225 }
226 logger->set(l_mdc_num_recovering_enqueued, file_recover_queue_size + file_recover_queue_front_size);
227 logger->set(l_mdc_num_recovering_prioritized, file_recover_queue_front_size);
228 _start(in);
229 } else if (!_is_in_any_recover_queue(in)) {
7c673cae
FG
230 // journal
231 mds->locker->check_inode_max_size(in, true, 0, size, mtime);
232 mds->locker->eval(in, CEPH_LOCK_IFILE);
233 in->auth_unpin(this);
234 }
235
236 advance();
237}
238