]> git.proxmox.com Git - ceph.git/blame - ceph/src/mds/RecoveryQueue.cc
import 15.2.0 Octopus source
[ceph.git] / ceph / src / mds / RecoveryQueue.cc
CommitLineData
7c673cae
FG
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3/*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15#include "CInode.h"
16#include "MDCache.h"
17#include "MDSRank.h"
18#include "Locker.h"
19#include "osdc/Filer.h"
20
21#include "RecoveryQueue.h"
22
7c673cae
FG
23#define dout_context g_ceph_context
24#define dout_subsys ceph_subsys_mds
25#undef dout_prefix
26#define dout_prefix *_dout << "mds." << mds->get_nodeid() << " RecoveryQueue::" << __func__ << " "
27
28class C_MDC_Recover : public MDSIOContextBase {
9f95a23c
TL
29public:
30 C_MDC_Recover(RecoveryQueue *rq_, CInode *i) :
31 MDSIOContextBase(false), rq(rq_), in(i) {
32 ceph_assert(rq != NULL);
33 }
34 void print(ostream& out) const override {
35 out << "file_recover(" << in->ino() << ")";
36 }
37
38 uint64_t size = 0;
39 utime_t mtime;
7c673cae 40protected:
7c673cae
FG
41 void finish(int r) override {
42 rq->_recovered(in, r, size, mtime);
43 }
44
45 MDSRank *get_mds() override {
46 return rq->mds;
47 }
48
9f95a23c
TL
49 RecoveryQueue *rq;
50 CInode *in;
7c673cae
FG
51};
52
b32b8144
FG
53RecoveryQueue::RecoveryQueue(MDSRank *mds_) :
54 file_recover_queue(member_offset(CInode, item_dirty_dirfrag_dir)),
55 file_recover_queue_front(member_offset(CInode, item_dirty_dirfrag_nest)),
9f95a23c 56 mds(mds_), filer(mds_->objecter, mds_->finisher)
b32b8144 57{ }
7c673cae 58
7c673cae
FG
59/**
60 * Progress the queue. Call this after enqueuing something or on
61 * completion of something.
62 */
63void RecoveryQueue::advance()
64{
b32b8144
FG
65 dout(10) << file_recover_queue_size << " queued, "
66 << file_recover_queue_front_size << " prioritized, "
7c673cae
FG
67 << file_recovering.size() << " recovering" << dendl;
68
11fdf7f2 69 while (file_recovering.size() < g_conf()->mds_max_file_recover) {
7c673cae 70 if (!file_recover_queue_front.empty()) {
b32b8144
FG
71 CInode *in = file_recover_queue_front.front();
72 in->item_recover_queue_front.remove_myself();
73 file_recover_queue_front_size--;
7c673cae
FG
74 _start(in);
75 } else if (!file_recover_queue.empty()) {
b32b8144
FG
76 CInode *in = file_recover_queue.front();
77 in->item_recover_queue.remove_myself();
78 file_recover_queue_size--;
7c673cae
FG
79 _start(in);
80 } else {
81 break;
82 }
83 }
84
85 logger->set(l_mdc_num_recovering_processing, file_recovering.size());
b32b8144
FG
86 logger->set(l_mdc_num_recovering_enqueued, file_recover_queue_size + file_recover_queue_front_size);
87 logger->set(l_mdc_num_recovering_prioritized, file_recover_queue_front_size);
7c673cae
FG
88}
89
90void RecoveryQueue::_start(CInode *in)
91{
94b18763 92 auto pi = in->get_projected_inode();
7c673cae
FG
93
94 // blech
95 if (pi->client_ranges.size() && !pi->get_max_size()) {
96 mds->clog->warn() << "bad client_range " << pi->client_ranges
97 << " on ino " << pi->ino;
98 }
99
b32b8144 100 auto p = file_recovering.find(in);
7c673cae
FG
101 if (pi->client_ranges.size() && pi->get_max_size()) {
102 dout(10) << "starting " << in->inode.size << " " << pi->client_ranges
103 << " " << *in << dendl;
b32b8144
FG
104 if (p == file_recovering.end()) {
105 file_recovering.insert(make_pair(in, false));
7c673cae 106
b32b8144
FG
107 C_MDC_Recover *fin = new C_MDC_Recover(this, in);
108 filer.probe(in->inode.ino, &in->inode.layout, in->last,
109 pi->get_max_size(), &fin->size, &fin->mtime, false,
110 0, fin);
111 } else {
112 p->second = true;
113 dout(10) << "already working on " << *in << ", set need_restart flag" << dendl;
114 }
7c673cae
FG
115 } else {
116 dout(10) << "skipping " << in->inode.size << " " << *in << dendl;
b32b8144
FG
117 if (p == file_recovering.end()) {
118 in->state_clear(CInode::STATE_RECOVERING);
119 mds->locker->eval(in, CEPH_LOCK_IFILE);
120 in->auth_unpin(this);
121 }
7c673cae
FG
122 }
123}
124
125void RecoveryQueue::prioritize(CInode *in)
126{
127 if (file_recovering.count(in)) {
128 dout(10) << "already working on " << *in << dendl;
129 return;
130 }
131
b32b8144 132 if (!in->item_recover_queue_front.is_on_list()) {
7c673cae 133 dout(20) << *in << dendl;
b32b8144 134
11fdf7f2 135 ceph_assert(in->item_recover_queue.is_on_list());
b32b8144
FG
136 in->item_recover_queue.remove_myself();
137 file_recover_queue_size--;
138
139 file_recover_queue_front.push_back(&in->item_recover_queue_front);
140
141 file_recover_queue_front_size++;
142 logger->set(l_mdc_num_recovering_prioritized, file_recover_queue_front_size);
7c673cae
FG
143 return;
144 }
145
146 dout(10) << "not queued " << *in << dendl;
147}
148
b32b8144
FG
149static bool _is_in_any_recover_queue(CInode *in)
150{
151 return in->item_recover_queue.is_on_list() ||
152 in->item_recover_queue_front.is_on_list();
153}
7c673cae
FG
154
155/**
156 * Given an authoritative inode which is in the cache,
157 * enqueue it for recovery.
158 */
159void RecoveryQueue::enqueue(CInode *in)
160{
161 dout(15) << "RecoveryQueue::enqueue " << *in << dendl;
11fdf7f2
TL
162 ceph_assert(logger); // Caller should have done set_logger before using me
163 ceph_assert(in->is_auth());
7c673cae
FG
164
165 in->state_clear(CInode::STATE_NEEDSRECOVER);
166 if (!in->state_test(CInode::STATE_RECOVERING)) {
167 in->state_set(CInode::STATE_RECOVERING);
168 in->auth_pin(this);
169 logger->inc(l_mdc_recovery_started);
170 }
b32b8144
FG
171
172 if (!_is_in_any_recover_queue(in)) {
173 file_recover_queue.push_back(&in->item_recover_queue);
174 file_recover_queue_size++;
175 logger->set(l_mdc_num_recovering_enqueued, file_recover_queue_size + file_recover_queue_front_size);
176 }
7c673cae
FG
177}
178
179
180/**
181 * Call back on completion of Filer probe on an inode.
182 */
183void RecoveryQueue::_recovered(CInode *in, int r, uint64_t size, utime_t mtime)
184{
185 dout(10) << "_recovered r=" << r << " size=" << size << " mtime=" << mtime
186 << " for " << *in << dendl;
187
188 if (r != 0) {
189 dout(0) << "recovery error! " << r << dendl;
190 if (r == -EBLACKLISTED) {
191 mds->respawn();
192 return;
193 } else {
194 // Something wrong on the OSD side trying to recover the size
195 // of this inode. In principle we could record this as a piece
196 // of per-inode damage, but it's actually more likely that
197 // this indicates something wrong with the MDS (like maybe
198 // it has the wrong auth caps?)
11fdf7f2
TL
199 mds->clog->error() << " OSD read error while recovering size"
200 " for inode " << in->ino();
7c673cae
FG
201 mds->damaged();
202 }
203 }
204
b32b8144 205 auto p = file_recovering.find(in);
11fdf7f2 206 ceph_assert(p != file_recovering.end());
b32b8144
FG
207 bool restart = p->second;
208 file_recovering.erase(p);
209
7c673cae
FG
210 logger->set(l_mdc_num_recovering_processing, file_recovering.size());
211 logger->inc(l_mdc_recovery_completed);
212 in->state_clear(CInode::STATE_RECOVERING);
213
b32b8144
FG
214 if (restart) {
215 if (in->item_recover_queue.is_on_list()) {
216 in->item_recover_queue.remove_myself();
217 file_recover_queue_size--;
218 }
219 if (in->item_recover_queue_front.is_on_list()) {
220 in->item_recover_queue_front.remove_myself();
221 file_recover_queue_front_size--;
222 }
223 logger->set(l_mdc_num_recovering_enqueued, file_recover_queue_size + file_recover_queue_front_size);
224 logger->set(l_mdc_num_recovering_prioritized, file_recover_queue_front_size);
225 _start(in);
226 } else if (!_is_in_any_recover_queue(in)) {
7c673cae
FG
227 // journal
228 mds->locker->check_inode_max_size(in, true, 0, size, mtime);
229 mds->locker->eval(in, CEPH_LOCK_IFILE);
230 in->auth_unpin(this);
231 }
232
233 advance();
234}
235