]> git.proxmox.com Git - ceph.git/blame - ceph/src/mds/RecoveryQueue.cc
update sources to v12.2.3
[ceph.git] / ceph / src / mds / RecoveryQueue.cc
CommitLineData
7c673cae
FG
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3/*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15#include "CInode.h"
16#include "MDCache.h"
17#include "MDSRank.h"
18#include "Locker.h"
19#include "osdc/Filer.h"
20
21#include "RecoveryQueue.h"
22
7c673cae
FG
23#define dout_context g_ceph_context
24#define dout_subsys ceph_subsys_mds
25#undef dout_prefix
26#define dout_prefix *_dout << "mds." << mds->get_nodeid() << " RecoveryQueue::" << __func__ << " "
27
28class C_MDC_Recover : public MDSIOContextBase {
29protected:
30 RecoveryQueue *rq;
31 CInode *in;
32 void finish(int r) override {
33 rq->_recovered(in, r, size, mtime);
34 }
35
36 MDSRank *get_mds() override {
37 return rq->mds;
38 }
39
40public:
41 uint64_t size;
42 utime_t mtime;
43
44 C_MDC_Recover(RecoveryQueue *rq_, CInode *i) : rq(rq_), in(i), size(0) {
45 assert(rq != NULL);
46 }
47};
48
49
b32b8144
FG
50RecoveryQueue::RecoveryQueue(MDSRank *mds_) :
51 file_recover_queue(member_offset(CInode, item_dirty_dirfrag_dir)),
52 file_recover_queue_front(member_offset(CInode, item_dirty_dirfrag_nest)),
53 mds(mds_), logger(NULL), filer(mds_->objecter, mds_->finisher)
54{ }
7c673cae
FG
55
56
57/**
58 * Progress the queue. Call this after enqueuing something or on
59 * completion of something.
60 */
61void RecoveryQueue::advance()
62{
b32b8144
FG
63 dout(10) << file_recover_queue_size << " queued, "
64 << file_recover_queue_front_size << " prioritized, "
7c673cae
FG
65 << file_recovering.size() << " recovering" << dendl;
66
67 while (file_recovering.size() < g_conf->mds_max_file_recover) {
68 if (!file_recover_queue_front.empty()) {
b32b8144
FG
69 CInode *in = file_recover_queue_front.front();
70 in->item_recover_queue_front.remove_myself();
71 file_recover_queue_front_size--;
7c673cae
FG
72 _start(in);
73 } else if (!file_recover_queue.empty()) {
b32b8144
FG
74 CInode *in = file_recover_queue.front();
75 in->item_recover_queue.remove_myself();
76 file_recover_queue_size--;
7c673cae
FG
77 _start(in);
78 } else {
79 break;
80 }
81 }
82
83 logger->set(l_mdc_num_recovering_processing, file_recovering.size());
b32b8144
FG
84 logger->set(l_mdc_num_recovering_enqueued, file_recover_queue_size + file_recover_queue_front_size);
85 logger->set(l_mdc_num_recovering_prioritized, file_recover_queue_front_size);
7c673cae
FG
86}
87
88void RecoveryQueue::_start(CInode *in)
89{
90 inode_t *pi = in->get_projected_inode();
91
92 // blech
93 if (pi->client_ranges.size() && !pi->get_max_size()) {
94 mds->clog->warn() << "bad client_range " << pi->client_ranges
95 << " on ino " << pi->ino;
96 }
97
b32b8144 98 auto p = file_recovering.find(in);
7c673cae
FG
99 if (pi->client_ranges.size() && pi->get_max_size()) {
100 dout(10) << "starting " << in->inode.size << " " << pi->client_ranges
101 << " " << *in << dendl;
b32b8144
FG
102 if (p == file_recovering.end()) {
103 file_recovering.insert(make_pair(in, false));
7c673cae 104
b32b8144
FG
105 C_MDC_Recover *fin = new C_MDC_Recover(this, in);
106 filer.probe(in->inode.ino, &in->inode.layout, in->last,
107 pi->get_max_size(), &fin->size, &fin->mtime, false,
108 0, fin);
109 } else {
110 p->second = true;
111 dout(10) << "already working on " << *in << ", set need_restart flag" << dendl;
112 }
7c673cae
FG
113 } else {
114 dout(10) << "skipping " << in->inode.size << " " << *in << dendl;
b32b8144
FG
115 if (p == file_recovering.end()) {
116 in->state_clear(CInode::STATE_RECOVERING);
117 mds->locker->eval(in, CEPH_LOCK_IFILE);
118 in->auth_unpin(this);
119 }
7c673cae
FG
120 }
121}
122
123void RecoveryQueue::prioritize(CInode *in)
124{
125 if (file_recovering.count(in)) {
126 dout(10) << "already working on " << *in << dendl;
127 return;
128 }
129
b32b8144 130 if (!in->item_recover_queue_front.is_on_list()) {
7c673cae 131 dout(20) << *in << dendl;
b32b8144
FG
132
133 assert(in->item_recover_queue.is_on_list());
134 in->item_recover_queue.remove_myself();
135 file_recover_queue_size--;
136
137 file_recover_queue_front.push_back(&in->item_recover_queue_front);
138
139 file_recover_queue_front_size++;
140 logger->set(l_mdc_num_recovering_prioritized, file_recover_queue_front_size);
7c673cae
FG
141 return;
142 }
143
144 dout(10) << "not queued " << *in << dendl;
145}
146
b32b8144
FG
147static bool _is_in_any_recover_queue(CInode *in)
148{
149 return in->item_recover_queue.is_on_list() ||
150 in->item_recover_queue_front.is_on_list();
151}
7c673cae
FG
152
153/**
154 * Given an authoritative inode which is in the cache,
155 * enqueue it for recovery.
156 */
157void RecoveryQueue::enqueue(CInode *in)
158{
159 dout(15) << "RecoveryQueue::enqueue " << *in << dendl;
160 assert(logger); // Caller should have done set_logger before using me
161 assert(in->is_auth());
162
163 in->state_clear(CInode::STATE_NEEDSRECOVER);
164 if (!in->state_test(CInode::STATE_RECOVERING)) {
165 in->state_set(CInode::STATE_RECOVERING);
166 in->auth_pin(this);
167 logger->inc(l_mdc_recovery_started);
168 }
b32b8144
FG
169
170 if (!_is_in_any_recover_queue(in)) {
171 file_recover_queue.push_back(&in->item_recover_queue);
172 file_recover_queue_size++;
173 logger->set(l_mdc_num_recovering_enqueued, file_recover_queue_size + file_recover_queue_front_size);
174 }
7c673cae
FG
175}
176
177
178/**
179 * Call back on completion of Filer probe on an inode.
180 */
181void RecoveryQueue::_recovered(CInode *in, int r, uint64_t size, utime_t mtime)
182{
183 dout(10) << "_recovered r=" << r << " size=" << size << " mtime=" << mtime
184 << " for " << *in << dendl;
185
186 if (r != 0) {
187 dout(0) << "recovery error! " << r << dendl;
188 if (r == -EBLACKLISTED) {
189 mds->respawn();
190 return;
191 } else {
192 // Something wrong on the OSD side trying to recover the size
193 // of this inode. In principle we could record this as a piece
194 // of per-inode damage, but it's actually more likely that
195 // this indicates something wrong with the MDS (like maybe
196 // it has the wrong auth caps?)
197 mds->clog->error() << " OSD read error while recovering size for inode 0x"
198 << std::hex << in->ino() << std::dec;
199 mds->damaged();
200 }
201 }
202
b32b8144
FG
203 auto p = file_recovering.find(in);
204 assert(p != file_recovering.end());
205 bool restart = p->second;
206 file_recovering.erase(p);
207
7c673cae
FG
208 logger->set(l_mdc_num_recovering_processing, file_recovering.size());
209 logger->inc(l_mdc_recovery_completed);
210 in->state_clear(CInode::STATE_RECOVERING);
211
b32b8144
FG
212 if (restart) {
213 if (in->item_recover_queue.is_on_list()) {
214 in->item_recover_queue.remove_myself();
215 file_recover_queue_size--;
216 }
217 if (in->item_recover_queue_front.is_on_list()) {
218 in->item_recover_queue_front.remove_myself();
219 file_recover_queue_front_size--;
220 }
221 logger->set(l_mdc_num_recovering_enqueued, file_recover_queue_size + file_recover_queue_front_size);
222 logger->set(l_mdc_num_recovering_prioritized, file_recover_queue_front_size);
223 _start(in);
224 } else if (!_is_in_any_recover_queue(in)) {
7c673cae
FG
225 // journal
226 mds->locker->check_inode_max_size(in, true, 0, size, mtime);
227 mds->locker->eval(in, CEPH_LOCK_IFILE);
228 in->auth_unpin(this);
229 }
230
231 advance();
232}
233