]>
Commit | Line | Data |
---|---|---|
7c673cae FG |
1 | // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- |
2 | // vim: ts=8 sw=2 smarttab | |
3 | /* | |
4 | * Ceph - scalable distributed file system | |
5 | * | |
6 | * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> | |
7 | * | |
8 | * This is free software; you can redistribute it and/or | |
9 | * modify it under the terms of the GNU Lesser General Public | |
10 | * License version 2.1, as published by the Free Software | |
11 | * Foundation. See file COPYING. | |
12 | * | |
13 | */ | |
14 | ||
15 | #include "CInode.h" | |
16 | #include "MDCache.h" | |
17 | #include "MDSRank.h" | |
18 | #include "Locker.h" | |
19 | #include "osdc/Filer.h" | |
20 | ||
21 | #include "RecoveryQueue.h" | |
22 | ||
7c673cae FG |
23 | #define dout_context g_ceph_context |
24 | #define dout_subsys ceph_subsys_mds | |
25 | #undef dout_prefix | |
26 | #define dout_prefix *_dout << "mds." << mds->get_nodeid() << " RecoveryQueue::" << __func__ << " " | |
27 | ||
28 | class C_MDC_Recover : public MDSIOContextBase { | |
9f95a23c TL |
29 | public: |
30 | C_MDC_Recover(RecoveryQueue *rq_, CInode *i) : | |
31 | MDSIOContextBase(false), rq(rq_), in(i) { | |
32 | ceph_assert(rq != NULL); | |
33 | } | |
34 | void print(ostream& out) const override { | |
35 | out << "file_recover(" << in->ino() << ")"; | |
36 | } | |
37 | ||
38 | uint64_t size = 0; | |
39 | utime_t mtime; | |
7c673cae | 40 | protected: |
7c673cae FG |
41 | void finish(int r) override { |
42 | rq->_recovered(in, r, size, mtime); | |
43 | } | |
44 | ||
45 | MDSRank *get_mds() override { | |
46 | return rq->mds; | |
47 | } | |
48 | ||
9f95a23c TL |
49 | RecoveryQueue *rq; |
50 | CInode *in; | |
7c673cae FG |
51 | }; |
52 | ||
b32b8144 FG |
53 | RecoveryQueue::RecoveryQueue(MDSRank *mds_) : |
54 | file_recover_queue(member_offset(CInode, item_dirty_dirfrag_dir)), | |
55 | file_recover_queue_front(member_offset(CInode, item_dirty_dirfrag_nest)), | |
9f95a23c | 56 | mds(mds_), filer(mds_->objecter, mds_->finisher) |
b32b8144 | 57 | { } |
7c673cae | 58 | |
7c673cae FG |
59 | /** |
60 | * Progress the queue. Call this after enqueuing something or on | |
61 | * completion of something. | |
62 | */ | |
63 | void RecoveryQueue::advance() | |
64 | { | |
b32b8144 FG |
65 | dout(10) << file_recover_queue_size << " queued, " |
66 | << file_recover_queue_front_size << " prioritized, " | |
7c673cae FG |
67 | << file_recovering.size() << " recovering" << dendl; |
68 | ||
11fdf7f2 | 69 | while (file_recovering.size() < g_conf()->mds_max_file_recover) { |
7c673cae | 70 | if (!file_recover_queue_front.empty()) { |
b32b8144 FG |
71 | CInode *in = file_recover_queue_front.front(); |
72 | in->item_recover_queue_front.remove_myself(); | |
73 | file_recover_queue_front_size--; | |
7c673cae FG |
74 | _start(in); |
75 | } else if (!file_recover_queue.empty()) { | |
b32b8144 FG |
76 | CInode *in = file_recover_queue.front(); |
77 | in->item_recover_queue.remove_myself(); | |
78 | file_recover_queue_size--; | |
7c673cae FG |
79 | _start(in); |
80 | } else { | |
81 | break; | |
82 | } | |
83 | } | |
84 | ||
85 | logger->set(l_mdc_num_recovering_processing, file_recovering.size()); | |
b32b8144 FG |
86 | logger->set(l_mdc_num_recovering_enqueued, file_recover_queue_size + file_recover_queue_front_size); |
87 | logger->set(l_mdc_num_recovering_prioritized, file_recover_queue_front_size); | |
7c673cae FG |
88 | } |
89 | ||
90 | void RecoveryQueue::_start(CInode *in) | |
91 | { | |
94b18763 | 92 | auto pi = in->get_projected_inode(); |
7c673cae FG |
93 | |
94 | // blech | |
95 | if (pi->client_ranges.size() && !pi->get_max_size()) { | |
96 | mds->clog->warn() << "bad client_range " << pi->client_ranges | |
97 | << " on ino " << pi->ino; | |
98 | } | |
99 | ||
b32b8144 | 100 | auto p = file_recovering.find(in); |
7c673cae FG |
101 | if (pi->client_ranges.size() && pi->get_max_size()) { |
102 | dout(10) << "starting " << in->inode.size << " " << pi->client_ranges | |
103 | << " " << *in << dendl; | |
b32b8144 FG |
104 | if (p == file_recovering.end()) { |
105 | file_recovering.insert(make_pair(in, false)); | |
7c673cae | 106 | |
b32b8144 FG |
107 | C_MDC_Recover *fin = new C_MDC_Recover(this, in); |
108 | filer.probe(in->inode.ino, &in->inode.layout, in->last, | |
109 | pi->get_max_size(), &fin->size, &fin->mtime, false, | |
110 | 0, fin); | |
111 | } else { | |
112 | p->second = true; | |
113 | dout(10) << "already working on " << *in << ", set need_restart flag" << dendl; | |
114 | } | |
7c673cae FG |
115 | } else { |
116 | dout(10) << "skipping " << in->inode.size << " " << *in << dendl; | |
b32b8144 FG |
117 | if (p == file_recovering.end()) { |
118 | in->state_clear(CInode::STATE_RECOVERING); | |
119 | mds->locker->eval(in, CEPH_LOCK_IFILE); | |
120 | in->auth_unpin(this); | |
121 | } | |
7c673cae FG |
122 | } |
123 | } | |
124 | ||
125 | void RecoveryQueue::prioritize(CInode *in) | |
126 | { | |
127 | if (file_recovering.count(in)) { | |
128 | dout(10) << "already working on " << *in << dendl; | |
129 | return; | |
130 | } | |
131 | ||
b32b8144 | 132 | if (!in->item_recover_queue_front.is_on_list()) { |
7c673cae | 133 | dout(20) << *in << dendl; |
b32b8144 | 134 | |
11fdf7f2 | 135 | ceph_assert(in->item_recover_queue.is_on_list()); |
b32b8144 FG |
136 | in->item_recover_queue.remove_myself(); |
137 | file_recover_queue_size--; | |
138 | ||
139 | file_recover_queue_front.push_back(&in->item_recover_queue_front); | |
140 | ||
141 | file_recover_queue_front_size++; | |
142 | logger->set(l_mdc_num_recovering_prioritized, file_recover_queue_front_size); | |
7c673cae FG |
143 | return; |
144 | } | |
145 | ||
146 | dout(10) << "not queued " << *in << dendl; | |
147 | } | |
148 | ||
b32b8144 FG |
149 | static bool _is_in_any_recover_queue(CInode *in) |
150 | { | |
151 | return in->item_recover_queue.is_on_list() || | |
152 | in->item_recover_queue_front.is_on_list(); | |
153 | } | |
7c673cae FG |
154 | |
155 | /** | |
156 | * Given an authoritative inode which is in the cache, | |
157 | * enqueue it for recovery. | |
158 | */ | |
159 | void RecoveryQueue::enqueue(CInode *in) | |
160 | { | |
161 | dout(15) << "RecoveryQueue::enqueue " << *in << dendl; | |
11fdf7f2 TL |
162 | ceph_assert(logger); // Caller should have done set_logger before using me |
163 | ceph_assert(in->is_auth()); | |
7c673cae FG |
164 | |
165 | in->state_clear(CInode::STATE_NEEDSRECOVER); | |
166 | if (!in->state_test(CInode::STATE_RECOVERING)) { | |
167 | in->state_set(CInode::STATE_RECOVERING); | |
168 | in->auth_pin(this); | |
169 | logger->inc(l_mdc_recovery_started); | |
170 | } | |
b32b8144 FG |
171 | |
172 | if (!_is_in_any_recover_queue(in)) { | |
173 | file_recover_queue.push_back(&in->item_recover_queue); | |
174 | file_recover_queue_size++; | |
175 | logger->set(l_mdc_num_recovering_enqueued, file_recover_queue_size + file_recover_queue_front_size); | |
176 | } | |
7c673cae FG |
177 | } |
178 | ||
179 | ||
180 | /** | |
181 | * Call back on completion of Filer probe on an inode. | |
182 | */ | |
183 | void RecoveryQueue::_recovered(CInode *in, int r, uint64_t size, utime_t mtime) | |
184 | { | |
185 | dout(10) << "_recovered r=" << r << " size=" << size << " mtime=" << mtime | |
186 | << " for " << *in << dendl; | |
187 | ||
188 | if (r != 0) { | |
189 | dout(0) << "recovery error! " << r << dendl; | |
190 | if (r == -EBLACKLISTED) { | |
191 | mds->respawn(); | |
192 | return; | |
193 | } else { | |
194 | // Something wrong on the OSD side trying to recover the size | |
195 | // of this inode. In principle we could record this as a piece | |
196 | // of per-inode damage, but it's actually more likely that | |
197 | // this indicates something wrong with the MDS (like maybe | |
198 | // it has the wrong auth caps?) | |
11fdf7f2 TL |
199 | mds->clog->error() << " OSD read error while recovering size" |
200 | " for inode " << in->ino(); | |
7c673cae FG |
201 | mds->damaged(); |
202 | } | |
203 | } | |
204 | ||
b32b8144 | 205 | auto p = file_recovering.find(in); |
11fdf7f2 | 206 | ceph_assert(p != file_recovering.end()); |
b32b8144 FG |
207 | bool restart = p->second; |
208 | file_recovering.erase(p); | |
209 | ||
7c673cae FG |
210 | logger->set(l_mdc_num_recovering_processing, file_recovering.size()); |
211 | logger->inc(l_mdc_recovery_completed); | |
212 | in->state_clear(CInode::STATE_RECOVERING); | |
213 | ||
b32b8144 FG |
214 | if (restart) { |
215 | if (in->item_recover_queue.is_on_list()) { | |
216 | in->item_recover_queue.remove_myself(); | |
217 | file_recover_queue_size--; | |
218 | } | |
219 | if (in->item_recover_queue_front.is_on_list()) { | |
220 | in->item_recover_queue_front.remove_myself(); | |
221 | file_recover_queue_front_size--; | |
222 | } | |
223 | logger->set(l_mdc_num_recovering_enqueued, file_recover_queue_size + file_recover_queue_front_size); | |
224 | logger->set(l_mdc_num_recovering_prioritized, file_recover_queue_front_size); | |
225 | _start(in); | |
226 | } else if (!_is_in_any_recover_queue(in)) { | |
7c673cae FG |
227 | // journal |
228 | mds->locker->check_inode_max_size(in, true, 0, size, mtime); | |
229 | mds->locker->eval(in, CEPH_LOCK_IFILE); | |
230 | in->auth_unpin(this); | |
231 | } | |
232 | ||
233 | advance(); | |
234 | } | |
235 |