]>
Commit | Line | Data |
---|---|---|
7c673cae FG |
1 | // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- |
2 | // vim: ts=8 sw=2 smarttab | |
3 | /* | |
4 | * Ceph - scalable distributed file system | |
5 | * | |
6 | * Copyright (C) 2014 Red Hat | |
7 | * | |
8 | * This is free software; you can redistribute it and/or | |
9 | * modify it under the terms of the GNU Lesser General Public | |
10 | * License version 2.1, as published by the Free Software | |
11 | * Foundation. See file COPYING. | |
12 | * | |
13 | */ | |
14 | ||
15 | #include <iostream> | |
16 | ||
17 | #include "ScrubStack.h" | |
18 | #include "common/Finisher.h" | |
19 | #include "mds/MDSRank.h" | |
20 | #include "mds/MDCache.h" | |
21 | #include "mds/MDSContinuation.h" | |
22 | ||
23 | #define dout_context g_ceph_context | |
24 | #define dout_subsys ceph_subsys_mds | |
25 | #undef dout_prefix | |
26 | #define dout_prefix _prefix(_dout, scrubstack->mdcache->mds) | |
27 | static ostream& _prefix(std::ostream *_dout, MDSRank *mds) { | |
28 | return *_dout << "mds." << mds->get_nodeid() << ".scrubstack "; | |
29 | } | |
30 | ||
31 | void ScrubStack::push_inode(CInode *in) | |
32 | { | |
33 | dout(20) << "pushing " << *in << " on top of ScrubStack" << dendl; | |
34 | if (!in->item_scrub.is_on_list()) { | |
35 | in->get(CInode::PIN_SCRUBQUEUE); | |
36 | stack_size++; | |
37 | } | |
38 | inode_stack.push_front(&in->item_scrub); | |
39 | } | |
40 | ||
41 | void ScrubStack::push_inode_bottom(CInode *in) | |
42 | { | |
43 | dout(20) << "pushing " << *in << " on bottom of ScrubStack" << dendl; | |
44 | if (!in->item_scrub.is_on_list()) { | |
45 | in->get(CInode::PIN_SCRUBQUEUE); | |
46 | stack_size++; | |
47 | } | |
48 | inode_stack.push_back(&in->item_scrub); | |
49 | } | |
50 | ||
51 | void ScrubStack::pop_inode(CInode *in) | |
52 | { | |
53 | dout(20) << "popping " << *in | |
54 | << " off of ScrubStack" << dendl; | |
55 | assert(in->item_scrub.is_on_list()); | |
56 | in->put(CInode::PIN_SCRUBQUEUE); | |
57 | in->item_scrub.remove_myself(); | |
58 | stack_size--; | |
59 | } | |
60 | ||
61 | void ScrubStack::_enqueue_inode(CInode *in, CDentry *parent, | |
b32b8144 | 62 | ScrubHeaderRef& header, |
7c673cae FG |
63 | MDSInternalContextBase *on_finish, bool top) |
64 | { | |
65 | dout(10) << __func__ << " with {" << *in << "}" | |
66 | << ", on_finish=" << on_finish << ", top=" << top << dendl; | |
67 | assert(mdcache->mds->mds_lock.is_locked_by_me()); | |
68 | in->scrub_initialize(parent, header, on_finish); | |
69 | if (top) | |
70 | push_inode(in); | |
71 | else | |
72 | push_inode_bottom(in); | |
73 | } | |
74 | ||
b32b8144 | 75 | void ScrubStack::enqueue_inode(CInode *in, ScrubHeaderRef& header, |
7c673cae FG |
76 | MDSInternalContextBase *on_finish, bool top) |
77 | { | |
78 | _enqueue_inode(in, NULL, header, on_finish, top); | |
79 | kick_off_scrubs(); | |
80 | } | |
81 | ||
82 | void ScrubStack::kick_off_scrubs() | |
83 | { | |
84 | dout(20) << __func__ << " entering with " << scrubs_in_progress << " in " | |
85 | "progress and " << stack_size << " in the stack" << dendl; | |
86 | bool can_continue = true; | |
87 | elist<CInode*>::iterator i = inode_stack.begin(); | |
88 | while (g_conf->mds_max_scrub_ops_in_progress > scrubs_in_progress && | |
89 | can_continue && !i.end()) { | |
90 | CInode *curi = *i; | |
91 | ++i; // we have our reference, push iterator forward | |
92 | ||
93 | dout(20) << __func__ << " examining " << *curi << dendl; | |
94 | ||
95 | if (!curi->is_dir()) { | |
96 | // it's a regular file, symlink, or hard link | |
97 | pop_inode(curi); // we only touch it this once, so remove from stack | |
98 | ||
99 | if (!curi->scrub_info()->on_finish) { | |
100 | scrubs_in_progress++; | |
101 | curi->scrub_set_finisher(&scrub_kick); | |
102 | } | |
103 | scrub_file_inode(curi); | |
104 | can_continue = true; | |
105 | } else { | |
106 | bool completed; // it's done, so pop it off the stack | |
107 | bool terminal; // not done, but we can start ops on other directories | |
108 | bool progress; // it added new dentries to the top of the stack | |
109 | scrub_dir_inode(curi, &progress, &terminal, &completed); | |
110 | if (completed) { | |
111 | dout(20) << __func__ << " dir completed" << dendl; | |
112 | pop_inode(curi); | |
113 | } else if (progress) { | |
114 | dout(20) << __func__ << " dir progressed" << dendl; | |
115 | // we added new stuff to top of stack, so reset ourselves there | |
116 | i = inode_stack.begin(); | |
117 | } else { | |
118 | dout(20) << __func__ << " dir no-op" << dendl; | |
119 | } | |
120 | ||
121 | can_continue = progress || terminal || completed; | |
122 | } | |
123 | } | |
124 | } | |
125 | ||
126 | void ScrubStack::scrub_dir_inode(CInode *in, | |
127 | bool *added_children, | |
128 | bool *terminal, | |
129 | bool *done) | |
130 | { | |
131 | dout(10) << __func__ << *in << dendl; | |
132 | ||
133 | *added_children = false; | |
134 | bool all_frags_terminal = true; | |
135 | bool all_frags_done = true; | |
136 | ||
b32b8144 FG |
137 | ScrubHeaderRef header = in->get_scrub_header(); |
138 | assert(header != nullptr); | |
7c673cae FG |
139 | |
140 | if (header->get_recursive()) { | |
141 | list<frag_t> scrubbing_frags; | |
142 | list<CDir*> scrubbing_cdirs; | |
143 | in->scrub_dirfrags_scrubbing(&scrubbing_frags); | |
144 | dout(20) << __func__ << " iterating over " << scrubbing_frags.size() | |
145 | << " scrubbing frags" << dendl; | |
146 | for (list<frag_t>::iterator i = scrubbing_frags.begin(); | |
147 | i != scrubbing_frags.end(); | |
148 | ++i) { | |
149 | // turn frags into CDir * | |
150 | CDir *dir = in->get_dirfrag(*i); | |
151 | if (dir) { | |
152 | scrubbing_cdirs.push_back(dir); | |
153 | dout(25) << __func__ << " got CDir " << *dir << " presently scrubbing" << dendl; | |
154 | } else { | |
155 | in->scrub_dirfrag_finished(*i); | |
156 | dout(25) << __func__ << " missing dirfrag " << *i << " skip scrubbing" << dendl; | |
157 | } | |
158 | } | |
159 | ||
160 | dout(20) << __func__ << " consuming from " << scrubbing_cdirs.size() | |
161 | << " scrubbing cdirs" << dendl; | |
162 | ||
163 | list<CDir*>::iterator i = scrubbing_cdirs.begin(); | |
164 | while (g_conf->mds_max_scrub_ops_in_progress > scrubs_in_progress) { | |
165 | // select next CDir | |
166 | CDir *cur_dir = NULL; | |
167 | if (i != scrubbing_cdirs.end()) { | |
168 | cur_dir = *i; | |
169 | ++i; | |
170 | dout(20) << __func__ << " got cur_dir = " << *cur_dir << dendl; | |
171 | } else { | |
172 | bool ready = get_next_cdir(in, &cur_dir); | |
173 | dout(20) << __func__ << " get_next_cdir ready=" << ready << dendl; | |
174 | ||
175 | if (ready && cur_dir) { | |
176 | scrubbing_cdirs.push_back(cur_dir); | |
177 | } else if (!ready) { | |
178 | // We are waiting for load of a frag | |
179 | all_frags_done = false; | |
180 | all_frags_terminal = false; | |
181 | break; | |
182 | } else { | |
183 | // Finished with all frags | |
184 | break; | |
185 | } | |
186 | } | |
187 | // scrub that CDir | |
188 | bool frag_added_children = false; | |
189 | bool frag_terminal = true; | |
190 | bool frag_done = false; | |
191 | scrub_dirfrag(cur_dir, header, | |
192 | &frag_added_children, &frag_terminal, &frag_done); | |
193 | if (frag_done) { | |
194 | cur_dir->inode->scrub_dirfrag_finished(cur_dir->frag); | |
195 | } | |
196 | *added_children |= frag_added_children; | |
197 | all_frags_terminal = all_frags_terminal && frag_terminal; | |
198 | all_frags_done = all_frags_done && frag_done; | |
199 | } | |
200 | ||
201 | dout(20) << "finished looping; all_frags_terminal=" << all_frags_terminal | |
202 | << ", all_frags_done=" << all_frags_done << dendl; | |
203 | } else { | |
204 | dout(20) << "!scrub_recursive" << dendl; | |
205 | } | |
206 | ||
207 | if (all_frags_done) { | |
208 | assert (!*added_children); // can't do this if children are still pending | |
209 | ||
210 | // OK, so now I can... fire off a validate on the dir inode, and | |
211 | // when it completes, come through here again, noticing that we've | |
212 | // set a flag to indicate the validate happened, and | |
213 | scrub_dir_inode_final(in); | |
214 | } | |
215 | ||
216 | *terminal = all_frags_terminal; | |
217 | *done = all_frags_done; | |
218 | dout(10) << __func__ << " is exiting " << *terminal << " " << *done << dendl; | |
219 | return; | |
220 | } | |
221 | ||
222 | bool ScrubStack::get_next_cdir(CInode *in, CDir **new_dir) | |
223 | { | |
224 | dout(20) << __func__ << " on " << *in << dendl; | |
225 | frag_t next_frag; | |
226 | int r = in->scrub_dirfrag_next(&next_frag); | |
227 | assert (r >= 0); | |
228 | ||
229 | if (r == 0) { | |
230 | // we got a frag to scrub, otherwise it would be ENOENT | |
231 | dout(25) << "looking up new frag " << next_frag << dendl; | |
232 | CDir *next_dir = in->get_or_open_dirfrag(mdcache, next_frag); | |
233 | if (!next_dir->is_complete()) { | |
234 | scrubs_in_progress++; | |
235 | next_dir->fetch(&scrub_kick); | |
236 | dout(25) << "fetching frag from RADOS" << dendl; | |
237 | return false; | |
238 | } | |
239 | *new_dir = next_dir; | |
240 | dout(25) << "returning dir " << *new_dir << dendl; | |
241 | return true; | |
242 | } | |
243 | assert(r == ENOENT); | |
244 | // there are no dirfrags left | |
245 | *new_dir = NULL; | |
246 | return true; | |
247 | } | |
248 | ||
249 | class C_InodeValidated : public MDSInternalContext | |
250 | { | |
251 | public: | |
252 | ScrubStack *stack; | |
253 | CInode::validated_data result; | |
254 | CInode *target; | |
255 | ||
256 | C_InodeValidated(MDSRank *mds, ScrubStack *stack_, CInode *target_) | |
257 | : MDSInternalContext(mds), stack(stack_), target(target_) | |
258 | {} | |
259 | ||
260 | void finish(int r) override | |
261 | { | |
262 | stack->_validate_inode_done(target, r, result); | |
263 | } | |
264 | }; | |
265 | ||
266 | ||
267 | void ScrubStack::scrub_dir_inode_final(CInode *in) | |
268 | { | |
269 | dout(20) << __func__ << *in << dendl; | |
270 | ||
271 | // Two passes through this function. First one triggers inode validation, | |
272 | // second one sets finally_done | |
273 | // FIXME: kind of overloading scrub_in_progress here, using it while | |
274 | // dentry is still on stack to indicate that we have finished | |
275 | // doing our validate_disk_state on the inode | |
276 | // FIXME: the magic-constructing scrub_info() is going to leave | |
277 | // an unneeded scrub_infop lying around here | |
278 | if (!in->scrub_info()->children_scrubbed) { | |
279 | if (!in->scrub_info()->on_finish) { | |
280 | scrubs_in_progress++; | |
281 | in->scrub_set_finisher(&scrub_kick); | |
282 | } | |
283 | ||
284 | in->scrub_children_finished(); | |
285 | C_InodeValidated *fin = new C_InodeValidated(mdcache->mds, this, in); | |
286 | in->validate_disk_state(&fin->result, fin); | |
287 | } | |
288 | ||
289 | return; | |
290 | } | |
291 | ||
292 | void ScrubStack::scrub_dirfrag(CDir *dir, | |
b32b8144 | 293 | ScrubHeaderRef& header, |
7c673cae FG |
294 | bool *added_children, bool *is_terminal, |
295 | bool *done) | |
296 | { | |
297 | assert(dir != NULL); | |
298 | ||
299 | dout(20) << __func__ << " on " << *dir << dendl; | |
300 | *added_children = false; | |
301 | *is_terminal = false; | |
302 | *done = false; | |
303 | ||
304 | ||
305 | if (!dir->scrub_info()->directory_scrubbing) { | |
306 | // Get the frag complete before calling | |
307 | // scrub initialize, so that it can populate its lists | |
308 | // of dentries. | |
309 | if (!dir->is_complete()) { | |
310 | scrubs_in_progress++; | |
311 | dir->fetch(&scrub_kick); | |
312 | return; | |
313 | } | |
314 | ||
315 | dir->scrub_initialize(header); | |
316 | } | |
317 | ||
318 | int r = 0; | |
319 | while(r == 0) { | |
320 | CDentry *dn = NULL; | |
321 | scrubs_in_progress++; | |
322 | r = dir->scrub_dentry_next(&scrub_kick, &dn); | |
323 | if (r != EAGAIN) { | |
324 | scrubs_in_progress--; | |
325 | } | |
326 | ||
327 | if (r == EAGAIN) { | |
328 | // Drop out, CDir fetcher will call back our kicker context | |
329 | dout(20) << __func__ << " waiting for fetch on " << *dir << dendl; | |
330 | return; | |
331 | } | |
332 | ||
333 | if (r == ENOENT) { | |
334 | // Nothing left to scrub, are we done? | |
335 | std::list<CDentry*> scrubbing; | |
336 | dir->scrub_dentries_scrubbing(&scrubbing); | |
337 | if (scrubbing.empty()) { | |
338 | dout(20) << __func__ << " dirfrag done: " << *dir << dendl; | |
339 | // FIXME: greg: What's the diff meant to be between done and terminal | |
340 | dir->scrub_finished(); | |
341 | *done = true; | |
342 | *is_terminal = true; | |
343 | } else { | |
344 | dout(20) << __func__ << " " << scrubbing.size() << " dentries still " | |
345 | "scrubbing in " << *dir << dendl; | |
346 | } | |
347 | return; | |
348 | } | |
349 | ||
350 | // scrub_dentry_next defined to only give EAGAIN, ENOENT, 0 -- we should | |
351 | // never get random IO errors here. | |
352 | assert(r == 0); | |
353 | ||
354 | _enqueue_inode(dn->get_projected_inode(), dn, header, NULL, true); | |
355 | ||
356 | *added_children = true; | |
357 | } | |
358 | } | |
359 | ||
360 | void ScrubStack::scrub_file_inode(CInode *in) | |
361 | { | |
362 | C_InodeValidated *fin = new C_InodeValidated(mdcache->mds, this, in); | |
363 | // At this stage the DN is already past scrub_initialize, so | |
364 | // it's in the cache, it has PIN_SCRUBQUEUE and it is authpinned | |
365 | in->validate_disk_state(&fin->result, fin); | |
366 | } | |
367 | ||
368 | void ScrubStack::_validate_inode_done(CInode *in, int r, | |
369 | const CInode::validated_data &result) | |
370 | { | |
371 | LogChannelRef clog = mdcache->mds->clog; | |
372 | const ScrubHeaderRefConst header = in->scrub_info()->header; | |
373 | ||
374 | std::string path; | |
375 | if (!result.passed_validation) { | |
376 | // Build path string for use in messages | |
377 | in->make_path_string(path, true); | |
378 | } | |
379 | ||
b32b8144 FG |
380 | if (result.backtrace.checked && !result.backtrace.passed |
381 | && !result.backtrace.repaired) | |
382 | { | |
7c673cae FG |
383 | // Record backtrace fails as remote linkage damage, as |
384 | // we may not be able to resolve hard links to this inode | |
385 | mdcache->mds->damage_table.notify_remote_damaged(in->inode.ino, path); | |
386 | } else if (result.inode.checked && !result.inode.passed) { | |
387 | // Record damaged inode structures as damaged dentries as | |
388 | // that is where they are stored | |
389 | auto parent = in->get_projected_parent_dn(); | |
390 | if (parent) { | |
391 | auto dir = parent->get_dir(); | |
392 | mdcache->mds->damage_table.notify_dentry( | |
94b18763 | 393 | dir->inode->ino(), dir->frag, parent->last, parent->get_name(), path); |
7c673cae FG |
394 | } |
395 | } | |
396 | ||
397 | // Inform the cluster log if we found an error | |
398 | if (!result.passed_validation) { | |
b32b8144 FG |
399 | if (result.all_damage_repaired()) { |
400 | clog->info() << "Scrub repaired inode " << in->ino() | |
401 | << " (" << path << ")"; | |
402 | } else { | |
403 | clog->warn() << "Scrub error on inode " << in->ino() | |
404 | << " (" << path << ") see " << g_conf->name | |
405 | << " log and `damage ls` output for details"; | |
406 | } | |
7c673cae FG |
407 | |
408 | // Put the verbose JSON output into the MDS log for later inspection | |
409 | JSONFormatter f; | |
410 | result.dump(&f); | |
411 | std::ostringstream out; | |
412 | f.flush(out); | |
413 | derr << __func__ << " scrub error on inode " << *in << ": " << out.str() | |
414 | << dendl; | |
415 | } else { | |
416 | dout(10) << __func__ << " scrub passed on inode " << *in << dendl; | |
417 | } | |
418 | ||
419 | MDSInternalContextBase *c = NULL; | |
420 | in->scrub_finished(&c); | |
421 | ||
422 | if (!header->get_recursive() && in == header->get_origin()) { | |
423 | if (r >= 0) { // we got into the scrubbing dump it | |
424 | result.dump(&(header->get_formatter())); | |
425 | } else { // we failed the lookup or something; dump ourselves | |
426 | header->get_formatter().open_object_section("results"); | |
427 | header->get_formatter().dump_int("return_code", r); | |
428 | header->get_formatter().close_section(); // results | |
429 | } | |
430 | } | |
431 | if (c) { | |
432 | finisher->queue(new MDSIOContextWrapper(mdcache->mds, c), 0); | |
433 | } | |
434 | } | |
435 | ||
436 | ScrubStack::C_KickOffScrubs::C_KickOffScrubs(MDCache *mdcache, ScrubStack *s) | |
437 | : MDSInternalContext(mdcache->mds), stack(s) { } |