]>
Commit | Line | Data |
---|---|---|
7c673cae FG |
1 | // -*- mode:c++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- |
2 | // vim: ts=8 sw=2 smarttab | |
3 | /* | |
4 | * ceph - scalable distributed file system | |
5 | * | |
6 | * copyright (c) 2014 john spray <john.spray@inktank.com> | |
7 | * | |
8 | * this is free software; you can redistribute it and/or | |
9 | * modify it under the terms of the gnu lesser general public | |
10 | * license version 2.1, as published by the free software | |
11 | * foundation. see file copying. | |
12 | */ | |
13 | ||
14 | ||
15 | #include "include/rados/librados.hpp" | |
16 | #include "mds/JournalPointer.h" | |
17 | ||
18 | #include "mds/events/ESubtreeMap.h" | |
11fdf7f2 | 19 | #include "mds/PurgeQueue.h" |
7c673cae FG |
20 | |
21 | #include "JournalScanner.h" | |
22 | ||
23 | #define dout_context g_ceph_context | |
24 | #define dout_subsys ceph_subsys_mds | |
25 | ||
26 | /** | |
27 | * Read journal header, followed by sequential scan through journal space. | |
28 | * | |
29 | * Return 0 on success, else error code. Note that success has the special meaning | |
30 | * that we were able to apply our checks, it does *not* mean that the journal is | |
31 | * healthy. | |
32 | */ | |
33 | int JournalScanner::scan(bool const full) | |
34 | { | |
35 | int r = 0; | |
36 | ||
11fdf7f2 | 37 | r = set_journal_ino(); |
7c673cae FG |
38 | if (r < 0) { |
39 | return r; | |
40 | } | |
41 | ||
11fdf7f2 | 42 | if (!is_mdlog || pointer_present) { |
7c673cae FG |
43 | r = scan_header(); |
44 | if (r < 0) { | |
45 | return r; | |
46 | } | |
47 | } | |
48 | ||
49 | if (full && header_present) { | |
50 | r = scan_events(); | |
51 | if (r < 0) { | |
52 | return r; | |
53 | } | |
54 | } | |
55 | ||
56 | return 0; | |
57 | } | |
58 | ||
59 | ||
11fdf7f2 TL |
60 | int JournalScanner::set_journal_ino() |
61 | { | |
62 | int r = 0; | |
63 | if (type == "purge_queue") { | |
64 | ino = MDS_INO_PURGE_QUEUE + rank; | |
65 | } | |
66 | else if (type == "mdlog"){ | |
67 | r = scan_pointer(); | |
68 | is_mdlog = true; | |
69 | } | |
70 | else { | |
71 | ceph_abort(); // should not get here | |
72 | } | |
73 | return r; | |
74 | } | |
75 | ||
7c673cae FG |
76 | int JournalScanner::scan_pointer() |
77 | { | |
78 | // Issue read | |
79 | std::string const pointer_oid = obj_name(MDS_INO_LOG_POINTER_OFFSET + rank, 0); | |
80 | bufferlist pointer_bl; | |
81 | int r = io.read(pointer_oid, pointer_bl, INT_MAX, 0); | |
82 | if (r == -ENOENT) { | |
83 | // 'Successfully' discovered the pointer is missing. | |
84 | derr << "Pointer " << pointer_oid << " is absent" << dendl; | |
85 | return 0; | |
86 | } else if (r < 0) { | |
87 | // Error preventing us interrogating pointer | |
88 | derr << "Pointer " << pointer_oid << " is unreadable" << dendl; | |
89 | return r; | |
90 | } else { | |
91 | dout(4) << "Pointer " << pointer_oid << " is readable" << dendl; | |
92 | pointer_present = true; | |
93 | ||
94 | JournalPointer jp; | |
95 | try { | |
11fdf7f2 | 96 | auto q = pointer_bl.cbegin(); |
7c673cae FG |
97 | jp.decode(q); |
98 | } catch(buffer::error &e) { | |
99 | derr << "Pointer " << pointer_oid << " is corrupt: " << e.what() << dendl; | |
100 | return 0; | |
101 | } | |
102 | ||
103 | pointer_valid = true; | |
104 | ino = jp.front; | |
105 | return 0; | |
106 | } | |
107 | } | |
108 | ||
109 | ||
110 | int JournalScanner::scan_header() | |
111 | { | |
112 | int r; | |
113 | ||
114 | bufferlist header_bl; | |
115 | std::string header_name = obj_name(0); | |
116 | dout(4) << "JournalScanner::scan: reading header object '" << header_name << "'" << dendl; | |
117 | r = io.read(header_name, header_bl, INT_MAX, 0); | |
118 | if (r < 0) { | |
119 | derr << "Header " << header_name << " is unreadable" << dendl; | |
120 | return 0; // "Successfully" found an error | |
121 | } else { | |
122 | header_present = true; | |
123 | } | |
124 | ||
11fdf7f2 | 125 | auto header_bl_i = header_bl.cbegin(); |
7c673cae FG |
126 | header = new Journaler::Header(); |
127 | try | |
128 | { | |
129 | header->decode(header_bl_i); | |
130 | } | |
131 | catch (buffer::error &e) | |
132 | { | |
133 | derr << "Header is corrupt (" << e.what() << ")" << dendl; | |
134 | delete header; | |
135 | header = NULL; | |
136 | return 0; // "Successfully" found an error | |
137 | } | |
138 | ||
139 | if (header->magic != std::string(CEPH_FS_ONDISK_MAGIC)) { | |
140 | derr << "Header is corrupt (bad magic)" << dendl; | |
141 | return 0; // "Successfully" found an error | |
142 | } | |
143 | if (!((header->trimmed_pos <= header->expire_pos) && (header->expire_pos <= header->write_pos))) { | |
144 | derr << "Header is invalid (inconsistent offsets)" << dendl; | |
145 | return 0; // "Successfully" found an error | |
146 | } | |
147 | header_valid = true; | |
148 | ||
149 | return 0; | |
150 | } | |
151 | ||
152 | ||
153 | int JournalScanner::scan_events() | |
154 | { | |
11fdf7f2 | 155 | uint64_t object_size = g_conf()->mds_log_segment_size; |
7c673cae FG |
156 | if (object_size == 0) { |
157 | // Default layout object size | |
158 | object_size = file_layout_t::get_default().object_size; | |
159 | } | |
160 | ||
161 | uint64_t read_offset = header->expire_pos; | |
162 | dout(10) << std::hex << "Header 0x" | |
163 | << header->trimmed_pos << " 0x" | |
164 | << header->expire_pos << " 0x" | |
165 | << header->write_pos << std::dec << dendl; | |
166 | dout(10) << "Starting journal scan from offset 0x" << std::hex << read_offset << std::dec << dendl; | |
167 | ||
168 | // TODO also check for extraneous objects before the trimmed pos or after the write pos, | |
169 | // which would indicate a bogus header. | |
170 | ||
171 | bufferlist read_buf; | |
172 | bool gap = false; | |
173 | uint64_t gap_start = -1; | |
174 | for (uint64_t obj_offset = (read_offset / object_size); ; obj_offset++) { | |
175 | uint64_t offset_in_obj = 0; | |
176 | if (obj_offset * object_size < header->expire_pos) { | |
177 | // Skip up to expire_pos from start of the object | |
178 | // (happens for the first object we read) | |
179 | offset_in_obj = header->expire_pos - obj_offset * object_size; | |
180 | } | |
181 | ||
182 | // Read this journal segment | |
183 | bufferlist this_object; | |
184 | std::string const oid = obj_name(obj_offset); | |
185 | int r = io.read(oid, this_object, INT_MAX, offset_in_obj); | |
186 | ||
187 | // Handle absent journal segments | |
188 | if (r < 0) { | |
189 | if (obj_offset > (header->write_pos / object_size)) { | |
190 | dout(4) << "Reached end of journal objects" << dendl; | |
191 | break; | |
192 | } else { | |
193 | derr << "Missing object " << oid << dendl; | |
194 | } | |
195 | ||
196 | objects_missing.push_back(obj_offset); | |
f91f0fd5 TL |
197 | if (!gap) { |
198 | gap_start = read_offset; | |
199 | gap = true; | |
200 | } | |
201 | if (read_buf.length() > 0) { | |
202 | read_offset += read_buf.length(); | |
203 | read_buf.clear(); | |
204 | } | |
205 | read_offset += object_size - offset_in_obj; | |
7c673cae FG |
206 | continue; |
207 | } else { | |
208 | dout(4) << "Read 0x" << std::hex << this_object.length() << std::dec | |
209 | << " bytes from " << oid << " gap=" << gap << dendl; | |
210 | objects_valid.push_back(oid); | |
9f95a23c | 211 | this_object.begin().copy(this_object.length(), read_buf); |
7c673cae FG |
212 | } |
213 | ||
214 | if (gap) { | |
215 | // No valid data at the current read offset, scan forward until we find something valid looking | |
216 | // or have to drop out to load another object. | |
217 | dout(4) << "Searching for sentinel from 0x" << std::hex << read_offset | |
218 | << ", 0x" << read_buf.length() << std::dec << " bytes available" << dendl; | |
219 | ||
220 | do { | |
11fdf7f2 | 221 | auto p = read_buf.cbegin(); |
7c673cae | 222 | uint64_t candidate_sentinel; |
11fdf7f2 | 223 | decode(candidate_sentinel, p); |
7c673cae FG |
224 | |
225 | dout(4) << "Data at 0x" << std::hex << read_offset << " = 0x" << candidate_sentinel << std::dec << dendl; | |
226 | ||
227 | if (candidate_sentinel == JournalStream::sentinel) { | |
228 | dout(4) << "Found sentinel at 0x" << std::hex << read_offset << std::dec << dendl; | |
229 | ranges_invalid.push_back(Range(gap_start, read_offset)); | |
230 | gap = false; | |
231 | break; | |
232 | } else { | |
233 | // No sentinel, discard this byte | |
234 | read_buf.splice(0, 1); | |
235 | read_offset += 1; | |
236 | } | |
237 | } while (read_buf.length() >= sizeof(JournalStream::sentinel)); | |
238 | dout(4) << "read_buf size is " << read_buf.length() << dendl; | |
f91f0fd5 TL |
239 | } |
240 | { | |
7c673cae FG |
241 | dout(10) << "Parsing data, 0x" << std::hex << read_buf.length() << std::dec << " bytes available" << dendl; |
242 | while(true) { | |
243 | // TODO: detect and handle legacy format journals: can do many things | |
244 | // on them but on read errors have to give up instead of searching | |
245 | // for sentinels. | |
246 | JournalStream journal_stream(JOURNAL_FORMAT_RESILIENT); | |
247 | bool readable = false; | |
248 | try { | |
249 | uint64_t need; | |
250 | readable = journal_stream.readable(read_buf, &need); | |
251 | } catch (buffer::error &e) { | |
252 | readable = false; | |
253 | dout(4) << "Invalid container encoding at 0x" << std::hex << read_offset << std::dec << dendl; | |
254 | gap = true; | |
255 | gap_start = read_offset; | |
256 | read_buf.splice(0, 1); | |
257 | read_offset += 1; | |
258 | break; | |
259 | } | |
260 | ||
261 | if (!readable) { | |
262 | // Out of data, continue to read next object | |
263 | break; | |
264 | } | |
265 | ||
266 | bufferlist le_bl; //< Serialized LogEvent blob | |
267 | dout(10) << "Attempting decode at 0x" << std::hex << read_offset << std::dec << dendl; | |
268 | // This cannot fail to decode because we pre-checked that a serialized entry | |
269 | // blob would be readable. | |
270 | uint64_t start_ptr = 0; | |
271 | uint64_t consumed = journal_stream.read(read_buf, &le_bl, &start_ptr); | |
272 | dout(10) << "Consumed 0x" << std::hex << consumed << std::dec << " bytes" << dendl; | |
273 | if (start_ptr != read_offset) { | |
274 | derr << "Bad entry start ptr (0x" << std::hex << start_ptr << ") at 0x" | |
275 | << read_offset << std::dec << dendl; | |
276 | gap = true; | |
277 | gap_start = read_offset; | |
278 | // FIXME: given that entry was invalid, should we be skipping over it? | |
279 | // maybe push bytes back onto start of read_buf and just advance one byte | |
280 | // to start scanning instead. e.g. if a bogus size value is found it can | |
281 | // cause us to consume and thus skip a bunch of following valid events. | |
282 | read_offset += consumed; | |
283 | break; | |
284 | } | |
11fdf7f2 TL |
285 | bool valid_entry = true; |
286 | if (is_mdlog) { | |
287 | auto le = LogEvent::decode_event(le_bl.cbegin()); | |
288 | ||
289 | if (le) { | |
290 | dout(10) << "Valid entry at 0x" << std::hex << read_offset << std::dec << dendl; | |
291 | ||
292 | if (le->get_type() == EVENT_SUBTREEMAP | |
293 | || le->get_type() == EVENT_SUBTREEMAP_TEST) { | |
294 | auto&& sle = dynamic_cast<ESubtreeMap&>(*le); | |
295 | if (sle.expire_pos > read_offset) { | |
296 | errors.insert(std::make_pair( | |
297 | read_offset, EventError( | |
298 | -ERANGE, | |
299 | "ESubtreeMap has expire_pos ahead of its own position"))); | |
300 | } | |
7c673cae | 301 | } |
7c673cae | 302 | |
11fdf7f2 TL |
303 | if (filter.apply(read_offset, *le)) { |
304 | events.insert_or_assign(read_offset, EventRecord(std::move(le), consumed)); | |
305 | } | |
7c673cae | 306 | } else { |
11fdf7f2 | 307 | valid_entry = false; |
7c673cae | 308 | } |
11fdf7f2 TL |
309 | } else if (type == "purge_queue"){ |
310 | auto pi = std::make_unique<PurgeItem>(); | |
311 | try { | |
312 | auto q = le_bl.cbegin(); | |
313 | pi->decode(q); | |
314 | if (filter.apply(read_offset, *pi)) { | |
315 | events.insert_or_assign(read_offset, EventRecord(std::move(pi), consumed)); | |
316 | } | |
317 | } catch (const buffer::error &err) { | |
318 | valid_entry = false; | |
319 | } | |
7c673cae | 320 | } else { |
11fdf7f2 TL |
321 | ceph_abort(); // should not get here |
322 | } | |
323 | if (!valid_entry) { | |
7c673cae FG |
324 | dout(10) << "Invalid entry at 0x" << std::hex << read_offset << std::dec << dendl; |
325 | gap = true; | |
326 | gap_start = read_offset; | |
327 | read_offset += consumed; | |
b32b8144 | 328 | break; |
11fdf7f2 TL |
329 | } else { |
330 | events_valid.push_back(read_offset); | |
331 | read_offset += consumed; | |
7c673cae FG |
332 | } |
333 | } | |
334 | } | |
335 | } | |
336 | ||
337 | if (gap) { | |
338 | // Ended on a gap, assume it ran to end | |
339 | ranges_invalid.push_back(Range(gap_start, -1)); | |
340 | } | |
341 | ||
342 | dout(4) << "Scanned objects, " << objects_missing.size() << " missing, " << objects_valid.size() << " valid" << dendl; | |
343 | dout(4) << "Events scanned, " << ranges_invalid.size() << " gaps" << dendl; | |
344 | dout(4) << "Found " << events_valid.size() << " valid events" << dendl; | |
345 | dout(4) << "Selected " << events.size() << " events events for processing" << dendl; | |
346 | ||
347 | return 0; | |
348 | } | |
349 | ||
350 | ||
351 | JournalScanner::~JournalScanner() | |
352 | { | |
353 | if (header) { | |
354 | delete header; | |
355 | header = NULL; | |
356 | } | |
357 | dout(4) << events.size() << " events" << dendl; | |
7c673cae FG |
358 | events.clear(); |
359 | } | |
360 | ||
361 | ||
362 | /** | |
363 | * Whether the journal data looks valid and replayable | |
364 | */ | |
365 | bool JournalScanner::is_healthy() const | |
366 | { | |
11fdf7f2 | 367 | return ((!is_mdlog || (pointer_present && pointer_valid)) |
7c673cae FG |
368 | && header_present && header_valid |
369 | && ranges_invalid.empty() | |
370 | && objects_missing.empty()); | |
371 | } | |
372 | ||
373 | ||
374 | /** | |
375 | * Whether the journal data can be read from RADOS | |
376 | */ | |
377 | bool JournalScanner::is_readable() const | |
378 | { | |
379 | return (header_present && header_valid && objects_missing.empty()); | |
380 | } | |
381 | ||
382 | ||
383 | /** | |
384 | * Calculate the object name for a given offset | |
385 | */ | |
386 | std::string JournalScanner::obj_name(inodeno_t ino, uint64_t offset) const | |
387 | { | |
388 | char name[60]; | |
389 | snprintf(name, sizeof(name), "%llx.%08llx", | |
390 | (unsigned long long)(ino), | |
391 | (unsigned long long)offset); | |
392 | return std::string(name); | |
393 | } | |
394 | ||
395 | ||
396 | std::string JournalScanner::obj_name(uint64_t offset) const | |
397 | { | |
398 | return obj_name(ino, offset); | |
399 | } | |
400 | ||
401 | ||
402 | /* | |
403 | * Write a human readable summary of the journal health | |
404 | */ | |
405 | void JournalScanner::report(std::ostream &out) const | |
406 | { | |
407 | out << "Overall journal integrity: " << (is_healthy() ? "OK" : "DAMAGED") << std::endl; | |
408 | ||
11fdf7f2 TL |
409 | if (is_mdlog) { |
410 | if (!pointer_present) { | |
411 | out << "Pointer not found" << std::endl; | |
412 | } else if (!pointer_valid) { | |
413 | out << "Pointer could not be decoded" << std::endl; | |
414 | } | |
7c673cae | 415 | } |
7c673cae FG |
416 | if (!header_present) { |
417 | out << "Header not found" << std::endl; | |
418 | } else if (!header_valid) { | |
419 | out << "Header could not be decoded" << std::endl; | |
420 | } | |
421 | ||
422 | if (objects_missing.size()) { | |
423 | out << "Objects missing:" << std::endl; | |
424 | for (std::vector<uint64_t>::const_iterator om = objects_missing.begin(); | |
425 | om != objects_missing.end(); ++om) { | |
426 | out << " 0x" << std::hex << *om << std::dec << std::endl; | |
427 | } | |
428 | } | |
429 | ||
430 | if (ranges_invalid.size()) { | |
431 | out << "Corrupt regions:" << std::endl; | |
432 | for (std::vector<Range>::const_iterator r = ranges_invalid.begin(); | |
433 | r != ranges_invalid.end(); ++r) { | |
434 | out << " 0x" << std::hex << r->first << "-" << r->second << std::dec << std::endl; | |
435 | } | |
436 | } | |
437 | } | |
438 |