]>
Commit | Line | Data |
---|---|---|
7c673cae FG |
1 | // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- |
2 | // vim: ts=8 sw=2 smarttab | |
3 | /* | |
4 | * Ceph - scalable distributed file system | |
5 | * | |
6 | * Copyright (C) 2010 Greg Farnum <gregf@hq.newdream.net> | |
7 | * | |
8 | * This is free software; you can redistribute it and/or | |
9 | * modify it under the terms of the GNU Lesser General Public | |
10 | * License version 2.1, as published by the Free Software | |
11 | * Foundation. See file COPYING. | |
12 | * | |
13 | */ | |
14 | ||
15 | #ifndef _BACKWARD_BACKWARD_WARNING_H | |
16 | #define _BACKWARD_BACKWARD_WARNING_H // make gcc 4.3 shut up about hash_* | |
17 | #endif | |
18 | ||
19 | #include "include/compat.h" | |
20 | #include "include/fs_types.h" | |
21 | #include "common/entity_name.h" | |
22 | #include "common/errno.h" | |
23 | #include "common/safe_io.h" | |
24 | #include "mds/mdstypes.h" | |
25 | #include "mds/LogEvent.h" | |
26 | #include "mds/JournalPointer.h" | |
27 | #include "osdc/Journaler.h" | |
91327a77 | 28 | #include "mon/MonClient.h" |
7c673cae FG |
29 | |
30 | #include "Dumper.h" | |
31 | ||
32 | #define dout_context g_ceph_context | |
33 | #define dout_subsys ceph_subsys_mds | |
34 | ||
35 | #define HEADER_LEN 4096 | |
36 | ||
11fdf7f2 | 37 | int Dumper::init(mds_role_t role_, const std::string &type) |
7c673cae FG |
38 | { |
39 | role = role_; | |
40 | ||
41 | int r = MDSUtility::init(); | |
42 | if (r < 0) { | |
43 | return r; | |
44 | } | |
45 | ||
46 | auto fs = fsmap->get_filesystem(role.fscid); | |
11fdf7f2 TL |
47 | ceph_assert(fs != nullptr); |
48 | ||
49 | if (type == "mdlog") { | |
50 | JournalPointer jp(role.rank, fs->mds_map.get_metadata_pool()); | |
51 | int jp_load_result = jp.load(objecter); | |
52 | if (jp_load_result != 0) { | |
53 | std::cerr << "Error loading journal: " << cpp_strerror(jp_load_result) << std::endl; | |
54 | return jp_load_result; | |
55 | } else { | |
56 | ino = jp.front; | |
57 | } | |
58 | } else if (type == "purge_queue") { | |
59 | ino = MDS_INO_PURGE_QUEUE + role.rank; | |
7c673cae | 60 | } else { |
11fdf7f2 | 61 | ceph_abort(); // should not get here |
7c673cae | 62 | } |
11fdf7f2 | 63 | return 0; |
7c673cae FG |
64 | } |
65 | ||
66 | ||
67 | int Dumper::recover_journal(Journaler *journaler) | |
68 | { | |
69 | C_SaferCond cond; | |
9f95a23c | 70 | lock.lock(); |
7c673cae | 71 | journaler->recover(&cond); |
9f95a23c | 72 | lock.unlock(); |
d2e6a577 | 73 | const int r = cond.wait(); |
7c673cae FG |
74 | |
75 | if (r < 0) { // Error | |
76 | derr << "error on recovery: " << cpp_strerror(r) << dendl; | |
77 | return r; | |
78 | } else { | |
79 | dout(10) << "completed journal recovery" << dendl; | |
80 | return 0; | |
81 | } | |
82 | } | |
83 | ||
84 | ||
85 | int Dumper::dump(const char *dump_file) | |
86 | { | |
87 | int r = 0; | |
88 | ||
89 | auto fs = fsmap->get_filesystem(role.fscid); | |
11fdf7f2 | 90 | ceph_assert(fs != nullptr); |
7c673cae FG |
91 | |
92 | Journaler journaler("dumper", ino, fs->mds_map.get_metadata_pool(), | |
93 | CEPH_FS_ONDISK_MAGIC, objecter, 0, 0, | |
94 | &finisher); | |
95 | r = recover_journal(&journaler); | |
96 | if (r) { | |
97 | return r; | |
98 | } | |
99 | uint64_t start = journaler.get_read_pos(); | |
100 | uint64_t end = journaler.get_write_pos(); | |
101 | uint64_t len = end-start; | |
102 | ||
103 | Filer filer(objecter, &finisher); | |
104 | ||
105 | cout << "journal is " << start << "~" << len << std::endl; | |
106 | ||
107 | int fd = ::open(dump_file, O_WRONLY|O_CREAT|O_TRUNC, 0644); | |
108 | if (fd >= 0) { | |
109 | // include an informative header | |
91327a77 AA |
110 | uuid_d fsid = monc->get_fsid(); |
111 | char fsid_str[40]; | |
112 | fsid.print(fsid_str); | |
7c673cae FG |
113 | char buf[HEADER_LEN]; |
114 | memset(buf, 0, sizeof(buf)); | |
91327a77 AA |
115 | snprintf(buf, HEADER_LEN, "Ceph mds%d journal dump\n start offset %llu (0x%llx)\n\ |
116 | length %llu (0x%llx)\n write_pos %llu (0x%llx)\n format %llu\n\ | |
117 | trimmed_pos %llu (0x%llx)\n stripe_unit %lu (0x%lx)\n stripe_count %lu (0x%lx)\n\ | |
118 | object_size %lu (0x%lx)\n fsid %s\n%c", | |
7c673cae FG |
119 | role.rank, |
120 | (unsigned long long)start, (unsigned long long)start, | |
121 | (unsigned long long)len, (unsigned long long)len, | |
122 | (unsigned long long)journaler.last_committed.write_pos, (unsigned long long)journaler.last_committed.write_pos, | |
123 | (unsigned long long)journaler.last_committed.stream_format, | |
124 | (unsigned long long)journaler.last_committed.trimmed_pos, (unsigned long long)journaler.last_committed.trimmed_pos, | |
91327a77 AA |
125 | (unsigned long)journaler.last_committed.layout.stripe_unit, (unsigned long)journaler.last_committed.layout.stripe_unit, |
126 | (unsigned long)journaler.last_committed.layout.stripe_count, (unsigned long)journaler.last_committed.layout.stripe_count, | |
127 | (unsigned long)journaler.last_committed.layout.object_size, (unsigned long)journaler.last_committed.layout.object_size, | |
128 | fsid_str, | |
7c673cae FG |
129 | 4); |
130 | r = safe_write(fd, buf, sizeof(buf)); | |
131 | if (r) { | |
132 | derr << "Error " << r << " (" << cpp_strerror(r) << ") writing journal file header" << dendl; | |
133 | ::close(fd); | |
134 | return r; | |
135 | } | |
136 | ||
137 | // write the data | |
138 | off64_t seeked = ::lseek64(fd, start, SEEK_SET); | |
139 | if (seeked == (off64_t)-1) { | |
140 | r = errno; | |
141 | derr << "Error " << r << " (" << cpp_strerror(r) << ") seeking to 0x" << std::hex << start << std::dec << dendl; | |
142 | ::close(fd); | |
143 | return r; | |
144 | } | |
145 | ||
146 | ||
147 | // Read and write 32MB chunks. Slower than it could be because we're not | |
148 | // streaming, but that's okay because this is just a debug/disaster tool. | |
149 | const uint32_t chunk_size = 32 * 1024 * 1024; | |
150 | ||
151 | for (uint64_t pos = start; pos < start + len; pos += chunk_size) { | |
152 | bufferlist bl; | |
153 | dout(10) << "Reading at pos=0x" << std::hex << pos << std::dec << dendl; | |
154 | ||
11fdf7f2 | 155 | const uint32_t read_size = std::min<uint64_t>(chunk_size, end - pos); |
7c673cae FG |
156 | |
157 | C_SaferCond cond; | |
9f95a23c | 158 | lock.lock(); |
7c673cae FG |
159 | filer.read(ino, &journaler.get_layout(), CEPH_NOSNAP, |
160 | pos, read_size, &bl, 0, &cond); | |
9f95a23c | 161 | lock.unlock(); |
7c673cae FG |
162 | r = cond.wait(); |
163 | if (r < 0) { | |
164 | derr << "Error " << r << " (" << cpp_strerror(r) << ") reading " | |
165 | "journal at offset 0x" << std::hex << pos << std::dec << dendl; | |
166 | ::close(fd); | |
167 | return r; | |
168 | } | |
169 | dout(10) << "Got 0x" << std::hex << bl.length() << std::dec | |
170 | << " bytes" << dendl; | |
171 | ||
172 | r = bl.write_fd(fd); | |
173 | if (r) { | |
174 | derr << "Error " << r << " (" << cpp_strerror(r) << ") writing journal file" << dendl; | |
175 | ::close(fd); | |
176 | return r; | |
177 | } | |
178 | } | |
179 | ||
180 | r = ::close(fd); | |
181 | if (r) { | |
182 | r = errno; | |
183 | derr << "Error " << r << " (" << cpp_strerror(r) << ") closing journal file" << dendl; | |
184 | return r; | |
185 | } | |
186 | ||
187 | cout << "wrote " << len << " bytes at offset " << start << " to " << dump_file << "\n" | |
188 | << "NOTE: this is a _sparse_ file; you can\n" | |
189 | << "\t$ tar cSzf " << dump_file << ".tgz " << dump_file << "\n" | |
190 | << " to efficiently compress it while preserving sparseness." << std::endl; | |
191 | return 0; | |
192 | } else { | |
193 | int err = errno; | |
194 | derr << "unable to open " << dump_file << ": " << cpp_strerror(err) << dendl; | |
195 | return err; | |
196 | } | |
197 | } | |
198 | ||
91327a77 | 199 | int Dumper::undump(const char *dump_file, bool force) |
7c673cae FG |
200 | { |
201 | cout << "undump " << dump_file << std::endl; | |
202 | ||
203 | auto fs = fsmap->get_filesystem(role.fscid); | |
11fdf7f2 | 204 | ceph_assert(fs != nullptr); |
7c673cae FG |
205 | |
206 | int r = 0; | |
91327a77 AA |
207 | // try get layout info from cluster |
208 | Journaler journaler("umdumper", ino, fs->mds_map.get_metadata_pool(), | |
209 | CEPH_FS_ONDISK_MAGIC, objecter, 0, 0, | |
210 | &finisher); | |
211 | int recovered = recover_journal(&journaler); | |
212 | if (recovered != 0) { | |
213 | derr << "recover_journal failed, try to get header from dump file " << dendl; | |
214 | } | |
215 | ||
7c673cae FG |
216 | int fd = ::open(dump_file, O_RDONLY); |
217 | if (fd < 0) { | |
218 | r = errno; | |
219 | derr << "couldn't open " << dump_file << ": " << cpp_strerror(r) << dendl; | |
220 | return r; | |
221 | } | |
222 | ||
223 | // Ceph mds0 journal dump | |
224 | // start offset 232401996 (0xdda2c4c) | |
225 | // length 1097504 (0x10bf20) | |
226 | ||
227 | char buf[HEADER_LEN]; | |
228 | r = safe_read(fd, buf, sizeof(buf)); | |
229 | if (r < 0) { | |
230 | VOID_TEMP_FAILURE_RETRY(::close(fd)); | |
231 | return r; | |
232 | } | |
233 | ||
234 | long long unsigned start, len, write_pos, format, trimmed_pos; | |
91327a77 | 235 | long unsigned stripe_unit, stripe_count, object_size; |
7c673cae FG |
236 | sscanf(strstr(buf, "start offset"), "start offset %llu", &start); |
237 | sscanf(strstr(buf, "length"), "length %llu", &len); | |
238 | sscanf(strstr(buf, "write_pos"), "write_pos %llu", &write_pos); | |
239 | sscanf(strstr(buf, "format"), "format %llu", &format); | |
91327a77 AA |
240 | |
241 | if (!force) { | |
242 | // need to check if fsid match onlien cluster fsid | |
243 | if (strstr(buf, "fsid")) { | |
244 | uuid_d fsid; | |
245 | char fsid_str[40]; | |
92f5a8d4 | 246 | sscanf(strstr(buf, "fsid"), "fsid %39s", fsid_str); |
91327a77 AA |
247 | r = fsid.parse(fsid_str); |
248 | if (!r) { | |
249 | derr << "Invalid fsid" << dendl; | |
250 | ::close(fd); | |
251 | return -EINVAL; | |
252 | } | |
253 | ||
254 | if (fsid != monc->get_fsid()) { | |
255 | derr << "Imported journal fsid does not match online cluster fsid" << dendl; | |
256 | derr << "Use --force to skip fsid check" << dendl; | |
257 | ::close(fd); | |
258 | return -EINVAL; | |
259 | } | |
260 | } else { | |
261 | derr << "Invalid header, no fsid embeded" << dendl; | |
262 | ::close(fd); | |
263 | return -EINVAL; | |
264 | } | |
265 | } | |
266 | ||
267 | if (recovered == 0) { | |
268 | stripe_unit = journaler.last_committed.layout.stripe_unit; | |
269 | stripe_count = journaler.last_committed.layout.stripe_count; | |
270 | object_size = journaler.last_committed.layout.object_size; | |
271 | } else { | |
272 | // try to get layout from dump file header, if failed set layout to default | |
273 | if (strstr(buf, "stripe_unit")) { | |
274 | sscanf(strstr(buf, "stripe_unit"), "stripe_unit %lu", &stripe_unit); | |
275 | } else { | |
276 | stripe_unit = file_layout_t::get_default().stripe_unit; | |
277 | } | |
278 | if (strstr(buf, "stripe_count")) { | |
279 | sscanf(strstr(buf, "stripe_count"), "stripe_count %lu", &stripe_count); | |
280 | } else { | |
281 | stripe_count = file_layout_t::get_default().stripe_count; | |
282 | } | |
283 | if (strstr(buf, "object_size")) { | |
284 | sscanf(strstr(buf, "object_size"), "object_size %lu", &object_size); | |
285 | } else { | |
286 | object_size = file_layout_t::get_default().object_size; | |
287 | } | |
288 | } | |
289 | ||
7c673cae FG |
290 | if (strstr(buf, "trimmed_pos")) { |
291 | sscanf(strstr(buf, "trimmed_pos"), "trimmed_pos %llu", &trimmed_pos); | |
292 | } else { | |
293 | // Old format dump, any untrimmed objects before expire_pos will | |
294 | // be discarded as trash. | |
91327a77 | 295 | trimmed_pos = start - (start % object_size); |
7c673cae FG |
296 | } |
297 | ||
298 | if (trimmed_pos > start) { | |
299 | derr << std::hex << "Invalid header (trimmed 0x" << trimmed_pos | |
300 | << " > expire 0x" << start << std::dec << dendl; | |
301 | ::close(fd); | |
302 | return -EINVAL; | |
303 | } | |
304 | ||
305 | if (start > write_pos) { | |
306 | derr << std::hex << "Invalid header (expire 0x" << start | |
307 | << " > write 0x" << write_pos << std::dec << dendl; | |
308 | ::close(fd); | |
309 | return -EINVAL; | |
310 | } | |
311 | ||
312 | cout << "start " << start << | |
313 | " len " << len << | |
314 | " write_pos " << write_pos << | |
315 | " format " << format << | |
91327a77 AA |
316 | " trimmed_pos " << trimmed_pos << |
317 | " stripe_unit " << stripe_unit << | |
318 | " stripe_count " << stripe_count << | |
319 | " object_size " << object_size << std::endl; | |
7c673cae FG |
320 | |
321 | Journaler::Header h; | |
322 | h.trimmed_pos = trimmed_pos; | |
323 | h.expire_pos = start; | |
324 | h.write_pos = write_pos; | |
325 | h.stream_format = format; | |
326 | h.magic = CEPH_FS_ONDISK_MAGIC; | |
327 | ||
91327a77 AA |
328 | h.layout.stripe_unit = stripe_unit; |
329 | h.layout.stripe_count = stripe_count; | |
330 | h.layout.object_size = object_size; | |
7c673cae FG |
331 | h.layout.pool_id = fs->mds_map.get_metadata_pool(); |
332 | ||
333 | bufferlist hbl; | |
11fdf7f2 | 334 | encode(h, hbl); |
7c673cae FG |
335 | |
336 | object_t oid = file_object_t(ino, 0); | |
337 | object_locator_t oloc(fs->mds_map.get_metadata_pool()); | |
338 | SnapContext snapc; | |
339 | ||
340 | cout << "writing header " << oid << std::endl; | |
341 | C_SaferCond header_cond; | |
9f95a23c | 342 | lock.lock(); |
7c673cae FG |
343 | objecter->write_full(oid, oloc, snapc, hbl, |
344 | ceph::real_clock::now(), 0, | |
345 | &header_cond); | |
9f95a23c | 346 | lock.unlock(); |
7c673cae FG |
347 | |
348 | r = header_cond.wait(); | |
349 | if (r != 0) { | |
350 | derr << "Failed to write header: " << cpp_strerror(r) << dendl; | |
351 | ::close(fd); | |
352 | return r; | |
353 | } | |
354 | ||
355 | Filer filer(objecter, &finisher); | |
356 | ||
357 | /* Erase any objects at the end of the region to which we shall write | |
358 | * the new log data. This is to avoid leaving trailing junk after | |
359 | * the newly written data. Any junk more than one object ahead | |
360 | * will be taken care of during normal operation by Journaler's | |
361 | * prezeroing behaviour */ | |
362 | { | |
363 | uint32_t const object_size = h.layout.object_size; | |
11fdf7f2 | 364 | ceph_assert(object_size > 0); |
1adf2230 AA |
365 | uint64_t last_obj = h.write_pos / object_size; |
366 | uint64_t purge_count = 2; | |
367 | /* When the length is zero, the last_obj should be zeroed | |
368 | * from the offset determined by the new write_pos instead of being purged. | |
369 | */ | |
370 | if (!len) { | |
371 | purge_count = 1; | |
372 | ++last_obj; | |
373 | } | |
7c673cae FG |
374 | C_SaferCond purge_cond; |
375 | cout << "Purging " << purge_count << " objects from " << last_obj << std::endl; | |
9f95a23c | 376 | lock.lock(); |
7c673cae FG |
377 | filer.purge_range(ino, &h.layout, snapc, last_obj, purge_count, |
378 | ceph::real_clock::now(), 0, &purge_cond); | |
9f95a23c | 379 | lock.unlock(); |
7c673cae FG |
380 | purge_cond.wait(); |
381 | } | |
1adf2230 AA |
382 | /* When the length is zero, zero the last object |
383 | * from the offset determined by the new write_pos. | |
384 | */ | |
385 | if (!len) { | |
386 | uint64_t offset_in_obj = h.write_pos % h.layout.object_size; | |
387 | uint64_t len = h.layout.object_size - offset_in_obj; | |
388 | C_SaferCond zero_cond; | |
389 | cout << "Zeroing " << len << " bytes in the last object." << std::endl; | |
390 | ||
9f95a23c | 391 | lock.lock(); |
1adf2230 | 392 | filer.zero(ino, &h.layout, snapc, h.write_pos, len, ceph::real_clock::now(), 0, &zero_cond); |
9f95a23c | 393 | lock.unlock(); |
1adf2230 AA |
394 | zero_cond.wait(); |
395 | } | |
7c673cae FG |
396 | |
397 | // Stream from `fd` to `filer` | |
398 | uint64_t pos = start; | |
399 | uint64_t left = len; | |
400 | while (left > 0) { | |
401 | // Read | |
402 | bufferlist j; | |
403 | lseek64(fd, pos, SEEK_SET); | |
11fdf7f2 | 404 | uint64_t l = std::min<uint64_t>(left, 1024*1024); |
7c673cae FG |
405 | j.read_fd(fd, l); |
406 | ||
407 | // Write | |
408 | cout << " writing " << pos << "~" << l << std::endl; | |
409 | C_SaferCond write_cond; | |
9f95a23c | 410 | lock.lock(); |
7c673cae FG |
411 | filer.write(ino, &h.layout, snapc, pos, l, j, |
412 | ceph::real_clock::now(), 0, &write_cond); | |
9f95a23c | 413 | lock.unlock(); |
7c673cae FG |
414 | |
415 | r = write_cond.wait(); | |
416 | if (r != 0) { | |
417 | derr << "Failed to write header: " << cpp_strerror(r) << dendl; | |
418 | ::close(fd); | |
419 | return r; | |
420 | } | |
421 | ||
422 | // Advance | |
423 | pos += l; | |
424 | left -= l; | |
425 | } | |
426 | ||
427 | VOID_TEMP_FAILURE_RETRY(::close(fd)); | |
428 | cout << "done." << std::endl; | |
429 | return 0; | |
430 | } | |
431 |