]>
Commit | Line | Data |
---|---|---|
1 | // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- | |
2 | // vim: ts=8 sw=2 smarttab | |
3 | /* | |
4 | * Ceph - scalable distributed file system | |
5 | * | |
6 | * Copyright (C) 2010 Greg Farnum <gregf@hq.newdream.net> | |
7 | * | |
8 | * This is free software; you can redistribute it and/or | |
9 | * modify it under the terms of the GNU Lesser General Public | |
10 | * License version 2.1, as published by the Free Software | |
11 | * Foundation. See file COPYING. | |
12 | * | |
13 | */ | |
14 | ||
15 | #ifndef _BACKWARD_BACKWARD_WARNING_H | |
16 | #define _BACKWARD_BACKWARD_WARNING_H // make gcc 4.3 shut up about hash_* | |
17 | #endif | |
18 | ||
19 | #include "include/compat.h" | |
20 | #include "include/fs_types.h" | |
21 | #include "common/entity_name.h" | |
22 | #include "common/errno.h" | |
23 | #include "common/safe_io.h" | |
24 | #include "mds/mdstypes.h" | |
25 | #include "mds/LogEvent.h" | |
26 | #include "mds/JournalPointer.h" | |
27 | #include "osdc/Journaler.h" | |
28 | #include "mon/MonClient.h" | |
29 | ||
30 | #include "Dumper.h" | |
31 | ||
32 | #define dout_context g_ceph_context | |
33 | #define dout_subsys ceph_subsys_mds | |
34 | ||
35 | #define HEADER_LEN 4096 | |
36 | ||
37 | int Dumper::init(mds_role_t role_) | |
38 | { | |
39 | role = role_; | |
40 | ||
41 | int r = MDSUtility::init(); | |
42 | if (r < 0) { | |
43 | return r; | |
44 | } | |
45 | ||
46 | auto fs = fsmap->get_filesystem(role.fscid); | |
47 | assert(fs != nullptr); | |
48 | ||
49 | JournalPointer jp(role.rank, fs->mds_map.get_metadata_pool()); | |
50 | int jp_load_result = jp.load(objecter); | |
51 | if (jp_load_result != 0) { | |
52 | std::cerr << "Error loading journal: " << cpp_strerror(jp_load_result) << std::endl; | |
53 | return jp_load_result; | |
54 | } else { | |
55 | ino = jp.front; | |
56 | return 0; | |
57 | } | |
58 | } | |
59 | ||
60 | ||
61 | int Dumper::recover_journal(Journaler *journaler) | |
62 | { | |
63 | C_SaferCond cond; | |
64 | lock.Lock(); | |
65 | journaler->recover(&cond); | |
66 | lock.Unlock(); | |
67 | const int r = cond.wait(); | |
68 | ||
69 | if (r < 0) { // Error | |
70 | derr << "error on recovery: " << cpp_strerror(r) << dendl; | |
71 | return r; | |
72 | } else { | |
73 | dout(10) << "completed journal recovery" << dendl; | |
74 | return 0; | |
75 | } | |
76 | } | |
77 | ||
78 | ||
79 | int Dumper::dump(const char *dump_file) | |
80 | { | |
81 | int r = 0; | |
82 | ||
83 | auto fs = fsmap->get_filesystem(role.fscid); | |
84 | assert(fs != nullptr); | |
85 | ||
86 | Journaler journaler("dumper", ino, fs->mds_map.get_metadata_pool(), | |
87 | CEPH_FS_ONDISK_MAGIC, objecter, 0, 0, | |
88 | &finisher); | |
89 | r = recover_journal(&journaler); | |
90 | if (r) { | |
91 | return r; | |
92 | } | |
93 | uint64_t start = journaler.get_read_pos(); | |
94 | uint64_t end = journaler.get_write_pos(); | |
95 | uint64_t len = end-start; | |
96 | ||
97 | Filer filer(objecter, &finisher); | |
98 | ||
99 | cout << "journal is " << start << "~" << len << std::endl; | |
100 | ||
101 | int fd = ::open(dump_file, O_WRONLY|O_CREAT|O_TRUNC, 0644); | |
102 | if (fd >= 0) { | |
103 | // include an informative header | |
104 | uuid_d fsid = monc->get_fsid(); | |
105 | char fsid_str[40]; | |
106 | fsid.print(fsid_str); | |
107 | char buf[HEADER_LEN]; | |
108 | memset(buf, 0, sizeof(buf)); | |
109 | snprintf(buf, HEADER_LEN, "Ceph mds%d journal dump\n start offset %llu (0x%llx)\n\ | |
110 | length %llu (0x%llx)\n write_pos %llu (0x%llx)\n format %llu\n\ | |
111 | trimmed_pos %llu (0x%llx)\n stripe_unit %lu (0x%lx)\n stripe_count %lu (0x%lx)\n\ | |
112 | object_size %lu (0x%lx)\n fsid %s\n%c", | |
113 | role.rank, | |
114 | (unsigned long long)start, (unsigned long long)start, | |
115 | (unsigned long long)len, (unsigned long long)len, | |
116 | (unsigned long long)journaler.last_committed.write_pos, (unsigned long long)journaler.last_committed.write_pos, | |
117 | (unsigned long long)journaler.last_committed.stream_format, | |
118 | (unsigned long long)journaler.last_committed.trimmed_pos, (unsigned long long)journaler.last_committed.trimmed_pos, | |
119 | (unsigned long)journaler.last_committed.layout.stripe_unit, (unsigned long)journaler.last_committed.layout.stripe_unit, | |
120 | (unsigned long)journaler.last_committed.layout.stripe_count, (unsigned long)journaler.last_committed.layout.stripe_count, | |
121 | (unsigned long)journaler.last_committed.layout.object_size, (unsigned long)journaler.last_committed.layout.object_size, | |
122 | fsid_str, | |
123 | 4); | |
124 | r = safe_write(fd, buf, sizeof(buf)); | |
125 | if (r) { | |
126 | derr << "Error " << r << " (" << cpp_strerror(r) << ") writing journal file header" << dendl; | |
127 | ::close(fd); | |
128 | return r; | |
129 | } | |
130 | ||
131 | // write the data | |
132 | off64_t seeked = ::lseek64(fd, start, SEEK_SET); | |
133 | if (seeked == (off64_t)-1) { | |
134 | r = errno; | |
135 | derr << "Error " << r << " (" << cpp_strerror(r) << ") seeking to 0x" << std::hex << start << std::dec << dendl; | |
136 | ::close(fd); | |
137 | return r; | |
138 | } | |
139 | ||
140 | ||
141 | // Read and write 32MB chunks. Slower than it could be because we're not | |
142 | // streaming, but that's okay because this is just a debug/disaster tool. | |
143 | const uint32_t chunk_size = 32 * 1024 * 1024; | |
144 | ||
145 | for (uint64_t pos = start; pos < start + len; pos += chunk_size) { | |
146 | bufferlist bl; | |
147 | dout(10) << "Reading at pos=0x" << std::hex << pos << std::dec << dendl; | |
148 | ||
149 | const uint32_t read_size = MIN(chunk_size, end - pos); | |
150 | ||
151 | C_SaferCond cond; | |
152 | lock.Lock(); | |
153 | filer.read(ino, &journaler.get_layout(), CEPH_NOSNAP, | |
154 | pos, read_size, &bl, 0, &cond); | |
155 | lock.Unlock(); | |
156 | r = cond.wait(); | |
157 | if (r < 0) { | |
158 | derr << "Error " << r << " (" << cpp_strerror(r) << ") reading " | |
159 | "journal at offset 0x" << std::hex << pos << std::dec << dendl; | |
160 | ::close(fd); | |
161 | return r; | |
162 | } | |
163 | dout(10) << "Got 0x" << std::hex << bl.length() << std::dec | |
164 | << " bytes" << dendl; | |
165 | ||
166 | r = bl.write_fd(fd); | |
167 | if (r) { | |
168 | derr << "Error " << r << " (" << cpp_strerror(r) << ") writing journal file" << dendl; | |
169 | ::close(fd); | |
170 | return r; | |
171 | } | |
172 | } | |
173 | ||
174 | r = ::close(fd); | |
175 | if (r) { | |
176 | r = errno; | |
177 | derr << "Error " << r << " (" << cpp_strerror(r) << ") closing journal file" << dendl; | |
178 | return r; | |
179 | } | |
180 | ||
181 | cout << "wrote " << len << " bytes at offset " << start << " to " << dump_file << "\n" | |
182 | << "NOTE: this is a _sparse_ file; you can\n" | |
183 | << "\t$ tar cSzf " << dump_file << ".tgz " << dump_file << "\n" | |
184 | << " to efficiently compress it while preserving sparseness." << std::endl; | |
185 | return 0; | |
186 | } else { | |
187 | int err = errno; | |
188 | derr << "unable to open " << dump_file << ": " << cpp_strerror(err) << dendl; | |
189 | return err; | |
190 | } | |
191 | } | |
192 | ||
193 | int Dumper::undump(const char *dump_file, bool force) | |
194 | { | |
195 | cout << "undump " << dump_file << std::endl; | |
196 | ||
197 | auto fs = fsmap->get_filesystem(role.fscid); | |
198 | assert(fs != nullptr); | |
199 | ||
200 | int r = 0; | |
201 | // try get layout info from cluster | |
202 | Journaler journaler("umdumper", ino, fs->mds_map.get_metadata_pool(), | |
203 | CEPH_FS_ONDISK_MAGIC, objecter, 0, 0, | |
204 | &finisher); | |
205 | int recovered = recover_journal(&journaler); | |
206 | if (recovered != 0) { | |
207 | derr << "recover_journal failed, try to get header from dump file " << dendl; | |
208 | } | |
209 | ||
210 | int fd = ::open(dump_file, O_RDONLY); | |
211 | if (fd < 0) { | |
212 | r = errno; | |
213 | derr << "couldn't open " << dump_file << ": " << cpp_strerror(r) << dendl; | |
214 | return r; | |
215 | } | |
216 | ||
217 | // Ceph mds0 journal dump | |
218 | // start offset 232401996 (0xdda2c4c) | |
219 | // length 1097504 (0x10bf20) | |
220 | ||
221 | char buf[HEADER_LEN]; | |
222 | r = safe_read(fd, buf, sizeof(buf)); | |
223 | if (r < 0) { | |
224 | VOID_TEMP_FAILURE_RETRY(::close(fd)); | |
225 | return r; | |
226 | } | |
227 | ||
228 | long long unsigned start, len, write_pos, format, trimmed_pos; | |
229 | long unsigned stripe_unit, stripe_count, object_size; | |
230 | sscanf(strstr(buf, "start offset"), "start offset %llu", &start); | |
231 | sscanf(strstr(buf, "length"), "length %llu", &len); | |
232 | sscanf(strstr(buf, "write_pos"), "write_pos %llu", &write_pos); | |
233 | sscanf(strstr(buf, "format"), "format %llu", &format); | |
234 | ||
235 | if (!force) { | |
236 | // need to check if fsid match onlien cluster fsid | |
237 | if (strstr(buf, "fsid")) { | |
238 | uuid_d fsid; | |
239 | char fsid_str[40]; | |
240 | sscanf(strstr(buf, "fsid"), "fsid %s", fsid_str); | |
241 | r = fsid.parse(fsid_str); | |
242 | if (!r) { | |
243 | derr << "Invalid fsid" << dendl; | |
244 | ::close(fd); | |
245 | return -EINVAL; | |
246 | } | |
247 | ||
248 | if (fsid != monc->get_fsid()) { | |
249 | derr << "Imported journal fsid does not match online cluster fsid" << dendl; | |
250 | derr << "Use --force to skip fsid check" << dendl; | |
251 | ::close(fd); | |
252 | return -EINVAL; | |
253 | } | |
254 | } else { | |
255 | derr << "Invalid header, no fsid embeded" << dendl; | |
256 | ::close(fd); | |
257 | return -EINVAL; | |
258 | } | |
259 | } | |
260 | ||
261 | if (recovered == 0) { | |
262 | stripe_unit = journaler.last_committed.layout.stripe_unit; | |
263 | stripe_count = journaler.last_committed.layout.stripe_count; | |
264 | object_size = journaler.last_committed.layout.object_size; | |
265 | } else { | |
266 | // try to get layout from dump file header, if failed set layout to default | |
267 | if (strstr(buf, "stripe_unit")) { | |
268 | sscanf(strstr(buf, "stripe_unit"), "stripe_unit %lu", &stripe_unit); | |
269 | } else { | |
270 | stripe_unit = file_layout_t::get_default().stripe_unit; | |
271 | } | |
272 | if (strstr(buf, "stripe_count")) { | |
273 | sscanf(strstr(buf, "stripe_count"), "stripe_count %lu", &stripe_count); | |
274 | } else { | |
275 | stripe_count = file_layout_t::get_default().stripe_count; | |
276 | } | |
277 | if (strstr(buf, "object_size")) { | |
278 | sscanf(strstr(buf, "object_size"), "object_size %lu", &object_size); | |
279 | } else { | |
280 | object_size = file_layout_t::get_default().object_size; | |
281 | } | |
282 | } | |
283 | ||
284 | if (strstr(buf, "trimmed_pos")) { | |
285 | sscanf(strstr(buf, "trimmed_pos"), "trimmed_pos %llu", &trimmed_pos); | |
286 | } else { | |
287 | // Old format dump, any untrimmed objects before expire_pos will | |
288 | // be discarded as trash. | |
289 | trimmed_pos = start - (start % object_size); | |
290 | } | |
291 | ||
292 | if (trimmed_pos > start) { | |
293 | derr << std::hex << "Invalid header (trimmed 0x" << trimmed_pos | |
294 | << " > expire 0x" << start << std::dec << dendl; | |
295 | ::close(fd); | |
296 | return -EINVAL; | |
297 | } | |
298 | ||
299 | if (start > write_pos) { | |
300 | derr << std::hex << "Invalid header (expire 0x" << start | |
301 | << " > write 0x" << write_pos << std::dec << dendl; | |
302 | ::close(fd); | |
303 | return -EINVAL; | |
304 | } | |
305 | ||
306 | cout << "start " << start << | |
307 | " len " << len << | |
308 | " write_pos " << write_pos << | |
309 | " format " << format << | |
310 | " trimmed_pos " << trimmed_pos << | |
311 | " stripe_unit " << stripe_unit << | |
312 | " stripe_count " << stripe_count << | |
313 | " object_size " << object_size << std::endl; | |
314 | ||
315 | Journaler::Header h; | |
316 | h.trimmed_pos = trimmed_pos; | |
317 | h.expire_pos = start; | |
318 | h.write_pos = write_pos; | |
319 | h.stream_format = format; | |
320 | h.magic = CEPH_FS_ONDISK_MAGIC; | |
321 | ||
322 | h.layout.stripe_unit = stripe_unit; | |
323 | h.layout.stripe_count = stripe_count; | |
324 | h.layout.object_size = object_size; | |
325 | h.layout.pool_id = fs->mds_map.get_metadata_pool(); | |
326 | ||
327 | bufferlist hbl; | |
328 | ::encode(h, hbl); | |
329 | ||
330 | object_t oid = file_object_t(ino, 0); | |
331 | object_locator_t oloc(fs->mds_map.get_metadata_pool()); | |
332 | SnapContext snapc; | |
333 | ||
334 | cout << "writing header " << oid << std::endl; | |
335 | C_SaferCond header_cond; | |
336 | lock.Lock(); | |
337 | objecter->write_full(oid, oloc, snapc, hbl, | |
338 | ceph::real_clock::now(), 0, | |
339 | &header_cond); | |
340 | lock.Unlock(); | |
341 | ||
342 | r = header_cond.wait(); | |
343 | if (r != 0) { | |
344 | derr << "Failed to write header: " << cpp_strerror(r) << dendl; | |
345 | ::close(fd); | |
346 | return r; | |
347 | } | |
348 | ||
349 | Filer filer(objecter, &finisher); | |
350 | ||
351 | /* Erase any objects at the end of the region to which we shall write | |
352 | * the new log data. This is to avoid leaving trailing junk after | |
353 | * the newly written data. Any junk more than one object ahead | |
354 | * will be taken care of during normal operation by Journaler's | |
355 | * prezeroing behaviour */ | |
356 | { | |
357 | uint32_t const object_size = h.layout.object_size; | |
358 | assert(object_size > 0); | |
359 | uint64_t last_obj = h.write_pos / object_size; | |
360 | uint64_t purge_count = 2; | |
361 | /* When the length is zero, the last_obj should be zeroed | |
362 | * from the offset determined by the new write_pos instead of being purged. | |
363 | */ | |
364 | if (!len) { | |
365 | purge_count = 1; | |
366 | ++last_obj; | |
367 | } | |
368 | C_SaferCond purge_cond; | |
369 | cout << "Purging " << purge_count << " objects from " << last_obj << std::endl; | |
370 | lock.Lock(); | |
371 | filer.purge_range(ino, &h.layout, snapc, last_obj, purge_count, | |
372 | ceph::real_clock::now(), 0, &purge_cond); | |
373 | lock.Unlock(); | |
374 | purge_cond.wait(); | |
375 | } | |
376 | /* When the length is zero, zero the last object | |
377 | * from the offset determined by the new write_pos. | |
378 | */ | |
379 | if (!len) { | |
380 | uint64_t offset_in_obj = h.write_pos % h.layout.object_size; | |
381 | uint64_t len = h.layout.object_size - offset_in_obj; | |
382 | C_SaferCond zero_cond; | |
383 | cout << "Zeroing " << len << " bytes in the last object." << std::endl; | |
384 | ||
385 | lock.Lock(); | |
386 | filer.zero(ino, &h.layout, snapc, h.write_pos, len, ceph::real_clock::now(), 0, &zero_cond); | |
387 | lock.Unlock(); | |
388 | zero_cond.wait(); | |
389 | } | |
390 | ||
391 | // Stream from `fd` to `filer` | |
392 | uint64_t pos = start; | |
393 | uint64_t left = len; | |
394 | while (left > 0) { | |
395 | // Read | |
396 | bufferlist j; | |
397 | lseek64(fd, pos, SEEK_SET); | |
398 | uint64_t l = MIN(left, 1024*1024); | |
399 | j.read_fd(fd, l); | |
400 | ||
401 | // Write | |
402 | cout << " writing " << pos << "~" << l << std::endl; | |
403 | C_SaferCond write_cond; | |
404 | lock.Lock(); | |
405 | filer.write(ino, &h.layout, snapc, pos, l, j, | |
406 | ceph::real_clock::now(), 0, &write_cond); | |
407 | lock.Unlock(); | |
408 | ||
409 | r = write_cond.wait(); | |
410 | if (r != 0) { | |
411 | derr << "Failed to write header: " << cpp_strerror(r) << dendl; | |
412 | ::close(fd); | |
413 | return r; | |
414 | } | |
415 | ||
416 | // Advance | |
417 | pos += l; | |
418 | left -= l; | |
419 | } | |
420 | ||
421 | VOID_TEMP_FAILURE_RETRY(::close(fd)); | |
422 | cout << "done." << std::endl; | |
423 | return 0; | |
424 | } | |
425 |