]> git.proxmox.com Git - ceph.git/blame - ceph/src/tools/cephfs/Dumper.cc
import 15.2.0 Octopus source
[ceph.git] / ceph / src / tools / cephfs / Dumper.cc
CommitLineData
7c673cae
FG
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3/*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2010 Greg Farnum <gregf@hq.newdream.net>
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15#ifndef _BACKWARD_BACKWARD_WARNING_H
16#define _BACKWARD_BACKWARD_WARNING_H // make gcc 4.3 shut up about hash_*
17#endif
18
19#include "include/compat.h"
20#include "include/fs_types.h"
21#include "common/entity_name.h"
22#include "common/errno.h"
23#include "common/safe_io.h"
24#include "mds/mdstypes.h"
25#include "mds/LogEvent.h"
26#include "mds/JournalPointer.h"
27#include "osdc/Journaler.h"
91327a77 28#include "mon/MonClient.h"
7c673cae
FG
29
30#include "Dumper.h"
31
32#define dout_context g_ceph_context
33#define dout_subsys ceph_subsys_mds
34
35#define HEADER_LEN 4096
36
11fdf7f2 37int Dumper::init(mds_role_t role_, const std::string &type)
7c673cae
FG
38{
39 role = role_;
40
41 int r = MDSUtility::init();
42 if (r < 0) {
43 return r;
44 }
45
46 auto fs = fsmap->get_filesystem(role.fscid);
11fdf7f2
TL
47 ceph_assert(fs != nullptr);
48
49 if (type == "mdlog") {
50 JournalPointer jp(role.rank, fs->mds_map.get_metadata_pool());
51 int jp_load_result = jp.load(objecter);
52 if (jp_load_result != 0) {
53 std::cerr << "Error loading journal: " << cpp_strerror(jp_load_result) << std::endl;
54 return jp_load_result;
55 } else {
56 ino = jp.front;
57 }
58 } else if (type == "purge_queue") {
59 ino = MDS_INO_PURGE_QUEUE + role.rank;
7c673cae 60 } else {
11fdf7f2 61 ceph_abort(); // should not get here
7c673cae 62 }
11fdf7f2 63 return 0;
7c673cae
FG
64}
65
66
67int Dumper::recover_journal(Journaler *journaler)
68{
69 C_SaferCond cond;
9f95a23c 70 lock.lock();
7c673cae 71 journaler->recover(&cond);
9f95a23c 72 lock.unlock();
d2e6a577 73 const int r = cond.wait();
7c673cae
FG
74
75 if (r < 0) { // Error
76 derr << "error on recovery: " << cpp_strerror(r) << dendl;
77 return r;
78 } else {
79 dout(10) << "completed journal recovery" << dendl;
80 return 0;
81 }
82}
83
84
85int Dumper::dump(const char *dump_file)
86{
87 int r = 0;
88
89 auto fs = fsmap->get_filesystem(role.fscid);
11fdf7f2 90 ceph_assert(fs != nullptr);
7c673cae
FG
91
92 Journaler journaler("dumper", ino, fs->mds_map.get_metadata_pool(),
93 CEPH_FS_ONDISK_MAGIC, objecter, 0, 0,
94 &finisher);
95 r = recover_journal(&journaler);
96 if (r) {
97 return r;
98 }
99 uint64_t start = journaler.get_read_pos();
100 uint64_t end = journaler.get_write_pos();
101 uint64_t len = end-start;
102
103 Filer filer(objecter, &finisher);
104
105 cout << "journal is " << start << "~" << len << std::endl;
106
107 int fd = ::open(dump_file, O_WRONLY|O_CREAT|O_TRUNC, 0644);
108 if (fd >= 0) {
109 // include an informative header
91327a77
AA
110 uuid_d fsid = monc->get_fsid();
111 char fsid_str[40];
112 fsid.print(fsid_str);
7c673cae
FG
113 char buf[HEADER_LEN];
114 memset(buf, 0, sizeof(buf));
91327a77
AA
115 snprintf(buf, HEADER_LEN, "Ceph mds%d journal dump\n start offset %llu (0x%llx)\n\
116 length %llu (0x%llx)\n write_pos %llu (0x%llx)\n format %llu\n\
117 trimmed_pos %llu (0x%llx)\n stripe_unit %lu (0x%lx)\n stripe_count %lu (0x%lx)\n\
118 object_size %lu (0x%lx)\n fsid %s\n%c",
7c673cae
FG
119 role.rank,
120 (unsigned long long)start, (unsigned long long)start,
121 (unsigned long long)len, (unsigned long long)len,
122 (unsigned long long)journaler.last_committed.write_pos, (unsigned long long)journaler.last_committed.write_pos,
123 (unsigned long long)journaler.last_committed.stream_format,
124 (unsigned long long)journaler.last_committed.trimmed_pos, (unsigned long long)journaler.last_committed.trimmed_pos,
91327a77
AA
125 (unsigned long)journaler.last_committed.layout.stripe_unit, (unsigned long)journaler.last_committed.layout.stripe_unit,
126 (unsigned long)journaler.last_committed.layout.stripe_count, (unsigned long)journaler.last_committed.layout.stripe_count,
127 (unsigned long)journaler.last_committed.layout.object_size, (unsigned long)journaler.last_committed.layout.object_size,
128 fsid_str,
7c673cae
FG
129 4);
130 r = safe_write(fd, buf, sizeof(buf));
131 if (r) {
132 derr << "Error " << r << " (" << cpp_strerror(r) << ") writing journal file header" << dendl;
133 ::close(fd);
134 return r;
135 }
136
137 // write the data
138 off64_t seeked = ::lseek64(fd, start, SEEK_SET);
139 if (seeked == (off64_t)-1) {
140 r = errno;
141 derr << "Error " << r << " (" << cpp_strerror(r) << ") seeking to 0x" << std::hex << start << std::dec << dendl;
142 ::close(fd);
143 return r;
144 }
145
146
147 // Read and write 32MB chunks. Slower than it could be because we're not
148 // streaming, but that's okay because this is just a debug/disaster tool.
149 const uint32_t chunk_size = 32 * 1024 * 1024;
150
151 for (uint64_t pos = start; pos < start + len; pos += chunk_size) {
152 bufferlist bl;
153 dout(10) << "Reading at pos=0x" << std::hex << pos << std::dec << dendl;
154
11fdf7f2 155 const uint32_t read_size = std::min<uint64_t>(chunk_size, end - pos);
7c673cae
FG
156
157 C_SaferCond cond;
9f95a23c 158 lock.lock();
7c673cae
FG
159 filer.read(ino, &journaler.get_layout(), CEPH_NOSNAP,
160 pos, read_size, &bl, 0, &cond);
9f95a23c 161 lock.unlock();
7c673cae
FG
162 r = cond.wait();
163 if (r < 0) {
164 derr << "Error " << r << " (" << cpp_strerror(r) << ") reading "
165 "journal at offset 0x" << std::hex << pos << std::dec << dendl;
166 ::close(fd);
167 return r;
168 }
169 dout(10) << "Got 0x" << std::hex << bl.length() << std::dec
170 << " bytes" << dendl;
171
172 r = bl.write_fd(fd);
173 if (r) {
174 derr << "Error " << r << " (" << cpp_strerror(r) << ") writing journal file" << dendl;
175 ::close(fd);
176 return r;
177 }
178 }
179
180 r = ::close(fd);
181 if (r) {
182 r = errno;
183 derr << "Error " << r << " (" << cpp_strerror(r) << ") closing journal file" << dendl;
184 return r;
185 }
186
187 cout << "wrote " << len << " bytes at offset " << start << " to " << dump_file << "\n"
188 << "NOTE: this is a _sparse_ file; you can\n"
189 << "\t$ tar cSzf " << dump_file << ".tgz " << dump_file << "\n"
190 << " to efficiently compress it while preserving sparseness." << std::endl;
191 return 0;
192 } else {
193 int err = errno;
194 derr << "unable to open " << dump_file << ": " << cpp_strerror(err) << dendl;
195 return err;
196 }
197}
198
91327a77 199int Dumper::undump(const char *dump_file, bool force)
7c673cae
FG
200{
201 cout << "undump " << dump_file << std::endl;
202
203 auto fs = fsmap->get_filesystem(role.fscid);
11fdf7f2 204 ceph_assert(fs != nullptr);
7c673cae
FG
205
206 int r = 0;
91327a77
AA
207 // try get layout info from cluster
208 Journaler journaler("umdumper", ino, fs->mds_map.get_metadata_pool(),
209 CEPH_FS_ONDISK_MAGIC, objecter, 0, 0,
210 &finisher);
211 int recovered = recover_journal(&journaler);
212 if (recovered != 0) {
213 derr << "recover_journal failed, try to get header from dump file " << dendl;
214 }
215
7c673cae
FG
216 int fd = ::open(dump_file, O_RDONLY);
217 if (fd < 0) {
218 r = errno;
219 derr << "couldn't open " << dump_file << ": " << cpp_strerror(r) << dendl;
220 return r;
221 }
222
223 // Ceph mds0 journal dump
224 // start offset 232401996 (0xdda2c4c)
225 // length 1097504 (0x10bf20)
226
227 char buf[HEADER_LEN];
228 r = safe_read(fd, buf, sizeof(buf));
229 if (r < 0) {
230 VOID_TEMP_FAILURE_RETRY(::close(fd));
231 return r;
232 }
233
234 long long unsigned start, len, write_pos, format, trimmed_pos;
91327a77 235 long unsigned stripe_unit, stripe_count, object_size;
7c673cae
FG
236 sscanf(strstr(buf, "start offset"), "start offset %llu", &start);
237 sscanf(strstr(buf, "length"), "length %llu", &len);
238 sscanf(strstr(buf, "write_pos"), "write_pos %llu", &write_pos);
239 sscanf(strstr(buf, "format"), "format %llu", &format);
91327a77
AA
240
241 if (!force) {
242 // need to check if fsid match onlien cluster fsid
243 if (strstr(buf, "fsid")) {
244 uuid_d fsid;
245 char fsid_str[40];
92f5a8d4 246 sscanf(strstr(buf, "fsid"), "fsid %39s", fsid_str);
91327a77
AA
247 r = fsid.parse(fsid_str);
248 if (!r) {
249 derr << "Invalid fsid" << dendl;
250 ::close(fd);
251 return -EINVAL;
252 }
253
254 if (fsid != monc->get_fsid()) {
255 derr << "Imported journal fsid does not match online cluster fsid" << dendl;
256 derr << "Use --force to skip fsid check" << dendl;
257 ::close(fd);
258 return -EINVAL;
259 }
260 } else {
261 derr << "Invalid header, no fsid embeded" << dendl;
262 ::close(fd);
263 return -EINVAL;
264 }
265 }
266
267 if (recovered == 0) {
268 stripe_unit = journaler.last_committed.layout.stripe_unit;
269 stripe_count = journaler.last_committed.layout.stripe_count;
270 object_size = journaler.last_committed.layout.object_size;
271 } else {
272 // try to get layout from dump file header, if failed set layout to default
273 if (strstr(buf, "stripe_unit")) {
274 sscanf(strstr(buf, "stripe_unit"), "stripe_unit %lu", &stripe_unit);
275 } else {
276 stripe_unit = file_layout_t::get_default().stripe_unit;
277 }
278 if (strstr(buf, "stripe_count")) {
279 sscanf(strstr(buf, "stripe_count"), "stripe_count %lu", &stripe_count);
280 } else {
281 stripe_count = file_layout_t::get_default().stripe_count;
282 }
283 if (strstr(buf, "object_size")) {
284 sscanf(strstr(buf, "object_size"), "object_size %lu", &object_size);
285 } else {
286 object_size = file_layout_t::get_default().object_size;
287 }
288 }
289
7c673cae
FG
290 if (strstr(buf, "trimmed_pos")) {
291 sscanf(strstr(buf, "trimmed_pos"), "trimmed_pos %llu", &trimmed_pos);
292 } else {
293 // Old format dump, any untrimmed objects before expire_pos will
294 // be discarded as trash.
91327a77 295 trimmed_pos = start - (start % object_size);
7c673cae
FG
296 }
297
298 if (trimmed_pos > start) {
299 derr << std::hex << "Invalid header (trimmed 0x" << trimmed_pos
300 << " > expire 0x" << start << std::dec << dendl;
301 ::close(fd);
302 return -EINVAL;
303 }
304
305 if (start > write_pos) {
306 derr << std::hex << "Invalid header (expire 0x" << start
307 << " > write 0x" << write_pos << std::dec << dendl;
308 ::close(fd);
309 return -EINVAL;
310 }
311
312 cout << "start " << start <<
313 " len " << len <<
314 " write_pos " << write_pos <<
315 " format " << format <<
91327a77
AA
316 " trimmed_pos " << trimmed_pos <<
317 " stripe_unit " << stripe_unit <<
318 " stripe_count " << stripe_count <<
319 " object_size " << object_size << std::endl;
7c673cae
FG
320
321 Journaler::Header h;
322 h.trimmed_pos = trimmed_pos;
323 h.expire_pos = start;
324 h.write_pos = write_pos;
325 h.stream_format = format;
326 h.magic = CEPH_FS_ONDISK_MAGIC;
327
91327a77
AA
328 h.layout.stripe_unit = stripe_unit;
329 h.layout.stripe_count = stripe_count;
330 h.layout.object_size = object_size;
7c673cae
FG
331 h.layout.pool_id = fs->mds_map.get_metadata_pool();
332
333 bufferlist hbl;
11fdf7f2 334 encode(h, hbl);
7c673cae
FG
335
336 object_t oid = file_object_t(ino, 0);
337 object_locator_t oloc(fs->mds_map.get_metadata_pool());
338 SnapContext snapc;
339
340 cout << "writing header " << oid << std::endl;
341 C_SaferCond header_cond;
9f95a23c 342 lock.lock();
7c673cae
FG
343 objecter->write_full(oid, oloc, snapc, hbl,
344 ceph::real_clock::now(), 0,
345 &header_cond);
9f95a23c 346 lock.unlock();
7c673cae
FG
347
348 r = header_cond.wait();
349 if (r != 0) {
350 derr << "Failed to write header: " << cpp_strerror(r) << dendl;
351 ::close(fd);
352 return r;
353 }
354
355 Filer filer(objecter, &finisher);
356
357 /* Erase any objects at the end of the region to which we shall write
358 * the new log data. This is to avoid leaving trailing junk after
359 * the newly written data. Any junk more than one object ahead
360 * will be taken care of during normal operation by Journaler's
361 * prezeroing behaviour */
362 {
363 uint32_t const object_size = h.layout.object_size;
11fdf7f2 364 ceph_assert(object_size > 0);
1adf2230
AA
365 uint64_t last_obj = h.write_pos / object_size;
366 uint64_t purge_count = 2;
367 /* When the length is zero, the last_obj should be zeroed
368 * from the offset determined by the new write_pos instead of being purged.
369 */
370 if (!len) {
371 purge_count = 1;
372 ++last_obj;
373 }
7c673cae
FG
374 C_SaferCond purge_cond;
375 cout << "Purging " << purge_count << " objects from " << last_obj << std::endl;
9f95a23c 376 lock.lock();
7c673cae
FG
377 filer.purge_range(ino, &h.layout, snapc, last_obj, purge_count,
378 ceph::real_clock::now(), 0, &purge_cond);
9f95a23c 379 lock.unlock();
7c673cae
FG
380 purge_cond.wait();
381 }
1adf2230
AA
382 /* When the length is zero, zero the last object
383 * from the offset determined by the new write_pos.
384 */
385 if (!len) {
386 uint64_t offset_in_obj = h.write_pos % h.layout.object_size;
387 uint64_t len = h.layout.object_size - offset_in_obj;
388 C_SaferCond zero_cond;
389 cout << "Zeroing " << len << " bytes in the last object." << std::endl;
390
9f95a23c 391 lock.lock();
1adf2230 392 filer.zero(ino, &h.layout, snapc, h.write_pos, len, ceph::real_clock::now(), 0, &zero_cond);
9f95a23c 393 lock.unlock();
1adf2230
AA
394 zero_cond.wait();
395 }
7c673cae
FG
396
397 // Stream from `fd` to `filer`
398 uint64_t pos = start;
399 uint64_t left = len;
400 while (left > 0) {
401 // Read
402 bufferlist j;
403 lseek64(fd, pos, SEEK_SET);
11fdf7f2 404 uint64_t l = std::min<uint64_t>(left, 1024*1024);
7c673cae
FG
405 j.read_fd(fd, l);
406
407 // Write
408 cout << " writing " << pos << "~" << l << std::endl;
409 C_SaferCond write_cond;
9f95a23c 410 lock.lock();
7c673cae
FG
411 filer.write(ino, &h.layout, snapc, pos, l, j,
412 ceph::real_clock::now(), 0, &write_cond);
9f95a23c 413 lock.unlock();
7c673cae
FG
414
415 r = write_cond.wait();
416 if (r != 0) {
417 derr << "Failed to write header: " << cpp_strerror(r) << dendl;
418 ::close(fd);
419 return r;
420 }
421
422 // Advance
423 pos += l;
424 left -= l;
425 }
426
427 VOID_TEMP_FAILURE_RETRY(::close(fd));
428 cout << "done." << std::endl;
429 return 0;
430}
431