]> git.proxmox.com Git - ceph.git/blob - ceph/src/osd/PGBackend.cc
157b2422ab3989371bb7c54a850742faec30e295
[ceph.git] / ceph / src / osd / PGBackend.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2013,2014 Inktank Storage, Inc.
7 * Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com>
8 *
9 * Author: Loic Dachary <loic@dachary.org>
10 *
11 * This is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public
13 * License version 2.1, as published by the Free Software
14 * Foundation. See file COPYING.
15 *
16 */
17
18
19 #include "common/errno.h"
20 #include "common/scrub_types.h"
21 #include "ReplicatedBackend.h"
22 #include "ScrubStore.h"
23 #include "ECBackend.h"
24 #include "PGBackend.h"
25 #include "OSD.h"
26 #include "erasure-code/ErasureCodePlugin.h"
27 #include "OSDMap.h"
28 #include "PGLog.h"
29 #include "common/LogClient.h"
30 #include "messages/MOSDPGRecoveryDelete.h"
31 #include "messages/MOSDPGRecoveryDeleteReply.h"
32
33 #define dout_context cct
34 #define dout_subsys ceph_subsys_osd
35 #define DOUT_PREFIX_ARGS this
36 #undef dout_prefix
37 #define dout_prefix _prefix(_dout, this)
38 static ostream& _prefix(std::ostream *_dout, PGBackend *pgb) {
39 return *_dout << pgb->get_parent()->gen_dbg_prefix();
40 }
41
42 void PGBackend::recover_delete_object(const hobject_t &oid, eversion_t v,
43 RecoveryHandle *h)
44 {
45 assert(get_parent()->get_actingbackfill_shards().size() > 0);
46 for (const auto& shard : get_parent()->get_actingbackfill_shards()) {
47 if (shard == get_parent()->whoami_shard())
48 continue;
49 if (get_parent()->get_shard_missing(shard).is_missing(oid)) {
50 dout(20) << __func__ << " will remove " << oid << " " << v << " from "
51 << shard << dendl;
52 h->deletes[shard].push_back(make_pair(oid, v));
53 get_parent()->begin_peer_recover(shard, oid);
54 }
55 }
56 }
57
58 void PGBackend::send_recovery_deletes(int prio,
59 const map<pg_shard_t, vector<pair<hobject_t, eversion_t> > > &deletes)
60 {
61 epoch_t min_epoch = get_parent()->get_last_peering_reset_epoch();
62 for (const auto& p : deletes) {
63 const auto& shard = p.first;
64 const auto& objects = p.second;
65 ConnectionRef con = get_parent()->get_con_osd_cluster(
66 shard.osd,
67 get_osdmap()->get_epoch());
68 if (!con)
69 continue;
70 auto it = objects.begin();
71 while (it != objects.end()) {
72 uint64_t cost = 0;
73 uint64_t deletes = 0;
74 spg_t target_pg = spg_t(get_parent()->get_info().pgid.pgid, shard.shard);
75 MOSDPGRecoveryDelete *msg =
76 new MOSDPGRecoveryDelete(get_parent()->whoami_shard(),
77 target_pg,
78 get_osdmap()->get_epoch(),
79 min_epoch);
80 msg->set_priority(prio);
81
82 while (it != objects.end() &&
83 cost < cct->_conf->osd_max_push_cost &&
84 deletes < cct->_conf->osd_max_push_objects) {
85 dout(20) << __func__ << ": sending recovery delete << " << it->first
86 << " " << it->second << " to osd." << shard << dendl;
87 msg->objects.push_back(*it);
88 cost += cct->_conf->osd_push_per_object_cost;
89 ++deletes;
90 ++it;
91 }
92
93 msg->set_cost(cost);
94 get_parent()->send_message_osd_cluster(msg, con);
95 }
96 }
97 }
98
99 bool PGBackend::handle_message(OpRequestRef op)
100 {
101 switch (op->get_req()->get_type()) {
102 case MSG_OSD_PG_RECOVERY_DELETE:
103 handle_recovery_delete(op);
104 return true;
105
106 case MSG_OSD_PG_RECOVERY_DELETE_REPLY:
107 handle_recovery_delete_reply(op);
108 return true;
109
110 default:
111 break;
112 }
113
114 return _handle_message(op);
115 }
116
117 void PGBackend::handle_recovery_delete(OpRequestRef op)
118 {
119 const MOSDPGRecoveryDelete *m = static_cast<const MOSDPGRecoveryDelete *>(op->get_req());
120 assert(m->get_type() == MSG_OSD_PG_RECOVERY_DELETE);
121 dout(20) << __func__ << " " << op << dendl;
122
123 op->mark_started();
124
125 C_GatherBuilder gather(cct);
126 for (const auto &p : m->objects) {
127 get_parent()->remove_missing_object(p.first, p.second, gather.new_sub());
128 }
129
130 MOSDPGRecoveryDeleteReply *reply = new MOSDPGRecoveryDeleteReply;
131 reply->from = get_parent()->whoami_shard();
132 reply->set_priority(m->get_priority());
133 reply->pgid = spg_t(get_parent()->get_info().pgid.pgid, m->from.shard);
134 reply->map_epoch = m->map_epoch;
135 reply->min_epoch = m->min_epoch;
136 reply->objects = m->objects;
137 ConnectionRef conn = m->get_connection();
138
139 gather.set_finisher(new FunctionContext(
140 [=](int r) {
141 if (r != -EAGAIN) {
142 get_parent()->send_message_osd_cluster(reply, conn.get());
143 }
144 }));
145 gather.activate();
146 }
147
148 void PGBackend::handle_recovery_delete_reply(OpRequestRef op)
149 {
150 const MOSDPGRecoveryDeleteReply *m = static_cast<const MOSDPGRecoveryDeleteReply *>(op->get_req());
151 assert(m->get_type() == MSG_OSD_PG_RECOVERY_DELETE_REPLY);
152 dout(20) << __func__ << " " << op << dendl;
153
154 for (const auto &p : m->objects) {
155 ObjectRecoveryInfo recovery_info;
156 hobject_t oid = p.first;
157 recovery_info.version = p.second;
158 get_parent()->on_peer_recover(m->from, oid, recovery_info);
159 bool peers_recovered = true;
160 for (const auto& shard : get_parent()->get_actingbackfill_shards()) {
161 if (shard == get_parent()->whoami_shard())
162 continue;
163 if (get_parent()->get_shard_missing(shard).is_missing(oid)) {
164 dout(20) << __func__ << " " << oid << " still missing on at least "
165 << shard << dendl;
166 peers_recovered = false;
167 break;
168 }
169 }
170 if (peers_recovered && !get_parent()->get_local_missing().is_missing(oid)) {
171 dout(20) << __func__ << " completed recovery, local_missing = "
172 << get_parent()->get_local_missing() << dendl;
173 object_stat_sum_t stat_diff;
174 stat_diff.num_objects_recovered = 1;
175 get_parent()->on_global_recover(p.first, stat_diff, true);
176 }
177 }
178 }
179
180 void PGBackend::rollback(
181 const pg_log_entry_t &entry,
182 ObjectStore::Transaction *t)
183 {
184
185 struct RollbackVisitor : public ObjectModDesc::Visitor {
186 const hobject_t &hoid;
187 PGBackend *pg;
188 ObjectStore::Transaction t;
189 RollbackVisitor(
190 const hobject_t &hoid,
191 PGBackend *pg) : hoid(hoid), pg(pg) {}
192 void append(uint64_t old_size) override {
193 ObjectStore::Transaction temp;
194 pg->rollback_append(hoid, old_size, &temp);
195 temp.append(t);
196 temp.swap(t);
197 }
198 void setattrs(map<string, boost::optional<bufferlist> > &attrs) override {
199 ObjectStore::Transaction temp;
200 pg->rollback_setattrs(hoid, attrs, &temp);
201 temp.append(t);
202 temp.swap(t);
203 }
204 void rmobject(version_t old_version) override {
205 ObjectStore::Transaction temp;
206 pg->rollback_stash(hoid, old_version, &temp);
207 temp.append(t);
208 temp.swap(t);
209 }
210 void try_rmobject(version_t old_version) override {
211 ObjectStore::Transaction temp;
212 pg->rollback_try_stash(hoid, old_version, &temp);
213 temp.append(t);
214 temp.swap(t);
215 }
216 void create() override {
217 ObjectStore::Transaction temp;
218 pg->rollback_create(hoid, &temp);
219 temp.append(t);
220 temp.swap(t);
221 }
222 void update_snaps(const set<snapid_t> &snaps) override {
223 ObjectStore::Transaction temp;
224 pg->get_parent()->pgb_set_object_snap_mapping(hoid, snaps, &temp);
225 temp.append(t);
226 temp.swap(t);
227 }
228 void rollback_extents(
229 version_t gen,
230 const vector<pair<uint64_t, uint64_t> > &extents) override {
231 ObjectStore::Transaction temp;
232 pg->rollback_extents(gen, extents, hoid, &temp);
233 temp.append(t);
234 temp.swap(t);
235 }
236 };
237
238 assert(entry.mod_desc.can_rollback());
239 RollbackVisitor vis(entry.soid, this);
240 entry.mod_desc.visit(&vis);
241 t->append(vis.t);
242 }
243
244 struct Trimmer : public ObjectModDesc::Visitor {
245 const hobject_t &soid;
246 PGBackend *pg;
247 ObjectStore::Transaction *t;
248 Trimmer(
249 const hobject_t &soid,
250 PGBackend *pg,
251 ObjectStore::Transaction *t)
252 : soid(soid), pg(pg), t(t) {}
253 void rmobject(version_t old_version) override {
254 pg->trim_rollback_object(
255 soid,
256 old_version,
257 t);
258 }
259 // try_rmobject defaults to rmobject
260 void rollback_extents(
261 version_t gen,
262 const vector<pair<uint64_t, uint64_t> > &extents) override {
263 pg->trim_rollback_object(
264 soid,
265 gen,
266 t);
267 }
268 };
269
270 void PGBackend::rollforward(
271 const pg_log_entry_t &entry,
272 ObjectStore::Transaction *t)
273 {
274 auto dpp = get_parent()->get_dpp();
275 ldpp_dout(dpp, 20) << __func__ << ": entry=" << entry << dendl;
276 if (!entry.can_rollback())
277 return;
278 Trimmer trimmer(entry.soid, this, t);
279 entry.mod_desc.visit(&trimmer);
280 }
281
282 void PGBackend::trim(
283 const pg_log_entry_t &entry,
284 ObjectStore::Transaction *t)
285 {
286 if (!entry.can_rollback())
287 return;
288 Trimmer trimmer(entry.soid, this, t);
289 entry.mod_desc.visit(&trimmer);
290 }
291
292 void PGBackend::try_stash(
293 const hobject_t &hoid,
294 version_t v,
295 ObjectStore::Transaction *t)
296 {
297 t->try_rename(
298 coll,
299 ghobject_t(hoid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard),
300 ghobject_t(hoid, v, get_parent()->whoami_shard().shard));
301 }
302
303 void PGBackend::remove(
304 const hobject_t &hoid,
305 ObjectStore::Transaction *t) {
306 assert(!hoid.is_temp());
307 t->remove(
308 coll,
309 ghobject_t(hoid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard));
310 get_parent()->pgb_clear_object_snap_mapping(hoid, t);
311 }
312
313 void PGBackend::on_change_cleanup(ObjectStore::Transaction *t)
314 {
315 dout(10) << __func__ << dendl;
316 // clear temp
317 for (set<hobject_t>::iterator i = temp_contents.begin();
318 i != temp_contents.end();
319 ++i) {
320 dout(10) << __func__ << ": Removing oid "
321 << *i << " from the temp collection" << dendl;
322 t->remove(
323 coll,
324 ghobject_t(*i, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard));
325 }
326 temp_contents.clear();
327 }
328
329 int PGBackend::objects_list_partial(
330 const hobject_t &begin,
331 int min,
332 int max,
333 vector<hobject_t> *ls,
334 hobject_t *next)
335 {
336 assert(ls);
337 // Starts with the smallest generation to make sure the result list
338 // has the marker object (it might have multiple generations
339 // though, which would be filtered).
340 ghobject_t _next;
341 if (!begin.is_min())
342 _next = ghobject_t(begin, 0, get_parent()->whoami_shard().shard);
343 ls->reserve(max);
344 int r = 0;
345
346 if (min > max)
347 min = max;
348
349 while (!_next.is_max() && ls->size() < (unsigned)min) {
350 vector<ghobject_t> objects;
351 r = store->collection_list(
352 ch,
353 _next,
354 ghobject_t::get_max(),
355 max - ls->size(),
356 &objects,
357 &_next);
358 if (r != 0) {
359 derr << __func__ << " list collection " << ch << " got: " << cpp_strerror(r) << dendl;
360 break;
361 }
362 for (vector<ghobject_t>::iterator i = objects.begin();
363 i != objects.end();
364 ++i) {
365 if (i->is_pgmeta() || i->hobj.is_temp()) {
366 continue;
367 }
368 if (i->is_no_gen()) {
369 ls->push_back(i->hobj);
370 }
371 }
372 }
373 if (r == 0)
374 *next = _next.hobj;
375 return r;
376 }
377
378 int PGBackend::objects_list_range(
379 const hobject_t &start,
380 const hobject_t &end,
381 snapid_t seq,
382 vector<hobject_t> *ls,
383 vector<ghobject_t> *gen_obs)
384 {
385 assert(ls);
386 vector<ghobject_t> objects;
387 int r = store->collection_list(
388 ch,
389 ghobject_t(start, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard),
390 ghobject_t(end, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard),
391 INT_MAX,
392 &objects,
393 NULL);
394 ls->reserve(objects.size());
395 for (vector<ghobject_t>::iterator i = objects.begin();
396 i != objects.end();
397 ++i) {
398 if (i->is_pgmeta() || i->hobj.is_temp()) {
399 continue;
400 }
401 if (i->is_no_gen()) {
402 ls->push_back(i->hobj);
403 } else if (gen_obs) {
404 gen_obs->push_back(*i);
405 }
406 }
407 return r;
408 }
409
410 int PGBackend::objects_get_attr(
411 const hobject_t &hoid,
412 const string &attr,
413 bufferlist *out)
414 {
415 bufferptr bp;
416 int r = store->getattr(
417 ch,
418 ghobject_t(hoid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard),
419 attr.c_str(),
420 bp);
421 if (r >= 0 && out) {
422 out->clear();
423 out->push_back(std::move(bp));
424 }
425 return r;
426 }
427
428 int PGBackend::objects_get_attrs(
429 const hobject_t &hoid,
430 map<string, bufferlist> *out)
431 {
432 return store->getattrs(
433 ch,
434 ghobject_t(hoid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard),
435 *out);
436 }
437
438 void PGBackend::rollback_setattrs(
439 const hobject_t &hoid,
440 map<string, boost::optional<bufferlist> > &old_attrs,
441 ObjectStore::Transaction *t) {
442 map<string, bufferlist> to_set;
443 assert(!hoid.is_temp());
444 for (map<string, boost::optional<bufferlist> >::iterator i = old_attrs.begin();
445 i != old_attrs.end();
446 ++i) {
447 if (i->second) {
448 to_set[i->first] = i->second.get();
449 } else {
450 t->rmattr(
451 coll,
452 ghobject_t(hoid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard),
453 i->first);
454 }
455 }
456 t->setattrs(
457 coll,
458 ghobject_t(hoid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard),
459 to_set);
460 }
461
462 void PGBackend::rollback_append(
463 const hobject_t &hoid,
464 uint64_t old_size,
465 ObjectStore::Transaction *t) {
466 assert(!hoid.is_temp());
467 t->truncate(
468 coll,
469 ghobject_t(hoid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard),
470 old_size);
471 }
472
473 void PGBackend::rollback_stash(
474 const hobject_t &hoid,
475 version_t old_version,
476 ObjectStore::Transaction *t) {
477 assert(!hoid.is_temp());
478 t->remove(
479 coll,
480 ghobject_t(hoid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard));
481 t->collection_move_rename(
482 coll,
483 ghobject_t(hoid, old_version, get_parent()->whoami_shard().shard),
484 coll,
485 ghobject_t(hoid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard));
486 }
487
488 void PGBackend::rollback_try_stash(
489 const hobject_t &hoid,
490 version_t old_version,
491 ObjectStore::Transaction *t) {
492 assert(!hoid.is_temp());
493 t->remove(
494 coll,
495 ghobject_t(hoid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard));
496 t->try_rename(
497 coll,
498 ghobject_t(hoid, old_version, get_parent()->whoami_shard().shard),
499 ghobject_t(hoid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard));
500 }
501
502 void PGBackend::rollback_extents(
503 version_t gen,
504 const vector<pair<uint64_t, uint64_t> > &extents,
505 const hobject_t &hoid,
506 ObjectStore::Transaction *t) {
507 auto shard = get_parent()->whoami_shard().shard;
508 for (auto &&extent: extents) {
509 t->clone_range(
510 coll,
511 ghobject_t(hoid, gen, shard),
512 ghobject_t(hoid, ghobject_t::NO_GEN, shard),
513 extent.first,
514 extent.second,
515 extent.first);
516 }
517 t->remove(
518 coll,
519 ghobject_t(hoid, gen, shard));
520 }
521
522 void PGBackend::trim_rollback_object(
523 const hobject_t &hoid,
524 version_t old_version,
525 ObjectStore::Transaction *t) {
526 assert(!hoid.is_temp());
527 t->remove(
528 coll, ghobject_t(hoid, old_version, get_parent()->whoami_shard().shard));
529 }
530
531 PGBackend *PGBackend::build_pg_backend(
532 const pg_pool_t &pool,
533 const OSDMapRef curmap,
534 Listener *l,
535 coll_t coll,
536 ObjectStore::CollectionHandle &ch,
537 ObjectStore *store,
538 CephContext *cct)
539 {
540 switch (pool.type) {
541 case pg_pool_t::TYPE_REPLICATED: {
542 return new ReplicatedBackend(l, coll, ch, store, cct);
543 }
544 case pg_pool_t::TYPE_ERASURE: {
545 ErasureCodeInterfaceRef ec_impl;
546 ErasureCodeProfile profile = curmap->get_erasure_code_profile(pool.erasure_code_profile);
547 assert(profile.count("plugin"));
548 stringstream ss;
549 ceph::ErasureCodePluginRegistry::instance().factory(
550 profile.find("plugin")->second,
551 cct->_conf->get_val<std::string>("erasure_code_dir"),
552 profile,
553 &ec_impl,
554 &ss);
555 assert(ec_impl);
556 return new ECBackend(
557 l,
558 coll,
559 ch,
560 store,
561 cct,
562 ec_impl,
563 pool.stripe_width);
564 }
565 default:
566 ceph_abort();
567 return NULL;
568 }
569 }
570
571 /*
572 * pg lock may or may not be held
573 */
574 void PGBackend::be_scan_list(
575 ScrubMap &map, const vector<hobject_t> &ls, bool deep, uint32_t seed,
576 ThreadPool::TPHandle &handle)
577 {
578 dout(10) << __func__ << " scanning " << ls.size() << " objects"
579 << (deep ? " deeply" : "") << dendl;
580 int i = 0;
581 for (vector<hobject_t>::const_iterator p = ls.begin();
582 p != ls.end();
583 ++p, i++) {
584 handle.reset_tp_timeout();
585 hobject_t poid = *p;
586
587 struct stat st;
588 int r = store->stat(
589 ch,
590 ghobject_t(
591 poid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard),
592 &st,
593 true);
594 if (r == 0) {
595 ScrubMap::object &o = map.objects[poid];
596 o.size = st.st_size;
597 assert(!o.negative);
598 store->getattrs(
599 ch,
600 ghobject_t(
601 poid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard),
602 o.attrs);
603
604 // calculate the CRC32 on deep scrubs
605 if (deep) {
606 be_deep_scrub(*p, seed, o, handle);
607 }
608
609 dout(25) << __func__ << " " << poid << dendl;
610 } else if (r == -ENOENT) {
611 dout(25) << __func__ << " " << poid << " got " << r
612 << ", skipping" << dendl;
613 } else if (r == -EIO) {
614 dout(25) << __func__ << " " << poid << " got " << r
615 << ", stat_error" << dendl;
616 ScrubMap::object &o = map.objects[poid];
617 o.stat_error = true;
618 } else {
619 derr << __func__ << " got: " << cpp_strerror(r) << dendl;
620 ceph_abort();
621 }
622 }
623 }
624
625 bool PGBackend::be_compare_scrub_objects(
626 pg_shard_t auth_shard,
627 const ScrubMap::object &auth,
628 const object_info_t& auth_oi,
629 const ScrubMap::object &candidate,
630 shard_info_wrapper &shard_result,
631 inconsistent_obj_wrapper &obj_result,
632 ostream &errorstream)
633 {
634 enum { CLEAN, FOUND_ERROR } error = CLEAN;
635 if (candidate.stat_error) {
636 assert(shard_result.has_stat_error());
637 error = FOUND_ERROR;
638 errorstream << "candidate had a stat error";
639 }
640 if (candidate.read_error || candidate.ec_hash_mismatch || candidate.ec_size_mismatch) {
641 error = FOUND_ERROR;
642 errorstream << "candidate had a read error";
643 }
644 if (auth.digest_present && candidate.digest_present) {
645 if (auth.digest != candidate.digest) {
646 if (error != CLEAN)
647 errorstream << ", ";
648 error = FOUND_ERROR;
649 errorstream << "data_digest 0x" << std::hex << candidate.digest
650 << " != data_digest 0x" << auth.digest << std::dec
651 << " from shard " << auth_shard;
652 obj_result.set_data_digest_mismatch();
653 }
654 }
655 if (auth.omap_digest_present && candidate.omap_digest_present) {
656 if (auth.omap_digest != candidate.omap_digest) {
657 if (error != CLEAN)
658 errorstream << ", ";
659 error = FOUND_ERROR;
660 errorstream << "omap_digest 0x" << std::hex << candidate.omap_digest
661 << " != omap_digest 0x" << auth.omap_digest << std::dec
662 << " from shard " << auth_shard;
663 obj_result.set_omap_digest_mismatch();
664 }
665 }
666 if (parent->get_pool().is_replicated()) {
667 if (auth_oi.is_data_digest() && candidate.digest_present) {
668 if (auth_oi.data_digest != candidate.digest) {
669 if (error != CLEAN)
670 errorstream << ", ";
671 error = FOUND_ERROR;
672 errorstream << "data_digest 0x" << std::hex << candidate.digest
673 << " != data_digest 0x" << auth_oi.data_digest << std::dec
674 << " from auth oi " << auth_oi;
675 shard_result.set_data_digest_mismatch_oi();
676 }
677 }
678 if (auth_oi.is_omap_digest() && candidate.omap_digest_present) {
679 if (auth_oi.omap_digest != candidate.omap_digest) {
680 if (error != CLEAN)
681 errorstream << ", ";
682 error = FOUND_ERROR;
683 errorstream << "omap_digest 0x" << std::hex << candidate.omap_digest
684 << " != omap_digest 0x" << auth_oi.omap_digest << std::dec
685 << " from auth oi " << auth_oi;
686 shard_result.set_omap_digest_mismatch_oi();
687 }
688 }
689 }
690 if (candidate.stat_error)
691 return error == FOUND_ERROR;
692 uint64_t oi_size = be_get_ondisk_size(auth_oi.size);
693 if (oi_size != candidate.size) {
694 if (error != CLEAN)
695 errorstream << ", ";
696 error = FOUND_ERROR;
697 errorstream << "size " << candidate.size
698 << " != size " << oi_size
699 << " from auth oi " << auth_oi;
700 shard_result.set_size_mismatch_oi();
701 }
702 if (auth.size != candidate.size) {
703 if (error != CLEAN)
704 errorstream << ", ";
705 error = FOUND_ERROR;
706 errorstream << "size " << candidate.size
707 << " != size " << auth.size
708 << " from shard " << auth_shard;
709 obj_result.set_size_mismatch();
710 }
711 for (map<string,bufferptr>::const_iterator i = auth.attrs.begin();
712 i != auth.attrs.end();
713 ++i) {
714 if (!candidate.attrs.count(i->first)) {
715 if (error != CLEAN)
716 errorstream << ", ";
717 error = FOUND_ERROR;
718 errorstream << "attr name mismatch '" << i->first << "'";
719 obj_result.set_attr_name_mismatch();
720 } else if (candidate.attrs.find(i->first)->second.cmp(i->second)) {
721 if (error != CLEAN)
722 errorstream << ", ";
723 error = FOUND_ERROR;
724 errorstream << "attr value mismatch '" << i->first << "'";
725 obj_result.set_attr_value_mismatch();
726 }
727 }
728 for (map<string,bufferptr>::const_iterator i = candidate.attrs.begin();
729 i != candidate.attrs.end();
730 ++i) {
731 if (!auth.attrs.count(i->first)) {
732 if (error != CLEAN)
733 errorstream << ", ";
734 error = FOUND_ERROR;
735 errorstream << "attr name mismatch '" << i->first << "'";
736 obj_result.set_attr_name_mismatch();
737 }
738 }
739 return error == FOUND_ERROR;
740 }
741
742 static int dcount(const object_info_t &oi)
743 {
744 int count = 0;
745 if (oi.is_data_digest())
746 count++;
747 if (oi.is_omap_digest())
748 count++;
749 return count;
750 }
751
752 map<pg_shard_t, ScrubMap *>::const_iterator
753 PGBackend::be_select_auth_object(
754 const hobject_t &obj,
755 const map<pg_shard_t,ScrubMap*> &maps,
756 object_info_t *auth_oi,
757 map<pg_shard_t, shard_info_wrapper> &shard_map,
758 inconsistent_obj_wrapper &object_error)
759 {
760 eversion_t auth_version;
761 bufferlist auth_bl;
762
763 // Create list of shards with primary last so it will be auth copy all
764 // other things being equal.
765 list<pg_shard_t> shards;
766 for (map<pg_shard_t, ScrubMap *>::const_iterator j = maps.begin();
767 j != maps.end();
768 ++j) {
769 if (j->first == get_parent()->whoami_shard())
770 continue;
771 shards.push_back(j->first);
772 }
773 shards.push_back(get_parent()->whoami_shard());
774
775 map<pg_shard_t, ScrubMap *>::const_iterator auth = maps.end();
776 for (auto &l : shards) {
777 map<pg_shard_t, ScrubMap *>::const_iterator j = maps.find(l);
778 map<hobject_t, ScrubMap::object>::iterator i =
779 j->second->objects.find(obj);
780 if (i == j->second->objects.end()) {
781 continue;
782 }
783 string error_string;
784 auto& shard_info = shard_map[j->first];
785 if (i->second.read_error) {
786 shard_info.set_read_error();
787 error_string += " read_error";
788 }
789 if (i->second.ec_hash_mismatch) {
790 shard_info.set_ec_hash_mismatch();
791 error_string += " ec_hash_mismatch";
792 }
793 if (i->second.ec_size_mismatch) {
794 shard_info.set_ec_size_mismatch();
795 error_string += " ec_size_mismatch";
796 }
797
798 object_info_t oi;
799 bufferlist bl;
800 map<string, bufferptr>::iterator k;
801 SnapSet ss;
802 bufferlist ss_bl;
803
804 if (i->second.stat_error) {
805 shard_info.set_stat_error();
806 error_string += " stat_error";
807 // With stat_error no further checking
808 // We don't need to also see a missing_object_info_attr
809 goto out;
810 }
811
812 k = i->second.attrs.find(OI_ATTR);
813 if (k == i->second.attrs.end()) {
814 // no object info on object, probably corrupt
815 shard_info.set_oi_attr_missing();
816 error_string += " oi_attr_missing";
817 goto out;
818 }
819 bl.push_back(k->second);
820 try {
821 bufferlist::iterator bliter = bl.begin();
822 ::decode(oi, bliter);
823 } catch (...) {
824 // invalid object info, probably corrupt
825 shard_info.set_oi_attr_corrupted();
826 error_string += " oi_attr_corrupted";
827 goto out;
828 }
829
830 if (oi.soid != obj) {
831 shard_info.set_oi_attr_corrupted();
832 error_string += " oi_attr_corrupted";
833 goto out;
834 }
835
836 if (auth_version != eversion_t()) {
837 if (!object_error.has_object_info_inconsistency() && !(bl == auth_bl)) {
838 object_error.set_object_info_inconsistency();
839 error_string += " object_info_inconsistency";
840 }
841 }
842
843 // Don't use this particular shard because it won't be able to repair data
844 // XXX: For now we can't pick one shard for repair and another's object info
845 if (i->second.read_error || i->second.ec_hash_mismatch || i->second.ec_size_mismatch)
846 goto out;
847
848 // We don't set errors here for snapset, but we won't pick an auth copy if the
849 // snapset is missing or won't decode.
850 if (obj.is_head() || obj.is_snapdir()) {
851 k = i->second.attrs.find(SS_ATTR);
852 if (k == i->second.attrs.end()) {
853 goto out;
854 }
855 ss_bl.push_back(k->second);
856 try {
857 bufferlist::iterator bliter = ss_bl.begin();
858 ::decode(ss, bliter);
859 } catch (...) {
860 // invalid snapset, probably corrupt
861 goto out;
862 }
863 }
864
865 if (auth_version == eversion_t() || oi.version > auth_version ||
866 (oi.version == auth_version && dcount(oi) > dcount(*auth_oi))) {
867 auth = j;
868 *auth_oi = oi;
869 auth_version = oi.version;
870 auth_bl.clear();
871 auth_bl.append(bl);
872 }
873
874 out:
875 // Check error_string because some errors already generated messages
876 if (error_string != "") {
877 dout(10) << __func__ << ": error(s) osd " << j->first
878 << " for obj " << obj
879 << "," << error_string
880 << dendl;
881 }
882 // Keep scanning other shards
883 }
884 dout(10) << __func__ << ": selecting osd " << auth->first
885 << " for obj " << obj
886 << " with oi " << *auth_oi
887 << dendl;
888 return auth;
889 }
890
891 void PGBackend::be_compare_scrubmaps(
892 const map<pg_shard_t,ScrubMap*> &maps,
893 bool repair,
894 map<hobject_t, set<pg_shard_t>> &missing,
895 map<hobject_t, set<pg_shard_t>> &inconsistent,
896 map<hobject_t, list<pg_shard_t>> &authoritative,
897 map<hobject_t, pair<uint32_t,uint32_t>> &missing_digest,
898 int &shallow_errors, int &deep_errors,
899 Scrub::Store *store,
900 const spg_t& pgid,
901 const vector<int> &acting,
902 ostream &errorstream)
903 {
904 map<hobject_t,ScrubMap::object>::const_iterator i;
905 map<pg_shard_t, ScrubMap *>::const_iterator j;
906 set<hobject_t> master_set;
907 utime_t now = ceph_clock_now();
908
909 // Construct master set
910 for (j = maps.begin(); j != maps.end(); ++j) {
911 for (i = j->second->objects.begin(); i != j->second->objects.end(); ++i) {
912 master_set.insert(i->first);
913 }
914 }
915
916 // Check maps against master set and each other
917 for (set<hobject_t>::const_iterator k = master_set.begin();
918 k != master_set.end();
919 ++k) {
920 object_info_t auth_oi;
921 map<pg_shard_t, shard_info_wrapper> shard_map;
922
923 inconsistent_obj_wrapper object_error{*k};
924
925 map<pg_shard_t, ScrubMap *>::const_iterator auth =
926 be_select_auth_object(*k, maps, &auth_oi, shard_map, object_error);
927
928 list<pg_shard_t> auth_list;
929 set<pg_shard_t> object_errors;
930 if (auth == maps.end()) {
931 object_error.set_version(0);
932 object_error.set_auth_missing(*k, maps, shard_map, shallow_errors, deep_errors);
933 if (object_error.has_deep_errors())
934 ++deep_errors;
935 else if (object_error.has_shallow_errors())
936 ++shallow_errors;
937 store->add_object_error(k->pool, object_error);
938 errorstream << pgid.pgid << " soid " << *k
939 << ": failed to pick suitable object info\n";
940 continue;
941 }
942 object_error.set_version(auth_oi.user_version);
943 ScrubMap::object& auth_object = auth->second->objects[*k];
944 set<pg_shard_t> cur_missing;
945 set<pg_shard_t> cur_inconsistent;
946
947 for (j = maps.begin(); j != maps.end(); ++j) {
948 if (j == auth)
949 shard_map[auth->first].selected_oi = true;
950 if (j->second->objects.count(*k)) {
951 shard_map[j->first].set_object(j->second->objects[*k]);
952 // Compare
953 stringstream ss;
954 bool found = be_compare_scrub_objects(auth->first,
955 auth_object,
956 auth_oi,
957 j->second->objects[*k],
958 shard_map[j->first],
959 object_error,
960 ss);
961 // Some errors might have already been set in be_select_auth_object()
962 if (shard_map[j->first].errors != 0) {
963 cur_inconsistent.insert(j->first);
964 if (shard_map[j->first].has_deep_errors())
965 ++deep_errors;
966 else
967 ++shallow_errors;
968 // Only true if be_compare_scrub_objects() found errors and put something
969 // in ss.
970 if (found)
971 errorstream << pgid << " shard " << j->first << ": soid " << *k
972 << " " << ss.str() << "\n";
973 } else if (found) {
974 // Track possible shard to use as authoritative, if needed
975 // There are errors, without identifying the shard
976 object_errors.insert(j->first);
977 } else {
978 // XXX: The auth shard might get here that we don't know
979 // that it has the "correct" data.
980 auth_list.push_back(j->first);
981 }
982 } else {
983 cur_missing.insert(j->first);
984 shard_map[j->first].set_missing();
985 // Can't have any other errors if there is no information available
986 ++shallow_errors;
987 errorstream << pgid << " shard " << j->first << " missing " << *k
988 << "\n";
989 }
990 object_error.add_shard(j->first, shard_map[j->first]);
991 }
992
993 if (auth_list.empty()) {
994 if (object_errors.empty()) {
995 errorstream << pgid.pgid << " soid " << *k
996 << ": failed to pick suitable auth object\n";
997 goto out;
998 }
999 // Object errors exist and nothing in auth_list
1000 // Prefer the auth shard otherwise take first from list.
1001 pg_shard_t shard;
1002 if (object_errors.count(auth->first)) {
1003 shard = auth->first;
1004 } else {
1005 shard = *(object_errors.begin());
1006 }
1007 auth_list.push_back(shard);
1008 object_errors.erase(shard);
1009 }
1010 // At this point auth_list is populated, so we add the object errors shards
1011 // as inconsistent.
1012 cur_inconsistent.insert(object_errors.begin(), object_errors.end());
1013 if (!cur_missing.empty()) {
1014 missing[*k] = cur_missing;
1015 }
1016 if (!cur_inconsistent.empty()) {
1017 inconsistent[*k] = cur_inconsistent;
1018 }
1019 if (!cur_inconsistent.empty() || !cur_missing.empty()) {
1020 authoritative[*k] = auth_list;
1021 } else if (parent->get_pool().is_replicated()) {
1022 enum {
1023 NO = 0,
1024 MAYBE = 1,
1025 FORCE = 2,
1026 } update = NO;
1027
1028 if (auth_object.digest_present && auth_object.omap_digest_present &&
1029 (!auth_oi.is_data_digest() || !auth_oi.is_omap_digest())) {
1030 dout(20) << __func__ << " missing digest on " << *k << dendl;
1031 update = MAYBE;
1032 }
1033 if (auth_object.digest_present && auth_object.omap_digest_present &&
1034 cct->_conf->osd_debug_scrub_chance_rewrite_digest &&
1035 (((unsigned)rand() % 100) >
1036 cct->_conf->osd_debug_scrub_chance_rewrite_digest)) {
1037 dout(20) << __func__ << " randomly updating digest on " << *k << dendl;
1038 update = MAYBE;
1039 }
1040
1041 // recorded digest != actual digest?
1042 if (auth_oi.is_data_digest() && auth_object.digest_present &&
1043 auth_oi.data_digest != auth_object.digest) {
1044 assert(shard_map[auth->first].has_data_digest_mismatch_oi());
1045 errorstream << pgid << " recorded data digest 0x"
1046 << std::hex << auth_oi.data_digest << " != on disk 0x"
1047 << auth_object.digest << std::dec << " on " << auth_oi.soid
1048 << "\n";
1049 if (repair)
1050 update = FORCE;
1051 }
1052 if (auth_oi.is_omap_digest() && auth_object.omap_digest_present &&
1053 auth_oi.omap_digest != auth_object.omap_digest) {
1054 assert(shard_map[auth->first].has_omap_digest_mismatch_oi());
1055 errorstream << pgid << " recorded omap digest 0x"
1056 << std::hex << auth_oi.omap_digest << " != on disk 0x"
1057 << auth_object.omap_digest << std::dec
1058 << " on " << auth_oi.soid << "\n";
1059 if (repair)
1060 update = FORCE;
1061 }
1062
1063 if (update != NO) {
1064 utime_t age = now - auth_oi.local_mtime;
1065 if (update == FORCE ||
1066 age > cct->_conf->osd_deep_scrub_update_digest_min_age) {
1067 dout(20) << __func__ << " will update digest on " << *k << dendl;
1068 missing_digest[*k] = make_pair(auth_object.digest,
1069 auth_object.omap_digest);
1070 } else {
1071 dout(20) << __func__ << " missing digest but age " << age
1072 << " < " << cct->_conf->osd_deep_scrub_update_digest_min_age
1073 << " on " << *k << dendl;
1074 }
1075 }
1076 }
1077 out:
1078 if (object_error.has_deep_errors())
1079 ++deep_errors;
1080 else if (object_error.has_shallow_errors())
1081 ++shallow_errors;
1082 if (object_error.errors || object_error.union_shards.errors) {
1083 store->add_object_error(k->pool, object_error);
1084 }
1085 }
1086 }