]> git.proxmox.com Git - ceph.git/blame - ceph/src/osd/PGBackend.cc
update sources to 12.2.8
[ceph.git] / ceph / src / osd / PGBackend.cc
CommitLineData
7c673cae
FG
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3/*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2013,2014 Inktank Storage, Inc.
7 * Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com>
8 *
9 * Author: Loic Dachary <loic@dachary.org>
10 *
11 * This is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public
13 * License version 2.1, as published by the Free Software
14 * Foundation. See file COPYING.
15 *
16 */
17
18
19#include "common/errno.h"
20#include "common/scrub_types.h"
21#include "ReplicatedBackend.h"
22#include "ScrubStore.h"
23#include "ECBackend.h"
24#include "PGBackend.h"
25#include "OSD.h"
26#include "erasure-code/ErasureCodePlugin.h"
27#include "OSDMap.h"
28#include "PGLog.h"
29#include "common/LogClient.h"
c07f9fc5
FG
30#include "messages/MOSDPGRecoveryDelete.h"
31#include "messages/MOSDPGRecoveryDeleteReply.h"
7c673cae
FG
32
33#define dout_context cct
34#define dout_subsys ceph_subsys_osd
35#define DOUT_PREFIX_ARGS this
36#undef dout_prefix
37#define dout_prefix _prefix(_dout, this)
38static ostream& _prefix(std::ostream *_dout, PGBackend *pgb) {
39 return *_dout << pgb->get_parent()->gen_dbg_prefix();
40}
41
c07f9fc5
FG
42void PGBackend::recover_delete_object(const hobject_t &oid, eversion_t v,
43 RecoveryHandle *h)
44{
45 assert(get_parent()->get_actingbackfill_shards().size() > 0);
46 for (const auto& shard : get_parent()->get_actingbackfill_shards()) {
47 if (shard == get_parent()->whoami_shard())
48 continue;
49 if (get_parent()->get_shard_missing(shard).is_missing(oid)) {
50 dout(20) << __func__ << " will remove " << oid << " " << v << " from "
51 << shard << dendl;
52 h->deletes[shard].push_back(make_pair(oid, v));
53 get_parent()->begin_peer_recover(shard, oid);
54 }
55 }
56}
57
58void PGBackend::send_recovery_deletes(int prio,
59 const map<pg_shard_t, vector<pair<hobject_t, eversion_t> > > &deletes)
60{
61 epoch_t min_epoch = get_parent()->get_last_peering_reset_epoch();
62 for (const auto& p : deletes) {
63 const auto& shard = p.first;
64 const auto& objects = p.second;
65 ConnectionRef con = get_parent()->get_con_osd_cluster(
66 shard.osd,
67 get_osdmap()->get_epoch());
68 if (!con)
69 continue;
70 auto it = objects.begin();
71 while (it != objects.end()) {
72 uint64_t cost = 0;
73 uint64_t deletes = 0;
74 spg_t target_pg = spg_t(get_parent()->get_info().pgid.pgid, shard.shard);
75 MOSDPGRecoveryDelete *msg =
76 new MOSDPGRecoveryDelete(get_parent()->whoami_shard(),
77 target_pg,
78 get_osdmap()->get_epoch(),
79 min_epoch);
80 msg->set_priority(prio);
81
82 while (it != objects.end() &&
83 cost < cct->_conf->osd_max_push_cost &&
84 deletes < cct->_conf->osd_max_push_objects) {
85 dout(20) << __func__ << ": sending recovery delete << " << it->first
86 << " " << it->second << " to osd." << shard << dendl;
87 msg->objects.push_back(*it);
88 cost += cct->_conf->osd_push_per_object_cost;
89 ++deletes;
90 ++it;
91 }
92
93 msg->set_cost(cost);
94 get_parent()->send_message_osd_cluster(msg, con);
95 }
96 }
97}
98
99bool PGBackend::handle_message(OpRequestRef op)
100{
101 switch (op->get_req()->get_type()) {
102 case MSG_OSD_PG_RECOVERY_DELETE:
103 handle_recovery_delete(op);
104 return true;
105
106 case MSG_OSD_PG_RECOVERY_DELETE_REPLY:
107 handle_recovery_delete_reply(op);
108 return true;
109
110 default:
111 break;
112 }
113
114 return _handle_message(op);
115}
116
117void PGBackend::handle_recovery_delete(OpRequestRef op)
118{
119 const MOSDPGRecoveryDelete *m = static_cast<const MOSDPGRecoveryDelete *>(op->get_req());
120 assert(m->get_type() == MSG_OSD_PG_RECOVERY_DELETE);
121 dout(20) << __func__ << " " << op << dendl;
122
123 op->mark_started();
124
125 C_GatherBuilder gather(cct);
126 for (const auto &p : m->objects) {
127 get_parent()->remove_missing_object(p.first, p.second, gather.new_sub());
128 }
129
130 MOSDPGRecoveryDeleteReply *reply = new MOSDPGRecoveryDeleteReply;
131 reply->from = get_parent()->whoami_shard();
132 reply->set_priority(m->get_priority());
133 reply->pgid = spg_t(get_parent()->get_info().pgid.pgid, m->from.shard);
134 reply->map_epoch = m->map_epoch;
135 reply->min_epoch = m->min_epoch;
136 reply->objects = m->objects;
137 ConnectionRef conn = m->get_connection();
138
139 gather.set_finisher(new FunctionContext(
140 [=](int r) {
141 if (r != -EAGAIN) {
142 get_parent()->send_message_osd_cluster(reply, conn.get());
b5b8bbf5
FG
143 } else {
144 reply->put();
c07f9fc5
FG
145 }
146 }));
147 gather.activate();
148}
149
150void PGBackend::handle_recovery_delete_reply(OpRequestRef op)
151{
152 const MOSDPGRecoveryDeleteReply *m = static_cast<const MOSDPGRecoveryDeleteReply *>(op->get_req());
153 assert(m->get_type() == MSG_OSD_PG_RECOVERY_DELETE_REPLY);
154 dout(20) << __func__ << " " << op << dendl;
155
156 for (const auto &p : m->objects) {
157 ObjectRecoveryInfo recovery_info;
158 hobject_t oid = p.first;
159 recovery_info.version = p.second;
160 get_parent()->on_peer_recover(m->from, oid, recovery_info);
161 bool peers_recovered = true;
162 for (const auto& shard : get_parent()->get_actingbackfill_shards()) {
163 if (shard == get_parent()->whoami_shard())
164 continue;
165 if (get_parent()->get_shard_missing(shard).is_missing(oid)) {
166 dout(20) << __func__ << " " << oid << " still missing on at least "
167 << shard << dendl;
168 peers_recovered = false;
169 break;
170 }
171 }
172 if (peers_recovered && !get_parent()->get_local_missing().is_missing(oid)) {
173 dout(20) << __func__ << " completed recovery, local_missing = "
174 << get_parent()->get_local_missing() << dendl;
175 object_stat_sum_t stat_diff;
176 stat_diff.num_objects_recovered = 1;
177 get_parent()->on_global_recover(p.first, stat_diff, true);
178 }
179 }
180}
181
7c673cae
FG
182void PGBackend::rollback(
183 const pg_log_entry_t &entry,
184 ObjectStore::Transaction *t)
185{
186
187 struct RollbackVisitor : public ObjectModDesc::Visitor {
188 const hobject_t &hoid;
189 PGBackend *pg;
190 ObjectStore::Transaction t;
191 RollbackVisitor(
192 const hobject_t &hoid,
193 PGBackend *pg) : hoid(hoid), pg(pg) {}
194 void append(uint64_t old_size) override {
195 ObjectStore::Transaction temp;
196 pg->rollback_append(hoid, old_size, &temp);
197 temp.append(t);
198 temp.swap(t);
199 }
200 void setattrs(map<string, boost::optional<bufferlist> > &attrs) override {
201 ObjectStore::Transaction temp;
202 pg->rollback_setattrs(hoid, attrs, &temp);
203 temp.append(t);
204 temp.swap(t);
205 }
206 void rmobject(version_t old_version) override {
207 ObjectStore::Transaction temp;
208 pg->rollback_stash(hoid, old_version, &temp);
209 temp.append(t);
210 temp.swap(t);
211 }
212 void try_rmobject(version_t old_version) override {
213 ObjectStore::Transaction temp;
214 pg->rollback_try_stash(hoid, old_version, &temp);
215 temp.append(t);
216 temp.swap(t);
217 }
218 void create() override {
219 ObjectStore::Transaction temp;
220 pg->rollback_create(hoid, &temp);
221 temp.append(t);
222 temp.swap(t);
223 }
224 void update_snaps(const set<snapid_t> &snaps) override {
225 ObjectStore::Transaction temp;
226 pg->get_parent()->pgb_set_object_snap_mapping(hoid, snaps, &temp);
227 temp.append(t);
228 temp.swap(t);
229 }
230 void rollback_extents(
231 version_t gen,
232 const vector<pair<uint64_t, uint64_t> > &extents) override {
233 ObjectStore::Transaction temp;
234 pg->rollback_extents(gen, extents, hoid, &temp);
235 temp.append(t);
236 temp.swap(t);
237 }
238 };
239
240 assert(entry.mod_desc.can_rollback());
241 RollbackVisitor vis(entry.soid, this);
242 entry.mod_desc.visit(&vis);
243 t->append(vis.t);
244}
245
246struct Trimmer : public ObjectModDesc::Visitor {
247 const hobject_t &soid;
248 PGBackend *pg;
249 ObjectStore::Transaction *t;
250 Trimmer(
251 const hobject_t &soid,
252 PGBackend *pg,
253 ObjectStore::Transaction *t)
254 : soid(soid), pg(pg), t(t) {}
255 void rmobject(version_t old_version) override {
256 pg->trim_rollback_object(
257 soid,
258 old_version,
259 t);
260 }
261 // try_rmobject defaults to rmobject
262 void rollback_extents(
263 version_t gen,
264 const vector<pair<uint64_t, uint64_t> > &extents) override {
265 pg->trim_rollback_object(
266 soid,
267 gen,
268 t);
269 }
270};
271
272void PGBackend::rollforward(
273 const pg_log_entry_t &entry,
274 ObjectStore::Transaction *t)
275{
276 auto dpp = get_parent()->get_dpp();
277 ldpp_dout(dpp, 20) << __func__ << ": entry=" << entry << dendl;
278 if (!entry.can_rollback())
279 return;
280 Trimmer trimmer(entry.soid, this, t);
281 entry.mod_desc.visit(&trimmer);
282}
283
284void PGBackend::trim(
285 const pg_log_entry_t &entry,
286 ObjectStore::Transaction *t)
287{
288 if (!entry.can_rollback())
289 return;
290 Trimmer trimmer(entry.soid, this, t);
291 entry.mod_desc.visit(&trimmer);
292}
293
294void PGBackend::try_stash(
295 const hobject_t &hoid,
296 version_t v,
297 ObjectStore::Transaction *t)
298{
299 t->try_rename(
300 coll,
301 ghobject_t(hoid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard),
302 ghobject_t(hoid, v, get_parent()->whoami_shard().shard));
303}
304
305void PGBackend::remove(
306 const hobject_t &hoid,
307 ObjectStore::Transaction *t) {
308 assert(!hoid.is_temp());
309 t->remove(
310 coll,
311 ghobject_t(hoid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard));
312 get_parent()->pgb_clear_object_snap_mapping(hoid, t);
313}
314
315void PGBackend::on_change_cleanup(ObjectStore::Transaction *t)
316{
317 dout(10) << __func__ << dendl;
318 // clear temp
319 for (set<hobject_t>::iterator i = temp_contents.begin();
320 i != temp_contents.end();
321 ++i) {
322 dout(10) << __func__ << ": Removing oid "
323 << *i << " from the temp collection" << dendl;
324 t->remove(
325 coll,
326 ghobject_t(*i, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard));
327 }
328 temp_contents.clear();
329}
330
331int PGBackend::objects_list_partial(
332 const hobject_t &begin,
333 int min,
334 int max,
335 vector<hobject_t> *ls,
336 hobject_t *next)
337{
338 assert(ls);
339 // Starts with the smallest generation to make sure the result list
340 // has the marker object (it might have multiple generations
341 // though, which would be filtered).
342 ghobject_t _next;
343 if (!begin.is_min())
344 _next = ghobject_t(begin, 0, get_parent()->whoami_shard().shard);
345 ls->reserve(max);
346 int r = 0;
347
348 if (min > max)
349 min = max;
350
351 while (!_next.is_max() && ls->size() < (unsigned)min) {
352 vector<ghobject_t> objects;
353 r = store->collection_list(
354 ch,
355 _next,
356 ghobject_t::get_max(),
357 max - ls->size(),
358 &objects,
359 &_next);
360 if (r != 0) {
361 derr << __func__ << " list collection " << ch << " got: " << cpp_strerror(r) << dendl;
362 break;
363 }
364 for (vector<ghobject_t>::iterator i = objects.begin();
365 i != objects.end();
366 ++i) {
367 if (i->is_pgmeta() || i->hobj.is_temp()) {
368 continue;
369 }
370 if (i->is_no_gen()) {
371 ls->push_back(i->hobj);
372 }
373 }
374 }
375 if (r == 0)
376 *next = _next.hobj;
377 return r;
378}
379
380int PGBackend::objects_list_range(
381 const hobject_t &start,
382 const hobject_t &end,
383 snapid_t seq,
384 vector<hobject_t> *ls,
385 vector<ghobject_t> *gen_obs)
386{
387 assert(ls);
388 vector<ghobject_t> objects;
389 int r = store->collection_list(
390 ch,
391 ghobject_t(start, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard),
392 ghobject_t(end, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard),
393 INT_MAX,
394 &objects,
395 NULL);
396 ls->reserve(objects.size());
397 for (vector<ghobject_t>::iterator i = objects.begin();
398 i != objects.end();
399 ++i) {
400 if (i->is_pgmeta() || i->hobj.is_temp()) {
401 continue;
402 }
403 if (i->is_no_gen()) {
404 ls->push_back(i->hobj);
405 } else if (gen_obs) {
406 gen_obs->push_back(*i);
407 }
408 }
409 return r;
410}
411
412int PGBackend::objects_get_attr(
413 const hobject_t &hoid,
414 const string &attr,
415 bufferlist *out)
416{
417 bufferptr bp;
418 int r = store->getattr(
419 ch,
420 ghobject_t(hoid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard),
421 attr.c_str(),
422 bp);
423 if (r >= 0 && out) {
424 out->clear();
425 out->push_back(std::move(bp));
426 }
427 return r;
428}
429
430int PGBackend::objects_get_attrs(
431 const hobject_t &hoid,
432 map<string, bufferlist> *out)
433{
434 return store->getattrs(
435 ch,
436 ghobject_t(hoid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard),
437 *out);
438}
439
440void PGBackend::rollback_setattrs(
441 const hobject_t &hoid,
442 map<string, boost::optional<bufferlist> > &old_attrs,
443 ObjectStore::Transaction *t) {
444 map<string, bufferlist> to_set;
445 assert(!hoid.is_temp());
446 for (map<string, boost::optional<bufferlist> >::iterator i = old_attrs.begin();
447 i != old_attrs.end();
448 ++i) {
449 if (i->second) {
450 to_set[i->first] = i->second.get();
451 } else {
452 t->rmattr(
453 coll,
454 ghobject_t(hoid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard),
455 i->first);
456 }
457 }
458 t->setattrs(
459 coll,
460 ghobject_t(hoid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard),
461 to_set);
462}
463
464void PGBackend::rollback_append(
465 const hobject_t &hoid,
466 uint64_t old_size,
467 ObjectStore::Transaction *t) {
468 assert(!hoid.is_temp());
469 t->truncate(
470 coll,
471 ghobject_t(hoid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard),
472 old_size);
473}
474
475void PGBackend::rollback_stash(
476 const hobject_t &hoid,
477 version_t old_version,
478 ObjectStore::Transaction *t) {
479 assert(!hoid.is_temp());
480 t->remove(
481 coll,
482 ghobject_t(hoid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard));
483 t->collection_move_rename(
484 coll,
485 ghobject_t(hoid, old_version, get_parent()->whoami_shard().shard),
486 coll,
487 ghobject_t(hoid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard));
488}
489
490void PGBackend::rollback_try_stash(
491 const hobject_t &hoid,
492 version_t old_version,
493 ObjectStore::Transaction *t) {
494 assert(!hoid.is_temp());
495 t->remove(
496 coll,
497 ghobject_t(hoid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard));
498 t->try_rename(
499 coll,
500 ghobject_t(hoid, old_version, get_parent()->whoami_shard().shard),
501 ghobject_t(hoid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard));
502}
503
504void PGBackend::rollback_extents(
505 version_t gen,
506 const vector<pair<uint64_t, uint64_t> > &extents,
507 const hobject_t &hoid,
508 ObjectStore::Transaction *t) {
509 auto shard = get_parent()->whoami_shard().shard;
510 for (auto &&extent: extents) {
511 t->clone_range(
512 coll,
513 ghobject_t(hoid, gen, shard),
514 ghobject_t(hoid, ghobject_t::NO_GEN, shard),
515 extent.first,
516 extent.second,
517 extent.first);
518 }
519 t->remove(
520 coll,
521 ghobject_t(hoid, gen, shard));
522}
523
524void PGBackend::trim_rollback_object(
525 const hobject_t &hoid,
526 version_t old_version,
527 ObjectStore::Transaction *t) {
528 assert(!hoid.is_temp());
529 t->remove(
530 coll, ghobject_t(hoid, old_version, get_parent()->whoami_shard().shard));
531}
532
533PGBackend *PGBackend::build_pg_backend(
534 const pg_pool_t &pool,
535 const OSDMapRef curmap,
536 Listener *l,
537 coll_t coll,
538 ObjectStore::CollectionHandle &ch,
539 ObjectStore *store,
540 CephContext *cct)
541{
542 switch (pool.type) {
543 case pg_pool_t::TYPE_REPLICATED: {
544 return new ReplicatedBackend(l, coll, ch, store, cct);
545 }
546 case pg_pool_t::TYPE_ERASURE: {
547 ErasureCodeInterfaceRef ec_impl;
548 ErasureCodeProfile profile = curmap->get_erasure_code_profile(pool.erasure_code_profile);
549 assert(profile.count("plugin"));
550 stringstream ss;
551 ceph::ErasureCodePluginRegistry::instance().factory(
552 profile.find("plugin")->second,
553 cct->_conf->get_val<std::string>("erasure_code_dir"),
554 profile,
555 &ec_impl,
556 &ss);
557 assert(ec_impl);
558 return new ECBackend(
559 l,
560 coll,
561 ch,
562 store,
563 cct,
564 ec_impl,
565 pool.stripe_width);
566 }
567 default:
568 ceph_abort();
569 return NULL;
570 }
571}
572
28e407b8
AA
573int PGBackend::be_scan_list(
574 ScrubMap &map,
575 ScrubMapBuilder &pos)
7c673cae 576{
28e407b8
AA
577 dout(10) << __func__ << " " << pos << dendl;
578 assert(!pos.done());
579 assert(pos.pos < pos.ls.size());
580 hobject_t& poid = pos.ls[pos.pos];
581
582 struct stat st;
583 int r = store->stat(
584 ch,
585 ghobject_t(
586 poid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard),
587 &st,
588 true);
589 if (r == 0) {
590 ScrubMap::object &o = map.objects[poid];
591 o.size = st.st_size;
592 assert(!o.negative);
593 store->getattrs(
7c673cae
FG
594 ch,
595 ghobject_t(
596 poid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard),
28e407b8 597 o.attrs);
7c673cae 598
28e407b8
AA
599 if (pos.deep) {
600 r = be_deep_scrub(poid, map, pos, o);
7c673cae 601 }
28e407b8
AA
602 dout(25) << __func__ << " " << poid << dendl;
603 } else if (r == -ENOENT) {
604 dout(25) << __func__ << " " << poid << " got " << r
605 << ", skipping" << dendl;
606 } else if (r == -EIO) {
607 dout(25) << __func__ << " " << poid << " got " << r
608 << ", stat_error" << dendl;
609 ScrubMap::object &o = map.objects[poid];
610 o.stat_error = true;
611 } else {
612 derr << __func__ << " got: " << cpp_strerror(r) << dendl;
613 ceph_abort();
614 }
615 if (r == -EINPROGRESS) {
616 return -EINPROGRESS;
7c673cae 617 }
28e407b8
AA
618 pos.next_object();
619 return 0;
7c673cae
FG
620}
621
622bool PGBackend::be_compare_scrub_objects(
623 pg_shard_t auth_shard,
624 const ScrubMap::object &auth,
625 const object_info_t& auth_oi,
626 const ScrubMap::object &candidate,
627 shard_info_wrapper &shard_result,
628 inconsistent_obj_wrapper &obj_result,
629 ostream &errorstream)
630{
631 enum { CLEAN, FOUND_ERROR } error = CLEAN;
632 if (candidate.stat_error) {
633 assert(shard_result.has_stat_error());
634 error = FOUND_ERROR;
635 errorstream << "candidate had a stat error";
636 }
637 if (candidate.read_error || candidate.ec_hash_mismatch || candidate.ec_size_mismatch) {
638 error = FOUND_ERROR;
639 errorstream << "candidate had a read error";
640 }
641 if (auth.digest_present && candidate.digest_present) {
642 if (auth.digest != candidate.digest) {
643 if (error != CLEAN)
644 errorstream << ", ";
645 error = FOUND_ERROR;
646 errorstream << "data_digest 0x" << std::hex << candidate.digest
647 << " != data_digest 0x" << auth.digest << std::dec
648 << " from shard " << auth_shard;
649 obj_result.set_data_digest_mismatch();
650 }
651 }
652 if (auth.omap_digest_present && candidate.omap_digest_present) {
653 if (auth.omap_digest != candidate.omap_digest) {
654 if (error != CLEAN)
655 errorstream << ", ";
656 error = FOUND_ERROR;
657 errorstream << "omap_digest 0x" << std::hex << candidate.omap_digest
658 << " != omap_digest 0x" << auth.omap_digest << std::dec
659 << " from shard " << auth_shard;
660 obj_result.set_omap_digest_mismatch();
661 }
662 }
663 if (parent->get_pool().is_replicated()) {
664 if (auth_oi.is_data_digest() && candidate.digest_present) {
665 if (auth_oi.data_digest != candidate.digest) {
666 if (error != CLEAN)
667 errorstream << ", ";
668 error = FOUND_ERROR;
669 errorstream << "data_digest 0x" << std::hex << candidate.digest
670 << " != data_digest 0x" << auth_oi.data_digest << std::dec
671 << " from auth oi " << auth_oi;
94b18763 672 shard_result.set_data_digest_mismatch_info();
7c673cae
FG
673 }
674 }
675 if (auth_oi.is_omap_digest() && candidate.omap_digest_present) {
676 if (auth_oi.omap_digest != candidate.omap_digest) {
677 if (error != CLEAN)
678 errorstream << ", ";
679 error = FOUND_ERROR;
680 errorstream << "omap_digest 0x" << std::hex << candidate.omap_digest
681 << " != omap_digest 0x" << auth_oi.omap_digest << std::dec
682 << " from auth oi " << auth_oi;
94b18763 683 shard_result.set_omap_digest_mismatch_info();
7c673cae
FG
684 }
685 }
686 }
687 if (candidate.stat_error)
688 return error == FOUND_ERROR;
689 uint64_t oi_size = be_get_ondisk_size(auth_oi.size);
690 if (oi_size != candidate.size) {
691 if (error != CLEAN)
692 errorstream << ", ";
693 error = FOUND_ERROR;
694 errorstream << "size " << candidate.size
695 << " != size " << oi_size
696 << " from auth oi " << auth_oi;
94b18763 697 shard_result.set_size_mismatch_info();
7c673cae
FG
698 }
699 if (auth.size != candidate.size) {
700 if (error != CLEAN)
701 errorstream << ", ";
702 error = FOUND_ERROR;
703 errorstream << "size " << candidate.size
704 << " != size " << auth.size
705 << " from shard " << auth_shard;
706 obj_result.set_size_mismatch();
707 }
708 for (map<string,bufferptr>::const_iterator i = auth.attrs.begin();
709 i != auth.attrs.end();
710 ++i) {
b5b8bbf5 711 // We check system keys seperately
94b18763 712 if (i->first == OI_ATTR || i->first[0] != '_')
b5b8bbf5 713 continue;
7c673cae
FG
714 if (!candidate.attrs.count(i->first)) {
715 if (error != CLEAN)
716 errorstream << ", ";
717 error = FOUND_ERROR;
718 errorstream << "attr name mismatch '" << i->first << "'";
719 obj_result.set_attr_name_mismatch();
720 } else if (candidate.attrs.find(i->first)->second.cmp(i->second)) {
721 if (error != CLEAN)
722 errorstream << ", ";
723 error = FOUND_ERROR;
724 errorstream << "attr value mismatch '" << i->first << "'";
725 obj_result.set_attr_value_mismatch();
726 }
727 }
728 for (map<string,bufferptr>::const_iterator i = candidate.attrs.begin();
729 i != candidate.attrs.end();
730 ++i) {
b5b8bbf5 731 // We check system keys seperately
94b18763 732 if (i->first == OI_ATTR || i->first[0] != '_')
b5b8bbf5 733 continue;
7c673cae
FG
734 if (!auth.attrs.count(i->first)) {
735 if (error != CLEAN)
736 errorstream << ", ";
737 error = FOUND_ERROR;
738 errorstream << "attr name mismatch '" << i->first << "'";
739 obj_result.set_attr_name_mismatch();
740 }
741 }
742 return error == FOUND_ERROR;
743}
744
1adf2230 745static int dcount(const object_info_t &oi, bool prioritize)
7c673cae
FG
746{
747 int count = 0;
1adf2230
AA
748 // Prioritize bluestore objects when osd_distrust_data_digest is set
749 if (prioritize)
750 count += 1000;
7c673cae
FG
751 if (oi.is_data_digest())
752 count++;
753 if (oi.is_omap_digest())
754 count++;
755 return count;
756}
757
758map<pg_shard_t, ScrubMap *>::const_iterator
759 PGBackend::be_select_auth_object(
760 const hobject_t &obj,
761 const map<pg_shard_t,ScrubMap*> &maps,
762 object_info_t *auth_oi,
763 map<pg_shard_t, shard_info_wrapper> &shard_map,
1adf2230
AA
764 inconsistent_obj_wrapper &object_error,
765 bool &digest_match)
7c673cae
FG
766{
767 eversion_t auth_version;
1adf2230 768 bool auth_prio = false;
94b18763 769 bufferlist first_oi_bl, first_ss_bl, first_hk_bl;
7c673cae 770
b32b8144 771 // Create list of shards with primary first so it will be auth copy all
31f18b77
FG
772 // other things being equal.
773 list<pg_shard_t> shards;
7c673cae
FG
774 for (map<pg_shard_t, ScrubMap *>::const_iterator j = maps.begin();
775 j != maps.end();
776 ++j) {
31f18b77
FG
777 if (j->first == get_parent()->whoami_shard())
778 continue;
779 shards.push_back(j->first);
780 }
b32b8144 781 shards.push_front(get_parent()->whoami_shard());
31f18b77
FG
782
783 map<pg_shard_t, ScrubMap *>::const_iterator auth = maps.end();
1adf2230 784 digest_match = true;
31f18b77 785 for (auto &l : shards) {
1adf2230 786 bool oi_prio = false;
31f18b77 787 map<pg_shard_t, ScrubMap *>::const_iterator j = maps.find(l);
7c673cae
FG
788 map<hobject_t, ScrubMap::object>::iterator i =
789 j->second->objects.find(obj);
790 if (i == j->second->objects.end()) {
791 continue;
792 }
793 string error_string;
794 auto& shard_info = shard_map[j->first];
b5b8bbf5
FG
795 if (j->first == get_parent()->whoami_shard())
796 shard_info.primary = true;
7c673cae
FG
797 if (i->second.read_error) {
798 shard_info.set_read_error();
799 error_string += " read_error";
800 }
801 if (i->second.ec_hash_mismatch) {
802 shard_info.set_ec_hash_mismatch();
803 error_string += " ec_hash_mismatch";
804 }
805 if (i->second.ec_size_mismatch) {
806 shard_info.set_ec_size_mismatch();
807 error_string += " ec_size_mismatch";
808 }
809
810 object_info_t oi;
811 bufferlist bl;
812 map<string, bufferptr>::iterator k;
31f18b77 813 SnapSet ss;
94b18763 814 bufferlist ss_bl, hk_bl;
7c673cae
FG
815
816 if (i->second.stat_error) {
817 shard_info.set_stat_error();
818 error_string += " stat_error";
819 // With stat_error no further checking
820 // We don't need to also see a missing_object_info_attr
821 goto out;
822 }
823
b5b8bbf5
FG
824 // We won't pick an auth copy if the snapset is missing or won't decode.
825 if (obj.is_head() || obj.is_snapdir()) {
826 k = i->second.attrs.find(SS_ATTR);
827 if (k == i->second.attrs.end()) {
94b18763
FG
828 shard_info.set_snapset_missing();
829 error_string += " snapset_missing";
b5b8bbf5
FG
830 } else {
831 ss_bl.push_back(k->second);
832 try {
833 bufferlist::iterator bliter = ss_bl.begin();
834 ::decode(ss, bliter);
3a9019d9
FG
835 if (first_ss_bl.length() == 0) {
836 first_ss_bl.append(ss_bl);
837 } else if (!object_error.has_snapset_inconsistency() && !ss_bl.contents_equal(first_ss_bl)) {
838 object_error.set_snapset_inconsistency();
839 error_string += " snapset_inconsistency";
840 }
b5b8bbf5
FG
841 } catch (...) {
842 // invalid snapset, probably corrupt
94b18763
FG
843 shard_info.set_snapset_corrupted();
844 error_string += " snapset_corrupted";
845 }
846 }
847 }
848
849 if (parent->get_pool().is_erasure()) {
850 ECUtil::HashInfo hi;
851 k = i->second.attrs.find(ECUtil::get_hinfo_key());
852 if (k == i->second.attrs.end()) {
853 shard_info.set_hinfo_missing();
854 error_string += " hinfo_key_missing";
855 } else {
856 hk_bl.push_back(k->second);
857 try {
858 bufferlist::iterator bliter = hk_bl.begin();
859 decode(hi, bliter);
860 if (first_hk_bl.length() == 0) {
861 first_hk_bl.append(hk_bl);
862 } else if (!object_error.has_hinfo_inconsistency() && !hk_bl.contents_equal(first_hk_bl)) {
863 object_error.set_hinfo_inconsistency();
864 error_string += " hinfo_inconsistency";
865 }
866 } catch (...) {
867 // invalid snapset, probably corrupt
868 shard_info.set_hinfo_corrupted();
869 error_string += " hinfo_corrupted";
b5b8bbf5
FG
870 }
871 }
872 }
873
7c673cae
FG
874 k = i->second.attrs.find(OI_ATTR);
875 if (k == i->second.attrs.end()) {
876 // no object info on object, probably corrupt
94b18763
FG
877 shard_info.set_info_missing();
878 error_string += " info_missing";
7c673cae
FG
879 goto out;
880 }
881 bl.push_back(k->second);
882 try {
883 bufferlist::iterator bliter = bl.begin();
884 ::decode(oi, bliter);
885 } catch (...) {
886 // invalid object info, probably corrupt
94b18763
FG
887 shard_info.set_info_corrupted();
888 error_string += " info_corrupted";
7c673cae
FG
889 goto out;
890 }
891
b5b8bbf5
FG
892 // This is automatically corrected in PG::_repair_oinfo_oid()
893 assert(oi.soid == obj);
894
3a9019d9
FG
895 if (first_oi_bl.length() == 0) {
896 first_oi_bl.append(bl);
897 } else if (!object_error.has_object_info_inconsistency() && !bl.contents_equal(first_oi_bl)) {
b5b8bbf5
FG
898 object_error.set_object_info_inconsistency();
899 error_string += " object_info_inconsistency";
31f18b77
FG
900 }
901
b5b8bbf5
FG
902 if (i->second.size != be_get_ondisk_size(oi.size)) {
903 dout(5) << __func__ << " size " << i->second.size << " oi size " << oi.size << dendl;
94b18763
FG
904 shard_info.set_obj_size_info_mismatch();
905 error_string += " obj_size_info_mismatch";
7c673cae
FG
906 }
907
1adf2230
AA
908 // digest_match will only be true if computed digests are the same
909 if (auth_version != eversion_t()
910 && auth->second->objects[obj].digest_present
911 && i->second.digest_present
912 && auth->second->objects[obj].digest != i->second.digest) {
913 digest_match = false;
914 dout(10) << __func__ << " digest_match = false, " << obj << " data_digest 0x" << std::hex << i->second.digest
915 << " != data_digest 0x" << auth->second->objects[obj].digest << std::dec
916 << dendl;
917 }
918
b5b8bbf5
FG
919 // Don't use this particular shard due to previous errors
920 // XXX: For now we can't pick one shard for repair and another's object info or snapset
921 if (shard_info.errors)
7c673cae
FG
922 goto out;
923
1adf2230
AA
924 // XXX: Do I want replicated only?
925 if (parent->get_pool().is_replicated() && cct->_conf->osd_distrust_data_digest) {
926 // This is a boost::optional<bool> so see if option set AND it has the value true
927 // We give priority to a replica where the ObjectStore like BlueStore has builtin checksum
928 if (j->second->has_builtin_csum && j->second->has_builtin_csum == true) {
929 oi_prio = true;
930 }
931 }
932
7c673cae 933 if (auth_version == eversion_t() || oi.version > auth_version ||
1adf2230 934 (oi.version == auth_version && dcount(oi, oi_prio) > dcount(*auth_oi, auth_prio))) {
7c673cae
FG
935 auth = j;
936 *auth_oi = oi;
937 auth_version = oi.version;
1adf2230 938 auth_prio = oi_prio;
7c673cae
FG
939 }
940
941out:
942 // Check error_string because some errors already generated messages
943 if (error_string != "") {
944 dout(10) << __func__ << ": error(s) osd " << j->first
945 << " for obj " << obj
946 << "," << error_string
947 << dendl;
948 }
949 // Keep scanning other shards
950 }
951 dout(10) << __func__ << ": selecting osd " << auth->first
952 << " for obj " << obj
953 << " with oi " << *auth_oi
954 << dendl;
955 return auth;
956}
957
958void PGBackend::be_compare_scrubmaps(
959 const map<pg_shard_t,ScrubMap*> &maps,
28e407b8 960 const set<hobject_t> &master_set,
7c673cae
FG
961 bool repair,
962 map<hobject_t, set<pg_shard_t>> &missing,
963 map<hobject_t, set<pg_shard_t>> &inconsistent,
964 map<hobject_t, list<pg_shard_t>> &authoritative,
28e407b8
AA
965 map<hobject_t, pair<boost::optional<uint32_t>,
966 boost::optional<uint32_t>>> &missing_digest,
7c673cae
FG
967 int &shallow_errors, int &deep_errors,
968 Scrub::Store *store,
969 const spg_t& pgid,
970 const vector<int> &acting,
971 ostream &errorstream)
972{
7c673cae
FG
973 utime_t now = ceph_clock_now();
974
7c673cae
FG
975 // Check maps against master set and each other
976 for (set<hobject_t>::const_iterator k = master_set.begin();
977 k != master_set.end();
978 ++k) {
979 object_info_t auth_oi;
980 map<pg_shard_t, shard_info_wrapper> shard_map;
981
982 inconsistent_obj_wrapper object_error{*k};
983
1adf2230 984 bool digest_match;
7c673cae 985 map<pg_shard_t, ScrubMap *>::const_iterator auth =
1adf2230
AA
986 be_select_auth_object(*k, maps, &auth_oi, shard_map, object_error,
987 digest_match);
7c673cae
FG
988
989 list<pg_shard_t> auth_list;
31f18b77 990 set<pg_shard_t> object_errors;
7c673cae
FG
991 if (auth == maps.end()) {
992 object_error.set_version(0);
b5b8bbf5
FG
993 object_error.set_auth_missing(*k, maps, shard_map, shallow_errors,
994 deep_errors, get_parent()->whoami_shard());
7c673cae
FG
995 if (object_error.has_deep_errors())
996 ++deep_errors;
997 else if (object_error.has_shallow_errors())
998 ++shallow_errors;
999 store->add_object_error(k->pool, object_error);
1000 errorstream << pgid.pgid << " soid " << *k
1001 << ": failed to pick suitable object info\n";
1002 continue;
1003 }
1004 object_error.set_version(auth_oi.user_version);
1005 ScrubMap::object& auth_object = auth->second->objects[*k];
1006 set<pg_shard_t> cur_missing;
1007 set<pg_shard_t> cur_inconsistent;
1adf2230 1008 bool fix_digest = false;
7c673cae 1009
28e407b8 1010 for (auto j = maps.cbegin(); j != maps.cend(); ++j) {
7c673cae
FG
1011 if (j == auth)
1012 shard_map[auth->first].selected_oi = true;
1013 if (j->second->objects.count(*k)) {
1014 shard_map[j->first].set_object(j->second->objects[*k]);
1015 // Compare
1016 stringstream ss;
1017 bool found = be_compare_scrub_objects(auth->first,
1018 auth_object,
1019 auth_oi,
1020 j->second->objects[*k],
1021 shard_map[j->first],
1022 object_error,
1023 ss);
1adf2230
AA
1024
1025 dout(20) << __func__ << (repair ? " repair " : " ") << (parent->get_pool().is_replicated() ? "replicated " : "")
1026 << (j == auth ? "auth " : "") << "shards " << shard_map.size() << (digest_match ? " digest_match " : " ")
1027 << (shard_map[j->first].has_data_digest_mismatch_info() ? "info_mismatch " : "")
1028 << (shard_map[j->first].only_data_digest_mismatch_info() ? "only" : "")
1029 << dendl;
1030
1031 if (cct->_conf->osd_distrust_data_digest) {
1032 if (digest_match && parent->get_pool().is_replicated()
1033 && shard_map[j->first].has_data_digest_mismatch_info()) {
1034 fix_digest = true;
1035 }
1036 shard_map[j->first].clear_data_digest_mismatch_info();
1037 // If all replicas match, but they don't match object_info we can
1038 // repair it by using missing_digest mechanism
1039 } else if (repair && parent->get_pool().is_replicated() && j == auth && shard_map.size() > 1
1040 && digest_match && shard_map[j->first].only_data_digest_mismatch_info()
1041 && auth_object.digest_present) {
1042 // Set in missing_digests
1043 fix_digest = true;
1044 // Clear the error
1045 shard_map[j->first].clear_data_digest_mismatch_info();
1046 errorstream << pgid << " : soid " << *k << " repairing object info data_digest" << "\n";
1047 }
7c673cae
FG
1048 // Some errors might have already been set in be_select_auth_object()
1049 if (shard_map[j->first].errors != 0) {
1050 cur_inconsistent.insert(j->first);
1051 if (shard_map[j->first].has_deep_errors())
1052 ++deep_errors;
1053 else
1054 ++shallow_errors;
1055 // Only true if be_compare_scrub_objects() found errors and put something
1056 // in ss.
1057 if (found)
1058 errorstream << pgid << " shard " << j->first << ": soid " << *k
1059 << " " << ss.str() << "\n";
1adf2230 1060 } else if (object_error.errors != 0) {
31f18b77
FG
1061 // Track possible shard to use as authoritative, if needed
1062 // There are errors, without identifying the shard
1063 object_errors.insert(j->first);
1adf2230
AA
1064 if (found)
1065 errorstream << pgid << " : soid " << *k << " " << ss.str() << "\n";
7c673cae
FG
1066 } else {
1067 // XXX: The auth shard might get here that we don't know
1068 // that it has the "correct" data.
1069 auth_list.push_back(j->first);
1070 }
1071 } else {
1072 cur_missing.insert(j->first);
1073 shard_map[j->first].set_missing();
b5b8bbf5 1074 shard_map[j->first].primary = (j->first == get_parent()->whoami_shard());
7c673cae
FG
1075 // Can't have any other errors if there is no information available
1076 ++shallow_errors;
1077 errorstream << pgid << " shard " << j->first << " missing " << *k
1078 << "\n";
1079 }
1080 object_error.add_shard(j->first, shard_map[j->first]);
1081 }
1082
1083 if (auth_list.empty()) {
31f18b77
FG
1084 if (object_errors.empty()) {
1085 errorstream << pgid.pgid << " soid " << *k
7c673cae 1086 << ": failed to pick suitable auth object\n";
31f18b77
FG
1087 goto out;
1088 }
1089 // Object errors exist and nothing in auth_list
1090 // Prefer the auth shard otherwise take first from list.
1091 pg_shard_t shard;
1092 if (object_errors.count(auth->first)) {
1093 shard = auth->first;
1094 } else {
1095 shard = *(object_errors.begin());
1096 }
1097 auth_list.push_back(shard);
1098 object_errors.erase(shard);
7c673cae 1099 }
31f18b77
FG
1100 // At this point auth_list is populated, so we add the object errors shards
1101 // as inconsistent.
1102 cur_inconsistent.insert(object_errors.begin(), object_errors.end());
7c673cae
FG
1103 if (!cur_missing.empty()) {
1104 missing[*k] = cur_missing;
1105 }
1106 if (!cur_inconsistent.empty()) {
1107 inconsistent[*k] = cur_inconsistent;
1108 }
1adf2230
AA
1109
1110 if (fix_digest) {
1111 boost::optional<uint32_t> data_digest, omap_digest;
1112 assert(auth_object.digest_present);
1113 data_digest = auth_object.digest;
1114 if (auth_object.omap_digest_present) {
1115 omap_digest = auth_object.omap_digest;
1116 }
1117 missing_digest[*k] = make_pair(data_digest, omap_digest);
1118 }
1119 // Special handling of this particular type of inconsistency
1120 // This can over-ride a data_digest or set an omap_digest
1121 // when all replicas match but the object info is wrong.
7c673cae
FG
1122 if (!cur_inconsistent.empty() || !cur_missing.empty()) {
1123 authoritative[*k] = auth_list;
1adf2230 1124 } else if (!fix_digest && parent->get_pool().is_replicated()) {
7c673cae
FG
1125 enum {
1126 NO = 0,
1127 MAYBE = 1,
1128 FORCE = 2,
1129 } update = NO;
1130
28e407b8
AA
1131 if (auth_object.digest_present && !auth_oi.is_data_digest()) {
1132 dout(20) << __func__ << " missing data digest on " << *k << dendl;
7c673cae
FG
1133 update = MAYBE;
1134 }
28e407b8
AA
1135 if (auth_object.omap_digest_present && !auth_oi.is_omap_digest()) {
1136 dout(20) << __func__ << " missing omap digest on " << *k << dendl;
7c673cae
FG
1137 update = MAYBE;
1138 }
1139
1140 // recorded digest != actual digest?
1141 if (auth_oi.is_data_digest() && auth_object.digest_present &&
1142 auth_oi.data_digest != auth_object.digest) {
1adf2230
AA
1143 assert(cct->_conf->osd_distrust_data_digest
1144 || shard_map[auth->first].has_data_digest_mismatch_info());
7c673cae
FG
1145 errorstream << pgid << " recorded data digest 0x"
1146 << std::hex << auth_oi.data_digest << " != on disk 0x"
1147 << auth_object.digest << std::dec << " on " << auth_oi.soid
1148 << "\n";
1149 if (repair)
1150 update = FORCE;
1151 }
1152 if (auth_oi.is_omap_digest() && auth_object.omap_digest_present &&
1153 auth_oi.omap_digest != auth_object.omap_digest) {
94b18763 1154 assert(shard_map[auth->first].has_omap_digest_mismatch_info());
7c673cae
FG
1155 errorstream << pgid << " recorded omap digest 0x"
1156 << std::hex << auth_oi.omap_digest << " != on disk 0x"
1157 << auth_object.omap_digest << std::dec
1158 << " on " << auth_oi.soid << "\n";
1159 if (repair)
1160 update = FORCE;
1161 }
1162
1163 if (update != NO) {
1164 utime_t age = now - auth_oi.local_mtime;
1165 if (update == FORCE ||
1166 age > cct->_conf->osd_deep_scrub_update_digest_min_age) {
28e407b8
AA
1167 boost::optional<uint32_t> data_digest, omap_digest;
1168 if (auth_object.digest_present) {
1169 data_digest = auth_object.digest;
1170 dout(20) << __func__ << " will update data digest on " << *k << dendl;
1171 }
1172 if (auth_object.omap_digest_present) {
1173 omap_digest = auth_object.omap_digest;
1174 dout(20) << __func__ << " will update omap digest on " << *k << dendl;
1175 }
1176 missing_digest[*k] = make_pair(data_digest, omap_digest);
7c673cae
FG
1177 } else {
1178 dout(20) << __func__ << " missing digest but age " << age
1179 << " < " << cct->_conf->osd_deep_scrub_update_digest_min_age
1180 << " on " << *k << dendl;
1181 }
1182 }
1183 }
1184out:
1185 if (object_error.has_deep_errors())
1186 ++deep_errors;
1187 else if (object_error.has_shallow_errors())
1188 ++shallow_errors;
1189 if (object_error.errors || object_error.union_shards.errors) {
1190 store->add_object_error(k->pool, object_error);
1191 }
1192 }
1193}
28e407b8
AA
1194
1195void PGBackend::be_large_omap_check(const map<pg_shard_t,ScrubMap*> &maps,
1196 const set<hobject_t> &master_set,
1197 int& large_omap_objects,
1198 ostream &warnstream) const
1199{
1200 bool needs_check = false;
1201 for (const auto& map : maps) {
1202 if (map.second->has_large_omap_object_errors) {
1203 needs_check = true;
1204 break;
1205 }
1206 }
1207
1208 if (!needs_check) {
1209 return;
1210 }
1211
1212 // Iterate through objects and check large omap object flag
1213 for (const auto& k : master_set) {
1214 for (const auto& map : maps) {
1215 ScrubMap::object& obj = map.second->objects[k];
1216 if (obj.large_omap_object_found) {
1217 large_omap_objects++;
1218 warnstream << "Large omap object found. Object: " << k << " Key count: "
1219 << obj.large_omap_object_key_count << " Size (bytes): "
1220 << obj.large_omap_object_value_size << '\n';
1221 break;
1222 }
1223 }
1224 }
1225}