]> git.proxmox.com Git - ceph.git/blob - ceph/src/osd/PGBackend.cc
9ae6b2c14e5c2670dcffbf70984cc9fbdda801c6
[ceph.git] / ceph / src / osd / PGBackend.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2013,2014 Inktank Storage, Inc.
7 * Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com>
8 *
9 * Author: Loic Dachary <loic@dachary.org>
10 *
11 * This is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public
13 * License version 2.1, as published by the Free Software
14 * Foundation. See file COPYING.
15 *
16 */
17
18
19 #include "common/errno.h"
20 #include "common/scrub_types.h"
21 #include "ReplicatedBackend.h"
22 #include "osd/scrubber/ScrubStore.h"
23 #include "ECBackend.h"
24 #include "PGBackend.h"
25 #include "OSD.h"
26 #include "erasure-code/ErasureCodePlugin.h"
27 #include "OSDMap.h"
28 #include "PGLog.h"
29 #include "common/LogClient.h"
30 #include "messages/MOSDPGRecoveryDelete.h"
31 #include "messages/MOSDPGRecoveryDeleteReply.h"
32
33 using std::less;
34 using std::list;
35 using std::make_pair;
36 using std::map;
37 using std::ostream;
38 using std::ostringstream;
39 using std::pair;
40 using std::set;
41 using std::string;
42 using std::stringstream;
43 using std::vector;
44
45 using ceph::bufferlist;
46 using ceph::bufferptr;
47 using ceph::ErasureCodeProfile;
48 using ceph::ErasureCodeInterfaceRef;
49
50 #define dout_context cct
51 #define dout_subsys ceph_subsys_osd
52 #define DOUT_PREFIX_ARGS this
53 #undef dout_prefix
54 #define dout_prefix _prefix(_dout, this)
55 static ostream& _prefix(std::ostream *_dout, PGBackend *pgb) {
56 return pgb->get_parent()->gen_dbg_prefix(*_dout);
57 }
58
59 void PGBackend::recover_delete_object(const hobject_t &oid, eversion_t v,
60 RecoveryHandle *h)
61 {
62 ceph_assert(get_parent()->get_acting_recovery_backfill_shards().size() > 0);
63 for (const auto& shard : get_parent()->get_acting_recovery_backfill_shards()) {
64 if (shard == get_parent()->whoami_shard())
65 continue;
66 if (get_parent()->get_shard_missing(shard).is_missing(oid)) {
67 dout(20) << __func__ << " will remove " << oid << " " << v << " from "
68 << shard << dendl;
69 h->deletes[shard].push_back(make_pair(oid, v));
70 get_parent()->begin_peer_recover(shard, oid);
71 }
72 }
73 }
74
75 void PGBackend::send_recovery_deletes(int prio,
76 const map<pg_shard_t, vector<pair<hobject_t, eversion_t> > > &deletes)
77 {
78 epoch_t min_epoch = get_parent()->get_last_peering_reset_epoch();
79 for (const auto& p : deletes) {
80 const auto& shard = p.first;
81 const auto& objects = p.second;
82 ConnectionRef con = get_parent()->get_con_osd_cluster(
83 shard.osd,
84 get_osdmap_epoch());
85 if (!con)
86 continue;
87 auto it = objects.begin();
88 while (it != objects.end()) {
89 uint64_t cost = 0;
90 uint64_t deletes = 0;
91 spg_t target_pg = spg_t(get_parent()->get_info().pgid.pgid, shard.shard);
92 MOSDPGRecoveryDelete *msg =
93 new MOSDPGRecoveryDelete(get_parent()->whoami_shard(),
94 target_pg,
95 get_osdmap_epoch(),
96 min_epoch);
97 msg->set_priority(prio);
98
99 while (it != objects.end() &&
100 cost < cct->_conf->osd_max_push_cost &&
101 deletes < cct->_conf->osd_max_push_objects) {
102 dout(20) << __func__ << ": sending recovery delete << " << it->first
103 << " " << it->second << " to osd." << shard << dendl;
104 msg->objects.push_back(*it);
105 cost += cct->_conf->osd_push_per_object_cost;
106 ++deletes;
107 ++it;
108 }
109
110 msg->set_cost(cost);
111 get_parent()->send_message_osd_cluster(msg, con);
112 }
113 }
114 }
115
116 bool PGBackend::handle_message(OpRequestRef op)
117 {
118 switch (op->get_req()->get_type()) {
119 case MSG_OSD_PG_RECOVERY_DELETE:
120 handle_recovery_delete(op);
121 return true;
122
123 case MSG_OSD_PG_RECOVERY_DELETE_REPLY:
124 handle_recovery_delete_reply(op);
125 return true;
126
127 default:
128 break;
129 }
130
131 return _handle_message(op);
132 }
133
134 void PGBackend::handle_recovery_delete(OpRequestRef op)
135 {
136 auto m = op->get_req<MOSDPGRecoveryDelete>();
137 ceph_assert(m->get_type() == MSG_OSD_PG_RECOVERY_DELETE);
138 dout(20) << __func__ << " " << op << dendl;
139
140 op->mark_started();
141
142 C_GatherBuilder gather(cct);
143 for (const auto &p : m->objects) {
144 get_parent()->remove_missing_object(p.first, p.second, gather.new_sub());
145 }
146
147 auto reply = make_message<MOSDPGRecoveryDeleteReply>();
148 reply->from = get_parent()->whoami_shard();
149 reply->set_priority(m->get_priority());
150 reply->pgid = spg_t(get_parent()->get_info().pgid.pgid, m->from.shard);
151 reply->map_epoch = m->map_epoch;
152 reply->min_epoch = m->min_epoch;
153 reply->objects = m->objects;
154 ConnectionRef conn = m->get_connection();
155
156 gather.set_finisher(new LambdaContext(
157 [=](int r) {
158 if (r != -EAGAIN) {
159 get_parent()->send_message_osd_cluster(reply, conn.get());
160 } else {
161 reply->put();
162 }
163 }));
164 gather.activate();
165 }
166
167 void PGBackend::handle_recovery_delete_reply(OpRequestRef op)
168 {
169 auto m = op->get_req<MOSDPGRecoveryDeleteReply>();
170 ceph_assert(m->get_type() == MSG_OSD_PG_RECOVERY_DELETE_REPLY);
171 dout(20) << __func__ << " " << op << dendl;
172
173 for (const auto &p : m->objects) {
174 ObjectRecoveryInfo recovery_info;
175 hobject_t oid = p.first;
176 recovery_info.version = p.second;
177 get_parent()->on_peer_recover(m->from, oid, recovery_info);
178 bool peers_recovered = true;
179 for (const auto& shard : get_parent()->get_acting_recovery_backfill_shards()) {
180 if (shard == get_parent()->whoami_shard())
181 continue;
182 if (get_parent()->get_shard_missing(shard).is_missing(oid)) {
183 dout(20) << __func__ << " " << oid << " still missing on at least "
184 << shard << dendl;
185 peers_recovered = false;
186 break;
187 }
188 }
189 if (peers_recovered && !get_parent()->get_local_missing().is_missing(oid)) {
190 dout(20) << __func__ << " completed recovery, local_missing = "
191 << get_parent()->get_local_missing() << dendl;
192 object_stat_sum_t stat_diff;
193 stat_diff.num_objects_recovered = 1;
194 get_parent()->on_global_recover(p.first, stat_diff, true);
195 }
196 }
197 }
198
199 void PGBackend::rollback(
200 const pg_log_entry_t &entry,
201 ObjectStore::Transaction *t)
202 {
203
204 struct RollbackVisitor : public ObjectModDesc::Visitor {
205 const hobject_t &hoid;
206 PGBackend *pg;
207 ObjectStore::Transaction t;
208 RollbackVisitor(
209 const hobject_t &hoid,
210 PGBackend *pg) : hoid(hoid), pg(pg) {}
211 void append(uint64_t old_size) override {
212 ObjectStore::Transaction temp;
213 pg->rollback_append(hoid, old_size, &temp);
214 temp.append(t);
215 temp.swap(t);
216 }
217 void setattrs(map<string, std::optional<bufferlist> > &attrs) override {
218 ObjectStore::Transaction temp;
219 pg->rollback_setattrs(hoid, attrs, &temp);
220 temp.append(t);
221 temp.swap(t);
222 }
223 void rmobject(version_t old_version) override {
224 ObjectStore::Transaction temp;
225 pg->rollback_stash(hoid, old_version, &temp);
226 temp.append(t);
227 temp.swap(t);
228 }
229 void try_rmobject(version_t old_version) override {
230 ObjectStore::Transaction temp;
231 pg->rollback_try_stash(hoid, old_version, &temp);
232 temp.append(t);
233 temp.swap(t);
234 }
235 void create() override {
236 ObjectStore::Transaction temp;
237 pg->rollback_create(hoid, &temp);
238 temp.append(t);
239 temp.swap(t);
240 }
241 void update_snaps(const set<snapid_t> &snaps) override {
242 ObjectStore::Transaction temp;
243 pg->get_parent()->pgb_set_object_snap_mapping(hoid, snaps, &temp);
244 temp.append(t);
245 temp.swap(t);
246 }
247 void rollback_extents(
248 version_t gen,
249 const vector<pair<uint64_t, uint64_t> > &extents) override {
250 ObjectStore::Transaction temp;
251 pg->rollback_extents(gen, extents, hoid, &temp);
252 temp.append(t);
253 temp.swap(t);
254 }
255 };
256
257 ceph_assert(entry.mod_desc.can_rollback());
258 RollbackVisitor vis(entry.soid, this);
259 entry.mod_desc.visit(&vis);
260 t->append(vis.t);
261 }
262
263 struct Trimmer : public ObjectModDesc::Visitor {
264 const hobject_t &soid;
265 PGBackend *pg;
266 ObjectStore::Transaction *t;
267 Trimmer(
268 const hobject_t &soid,
269 PGBackend *pg,
270 ObjectStore::Transaction *t)
271 : soid(soid), pg(pg), t(t) {}
272 void rmobject(version_t old_version) override {
273 pg->trim_rollback_object(
274 soid,
275 old_version,
276 t);
277 }
278 // try_rmobject defaults to rmobject
279 void rollback_extents(
280 version_t gen,
281 const vector<pair<uint64_t, uint64_t> > &extents) override {
282 pg->trim_rollback_object(
283 soid,
284 gen,
285 t);
286 }
287 };
288
289 void PGBackend::rollforward(
290 const pg_log_entry_t &entry,
291 ObjectStore::Transaction *t)
292 {
293 auto dpp = get_parent()->get_dpp();
294 ldpp_dout(dpp, 20) << __func__ << ": entry=" << entry << dendl;
295 if (!entry.can_rollback())
296 return;
297 Trimmer trimmer(entry.soid, this, t);
298 entry.mod_desc.visit(&trimmer);
299 }
300
301 void PGBackend::trim(
302 const pg_log_entry_t &entry,
303 ObjectStore::Transaction *t)
304 {
305 if (!entry.can_rollback())
306 return;
307 Trimmer trimmer(entry.soid, this, t);
308 entry.mod_desc.visit(&trimmer);
309 }
310
311 void PGBackend::try_stash(
312 const hobject_t &hoid,
313 version_t v,
314 ObjectStore::Transaction *t)
315 {
316 t->try_rename(
317 coll,
318 ghobject_t(hoid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard),
319 ghobject_t(hoid, v, get_parent()->whoami_shard().shard));
320 }
321
322 void PGBackend::remove(
323 const hobject_t &hoid,
324 ObjectStore::Transaction *t) {
325 ceph_assert(!hoid.is_temp());
326 t->remove(
327 coll,
328 ghobject_t(hoid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard));
329 get_parent()->pgb_clear_object_snap_mapping(hoid, t);
330 }
331
332 void PGBackend::on_change_cleanup(ObjectStore::Transaction *t)
333 {
334 dout(10) << __func__ << dendl;
335 // clear temp
336 for (set<hobject_t>::iterator i = temp_contents.begin();
337 i != temp_contents.end();
338 ++i) {
339 dout(10) << __func__ << ": Removing oid "
340 << *i << " from the temp collection" << dendl;
341 t->remove(
342 coll,
343 ghobject_t(*i, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard));
344 }
345 temp_contents.clear();
346 }
347
348 int PGBackend::objects_list_partial(
349 const hobject_t &begin,
350 int min,
351 int max,
352 vector<hobject_t> *ls,
353 hobject_t *next)
354 {
355 ceph_assert(ls);
356 // Starts with the smallest generation to make sure the result list
357 // has the marker object (it might have multiple generations
358 // though, which would be filtered).
359 ghobject_t _next;
360 if (!begin.is_min())
361 _next = ghobject_t(begin, 0, get_parent()->whoami_shard().shard);
362 ls->reserve(max);
363 int r = 0;
364
365 if (min > max)
366 min = max;
367
368 while (!_next.is_max() && ls->size() < (unsigned)min) {
369 vector<ghobject_t> objects;
370 if (HAVE_FEATURE(parent->min_upacting_features(),
371 OSD_FIXED_COLLECTION_LIST)) {
372 r = store->collection_list(
373 ch,
374 _next,
375 ghobject_t::get_max(),
376 max - ls->size(),
377 &objects,
378 &_next);
379 } else {
380 r = store->collection_list_legacy(
381 ch,
382 _next,
383 ghobject_t::get_max(),
384 max - ls->size(),
385 &objects,
386 &_next);
387 }
388 if (r != 0) {
389 derr << __func__ << " list collection " << ch << " got: " << cpp_strerror(r) << dendl;
390 break;
391 }
392 for (vector<ghobject_t>::iterator i = objects.begin();
393 i != objects.end();
394 ++i) {
395 if (i->is_pgmeta() || i->hobj.is_temp()) {
396 continue;
397 }
398 if (i->is_no_gen()) {
399 ls->push_back(i->hobj);
400 }
401 }
402 }
403 if (r == 0)
404 *next = _next.hobj;
405 return r;
406 }
407
408 int PGBackend::objects_list_range(
409 const hobject_t &start,
410 const hobject_t &end,
411 vector<hobject_t> *ls,
412 vector<ghobject_t> *gen_obs)
413 {
414 ceph_assert(ls);
415 vector<ghobject_t> objects;
416 int r;
417 if (HAVE_FEATURE(parent->min_upacting_features(),
418 OSD_FIXED_COLLECTION_LIST)) {
419 r = store->collection_list(
420 ch,
421 ghobject_t(start, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard),
422 ghobject_t(end, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard),
423 INT_MAX,
424 &objects,
425 NULL);
426 } else {
427 r = store->collection_list_legacy(
428 ch,
429 ghobject_t(start, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard),
430 ghobject_t(end, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard),
431 INT_MAX,
432 &objects,
433 NULL);
434 }
435 ls->reserve(objects.size());
436 for (vector<ghobject_t>::iterator i = objects.begin();
437 i != objects.end();
438 ++i) {
439 if (i->is_pgmeta() || i->hobj.is_temp()) {
440 continue;
441 }
442 if (i->is_no_gen()) {
443 ls->push_back(i->hobj);
444 } else if (gen_obs) {
445 gen_obs->push_back(*i);
446 }
447 }
448 return r;
449 }
450
451 int PGBackend::objects_get_attr(
452 const hobject_t &hoid,
453 const string &attr,
454 bufferlist *out)
455 {
456 bufferptr bp;
457 int r = store->getattr(
458 ch,
459 ghobject_t(hoid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard),
460 attr.c_str(),
461 bp);
462 if (r >= 0 && out) {
463 out->clear();
464 out->push_back(std::move(bp));
465 }
466 return r;
467 }
468
469 int PGBackend::objects_get_attrs(
470 const hobject_t &hoid,
471 map<string, bufferlist, less<>> *out)
472 {
473 return store->getattrs(
474 ch,
475 ghobject_t(hoid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard),
476 *out);
477 }
478
479 void PGBackend::rollback_setattrs(
480 const hobject_t &hoid,
481 map<string, std::optional<bufferlist> > &old_attrs,
482 ObjectStore::Transaction *t) {
483 map<string, bufferlist, less<>> to_set;
484 ceph_assert(!hoid.is_temp());
485 for (map<string, std::optional<bufferlist> >::iterator i = old_attrs.begin();
486 i != old_attrs.end();
487 ++i) {
488 if (i->second) {
489 to_set[i->first] = *(i->second);
490 } else {
491 t->rmattr(
492 coll,
493 ghobject_t(hoid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard),
494 i->first);
495 }
496 }
497 t->setattrs(
498 coll,
499 ghobject_t(hoid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard),
500 to_set);
501 }
502
503 void PGBackend::rollback_append(
504 const hobject_t &hoid,
505 uint64_t old_size,
506 ObjectStore::Transaction *t) {
507 ceph_assert(!hoid.is_temp());
508 t->truncate(
509 coll,
510 ghobject_t(hoid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard),
511 old_size);
512 }
513
514 void PGBackend::rollback_stash(
515 const hobject_t &hoid,
516 version_t old_version,
517 ObjectStore::Transaction *t) {
518 ceph_assert(!hoid.is_temp());
519 t->remove(
520 coll,
521 ghobject_t(hoid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard));
522 t->collection_move_rename(
523 coll,
524 ghobject_t(hoid, old_version, get_parent()->whoami_shard().shard),
525 coll,
526 ghobject_t(hoid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard));
527 }
528
529 void PGBackend::rollback_try_stash(
530 const hobject_t &hoid,
531 version_t old_version,
532 ObjectStore::Transaction *t) {
533 ceph_assert(!hoid.is_temp());
534 t->remove(
535 coll,
536 ghobject_t(hoid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard));
537 t->try_rename(
538 coll,
539 ghobject_t(hoid, old_version, get_parent()->whoami_shard().shard),
540 ghobject_t(hoid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard));
541 }
542
543 void PGBackend::rollback_extents(
544 version_t gen,
545 const vector<pair<uint64_t, uint64_t> > &extents,
546 const hobject_t &hoid,
547 ObjectStore::Transaction *t) {
548 auto shard = get_parent()->whoami_shard().shard;
549 for (auto &&extent: extents) {
550 t->clone_range(
551 coll,
552 ghobject_t(hoid, gen, shard),
553 ghobject_t(hoid, ghobject_t::NO_GEN, shard),
554 extent.first,
555 extent.second,
556 extent.first);
557 }
558 t->remove(
559 coll,
560 ghobject_t(hoid, gen, shard));
561 }
562
563 void PGBackend::trim_rollback_object(
564 const hobject_t &hoid,
565 version_t old_version,
566 ObjectStore::Transaction *t) {
567 ceph_assert(!hoid.is_temp());
568 t->remove(
569 coll, ghobject_t(hoid, old_version, get_parent()->whoami_shard().shard));
570 }
571
572 PGBackend *PGBackend::build_pg_backend(
573 const pg_pool_t &pool,
574 const map<string,string>& profile,
575 Listener *l,
576 coll_t coll,
577 ObjectStore::CollectionHandle &ch,
578 ObjectStore *store,
579 CephContext *cct)
580 {
581 ErasureCodeProfile ec_profile = profile;
582 switch (pool.type) {
583 case pg_pool_t::TYPE_REPLICATED: {
584 return new ReplicatedBackend(l, coll, ch, store, cct);
585 }
586 case pg_pool_t::TYPE_ERASURE: {
587 ErasureCodeInterfaceRef ec_impl;
588 stringstream ss;
589 ceph::ErasureCodePluginRegistry::instance().factory(
590 profile.find("plugin")->second,
591 cct->_conf.get_val<std::string>("erasure_code_dir"),
592 ec_profile,
593 &ec_impl,
594 &ss);
595 ceph_assert(ec_impl);
596 return new ECBackend(
597 l,
598 coll,
599 ch,
600 store,
601 cct,
602 ec_impl,
603 pool.stripe_width);
604 }
605 default:
606 ceph_abort();
607 return NULL;
608 }
609 }
610
611 int PGBackend::be_scan_list(
612 ScrubMap &map,
613 ScrubMapBuilder &pos)
614 {
615 dout(10) << __func__ << " " << pos << dendl;
616 ceph_assert(!pos.done());
617 ceph_assert(pos.pos < pos.ls.size());
618 hobject_t& poid = pos.ls[pos.pos];
619
620 struct stat st;
621 int r = store->stat(
622 ch,
623 ghobject_t(
624 poid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard),
625 &st,
626 true);
627 if (r == 0) {
628 ScrubMap::object &o = map.objects[poid];
629 o.size = st.st_size;
630 ceph_assert(!o.negative);
631 store->getattrs(
632 ch,
633 ghobject_t(
634 poid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard),
635 o.attrs);
636
637 if (pos.deep) {
638 r = be_deep_scrub(poid, map, pos, o);
639 }
640 dout(25) << __func__ << " " << poid << dendl;
641 } else if (r == -ENOENT) {
642 dout(25) << __func__ << " " << poid << " got " << r
643 << ", skipping" << dendl;
644 } else if (r == -EIO) {
645 dout(25) << __func__ << " " << poid << " got " << r
646 << ", stat_error" << dendl;
647 ScrubMap::object &o = map.objects[poid];
648 o.stat_error = true;
649 } else {
650 derr << __func__ << " got: " << cpp_strerror(r) << dendl;
651 ceph_abort();
652 }
653 if (r == -EINPROGRESS) {
654 return -EINPROGRESS;
655 }
656 pos.next_object();
657 return 0;
658 }
659
660 bool PGBackend::be_compare_scrub_objects(
661 pg_shard_t auth_shard,
662 const ScrubMap::object &auth,
663 const object_info_t& auth_oi,
664 const ScrubMap::object &candidate,
665 shard_info_wrapper &shard_result,
666 inconsistent_obj_wrapper &obj_result,
667 ostream &errorstream,
668 bool has_snapset)
669 {
670 enum { CLEAN, FOUND_ERROR } error = CLEAN;
671 if (auth.digest_present && candidate.digest_present) {
672 if (auth.digest != candidate.digest) {
673 if (error != CLEAN)
674 errorstream << ", ";
675 error = FOUND_ERROR;
676 errorstream << "data_digest 0x" << std::hex << candidate.digest
677 << " != data_digest 0x" << auth.digest << std::dec
678 << " from shard " << auth_shard;
679 obj_result.set_data_digest_mismatch();
680 }
681 }
682 if (auth.omap_digest_present && candidate.omap_digest_present) {
683 if (auth.omap_digest != candidate.omap_digest) {
684 if (error != CLEAN)
685 errorstream << ", ";
686 error = FOUND_ERROR;
687 errorstream << "omap_digest 0x" << std::hex << candidate.omap_digest
688 << " != omap_digest 0x" << auth.omap_digest << std::dec
689 << " from shard " << auth_shard;
690 obj_result.set_omap_digest_mismatch();
691 }
692 }
693 if (parent->get_pool().is_replicated()) {
694 if (auth_oi.is_data_digest() && candidate.digest_present) {
695 if (auth_oi.data_digest != candidate.digest) {
696 if (error != CLEAN)
697 errorstream << ", ";
698 error = FOUND_ERROR;
699 errorstream << "data_digest 0x" << std::hex << candidate.digest
700 << " != data_digest 0x" << auth_oi.data_digest << std::dec
701 << " from auth oi " << auth_oi;
702 shard_result.set_data_digest_mismatch_info();
703 }
704 }
705 if (auth_oi.is_omap_digest() && candidate.omap_digest_present) {
706 if (auth_oi.omap_digest != candidate.omap_digest) {
707 if (error != CLEAN)
708 errorstream << ", ";
709 error = FOUND_ERROR;
710 errorstream << "omap_digest 0x" << std::hex << candidate.omap_digest
711 << " != omap_digest 0x" << auth_oi.omap_digest << std::dec
712 << " from auth oi " << auth_oi;
713 shard_result.set_omap_digest_mismatch_info();
714 }
715 }
716 }
717 if (candidate.stat_error)
718 return error == FOUND_ERROR;
719 if (!shard_result.has_info_missing()
720 && !shard_result.has_info_corrupted()) {
721 bufferlist can_bl, auth_bl;
722 auto can_attr = candidate.attrs.find(OI_ATTR);
723 auto auth_attr = auth.attrs.find(OI_ATTR);
724
725 ceph_assert(auth_attr != auth.attrs.end());
726 ceph_assert(can_attr != candidate.attrs.end());
727
728 can_bl.push_back(can_attr->second);
729 auth_bl.push_back(auth_attr->second);
730 if (!can_bl.contents_equal(auth_bl)) {
731 if (error != CLEAN)
732 errorstream << ", ";
733 error = FOUND_ERROR;
734 obj_result.set_object_info_inconsistency();
735 errorstream << "object info inconsistent ";
736 }
737 }
738 if (has_snapset) {
739 if (!shard_result.has_snapset_missing()
740 && !shard_result.has_snapset_corrupted()) {
741 bufferlist can_bl, auth_bl;
742 auto can_attr = candidate.attrs.find(SS_ATTR);
743 auto auth_attr = auth.attrs.find(SS_ATTR);
744
745 ceph_assert(auth_attr != auth.attrs.end());
746 ceph_assert(can_attr != candidate.attrs.end());
747
748 can_bl.push_back(can_attr->second);
749 auth_bl.push_back(auth_attr->second);
750 if (!can_bl.contents_equal(auth_bl)) {
751 if (error != CLEAN)
752 errorstream << ", ";
753 error = FOUND_ERROR;
754 obj_result.set_snapset_inconsistency();
755 errorstream << "snapset inconsistent ";
756 }
757 }
758 }
759 if (parent->get_pool().is_erasure()) {
760 if (!shard_result.has_hinfo_missing()
761 && !shard_result.has_hinfo_corrupted()) {
762 bufferlist can_bl, auth_bl;
763 auto can_hi = candidate.attrs.find(ECUtil::get_hinfo_key());
764 auto auth_hi = auth.attrs.find(ECUtil::get_hinfo_key());
765
766 ceph_assert(auth_hi != auth.attrs.end());
767 ceph_assert(can_hi != candidate.attrs.end());
768
769 can_bl.push_back(can_hi->second);
770 auth_bl.push_back(auth_hi->second);
771 if (!can_bl.contents_equal(auth_bl)) {
772 if (error != CLEAN)
773 errorstream << ", ";
774 error = FOUND_ERROR;
775 obj_result.set_hinfo_inconsistency();
776 errorstream << "hinfo inconsistent ";
777 }
778 }
779 }
780 uint64_t oi_size = be_get_ondisk_size(auth_oi.size);
781 if (oi_size != candidate.size) {
782 if (error != CLEAN)
783 errorstream << ", ";
784 error = FOUND_ERROR;
785 errorstream << "size " << candidate.size
786 << " != size " << oi_size
787 << " from auth oi " << auth_oi;
788 shard_result.set_size_mismatch_info();
789 }
790 if (auth.size != candidate.size) {
791 if (error != CLEAN)
792 errorstream << ", ";
793 error = FOUND_ERROR;
794 errorstream << "size " << candidate.size
795 << " != size " << auth.size
796 << " from shard " << auth_shard;
797 obj_result.set_size_mismatch();
798 }
799 // If the replica is too large and we didn't already count it for this object
800 //
801 if (candidate.size > cct->_conf->osd_max_object_size
802 && !obj_result.has_size_too_large()) {
803 if (error != CLEAN)
804 errorstream << ", ";
805 error = FOUND_ERROR;
806 errorstream << "size " << candidate.size
807 << " > " << cct->_conf->osd_max_object_size
808 << " is too large";
809 obj_result.set_size_too_large();
810 }
811 for (map<string,bufferptr>::const_iterator i = auth.attrs.begin();
812 i != auth.attrs.end();
813 ++i) {
814 // We check system keys seperately
815 if (i->first == OI_ATTR || i->first[0] != '_')
816 continue;
817 if (!candidate.attrs.count(i->first)) {
818 if (error != CLEAN)
819 errorstream << ", ";
820 error = FOUND_ERROR;
821 errorstream << "attr name mismatch '" << i->first << "'";
822 obj_result.set_attr_name_mismatch();
823 } else if (candidate.attrs.find(i->first)->second.cmp(i->second)) {
824 if (error != CLEAN)
825 errorstream << ", ";
826 error = FOUND_ERROR;
827 errorstream << "attr value mismatch '" << i->first << "'";
828 obj_result.set_attr_value_mismatch();
829 }
830 }
831 for (map<string,bufferptr>::const_iterator i = candidate.attrs.begin();
832 i != candidate.attrs.end();
833 ++i) {
834 // We check system keys seperately
835 if (i->first == OI_ATTR || i->first[0] != '_')
836 continue;
837 if (!auth.attrs.count(i->first)) {
838 if (error != CLEAN)
839 errorstream << ", ";
840 error = FOUND_ERROR;
841 errorstream << "attr name mismatch '" << i->first << "'";
842 obj_result.set_attr_name_mismatch();
843 }
844 }
845 return error == FOUND_ERROR;
846 }
847
848 static int dcount(const object_info_t &oi)
849 {
850 int count = 0;
851 if (oi.is_data_digest())
852 count++;
853 if (oi.is_omap_digest())
854 count++;
855 return count;
856 }
857
858 map<pg_shard_t, ScrubMap *>::const_iterator
859 PGBackend::be_select_auth_object(
860 const hobject_t &obj,
861 const map<pg_shard_t,ScrubMap*> &maps,
862 object_info_t *auth_oi,
863 map<pg_shard_t, shard_info_wrapper> &shard_map,
864 bool &digest_match,
865 spg_t pgid,
866 ostream &errorstream)
867 {
868 eversion_t auth_version;
869
870 // Create list of shards with primary first so it will be auth copy all
871 // other things being equal.
872 list<pg_shard_t> shards;
873 for (map<pg_shard_t, ScrubMap *>::const_iterator j = maps.begin();
874 j != maps.end();
875 ++j) {
876 if (j->first == get_parent()->whoami_shard())
877 continue;
878 shards.push_back(j->first);
879 }
880 shards.push_front(get_parent()->whoami_shard());
881
882 map<pg_shard_t, ScrubMap *>::const_iterator auth = maps.end();
883 digest_match = true;
884 for (auto &l : shards) {
885 ostringstream shard_errorstream;
886 bool error = false;
887 map<pg_shard_t, ScrubMap *>::const_iterator j = maps.find(l);
888 map<hobject_t, ScrubMap::object>::iterator i =
889 j->second->objects.find(obj);
890 if (i == j->second->objects.end()) {
891 continue;
892 }
893 auto& shard_info = shard_map[j->first];
894 if (j->first == get_parent()->whoami_shard())
895 shard_info.primary = true;
896 if (i->second.read_error) {
897 shard_info.set_read_error();
898 if (error)
899 shard_errorstream << ", ";
900 error = true;
901 shard_errorstream << "candidate had a read error";
902 }
903 if (i->second.ec_hash_mismatch) {
904 shard_info.set_ec_hash_mismatch();
905 if (error)
906 shard_errorstream << ", ";
907 error = true;
908 shard_errorstream << "candidate had an ec hash mismatch";
909 }
910 if (i->second.ec_size_mismatch) {
911 shard_info.set_ec_size_mismatch();
912 if (error)
913 shard_errorstream << ", ";
914 error = true;
915 shard_errorstream << "candidate had an ec size mismatch";
916 }
917
918 object_info_t oi;
919 bufferlist bl;
920 map<string, bufferptr>::iterator k;
921 SnapSet ss;
922 bufferlist ss_bl, hk_bl;
923
924 if (i->second.stat_error) {
925 shard_info.set_stat_error();
926 if (error)
927 shard_errorstream << ", ";
928 error = true;
929 shard_errorstream << "candidate had a stat error";
930 // With stat_error no further checking
931 // We don't need to also see a missing_object_info_attr
932 goto out;
933 }
934
935 // We won't pick an auth copy if the snapset is missing or won't decode.
936 ceph_assert(!obj.is_snapdir());
937 if (obj.is_head()) {
938 k = i->second.attrs.find(SS_ATTR);
939 if (k == i->second.attrs.end()) {
940 shard_info.set_snapset_missing();
941 if (error)
942 shard_errorstream << ", ";
943 error = true;
944 shard_errorstream << "candidate had a missing snapset key";
945 } else {
946 ss_bl.push_back(k->second);
947 try {
948 auto bliter = ss_bl.cbegin();
949 decode(ss, bliter);
950 } catch (...) {
951 // invalid snapset, probably corrupt
952 shard_info.set_snapset_corrupted();
953 if (error)
954 shard_errorstream << ", ";
955 error = true;
956 shard_errorstream << "candidate had a corrupt snapset";
957 }
958 }
959 }
960
961 if (parent->get_pool().is_erasure()) {
962 ECUtil::HashInfo hi;
963 k = i->second.attrs.find(ECUtil::get_hinfo_key());
964 if (k == i->second.attrs.end()) {
965 shard_info.set_hinfo_missing();
966 if (error)
967 shard_errorstream << ", ";
968 error = true;
969 shard_errorstream << "candidate had a missing hinfo key";
970 } else {
971 hk_bl.push_back(k->second);
972 try {
973 auto bliter = hk_bl.cbegin();
974 decode(hi, bliter);
975 } catch (...) {
976 // invalid snapset, probably corrupt
977 shard_info.set_hinfo_corrupted();
978 if (error)
979 shard_errorstream << ", ";
980 error = true;
981 shard_errorstream << "candidate had a corrupt hinfo";
982 }
983 }
984 }
985
986 k = i->second.attrs.find(OI_ATTR);
987 if (k == i->second.attrs.end()) {
988 // no object info on object, probably corrupt
989 shard_info.set_info_missing();
990 if (error)
991 shard_errorstream << ", ";
992 error = true;
993 shard_errorstream << "candidate had a missing info key";
994 goto out;
995 }
996 bl.push_back(k->second);
997 try {
998 auto bliter = bl.cbegin();
999 decode(oi, bliter);
1000 } catch (...) {
1001 // invalid object info, probably corrupt
1002 shard_info.set_info_corrupted();
1003 if (error)
1004 shard_errorstream << ", ";
1005 error = true;
1006 shard_errorstream << "candidate had a corrupt info";
1007 goto out;
1008 }
1009
1010 // This is automatically corrected in PG::_repair_oinfo_oid()
1011 ceph_assert(oi.soid == obj);
1012
1013 if (i->second.size != be_get_ondisk_size(oi.size)) {
1014 shard_info.set_obj_size_info_mismatch();
1015 if (error)
1016 shard_errorstream << ", ";
1017 error = true;
1018 shard_errorstream << "candidate size " << i->second.size << " info size "
1019 << oi.size << " mismatch";
1020 }
1021
1022 // digest_match will only be true if computed digests are the same
1023 if (auth_version != eversion_t()
1024 && auth->second->objects[obj].digest_present
1025 && i->second.digest_present
1026 && auth->second->objects[obj].digest != i->second.digest) {
1027 digest_match = false;
1028 dout(10) << __func__ << " digest_match = false, " << obj << " data_digest 0x" << std::hex << i->second.digest
1029 << " != data_digest 0x" << auth->second->objects[obj].digest << std::dec
1030 << dendl;
1031 }
1032
1033 // Don't use this particular shard due to previous errors
1034 // XXX: For now we can't pick one shard for repair and another's object info or snapset
1035 if (shard_info.errors)
1036 goto out;
1037
1038 if (auth_version == eversion_t() || oi.version > auth_version ||
1039 (oi.version == auth_version && dcount(oi) > dcount(*auth_oi))) {
1040 auth = j;
1041 *auth_oi = oi;
1042 auth_version = oi.version;
1043 }
1044
1045 out:
1046 if (error)
1047 errorstream << pgid.pgid << " shard " << l << " soid " << obj
1048 << " : " << shard_errorstream.str() << "\n";
1049 // Keep scanning other shards
1050 }
1051 dout(10) << __func__ << ": selecting osd " << auth->first
1052 << " for obj " << obj
1053 << " with oi " << *auth_oi
1054 << dendl;
1055 return auth;
1056 }
1057
1058 void PGBackend::be_compare_scrubmaps(
1059 const map<pg_shard_t,ScrubMap*> &maps,
1060 const set<hobject_t> &master_set,
1061 bool repair,
1062 map<hobject_t, set<pg_shard_t>> &missing,
1063 map<hobject_t, set<pg_shard_t>> &inconsistent,
1064 map<hobject_t, list<pg_shard_t>> &authoritative,
1065 map<hobject_t, pair<std::optional<uint32_t>,
1066 std::optional<uint32_t>>> &missing_digest,
1067 int &shallow_errors, int &deep_errors,
1068 Scrub::Store *store,
1069 const spg_t& pgid,
1070 const vector<int> &acting,
1071 ostream &errorstream)
1072 {
1073 utime_t now = ceph_clock_now();
1074
1075 // Check maps against master set and each other
1076 for (set<hobject_t>::const_iterator k = master_set.begin();
1077 k != master_set.end();
1078 ++k) {
1079 object_info_t auth_oi;
1080 map<pg_shard_t, shard_info_wrapper> shard_map;
1081
1082 inconsistent_obj_wrapper object_error{*k};
1083
1084 bool digest_match;
1085 map<pg_shard_t, ScrubMap *>::const_iterator auth =
1086 be_select_auth_object(*k, maps, &auth_oi, shard_map, digest_match,
1087 pgid, errorstream);
1088
1089 list<pg_shard_t> auth_list;
1090 set<pg_shard_t> object_errors;
1091 if (auth == maps.end()) {
1092 object_error.set_version(0);
1093 object_error.set_auth_missing(*k, maps, shard_map, shallow_errors,
1094 deep_errors, get_parent()->whoami_shard());
1095 if (object_error.has_deep_errors())
1096 ++deep_errors;
1097 else if (object_error.has_shallow_errors())
1098 ++shallow_errors;
1099 store->add_object_error(k->pool, object_error);
1100 errorstream << pgid.pgid << " soid " << *k
1101 << " : failed to pick suitable object info\n";
1102 continue;
1103 }
1104 object_error.set_version(auth_oi.user_version);
1105 ScrubMap::object& auth_object = auth->second->objects[*k];
1106 set<pg_shard_t> cur_missing;
1107 set<pg_shard_t> cur_inconsistent;
1108 bool fix_digest = false;
1109
1110 for (auto j = maps.cbegin(); j != maps.cend(); ++j) {
1111 if (j == auth)
1112 shard_map[auth->first].selected_oi = true;
1113 if (j->second->objects.count(*k)) {
1114 shard_map[j->first].set_object(j->second->objects[*k]);
1115 // Compare
1116 stringstream ss;
1117 bool found = be_compare_scrub_objects(auth->first,
1118 auth_object,
1119 auth_oi,
1120 j->second->objects[*k],
1121 shard_map[j->first],
1122 object_error,
1123 ss,
1124 k->has_snapset());
1125
1126 dout(20) << __func__ << (repair ? " repair " : " ") << (parent->get_pool().is_replicated() ? "replicated " : "")
1127 << (j == auth ? "auth" : "") << "shards " << shard_map.size() << (digest_match ? " digest_match " : " ")
1128 << (shard_map[j->first].only_data_digest_mismatch_info() ? "'info mismatch info'" : "")
1129 << dendl;
1130 // If all replicas match, but they don't match object_info we can
1131 // repair it by using missing_digest mechanism
1132 if (repair && parent->get_pool().is_replicated() && j == auth && shard_map.size() > 1
1133 && digest_match && shard_map[j->first].only_data_digest_mismatch_info()
1134 && auth_object.digest_present) {
1135 // Set in missing_digests
1136 fix_digest = true;
1137 // Clear the error
1138 shard_map[j->first].clear_data_digest_mismatch_info();
1139 errorstream << pgid << " soid " << *k << " : repairing object info data_digest" << "\n";
1140 }
1141 // Some errors might have already been set in be_select_auth_object()
1142 if (shard_map[j->first].errors != 0) {
1143 cur_inconsistent.insert(j->first);
1144 if (shard_map[j->first].has_deep_errors())
1145 ++deep_errors;
1146 else
1147 ++shallow_errors;
1148 // Only true if be_compare_scrub_objects() found errors and put something
1149 // in ss.
1150 if (found)
1151 errorstream << pgid << " shard " << j->first << " soid " << *k
1152 << " : " << ss.str() << "\n";
1153 } else if (found) {
1154 // Track possible shard to use as authoritative, if needed
1155 // There are errors, without identifying the shard
1156 object_errors.insert(j->first);
1157 errorstream << pgid << " soid " << *k << " : " << ss.str() << "\n";
1158 } else {
1159 // XXX: The auth shard might get here that we don't know
1160 // that it has the "correct" data.
1161 auth_list.push_back(j->first);
1162 }
1163 } else {
1164 cur_missing.insert(j->first);
1165 shard_map[j->first].set_missing();
1166 shard_map[j->first].primary = (j->first == get_parent()->whoami_shard());
1167 // Can't have any other errors if there is no information available
1168 ++shallow_errors;
1169 errorstream << pgid << " shard " << j->first << " " << *k << " : missing\n";
1170 }
1171 object_error.add_shard(j->first, shard_map[j->first]);
1172 }
1173
1174 if (auth_list.empty()) {
1175 if (object_errors.empty()) {
1176 errorstream << pgid.pgid << " soid " << *k
1177 << " : failed to pick suitable auth object\n";
1178 goto out;
1179 }
1180 // Object errors exist and nothing in auth_list
1181 // Prefer the auth shard otherwise take first from list.
1182 pg_shard_t shard;
1183 if (object_errors.count(auth->first)) {
1184 shard = auth->first;
1185 } else {
1186 shard = *(object_errors.begin());
1187 }
1188 auth_list.push_back(shard);
1189 object_errors.erase(shard);
1190 }
1191 // At this point auth_list is populated, so we add the object errors shards
1192 // as inconsistent.
1193 cur_inconsistent.insert(object_errors.begin(), object_errors.end());
1194 if (!cur_missing.empty()) {
1195 missing[*k] = cur_missing;
1196 }
1197 if (!cur_inconsistent.empty()) {
1198 inconsistent[*k] = cur_inconsistent;
1199 }
1200
1201 if (fix_digest) {
1202 std::optional<uint32_t> data_digest, omap_digest;
1203 ceph_assert(auth_object.digest_present);
1204 data_digest = auth_object.digest;
1205 if (auth_object.omap_digest_present) {
1206 omap_digest = auth_object.omap_digest;
1207 }
1208 missing_digest[*k] = make_pair(data_digest, omap_digest);
1209 }
1210 if (!cur_inconsistent.empty() || !cur_missing.empty()) {
1211 authoritative[*k] = auth_list;
1212 } else if (!fix_digest && parent->get_pool().is_replicated()) {
1213 enum {
1214 NO = 0,
1215 MAYBE = 1,
1216 FORCE = 2,
1217 } update = NO;
1218
1219 if (auth_object.digest_present && !auth_oi.is_data_digest()) {
1220 dout(20) << __func__ << " missing data digest on " << *k << dendl;
1221 update = MAYBE;
1222 }
1223 if (auth_object.omap_digest_present && !auth_oi.is_omap_digest()) {
1224 dout(20) << __func__ << " missing omap digest on " << *k << dendl;
1225 update = MAYBE;
1226 }
1227
1228 // recorded digest != actual digest?
1229 if (auth_oi.is_data_digest() && auth_object.digest_present &&
1230 auth_oi.data_digest != auth_object.digest) {
1231 ceph_assert(shard_map[auth->first].has_data_digest_mismatch_info());
1232 errorstream << pgid << " recorded data digest 0x"
1233 << std::hex << auth_oi.data_digest << " != on disk 0x"
1234 << auth_object.digest << std::dec << " on " << auth_oi.soid
1235 << "\n";
1236 if (repair)
1237 update = FORCE;
1238 }
1239 if (auth_oi.is_omap_digest() && auth_object.omap_digest_present &&
1240 auth_oi.omap_digest != auth_object.omap_digest) {
1241 ceph_assert(shard_map[auth->first].has_omap_digest_mismatch_info());
1242 errorstream << pgid << " recorded omap digest 0x"
1243 << std::hex << auth_oi.omap_digest << " != on disk 0x"
1244 << auth_object.omap_digest << std::dec
1245 << " on " << auth_oi.soid << "\n";
1246 if (repair)
1247 update = FORCE;
1248 }
1249
1250 if (update != NO) {
1251 utime_t age = now - auth_oi.local_mtime;
1252 if (update == FORCE ||
1253 age > cct->_conf->osd_deep_scrub_update_digest_min_age) {
1254 std::optional<uint32_t> data_digest, omap_digest;
1255 if (auth_object.digest_present) {
1256 data_digest = auth_object.digest;
1257 dout(20) << __func__ << " will update data digest on " << *k << dendl;
1258 }
1259 if (auth_object.omap_digest_present) {
1260 omap_digest = auth_object.omap_digest;
1261 dout(20) << __func__ << " will update omap digest on " << *k << dendl;
1262 }
1263 missing_digest[*k] = make_pair(data_digest, omap_digest);
1264 } else {
1265 dout(20) << __func__ << " missing digest but age " << age
1266 << " < " << cct->_conf->osd_deep_scrub_update_digest_min_age
1267 << " on " << *k << dendl;
1268 }
1269 }
1270 }
1271 out:
1272 if (object_error.has_deep_errors())
1273 ++deep_errors;
1274 else if (object_error.has_shallow_errors())
1275 ++shallow_errors;
1276 if (object_error.errors || object_error.union_shards.errors) {
1277 store->add_object_error(k->pool, object_error);
1278 }
1279 }
1280 }
1281
1282 void PGBackend::be_omap_checks(const map<pg_shard_t,ScrubMap*> &maps,
1283 const set<hobject_t> &master_set,
1284 omap_stat_t& omap_stats,
1285 ostream &warnstream) const
1286 {
1287 bool needs_omap_check = false;
1288 for (const auto& map : maps) {
1289 if (map.second->has_large_omap_object_errors || map.second->has_omap_keys) {
1290 needs_omap_check = true;
1291 break;
1292 }
1293 }
1294
1295 if (!needs_omap_check) {
1296 return; // Nothing to do
1297 }
1298
1299 // Iterate through objects and update omap stats
1300 for (const auto& k : master_set) {
1301 for (const auto& map : maps) {
1302 if (map.first != get_parent()->primary_shard()) {
1303 // Only set omap stats for the primary
1304 continue;
1305 }
1306 auto it = map.second->objects.find(k);
1307 if (it == map.second->objects.end())
1308 continue;
1309 ScrubMap::object& obj = it->second;
1310 omap_stats.omap_bytes += obj.object_omap_bytes;
1311 omap_stats.omap_keys += obj.object_omap_keys;
1312 if (obj.large_omap_object_found) {
1313 pg_t pg;
1314 auto osdmap = get_osdmap();
1315 osdmap->map_to_pg(k.pool, k.oid.name, k.get_key(), k.nspace, &pg);
1316 pg_t mpg = osdmap->raw_pg_to_pg(pg);
1317 omap_stats.large_omap_objects++;
1318 warnstream << "Large omap object found. Object: " << k
1319 << " PG: " << pg << " (" << mpg << ")"
1320 << " Key count: " << obj.large_omap_object_key_count
1321 << " Size (bytes): " << obj.large_omap_object_value_size
1322 << '\n';
1323 break;
1324 }
1325 }
1326 }
1327 }