]> git.proxmox.com Git - ceph.git/blob - ceph/src/osd/PGBackend.cc
update source to Ceph Pacific 16.2.2
[ceph.git] / ceph / src / osd / PGBackend.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2013,2014 Inktank Storage, Inc.
7 * Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com>
8 *
9 * Author: Loic Dachary <loic@dachary.org>
10 *
11 * This is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public
13 * License version 2.1, as published by the Free Software
14 * Foundation. See file COPYING.
15 *
16 */
17
18
19 #include "common/errno.h"
20 #include "common/scrub_types.h"
21 #include "ReplicatedBackend.h"
22 #include "ScrubStore.h"
23 #include "ECBackend.h"
24 #include "PGBackend.h"
25 #include "OSD.h"
26 #include "erasure-code/ErasureCodePlugin.h"
27 #include "OSDMap.h"
28 #include "PGLog.h"
29 #include "common/LogClient.h"
30 #include "messages/MOSDPGRecoveryDelete.h"
31 #include "messages/MOSDPGRecoveryDeleteReply.h"
32
33 using std::list;
34 using std::make_pair;
35 using std::map;
36 using std::ostream;
37 using std::ostringstream;
38 using std::pair;
39 using std::set;
40 using std::string;
41 using std::stringstream;
42 using std::vector;
43
44 using ceph::bufferlist;
45 using ceph::bufferptr;
46 using ceph::ErasureCodeProfile;
47 using ceph::ErasureCodeInterfaceRef;
48
49 #define dout_context cct
50 #define dout_subsys ceph_subsys_osd
51 #define DOUT_PREFIX_ARGS this
52 #undef dout_prefix
53 #define dout_prefix _prefix(_dout, this)
54 static ostream& _prefix(std::ostream *_dout, PGBackend *pgb) {
55 return pgb->get_parent()->gen_dbg_prefix(*_dout);
56 }
57
58 void PGBackend::recover_delete_object(const hobject_t &oid, eversion_t v,
59 RecoveryHandle *h)
60 {
61 ceph_assert(get_parent()->get_acting_recovery_backfill_shards().size() > 0);
62 for (const auto& shard : get_parent()->get_acting_recovery_backfill_shards()) {
63 if (shard == get_parent()->whoami_shard())
64 continue;
65 if (get_parent()->get_shard_missing(shard).is_missing(oid)) {
66 dout(20) << __func__ << " will remove " << oid << " " << v << " from "
67 << shard << dendl;
68 h->deletes[shard].push_back(make_pair(oid, v));
69 get_parent()->begin_peer_recover(shard, oid);
70 }
71 }
72 }
73
74 void PGBackend::send_recovery_deletes(int prio,
75 const map<pg_shard_t, vector<pair<hobject_t, eversion_t> > > &deletes)
76 {
77 epoch_t min_epoch = get_parent()->get_last_peering_reset_epoch();
78 for (const auto& p : deletes) {
79 const auto& shard = p.first;
80 const auto& objects = p.second;
81 ConnectionRef con = get_parent()->get_con_osd_cluster(
82 shard.osd,
83 get_osdmap_epoch());
84 if (!con)
85 continue;
86 auto it = objects.begin();
87 while (it != objects.end()) {
88 uint64_t cost = 0;
89 uint64_t deletes = 0;
90 spg_t target_pg = spg_t(get_parent()->get_info().pgid.pgid, shard.shard);
91 MOSDPGRecoveryDelete *msg =
92 new MOSDPGRecoveryDelete(get_parent()->whoami_shard(),
93 target_pg,
94 get_osdmap_epoch(),
95 min_epoch);
96 msg->set_priority(prio);
97
98 while (it != objects.end() &&
99 cost < cct->_conf->osd_max_push_cost &&
100 deletes < cct->_conf->osd_max_push_objects) {
101 dout(20) << __func__ << ": sending recovery delete << " << it->first
102 << " " << it->second << " to osd." << shard << dendl;
103 msg->objects.push_back(*it);
104 cost += cct->_conf->osd_push_per_object_cost;
105 ++deletes;
106 ++it;
107 }
108
109 msg->set_cost(cost);
110 get_parent()->send_message_osd_cluster(msg, con);
111 }
112 }
113 }
114
115 bool PGBackend::handle_message(OpRequestRef op)
116 {
117 switch (op->get_req()->get_type()) {
118 case MSG_OSD_PG_RECOVERY_DELETE:
119 handle_recovery_delete(op);
120 return true;
121
122 case MSG_OSD_PG_RECOVERY_DELETE_REPLY:
123 handle_recovery_delete_reply(op);
124 return true;
125
126 default:
127 break;
128 }
129
130 return _handle_message(op);
131 }
132
133 void PGBackend::handle_recovery_delete(OpRequestRef op)
134 {
135 auto m = op->get_req<MOSDPGRecoveryDelete>();
136 ceph_assert(m->get_type() == MSG_OSD_PG_RECOVERY_DELETE);
137 dout(20) << __func__ << " " << op << dendl;
138
139 op->mark_started();
140
141 C_GatherBuilder gather(cct);
142 for (const auto &p : m->objects) {
143 get_parent()->remove_missing_object(p.first, p.second, gather.new_sub());
144 }
145
146 auto reply = make_message<MOSDPGRecoveryDeleteReply>();
147 reply->from = get_parent()->whoami_shard();
148 reply->set_priority(m->get_priority());
149 reply->pgid = spg_t(get_parent()->get_info().pgid.pgid, m->from.shard);
150 reply->map_epoch = m->map_epoch;
151 reply->min_epoch = m->min_epoch;
152 reply->objects = m->objects;
153 ConnectionRef conn = m->get_connection();
154
155 gather.set_finisher(new LambdaContext(
156 [=](int r) {
157 if (r != -EAGAIN) {
158 get_parent()->send_message_osd_cluster(reply, conn.get());
159 } else {
160 reply->put();
161 }
162 }));
163 gather.activate();
164 }
165
166 void PGBackend::handle_recovery_delete_reply(OpRequestRef op)
167 {
168 auto m = op->get_req<MOSDPGRecoveryDeleteReply>();
169 ceph_assert(m->get_type() == MSG_OSD_PG_RECOVERY_DELETE_REPLY);
170 dout(20) << __func__ << " " << op << dendl;
171
172 for (const auto &p : m->objects) {
173 ObjectRecoveryInfo recovery_info;
174 hobject_t oid = p.first;
175 recovery_info.version = p.second;
176 get_parent()->on_peer_recover(m->from, oid, recovery_info);
177 bool peers_recovered = true;
178 for (const auto& shard : get_parent()->get_acting_recovery_backfill_shards()) {
179 if (shard == get_parent()->whoami_shard())
180 continue;
181 if (get_parent()->get_shard_missing(shard).is_missing(oid)) {
182 dout(20) << __func__ << " " << oid << " still missing on at least "
183 << shard << dendl;
184 peers_recovered = false;
185 break;
186 }
187 }
188 if (peers_recovered && !get_parent()->get_local_missing().is_missing(oid)) {
189 dout(20) << __func__ << " completed recovery, local_missing = "
190 << get_parent()->get_local_missing() << dendl;
191 object_stat_sum_t stat_diff;
192 stat_diff.num_objects_recovered = 1;
193 get_parent()->on_global_recover(p.first, stat_diff, true);
194 }
195 }
196 }
197
198 void PGBackend::rollback(
199 const pg_log_entry_t &entry,
200 ObjectStore::Transaction *t)
201 {
202
203 struct RollbackVisitor : public ObjectModDesc::Visitor {
204 const hobject_t &hoid;
205 PGBackend *pg;
206 ObjectStore::Transaction t;
207 RollbackVisitor(
208 const hobject_t &hoid,
209 PGBackend *pg) : hoid(hoid), pg(pg) {}
210 void append(uint64_t old_size) override {
211 ObjectStore::Transaction temp;
212 pg->rollback_append(hoid, old_size, &temp);
213 temp.append(t);
214 temp.swap(t);
215 }
216 void setattrs(map<string, std::optional<bufferlist> > &attrs) override {
217 ObjectStore::Transaction temp;
218 pg->rollback_setattrs(hoid, attrs, &temp);
219 temp.append(t);
220 temp.swap(t);
221 }
222 void rmobject(version_t old_version) override {
223 ObjectStore::Transaction temp;
224 pg->rollback_stash(hoid, old_version, &temp);
225 temp.append(t);
226 temp.swap(t);
227 }
228 void try_rmobject(version_t old_version) override {
229 ObjectStore::Transaction temp;
230 pg->rollback_try_stash(hoid, old_version, &temp);
231 temp.append(t);
232 temp.swap(t);
233 }
234 void create() override {
235 ObjectStore::Transaction temp;
236 pg->rollback_create(hoid, &temp);
237 temp.append(t);
238 temp.swap(t);
239 }
240 void update_snaps(const set<snapid_t> &snaps) override {
241 ObjectStore::Transaction temp;
242 pg->get_parent()->pgb_set_object_snap_mapping(hoid, snaps, &temp);
243 temp.append(t);
244 temp.swap(t);
245 }
246 void rollback_extents(
247 version_t gen,
248 const vector<pair<uint64_t, uint64_t> > &extents) override {
249 ObjectStore::Transaction temp;
250 pg->rollback_extents(gen, extents, hoid, &temp);
251 temp.append(t);
252 temp.swap(t);
253 }
254 };
255
256 ceph_assert(entry.mod_desc.can_rollback());
257 RollbackVisitor vis(entry.soid, this);
258 entry.mod_desc.visit(&vis);
259 t->append(vis.t);
260 }
261
262 struct Trimmer : public ObjectModDesc::Visitor {
263 const hobject_t &soid;
264 PGBackend *pg;
265 ObjectStore::Transaction *t;
266 Trimmer(
267 const hobject_t &soid,
268 PGBackend *pg,
269 ObjectStore::Transaction *t)
270 : soid(soid), pg(pg), t(t) {}
271 void rmobject(version_t old_version) override {
272 pg->trim_rollback_object(
273 soid,
274 old_version,
275 t);
276 }
277 // try_rmobject defaults to rmobject
278 void rollback_extents(
279 version_t gen,
280 const vector<pair<uint64_t, uint64_t> > &extents) override {
281 pg->trim_rollback_object(
282 soid,
283 gen,
284 t);
285 }
286 };
287
288 void PGBackend::rollforward(
289 const pg_log_entry_t &entry,
290 ObjectStore::Transaction *t)
291 {
292 auto dpp = get_parent()->get_dpp();
293 ldpp_dout(dpp, 20) << __func__ << ": entry=" << entry << dendl;
294 if (!entry.can_rollback())
295 return;
296 Trimmer trimmer(entry.soid, this, t);
297 entry.mod_desc.visit(&trimmer);
298 }
299
300 void PGBackend::trim(
301 const pg_log_entry_t &entry,
302 ObjectStore::Transaction *t)
303 {
304 if (!entry.can_rollback())
305 return;
306 Trimmer trimmer(entry.soid, this, t);
307 entry.mod_desc.visit(&trimmer);
308 }
309
310 void PGBackend::try_stash(
311 const hobject_t &hoid,
312 version_t v,
313 ObjectStore::Transaction *t)
314 {
315 t->try_rename(
316 coll,
317 ghobject_t(hoid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard),
318 ghobject_t(hoid, v, get_parent()->whoami_shard().shard));
319 }
320
321 void PGBackend::remove(
322 const hobject_t &hoid,
323 ObjectStore::Transaction *t) {
324 ceph_assert(!hoid.is_temp());
325 t->remove(
326 coll,
327 ghobject_t(hoid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard));
328 get_parent()->pgb_clear_object_snap_mapping(hoid, t);
329 }
330
331 void PGBackend::on_change_cleanup(ObjectStore::Transaction *t)
332 {
333 dout(10) << __func__ << dendl;
334 // clear temp
335 for (set<hobject_t>::iterator i = temp_contents.begin();
336 i != temp_contents.end();
337 ++i) {
338 dout(10) << __func__ << ": Removing oid "
339 << *i << " from the temp collection" << dendl;
340 t->remove(
341 coll,
342 ghobject_t(*i, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard));
343 }
344 temp_contents.clear();
345 }
346
347 int PGBackend::objects_list_partial(
348 const hobject_t &begin,
349 int min,
350 int max,
351 vector<hobject_t> *ls,
352 hobject_t *next)
353 {
354 ceph_assert(ls);
355 // Starts with the smallest generation to make sure the result list
356 // has the marker object (it might have multiple generations
357 // though, which would be filtered).
358 ghobject_t _next;
359 if (!begin.is_min())
360 _next = ghobject_t(begin, 0, get_parent()->whoami_shard().shard);
361 ls->reserve(max);
362 int r = 0;
363
364 if (min > max)
365 min = max;
366
367 while (!_next.is_max() && ls->size() < (unsigned)min) {
368 vector<ghobject_t> objects;
369 if (HAVE_FEATURE(parent->min_upacting_features(),
370 OSD_FIXED_COLLECTION_LIST)) {
371 r = store->collection_list(
372 ch,
373 _next,
374 ghobject_t::get_max(),
375 max - ls->size(),
376 &objects,
377 &_next);
378 } else {
379 r = store->collection_list_legacy(
380 ch,
381 _next,
382 ghobject_t::get_max(),
383 max - ls->size(),
384 &objects,
385 &_next);
386 }
387 if (r != 0) {
388 derr << __func__ << " list collection " << ch << " got: " << cpp_strerror(r) << dendl;
389 break;
390 }
391 for (vector<ghobject_t>::iterator i = objects.begin();
392 i != objects.end();
393 ++i) {
394 if (i->is_pgmeta() || i->hobj.is_temp()) {
395 continue;
396 }
397 if (i->is_no_gen()) {
398 ls->push_back(i->hobj);
399 }
400 }
401 }
402 if (r == 0)
403 *next = _next.hobj;
404 return r;
405 }
406
407 int PGBackend::objects_list_range(
408 const hobject_t &start,
409 const hobject_t &end,
410 vector<hobject_t> *ls,
411 vector<ghobject_t> *gen_obs)
412 {
413 ceph_assert(ls);
414 vector<ghobject_t> objects;
415 int r;
416 if (HAVE_FEATURE(parent->min_upacting_features(),
417 OSD_FIXED_COLLECTION_LIST)) {
418 r = store->collection_list(
419 ch,
420 ghobject_t(start, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard),
421 ghobject_t(end, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard),
422 INT_MAX,
423 &objects,
424 NULL);
425 } else {
426 r = store->collection_list_legacy(
427 ch,
428 ghobject_t(start, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard),
429 ghobject_t(end, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard),
430 INT_MAX,
431 &objects,
432 NULL);
433 }
434 ls->reserve(objects.size());
435 for (vector<ghobject_t>::iterator i = objects.begin();
436 i != objects.end();
437 ++i) {
438 if (i->is_pgmeta() || i->hobj.is_temp()) {
439 continue;
440 }
441 if (i->is_no_gen()) {
442 ls->push_back(i->hobj);
443 } else if (gen_obs) {
444 gen_obs->push_back(*i);
445 }
446 }
447 return r;
448 }
449
450 int PGBackend::objects_get_attr(
451 const hobject_t &hoid,
452 const string &attr,
453 bufferlist *out)
454 {
455 bufferptr bp;
456 int r = store->getattr(
457 ch,
458 ghobject_t(hoid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard),
459 attr.c_str(),
460 bp);
461 if (r >= 0 && out) {
462 out->clear();
463 out->push_back(std::move(bp));
464 }
465 return r;
466 }
467
468 int PGBackend::objects_get_attrs(
469 const hobject_t &hoid,
470 map<string, bufferlist> *out)
471 {
472 return store->getattrs(
473 ch,
474 ghobject_t(hoid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard),
475 *out);
476 }
477
478 void PGBackend::rollback_setattrs(
479 const hobject_t &hoid,
480 map<string, std::optional<bufferlist> > &old_attrs,
481 ObjectStore::Transaction *t) {
482 map<string, bufferlist> to_set;
483 ceph_assert(!hoid.is_temp());
484 for (map<string, std::optional<bufferlist> >::iterator i = old_attrs.begin();
485 i != old_attrs.end();
486 ++i) {
487 if (i->second) {
488 to_set[i->first] = *(i->second);
489 } else {
490 t->rmattr(
491 coll,
492 ghobject_t(hoid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard),
493 i->first);
494 }
495 }
496 t->setattrs(
497 coll,
498 ghobject_t(hoid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard),
499 to_set);
500 }
501
502 void PGBackend::rollback_append(
503 const hobject_t &hoid,
504 uint64_t old_size,
505 ObjectStore::Transaction *t) {
506 ceph_assert(!hoid.is_temp());
507 t->truncate(
508 coll,
509 ghobject_t(hoid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard),
510 old_size);
511 }
512
513 void PGBackend::rollback_stash(
514 const hobject_t &hoid,
515 version_t old_version,
516 ObjectStore::Transaction *t) {
517 ceph_assert(!hoid.is_temp());
518 t->remove(
519 coll,
520 ghobject_t(hoid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard));
521 t->collection_move_rename(
522 coll,
523 ghobject_t(hoid, old_version, get_parent()->whoami_shard().shard),
524 coll,
525 ghobject_t(hoid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard));
526 }
527
528 void PGBackend::rollback_try_stash(
529 const hobject_t &hoid,
530 version_t old_version,
531 ObjectStore::Transaction *t) {
532 ceph_assert(!hoid.is_temp());
533 t->remove(
534 coll,
535 ghobject_t(hoid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard));
536 t->try_rename(
537 coll,
538 ghobject_t(hoid, old_version, get_parent()->whoami_shard().shard),
539 ghobject_t(hoid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard));
540 }
541
542 void PGBackend::rollback_extents(
543 version_t gen,
544 const vector<pair<uint64_t, uint64_t> > &extents,
545 const hobject_t &hoid,
546 ObjectStore::Transaction *t) {
547 auto shard = get_parent()->whoami_shard().shard;
548 for (auto &&extent: extents) {
549 t->clone_range(
550 coll,
551 ghobject_t(hoid, gen, shard),
552 ghobject_t(hoid, ghobject_t::NO_GEN, shard),
553 extent.first,
554 extent.second,
555 extent.first);
556 }
557 t->remove(
558 coll,
559 ghobject_t(hoid, gen, shard));
560 }
561
562 void PGBackend::trim_rollback_object(
563 const hobject_t &hoid,
564 version_t old_version,
565 ObjectStore::Transaction *t) {
566 ceph_assert(!hoid.is_temp());
567 t->remove(
568 coll, ghobject_t(hoid, old_version, get_parent()->whoami_shard().shard));
569 }
570
571 PGBackend *PGBackend::build_pg_backend(
572 const pg_pool_t &pool,
573 const map<string,string>& profile,
574 Listener *l,
575 coll_t coll,
576 ObjectStore::CollectionHandle &ch,
577 ObjectStore *store,
578 CephContext *cct)
579 {
580 ErasureCodeProfile ec_profile = profile;
581 switch (pool.type) {
582 case pg_pool_t::TYPE_REPLICATED: {
583 return new ReplicatedBackend(l, coll, ch, store, cct);
584 }
585 case pg_pool_t::TYPE_ERASURE: {
586 ErasureCodeInterfaceRef ec_impl;
587 stringstream ss;
588 ceph::ErasureCodePluginRegistry::instance().factory(
589 profile.find("plugin")->second,
590 cct->_conf.get_val<std::string>("erasure_code_dir"),
591 ec_profile,
592 &ec_impl,
593 &ss);
594 ceph_assert(ec_impl);
595 return new ECBackend(
596 l,
597 coll,
598 ch,
599 store,
600 cct,
601 ec_impl,
602 pool.stripe_width);
603 }
604 default:
605 ceph_abort();
606 return NULL;
607 }
608 }
609
610 int PGBackend::be_scan_list(
611 ScrubMap &map,
612 ScrubMapBuilder &pos)
613 {
614 dout(10) << __func__ << " " << pos << dendl;
615 ceph_assert(!pos.done());
616 ceph_assert(pos.pos < pos.ls.size());
617 hobject_t& poid = pos.ls[pos.pos];
618
619 struct stat st;
620 int r = store->stat(
621 ch,
622 ghobject_t(
623 poid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard),
624 &st,
625 true);
626 if (r == 0) {
627 ScrubMap::object &o = map.objects[poid];
628 o.size = st.st_size;
629 ceph_assert(!o.negative);
630 store->getattrs(
631 ch,
632 ghobject_t(
633 poid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard),
634 o.attrs);
635
636 if (pos.deep) {
637 r = be_deep_scrub(poid, map, pos, o);
638 }
639 dout(25) << __func__ << " " << poid << dendl;
640 } else if (r == -ENOENT) {
641 dout(25) << __func__ << " " << poid << " got " << r
642 << ", skipping" << dendl;
643 } else if (r == -EIO) {
644 dout(25) << __func__ << " " << poid << " got " << r
645 << ", stat_error" << dendl;
646 ScrubMap::object &o = map.objects[poid];
647 o.stat_error = true;
648 } else {
649 derr << __func__ << " got: " << cpp_strerror(r) << dendl;
650 ceph_abort();
651 }
652 if (r == -EINPROGRESS) {
653 return -EINPROGRESS;
654 }
655 pos.next_object();
656 return 0;
657 }
658
659 bool PGBackend::be_compare_scrub_objects(
660 pg_shard_t auth_shard,
661 const ScrubMap::object &auth,
662 const object_info_t& auth_oi,
663 const ScrubMap::object &candidate,
664 shard_info_wrapper &shard_result,
665 inconsistent_obj_wrapper &obj_result,
666 ostream &errorstream,
667 bool has_snapset)
668 {
669 enum { CLEAN, FOUND_ERROR } error = CLEAN;
670 if (auth.digest_present && candidate.digest_present) {
671 if (auth.digest != candidate.digest) {
672 if (error != CLEAN)
673 errorstream << ", ";
674 error = FOUND_ERROR;
675 errorstream << "data_digest 0x" << std::hex << candidate.digest
676 << " != data_digest 0x" << auth.digest << std::dec
677 << " from shard " << auth_shard;
678 obj_result.set_data_digest_mismatch();
679 }
680 }
681 if (auth.omap_digest_present && candidate.omap_digest_present) {
682 if (auth.omap_digest != candidate.omap_digest) {
683 if (error != CLEAN)
684 errorstream << ", ";
685 error = FOUND_ERROR;
686 errorstream << "omap_digest 0x" << std::hex << candidate.omap_digest
687 << " != omap_digest 0x" << auth.omap_digest << std::dec
688 << " from shard " << auth_shard;
689 obj_result.set_omap_digest_mismatch();
690 }
691 }
692 if (parent->get_pool().is_replicated()) {
693 if (auth_oi.is_data_digest() && candidate.digest_present) {
694 if (auth_oi.data_digest != candidate.digest) {
695 if (error != CLEAN)
696 errorstream << ", ";
697 error = FOUND_ERROR;
698 errorstream << "data_digest 0x" << std::hex << candidate.digest
699 << " != data_digest 0x" << auth_oi.data_digest << std::dec
700 << " from auth oi " << auth_oi;
701 shard_result.set_data_digest_mismatch_info();
702 }
703 }
704 if (auth_oi.is_omap_digest() && candidate.omap_digest_present) {
705 if (auth_oi.omap_digest != candidate.omap_digest) {
706 if (error != CLEAN)
707 errorstream << ", ";
708 error = FOUND_ERROR;
709 errorstream << "omap_digest 0x" << std::hex << candidate.omap_digest
710 << " != omap_digest 0x" << auth_oi.omap_digest << std::dec
711 << " from auth oi " << auth_oi;
712 shard_result.set_omap_digest_mismatch_info();
713 }
714 }
715 }
716 if (candidate.stat_error)
717 return error == FOUND_ERROR;
718 if (!shard_result.has_info_missing()
719 && !shard_result.has_info_corrupted()) {
720 bufferlist can_bl, auth_bl;
721 auto can_attr = candidate.attrs.find(OI_ATTR);
722 auto auth_attr = auth.attrs.find(OI_ATTR);
723
724 ceph_assert(auth_attr != auth.attrs.end());
725 ceph_assert(can_attr != candidate.attrs.end());
726
727 can_bl.push_back(can_attr->second);
728 auth_bl.push_back(auth_attr->second);
729 if (!can_bl.contents_equal(auth_bl)) {
730 if (error != CLEAN)
731 errorstream << ", ";
732 error = FOUND_ERROR;
733 obj_result.set_object_info_inconsistency();
734 errorstream << "object info inconsistent ";
735 }
736 }
737 if (has_snapset) {
738 if (!shard_result.has_snapset_missing()
739 && !shard_result.has_snapset_corrupted()) {
740 bufferlist can_bl, auth_bl;
741 auto can_attr = candidate.attrs.find(SS_ATTR);
742 auto auth_attr = auth.attrs.find(SS_ATTR);
743
744 ceph_assert(auth_attr != auth.attrs.end());
745 ceph_assert(can_attr != candidate.attrs.end());
746
747 can_bl.push_back(can_attr->second);
748 auth_bl.push_back(auth_attr->second);
749 if (!can_bl.contents_equal(auth_bl)) {
750 if (error != CLEAN)
751 errorstream << ", ";
752 error = FOUND_ERROR;
753 obj_result.set_snapset_inconsistency();
754 errorstream << "snapset inconsistent ";
755 }
756 }
757 }
758 if (parent->get_pool().is_erasure()) {
759 if (!shard_result.has_hinfo_missing()
760 && !shard_result.has_hinfo_corrupted()) {
761 bufferlist can_bl, auth_bl;
762 auto can_hi = candidate.attrs.find(ECUtil::get_hinfo_key());
763 auto auth_hi = auth.attrs.find(ECUtil::get_hinfo_key());
764
765 ceph_assert(auth_hi != auth.attrs.end());
766 ceph_assert(can_hi != candidate.attrs.end());
767
768 can_bl.push_back(can_hi->second);
769 auth_bl.push_back(auth_hi->second);
770 if (!can_bl.contents_equal(auth_bl)) {
771 if (error != CLEAN)
772 errorstream << ", ";
773 error = FOUND_ERROR;
774 obj_result.set_hinfo_inconsistency();
775 errorstream << "hinfo inconsistent ";
776 }
777 }
778 }
779 uint64_t oi_size = be_get_ondisk_size(auth_oi.size);
780 if (oi_size != candidate.size) {
781 if (error != CLEAN)
782 errorstream << ", ";
783 error = FOUND_ERROR;
784 errorstream << "size " << candidate.size
785 << " != size " << oi_size
786 << " from auth oi " << auth_oi;
787 shard_result.set_size_mismatch_info();
788 }
789 if (auth.size != candidate.size) {
790 if (error != CLEAN)
791 errorstream << ", ";
792 error = FOUND_ERROR;
793 errorstream << "size " << candidate.size
794 << " != size " << auth.size
795 << " from shard " << auth_shard;
796 obj_result.set_size_mismatch();
797 }
798 // If the replica is too large and we didn't already count it for this object
799 //
800 if (candidate.size > cct->_conf->osd_max_object_size
801 && !obj_result.has_size_too_large()) {
802 if (error != CLEAN)
803 errorstream << ", ";
804 error = FOUND_ERROR;
805 errorstream << "size " << candidate.size
806 << " > " << cct->_conf->osd_max_object_size
807 << " is too large";
808 obj_result.set_size_too_large();
809 }
810 for (map<string,bufferptr>::const_iterator i = auth.attrs.begin();
811 i != auth.attrs.end();
812 ++i) {
813 // We check system keys seperately
814 if (i->first == OI_ATTR || i->first[0] != '_')
815 continue;
816 if (!candidate.attrs.count(i->first)) {
817 if (error != CLEAN)
818 errorstream << ", ";
819 error = FOUND_ERROR;
820 errorstream << "attr name mismatch '" << i->first << "'";
821 obj_result.set_attr_name_mismatch();
822 } else if (candidate.attrs.find(i->first)->second.cmp(i->second)) {
823 if (error != CLEAN)
824 errorstream << ", ";
825 error = FOUND_ERROR;
826 errorstream << "attr value mismatch '" << i->first << "'";
827 obj_result.set_attr_value_mismatch();
828 }
829 }
830 for (map<string,bufferptr>::const_iterator i = candidate.attrs.begin();
831 i != candidate.attrs.end();
832 ++i) {
833 // We check system keys seperately
834 if (i->first == OI_ATTR || i->first[0] != '_')
835 continue;
836 if (!auth.attrs.count(i->first)) {
837 if (error != CLEAN)
838 errorstream << ", ";
839 error = FOUND_ERROR;
840 errorstream << "attr name mismatch '" << i->first << "'";
841 obj_result.set_attr_name_mismatch();
842 }
843 }
844 return error == FOUND_ERROR;
845 }
846
847 static int dcount(const object_info_t &oi)
848 {
849 int count = 0;
850 if (oi.is_data_digest())
851 count++;
852 if (oi.is_omap_digest())
853 count++;
854 return count;
855 }
856
857 map<pg_shard_t, ScrubMap *>::const_iterator
858 PGBackend::be_select_auth_object(
859 const hobject_t &obj,
860 const map<pg_shard_t,ScrubMap*> &maps,
861 object_info_t *auth_oi,
862 map<pg_shard_t, shard_info_wrapper> &shard_map,
863 bool &digest_match,
864 spg_t pgid,
865 ostream &errorstream)
866 {
867 eversion_t auth_version;
868
869 // Create list of shards with primary first so it will be auth copy all
870 // other things being equal.
871 list<pg_shard_t> shards;
872 for (map<pg_shard_t, ScrubMap *>::const_iterator j = maps.begin();
873 j != maps.end();
874 ++j) {
875 if (j->first == get_parent()->whoami_shard())
876 continue;
877 shards.push_back(j->first);
878 }
879 shards.push_front(get_parent()->whoami_shard());
880
881 map<pg_shard_t, ScrubMap *>::const_iterator auth = maps.end();
882 digest_match = true;
883 for (auto &l : shards) {
884 ostringstream shard_errorstream;
885 bool error = false;
886 map<pg_shard_t, ScrubMap *>::const_iterator j = maps.find(l);
887 map<hobject_t, ScrubMap::object>::iterator i =
888 j->second->objects.find(obj);
889 if (i == j->second->objects.end()) {
890 continue;
891 }
892 auto& shard_info = shard_map[j->first];
893 if (j->first == get_parent()->whoami_shard())
894 shard_info.primary = true;
895 if (i->second.read_error) {
896 shard_info.set_read_error();
897 if (error)
898 shard_errorstream << ", ";
899 error = true;
900 shard_errorstream << "candidate had a read error";
901 }
902 if (i->second.ec_hash_mismatch) {
903 shard_info.set_ec_hash_mismatch();
904 if (error)
905 shard_errorstream << ", ";
906 error = true;
907 shard_errorstream << "candidate had an ec hash mismatch";
908 }
909 if (i->second.ec_size_mismatch) {
910 shard_info.set_ec_size_mismatch();
911 if (error)
912 shard_errorstream << ", ";
913 error = true;
914 shard_errorstream << "candidate had an ec size mismatch";
915 }
916
917 object_info_t oi;
918 bufferlist bl;
919 map<string, bufferptr>::iterator k;
920 SnapSet ss;
921 bufferlist ss_bl, hk_bl;
922
923 if (i->second.stat_error) {
924 shard_info.set_stat_error();
925 if (error)
926 shard_errorstream << ", ";
927 error = true;
928 shard_errorstream << "candidate had a stat error";
929 // With stat_error no further checking
930 // We don't need to also see a missing_object_info_attr
931 goto out;
932 }
933
934 // We won't pick an auth copy if the snapset is missing or won't decode.
935 ceph_assert(!obj.is_snapdir());
936 if (obj.is_head()) {
937 k = i->second.attrs.find(SS_ATTR);
938 if (k == i->second.attrs.end()) {
939 shard_info.set_snapset_missing();
940 if (error)
941 shard_errorstream << ", ";
942 error = true;
943 shard_errorstream << "candidate had a missing snapset key";
944 } else {
945 ss_bl.push_back(k->second);
946 try {
947 auto bliter = ss_bl.cbegin();
948 decode(ss, bliter);
949 } catch (...) {
950 // invalid snapset, probably corrupt
951 shard_info.set_snapset_corrupted();
952 if (error)
953 shard_errorstream << ", ";
954 error = true;
955 shard_errorstream << "candidate had a corrupt snapset";
956 }
957 }
958 }
959
960 if (parent->get_pool().is_erasure()) {
961 ECUtil::HashInfo hi;
962 k = i->second.attrs.find(ECUtil::get_hinfo_key());
963 if (k == i->second.attrs.end()) {
964 shard_info.set_hinfo_missing();
965 if (error)
966 shard_errorstream << ", ";
967 error = true;
968 shard_errorstream << "candidate had a missing hinfo key";
969 } else {
970 hk_bl.push_back(k->second);
971 try {
972 auto bliter = hk_bl.cbegin();
973 decode(hi, bliter);
974 } catch (...) {
975 // invalid snapset, probably corrupt
976 shard_info.set_hinfo_corrupted();
977 if (error)
978 shard_errorstream << ", ";
979 error = true;
980 shard_errorstream << "candidate had a corrupt hinfo";
981 }
982 }
983 }
984
985 k = i->second.attrs.find(OI_ATTR);
986 if (k == i->second.attrs.end()) {
987 // no object info on object, probably corrupt
988 shard_info.set_info_missing();
989 if (error)
990 shard_errorstream << ", ";
991 error = true;
992 shard_errorstream << "candidate had a missing info key";
993 goto out;
994 }
995 bl.push_back(k->second);
996 try {
997 auto bliter = bl.cbegin();
998 decode(oi, bliter);
999 } catch (...) {
1000 // invalid object info, probably corrupt
1001 shard_info.set_info_corrupted();
1002 if (error)
1003 shard_errorstream << ", ";
1004 error = true;
1005 shard_errorstream << "candidate had a corrupt info";
1006 goto out;
1007 }
1008
1009 // This is automatically corrected in PG::_repair_oinfo_oid()
1010 ceph_assert(oi.soid == obj);
1011
1012 if (i->second.size != be_get_ondisk_size(oi.size)) {
1013 shard_info.set_obj_size_info_mismatch();
1014 if (error)
1015 shard_errorstream << ", ";
1016 error = true;
1017 shard_errorstream << "candidate size " << i->second.size << " info size "
1018 << oi.size << " mismatch";
1019 }
1020
1021 // digest_match will only be true if computed digests are the same
1022 if (auth_version != eversion_t()
1023 && auth->second->objects[obj].digest_present
1024 && i->second.digest_present
1025 && auth->second->objects[obj].digest != i->second.digest) {
1026 digest_match = false;
1027 dout(10) << __func__ << " digest_match = false, " << obj << " data_digest 0x" << std::hex << i->second.digest
1028 << " != data_digest 0x" << auth->second->objects[obj].digest << std::dec
1029 << dendl;
1030 }
1031
1032 // Don't use this particular shard due to previous errors
1033 // XXX: For now we can't pick one shard for repair and another's object info or snapset
1034 if (shard_info.errors)
1035 goto out;
1036
1037 if (auth_version == eversion_t() || oi.version > auth_version ||
1038 (oi.version == auth_version && dcount(oi) > dcount(*auth_oi))) {
1039 auth = j;
1040 *auth_oi = oi;
1041 auth_version = oi.version;
1042 }
1043
1044 out:
1045 if (error)
1046 errorstream << pgid.pgid << " shard " << l << " soid " << obj
1047 << " : " << shard_errorstream.str() << "\n";
1048 // Keep scanning other shards
1049 }
1050 dout(10) << __func__ << ": selecting osd " << auth->first
1051 << " for obj " << obj
1052 << " with oi " << *auth_oi
1053 << dendl;
1054 return auth;
1055 }
1056
1057 void PGBackend::be_compare_scrubmaps(
1058 const map<pg_shard_t,ScrubMap*> &maps,
1059 const set<hobject_t> &master_set,
1060 bool repair,
1061 map<hobject_t, set<pg_shard_t>> &missing,
1062 map<hobject_t, set<pg_shard_t>> &inconsistent,
1063 map<hobject_t, list<pg_shard_t>> &authoritative,
1064 map<hobject_t, pair<std::optional<uint32_t>,
1065 std::optional<uint32_t>>> &missing_digest,
1066 int &shallow_errors, int &deep_errors,
1067 Scrub::Store *store,
1068 const spg_t& pgid,
1069 const vector<int> &acting,
1070 ostream &errorstream)
1071 {
1072 utime_t now = ceph_clock_now();
1073
1074 // Check maps against master set and each other
1075 for (set<hobject_t>::const_iterator k = master_set.begin();
1076 k != master_set.end();
1077 ++k) {
1078 object_info_t auth_oi;
1079 map<pg_shard_t, shard_info_wrapper> shard_map;
1080
1081 inconsistent_obj_wrapper object_error{*k};
1082
1083 bool digest_match;
1084 map<pg_shard_t, ScrubMap *>::const_iterator auth =
1085 be_select_auth_object(*k, maps, &auth_oi, shard_map, digest_match,
1086 pgid, errorstream);
1087
1088 list<pg_shard_t> auth_list;
1089 set<pg_shard_t> object_errors;
1090 if (auth == maps.end()) {
1091 object_error.set_version(0);
1092 object_error.set_auth_missing(*k, maps, shard_map, shallow_errors,
1093 deep_errors, get_parent()->whoami_shard());
1094 if (object_error.has_deep_errors())
1095 ++deep_errors;
1096 else if (object_error.has_shallow_errors())
1097 ++shallow_errors;
1098 store->add_object_error(k->pool, object_error);
1099 errorstream << pgid.pgid << " soid " << *k
1100 << " : failed to pick suitable object info\n";
1101 continue;
1102 }
1103 object_error.set_version(auth_oi.user_version);
1104 ScrubMap::object& auth_object = auth->second->objects[*k];
1105 set<pg_shard_t> cur_missing;
1106 set<pg_shard_t> cur_inconsistent;
1107 bool fix_digest = false;
1108
1109 for (auto j = maps.cbegin(); j != maps.cend(); ++j) {
1110 if (j == auth)
1111 shard_map[auth->first].selected_oi = true;
1112 if (j->second->objects.count(*k)) {
1113 shard_map[j->first].set_object(j->second->objects[*k]);
1114 // Compare
1115 stringstream ss;
1116 bool found = be_compare_scrub_objects(auth->first,
1117 auth_object,
1118 auth_oi,
1119 j->second->objects[*k],
1120 shard_map[j->first],
1121 object_error,
1122 ss,
1123 k->has_snapset());
1124
1125 dout(20) << __func__ << (repair ? " repair " : " ") << (parent->get_pool().is_replicated() ? "replicated " : "")
1126 << (j == auth ? "auth" : "") << "shards " << shard_map.size() << (digest_match ? " digest_match " : " ")
1127 << (shard_map[j->first].only_data_digest_mismatch_info() ? "'info mismatch info'" : "")
1128 << dendl;
1129 // If all replicas match, but they don't match object_info we can
1130 // repair it by using missing_digest mechanism
1131 if (repair && parent->get_pool().is_replicated() && j == auth && shard_map.size() > 1
1132 && digest_match && shard_map[j->first].only_data_digest_mismatch_info()
1133 && auth_object.digest_present) {
1134 // Set in missing_digests
1135 fix_digest = true;
1136 // Clear the error
1137 shard_map[j->first].clear_data_digest_mismatch_info();
1138 errorstream << pgid << " soid " << *k << " : repairing object info data_digest" << "\n";
1139 }
1140 // Some errors might have already been set in be_select_auth_object()
1141 if (shard_map[j->first].errors != 0) {
1142 cur_inconsistent.insert(j->first);
1143 if (shard_map[j->first].has_deep_errors())
1144 ++deep_errors;
1145 else
1146 ++shallow_errors;
1147 // Only true if be_compare_scrub_objects() found errors and put something
1148 // in ss.
1149 if (found)
1150 errorstream << pgid << " shard " << j->first << " soid " << *k
1151 << " : " << ss.str() << "\n";
1152 } else if (found) {
1153 // Track possible shard to use as authoritative, if needed
1154 // There are errors, without identifying the shard
1155 object_errors.insert(j->first);
1156 errorstream << pgid << " soid " << *k << " : " << ss.str() << "\n";
1157 } else {
1158 // XXX: The auth shard might get here that we don't know
1159 // that it has the "correct" data.
1160 auth_list.push_back(j->first);
1161 }
1162 } else {
1163 cur_missing.insert(j->first);
1164 shard_map[j->first].set_missing();
1165 shard_map[j->first].primary = (j->first == get_parent()->whoami_shard());
1166 // Can't have any other errors if there is no information available
1167 ++shallow_errors;
1168 errorstream << pgid << " shard " << j->first << " " << *k << " : missing\n";
1169 }
1170 object_error.add_shard(j->first, shard_map[j->first]);
1171 }
1172
1173 if (auth_list.empty()) {
1174 if (object_errors.empty()) {
1175 errorstream << pgid.pgid << " soid " << *k
1176 << " : failed to pick suitable auth object\n";
1177 goto out;
1178 }
1179 // Object errors exist and nothing in auth_list
1180 // Prefer the auth shard otherwise take first from list.
1181 pg_shard_t shard;
1182 if (object_errors.count(auth->first)) {
1183 shard = auth->first;
1184 } else {
1185 shard = *(object_errors.begin());
1186 }
1187 auth_list.push_back(shard);
1188 object_errors.erase(shard);
1189 }
1190 // At this point auth_list is populated, so we add the object errors shards
1191 // as inconsistent.
1192 cur_inconsistent.insert(object_errors.begin(), object_errors.end());
1193 if (!cur_missing.empty()) {
1194 missing[*k] = cur_missing;
1195 }
1196 if (!cur_inconsistent.empty()) {
1197 inconsistent[*k] = cur_inconsistent;
1198 }
1199
1200 if (fix_digest) {
1201 std::optional<uint32_t> data_digest, omap_digest;
1202 ceph_assert(auth_object.digest_present);
1203 data_digest = auth_object.digest;
1204 if (auth_object.omap_digest_present) {
1205 omap_digest = auth_object.omap_digest;
1206 }
1207 missing_digest[*k] = make_pair(data_digest, omap_digest);
1208 }
1209 if (!cur_inconsistent.empty() || !cur_missing.empty()) {
1210 authoritative[*k] = auth_list;
1211 } else if (!fix_digest && parent->get_pool().is_replicated()) {
1212 enum {
1213 NO = 0,
1214 MAYBE = 1,
1215 FORCE = 2,
1216 } update = NO;
1217
1218 if (auth_object.digest_present && !auth_oi.is_data_digest()) {
1219 dout(20) << __func__ << " missing data digest on " << *k << dendl;
1220 update = MAYBE;
1221 }
1222 if (auth_object.omap_digest_present && !auth_oi.is_omap_digest()) {
1223 dout(20) << __func__ << " missing omap digest on " << *k << dendl;
1224 update = MAYBE;
1225 }
1226
1227 // recorded digest != actual digest?
1228 if (auth_oi.is_data_digest() && auth_object.digest_present &&
1229 auth_oi.data_digest != auth_object.digest) {
1230 ceph_assert(shard_map[auth->first].has_data_digest_mismatch_info());
1231 errorstream << pgid << " recorded data digest 0x"
1232 << std::hex << auth_oi.data_digest << " != on disk 0x"
1233 << auth_object.digest << std::dec << " on " << auth_oi.soid
1234 << "\n";
1235 if (repair)
1236 update = FORCE;
1237 }
1238 if (auth_oi.is_omap_digest() && auth_object.omap_digest_present &&
1239 auth_oi.omap_digest != auth_object.omap_digest) {
1240 ceph_assert(shard_map[auth->first].has_omap_digest_mismatch_info());
1241 errorstream << pgid << " recorded omap digest 0x"
1242 << std::hex << auth_oi.omap_digest << " != on disk 0x"
1243 << auth_object.omap_digest << std::dec
1244 << " on " << auth_oi.soid << "\n";
1245 if (repair)
1246 update = FORCE;
1247 }
1248
1249 if (update != NO) {
1250 utime_t age = now - auth_oi.local_mtime;
1251 if (update == FORCE ||
1252 age > cct->_conf->osd_deep_scrub_update_digest_min_age) {
1253 std::optional<uint32_t> data_digest, omap_digest;
1254 if (auth_object.digest_present) {
1255 data_digest = auth_object.digest;
1256 dout(20) << __func__ << " will update data digest on " << *k << dendl;
1257 }
1258 if (auth_object.omap_digest_present) {
1259 omap_digest = auth_object.omap_digest;
1260 dout(20) << __func__ << " will update omap digest on " << *k << dendl;
1261 }
1262 missing_digest[*k] = make_pair(data_digest, omap_digest);
1263 } else {
1264 dout(20) << __func__ << " missing digest but age " << age
1265 << " < " << cct->_conf->osd_deep_scrub_update_digest_min_age
1266 << " on " << *k << dendl;
1267 }
1268 }
1269 }
1270 out:
1271 if (object_error.has_deep_errors())
1272 ++deep_errors;
1273 else if (object_error.has_shallow_errors())
1274 ++shallow_errors;
1275 if (object_error.errors || object_error.union_shards.errors) {
1276 store->add_object_error(k->pool, object_error);
1277 }
1278 }
1279 }
1280
1281 void PGBackend::be_omap_checks(const map<pg_shard_t,ScrubMap*> &maps,
1282 const set<hobject_t> &master_set,
1283 omap_stat_t& omap_stats,
1284 ostream &warnstream) const
1285 {
1286 bool needs_omap_check = false;
1287 for (const auto& map : maps) {
1288 if (map.second->has_large_omap_object_errors || map.second->has_omap_keys) {
1289 needs_omap_check = true;
1290 break;
1291 }
1292 }
1293
1294 if (!needs_omap_check) {
1295 return; // Nothing to do
1296 }
1297
1298 // Iterate through objects and update omap stats
1299 for (const auto& k : master_set) {
1300 for (const auto& map : maps) {
1301 if (map.first != get_parent()->primary_shard()) {
1302 // Only set omap stats for the primary
1303 continue;
1304 }
1305 auto it = map.second->objects.find(k);
1306 if (it == map.second->objects.end())
1307 continue;
1308 ScrubMap::object& obj = it->second;
1309 omap_stats.omap_bytes += obj.object_omap_bytes;
1310 omap_stats.omap_keys += obj.object_omap_keys;
1311 if (obj.large_omap_object_found) {
1312 pg_t pg;
1313 auto osdmap = get_osdmap();
1314 osdmap->map_to_pg(k.pool, k.oid.name, k.get_key(), k.nspace, &pg);
1315 pg_t mpg = osdmap->raw_pg_to_pg(pg);
1316 omap_stats.large_omap_objects++;
1317 warnstream << "Large omap object found. Object: " << k
1318 << " PG: " << pg << " (" << mpg << ")"
1319 << " Key count: " << obj.large_omap_object_key_count
1320 << " Size (bytes): " << obj.large_omap_object_value_size
1321 << '\n';
1322 break;
1323 }
1324 }
1325 }
1326 }