]> git.proxmox.com Git - ceph.git/blob - ceph/src/osd/PGBackend.cc
import 15.2.0 Octopus source
[ceph.git] / ceph / src / osd / PGBackend.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2013,2014 Inktank Storage, Inc.
7 * Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com>
8 *
9 * Author: Loic Dachary <loic@dachary.org>
10 *
11 * This is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public
13 * License version 2.1, as published by the Free Software
14 * Foundation. See file COPYING.
15 *
16 */
17
18
19 #include "common/errno.h"
20 #include "common/scrub_types.h"
21 #include "ReplicatedBackend.h"
22 #include "ScrubStore.h"
23 #include "ECBackend.h"
24 #include "PGBackend.h"
25 #include "OSD.h"
26 #include "erasure-code/ErasureCodePlugin.h"
27 #include "OSDMap.h"
28 #include "PGLog.h"
29 #include "common/LogClient.h"
30 #include "messages/MOSDPGRecoveryDelete.h"
31 #include "messages/MOSDPGRecoveryDeleteReply.h"
32
33 #define dout_context cct
34 #define dout_subsys ceph_subsys_osd
35 #define DOUT_PREFIX_ARGS this
36 #undef dout_prefix
37 #define dout_prefix _prefix(_dout, this)
38 static ostream& _prefix(std::ostream *_dout, PGBackend *pgb) {
39 return pgb->get_parent()->gen_dbg_prefix(*_dout);
40 }
41
42 void PGBackend::recover_delete_object(const hobject_t &oid, eversion_t v,
43 RecoveryHandle *h)
44 {
45 ceph_assert(get_parent()->get_acting_recovery_backfill_shards().size() > 0);
46 for (const auto& shard : get_parent()->get_acting_recovery_backfill_shards()) {
47 if (shard == get_parent()->whoami_shard())
48 continue;
49 if (get_parent()->get_shard_missing(shard).is_missing(oid)) {
50 dout(20) << __func__ << " will remove " << oid << " " << v << " from "
51 << shard << dendl;
52 h->deletes[shard].push_back(make_pair(oid, v));
53 get_parent()->begin_peer_recover(shard, oid);
54 }
55 }
56 }
57
58 void PGBackend::send_recovery_deletes(int prio,
59 const map<pg_shard_t, vector<pair<hobject_t, eversion_t> > > &deletes)
60 {
61 epoch_t min_epoch = get_parent()->get_last_peering_reset_epoch();
62 for (const auto& p : deletes) {
63 const auto& shard = p.first;
64 const auto& objects = p.second;
65 ConnectionRef con = get_parent()->get_con_osd_cluster(
66 shard.osd,
67 get_osdmap_epoch());
68 if (!con)
69 continue;
70 auto it = objects.begin();
71 while (it != objects.end()) {
72 uint64_t cost = 0;
73 uint64_t deletes = 0;
74 spg_t target_pg = spg_t(get_parent()->get_info().pgid.pgid, shard.shard);
75 MOSDPGRecoveryDelete *msg =
76 new MOSDPGRecoveryDelete(get_parent()->whoami_shard(),
77 target_pg,
78 get_osdmap_epoch(),
79 min_epoch);
80 msg->set_priority(prio);
81
82 while (it != objects.end() &&
83 cost < cct->_conf->osd_max_push_cost &&
84 deletes < cct->_conf->osd_max_push_objects) {
85 dout(20) << __func__ << ": sending recovery delete << " << it->first
86 << " " << it->second << " to osd." << shard << dendl;
87 msg->objects.push_back(*it);
88 cost += cct->_conf->osd_push_per_object_cost;
89 ++deletes;
90 ++it;
91 }
92
93 msg->set_cost(cost);
94 get_parent()->send_message_osd_cluster(msg, con);
95 }
96 }
97 }
98
99 bool PGBackend::handle_message(OpRequestRef op)
100 {
101 switch (op->get_req()->get_type()) {
102 case MSG_OSD_PG_RECOVERY_DELETE:
103 handle_recovery_delete(op);
104 return true;
105
106 case MSG_OSD_PG_RECOVERY_DELETE_REPLY:
107 handle_recovery_delete_reply(op);
108 return true;
109
110 default:
111 break;
112 }
113
114 return _handle_message(op);
115 }
116
117 void PGBackend::handle_recovery_delete(OpRequestRef op)
118 {
119 auto m = op->get_req<MOSDPGRecoveryDelete>();
120 ceph_assert(m->get_type() == MSG_OSD_PG_RECOVERY_DELETE);
121 dout(20) << __func__ << " " << op << dendl;
122
123 op->mark_started();
124
125 C_GatherBuilder gather(cct);
126 for (const auto &p : m->objects) {
127 get_parent()->remove_missing_object(p.first, p.second, gather.new_sub());
128 }
129
130 MOSDPGRecoveryDeleteReply *reply = new MOSDPGRecoveryDeleteReply;
131 reply->from = get_parent()->whoami_shard();
132 reply->set_priority(m->get_priority());
133 reply->pgid = spg_t(get_parent()->get_info().pgid.pgid, m->from.shard);
134 reply->map_epoch = m->map_epoch;
135 reply->min_epoch = m->min_epoch;
136 reply->objects = m->objects;
137 ConnectionRef conn = m->get_connection();
138
139 gather.set_finisher(new LambdaContext(
140 [=](int r) {
141 if (r != -EAGAIN) {
142 get_parent()->send_message_osd_cluster(reply, conn.get());
143 } else {
144 reply->put();
145 }
146 }));
147 gather.activate();
148 }
149
150 void PGBackend::handle_recovery_delete_reply(OpRequestRef op)
151 {
152 auto m = op->get_req<MOSDPGRecoveryDeleteReply>();
153 ceph_assert(m->get_type() == MSG_OSD_PG_RECOVERY_DELETE_REPLY);
154 dout(20) << __func__ << " " << op << dendl;
155
156 for (const auto &p : m->objects) {
157 ObjectRecoveryInfo recovery_info;
158 hobject_t oid = p.first;
159 recovery_info.version = p.second;
160 get_parent()->on_peer_recover(m->from, oid, recovery_info);
161 bool peers_recovered = true;
162 for (const auto& shard : get_parent()->get_acting_recovery_backfill_shards()) {
163 if (shard == get_parent()->whoami_shard())
164 continue;
165 if (get_parent()->get_shard_missing(shard).is_missing(oid)) {
166 dout(20) << __func__ << " " << oid << " still missing on at least "
167 << shard << dendl;
168 peers_recovered = false;
169 break;
170 }
171 }
172 if (peers_recovered && !get_parent()->get_local_missing().is_missing(oid)) {
173 dout(20) << __func__ << " completed recovery, local_missing = "
174 << get_parent()->get_local_missing() << dendl;
175 object_stat_sum_t stat_diff;
176 stat_diff.num_objects_recovered = 1;
177 get_parent()->on_global_recover(p.first, stat_diff, true);
178 }
179 }
180 }
181
182 void PGBackend::rollback(
183 const pg_log_entry_t &entry,
184 ObjectStore::Transaction *t)
185 {
186
187 struct RollbackVisitor : public ObjectModDesc::Visitor {
188 const hobject_t &hoid;
189 PGBackend *pg;
190 ObjectStore::Transaction t;
191 RollbackVisitor(
192 const hobject_t &hoid,
193 PGBackend *pg) : hoid(hoid), pg(pg) {}
194 void append(uint64_t old_size) override {
195 ObjectStore::Transaction temp;
196 pg->rollback_append(hoid, old_size, &temp);
197 temp.append(t);
198 temp.swap(t);
199 }
200 void setattrs(map<string, std::optional<bufferlist> > &attrs) override {
201 ObjectStore::Transaction temp;
202 pg->rollback_setattrs(hoid, attrs, &temp);
203 temp.append(t);
204 temp.swap(t);
205 }
206 void rmobject(version_t old_version) override {
207 ObjectStore::Transaction temp;
208 pg->rollback_stash(hoid, old_version, &temp);
209 temp.append(t);
210 temp.swap(t);
211 }
212 void try_rmobject(version_t old_version) override {
213 ObjectStore::Transaction temp;
214 pg->rollback_try_stash(hoid, old_version, &temp);
215 temp.append(t);
216 temp.swap(t);
217 }
218 void create() override {
219 ObjectStore::Transaction temp;
220 pg->rollback_create(hoid, &temp);
221 temp.append(t);
222 temp.swap(t);
223 }
224 void update_snaps(const set<snapid_t> &snaps) override {
225 ObjectStore::Transaction temp;
226 pg->get_parent()->pgb_set_object_snap_mapping(hoid, snaps, &temp);
227 temp.append(t);
228 temp.swap(t);
229 }
230 void rollback_extents(
231 version_t gen,
232 const vector<pair<uint64_t, uint64_t> > &extents) override {
233 ObjectStore::Transaction temp;
234 pg->rollback_extents(gen, extents, hoid, &temp);
235 temp.append(t);
236 temp.swap(t);
237 }
238 };
239
240 ceph_assert(entry.mod_desc.can_rollback());
241 RollbackVisitor vis(entry.soid, this);
242 entry.mod_desc.visit(&vis);
243 t->append(vis.t);
244 }
245
246 struct Trimmer : public ObjectModDesc::Visitor {
247 const hobject_t &soid;
248 PGBackend *pg;
249 ObjectStore::Transaction *t;
250 Trimmer(
251 const hobject_t &soid,
252 PGBackend *pg,
253 ObjectStore::Transaction *t)
254 : soid(soid), pg(pg), t(t) {}
255 void rmobject(version_t old_version) override {
256 pg->trim_rollback_object(
257 soid,
258 old_version,
259 t);
260 }
261 // try_rmobject defaults to rmobject
262 void rollback_extents(
263 version_t gen,
264 const vector<pair<uint64_t, uint64_t> > &extents) override {
265 pg->trim_rollback_object(
266 soid,
267 gen,
268 t);
269 }
270 };
271
272 void PGBackend::rollforward(
273 const pg_log_entry_t &entry,
274 ObjectStore::Transaction *t)
275 {
276 auto dpp = get_parent()->get_dpp();
277 ldpp_dout(dpp, 20) << __func__ << ": entry=" << entry << dendl;
278 if (!entry.can_rollback())
279 return;
280 Trimmer trimmer(entry.soid, this, t);
281 entry.mod_desc.visit(&trimmer);
282 }
283
284 void PGBackend::trim(
285 const pg_log_entry_t &entry,
286 ObjectStore::Transaction *t)
287 {
288 if (!entry.can_rollback())
289 return;
290 Trimmer trimmer(entry.soid, this, t);
291 entry.mod_desc.visit(&trimmer);
292 }
293
294 void PGBackend::try_stash(
295 const hobject_t &hoid,
296 version_t v,
297 ObjectStore::Transaction *t)
298 {
299 t->try_rename(
300 coll,
301 ghobject_t(hoid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard),
302 ghobject_t(hoid, v, get_parent()->whoami_shard().shard));
303 }
304
305 void PGBackend::remove(
306 const hobject_t &hoid,
307 ObjectStore::Transaction *t) {
308 ceph_assert(!hoid.is_temp());
309 t->remove(
310 coll,
311 ghobject_t(hoid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard));
312 get_parent()->pgb_clear_object_snap_mapping(hoid, t);
313 }
314
315 void PGBackend::on_change_cleanup(ObjectStore::Transaction *t)
316 {
317 dout(10) << __func__ << dendl;
318 // clear temp
319 for (set<hobject_t>::iterator i = temp_contents.begin();
320 i != temp_contents.end();
321 ++i) {
322 dout(10) << __func__ << ": Removing oid "
323 << *i << " from the temp collection" << dendl;
324 t->remove(
325 coll,
326 ghobject_t(*i, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard));
327 }
328 temp_contents.clear();
329 }
330
331 int PGBackend::objects_list_partial(
332 const hobject_t &begin,
333 int min,
334 int max,
335 vector<hobject_t> *ls,
336 hobject_t *next)
337 {
338 ceph_assert(ls);
339 // Starts with the smallest generation to make sure the result list
340 // has the marker object (it might have multiple generations
341 // though, which would be filtered).
342 ghobject_t _next;
343 if (!begin.is_min())
344 _next = ghobject_t(begin, 0, get_parent()->whoami_shard().shard);
345 ls->reserve(max);
346 int r = 0;
347
348 if (min > max)
349 min = max;
350
351 while (!_next.is_max() && ls->size() < (unsigned)min) {
352 vector<ghobject_t> objects;
353 r = store->collection_list(
354 ch,
355 _next,
356 ghobject_t::get_max(),
357 max - ls->size(),
358 &objects,
359 &_next);
360 if (r != 0) {
361 derr << __func__ << " list collection " << ch << " got: " << cpp_strerror(r) << dendl;
362 break;
363 }
364 for (vector<ghobject_t>::iterator i = objects.begin();
365 i != objects.end();
366 ++i) {
367 if (i->is_pgmeta() || i->hobj.is_temp()) {
368 continue;
369 }
370 if (i->is_no_gen()) {
371 ls->push_back(i->hobj);
372 }
373 }
374 }
375 if (r == 0)
376 *next = _next.hobj;
377 return r;
378 }
379
380 int PGBackend::objects_list_range(
381 const hobject_t &start,
382 const hobject_t &end,
383 vector<hobject_t> *ls,
384 vector<ghobject_t> *gen_obs)
385 {
386 ceph_assert(ls);
387 vector<ghobject_t> objects;
388 int r = store->collection_list(
389 ch,
390 ghobject_t(start, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard),
391 ghobject_t(end, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard),
392 INT_MAX,
393 &objects,
394 NULL);
395 ls->reserve(objects.size());
396 for (vector<ghobject_t>::iterator i = objects.begin();
397 i != objects.end();
398 ++i) {
399 if (i->is_pgmeta() || i->hobj.is_temp()) {
400 continue;
401 }
402 if (i->is_no_gen()) {
403 ls->push_back(i->hobj);
404 } else if (gen_obs) {
405 gen_obs->push_back(*i);
406 }
407 }
408 return r;
409 }
410
411 int PGBackend::objects_get_attr(
412 const hobject_t &hoid,
413 const string &attr,
414 bufferlist *out)
415 {
416 bufferptr bp;
417 int r = store->getattr(
418 ch,
419 ghobject_t(hoid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard),
420 attr.c_str(),
421 bp);
422 if (r >= 0 && out) {
423 out->clear();
424 out->push_back(std::move(bp));
425 }
426 return r;
427 }
428
429 int PGBackend::objects_get_attrs(
430 const hobject_t &hoid,
431 map<string, bufferlist> *out)
432 {
433 return store->getattrs(
434 ch,
435 ghobject_t(hoid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard),
436 *out);
437 }
438
439 void PGBackend::rollback_setattrs(
440 const hobject_t &hoid,
441 map<string, std::optional<bufferlist> > &old_attrs,
442 ObjectStore::Transaction *t) {
443 map<string, bufferlist> to_set;
444 ceph_assert(!hoid.is_temp());
445 for (map<string, std::optional<bufferlist> >::iterator i = old_attrs.begin();
446 i != old_attrs.end();
447 ++i) {
448 if (i->second) {
449 to_set[i->first] = *(i->second);
450 } else {
451 t->rmattr(
452 coll,
453 ghobject_t(hoid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard),
454 i->first);
455 }
456 }
457 t->setattrs(
458 coll,
459 ghobject_t(hoid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard),
460 to_set);
461 }
462
463 void PGBackend::rollback_append(
464 const hobject_t &hoid,
465 uint64_t old_size,
466 ObjectStore::Transaction *t) {
467 ceph_assert(!hoid.is_temp());
468 t->truncate(
469 coll,
470 ghobject_t(hoid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard),
471 old_size);
472 }
473
474 void PGBackend::rollback_stash(
475 const hobject_t &hoid,
476 version_t old_version,
477 ObjectStore::Transaction *t) {
478 ceph_assert(!hoid.is_temp());
479 t->remove(
480 coll,
481 ghobject_t(hoid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard));
482 t->collection_move_rename(
483 coll,
484 ghobject_t(hoid, old_version, get_parent()->whoami_shard().shard),
485 coll,
486 ghobject_t(hoid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard));
487 }
488
489 void PGBackend::rollback_try_stash(
490 const hobject_t &hoid,
491 version_t old_version,
492 ObjectStore::Transaction *t) {
493 ceph_assert(!hoid.is_temp());
494 t->remove(
495 coll,
496 ghobject_t(hoid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard));
497 t->try_rename(
498 coll,
499 ghobject_t(hoid, old_version, get_parent()->whoami_shard().shard),
500 ghobject_t(hoid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard));
501 }
502
503 void PGBackend::rollback_extents(
504 version_t gen,
505 const vector<pair<uint64_t, uint64_t> > &extents,
506 const hobject_t &hoid,
507 ObjectStore::Transaction *t) {
508 auto shard = get_parent()->whoami_shard().shard;
509 for (auto &&extent: extents) {
510 t->clone_range(
511 coll,
512 ghobject_t(hoid, gen, shard),
513 ghobject_t(hoid, ghobject_t::NO_GEN, shard),
514 extent.first,
515 extent.second,
516 extent.first);
517 }
518 t->remove(
519 coll,
520 ghobject_t(hoid, gen, shard));
521 }
522
523 void PGBackend::trim_rollback_object(
524 const hobject_t &hoid,
525 version_t old_version,
526 ObjectStore::Transaction *t) {
527 ceph_assert(!hoid.is_temp());
528 t->remove(
529 coll, ghobject_t(hoid, old_version, get_parent()->whoami_shard().shard));
530 }
531
532 PGBackend *PGBackend::build_pg_backend(
533 const pg_pool_t &pool,
534 const map<string,string>& profile,
535 Listener *l,
536 coll_t coll,
537 ObjectStore::CollectionHandle &ch,
538 ObjectStore *store,
539 CephContext *cct)
540 {
541 ErasureCodeProfile ec_profile = profile;
542 switch (pool.type) {
543 case pg_pool_t::TYPE_REPLICATED: {
544 return new ReplicatedBackend(l, coll, ch, store, cct);
545 }
546 case pg_pool_t::TYPE_ERASURE: {
547 ErasureCodeInterfaceRef ec_impl;
548 stringstream ss;
549 ceph::ErasureCodePluginRegistry::instance().factory(
550 profile.find("plugin")->second,
551 cct->_conf.get_val<std::string>("erasure_code_dir"),
552 ec_profile,
553 &ec_impl,
554 &ss);
555 ceph_assert(ec_impl);
556 return new ECBackend(
557 l,
558 coll,
559 ch,
560 store,
561 cct,
562 ec_impl,
563 pool.stripe_width);
564 }
565 default:
566 ceph_abort();
567 return NULL;
568 }
569 }
570
571 int PGBackend::be_scan_list(
572 ScrubMap &map,
573 ScrubMapBuilder &pos)
574 {
575 dout(10) << __func__ << " " << pos << dendl;
576 ceph_assert(!pos.done());
577 ceph_assert(pos.pos < pos.ls.size());
578 hobject_t& poid = pos.ls[pos.pos];
579
580 struct stat st;
581 int r = store->stat(
582 ch,
583 ghobject_t(
584 poid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard),
585 &st,
586 true);
587 if (r == 0) {
588 ScrubMap::object &o = map.objects[poid];
589 o.size = st.st_size;
590 ceph_assert(!o.negative);
591 store->getattrs(
592 ch,
593 ghobject_t(
594 poid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard),
595 o.attrs);
596
597 if (pos.deep) {
598 r = be_deep_scrub(poid, map, pos, o);
599 }
600 dout(25) << __func__ << " " << poid << dendl;
601 } else if (r == -ENOENT) {
602 dout(25) << __func__ << " " << poid << " got " << r
603 << ", skipping" << dendl;
604 } else if (r == -EIO) {
605 dout(25) << __func__ << " " << poid << " got " << r
606 << ", stat_error" << dendl;
607 ScrubMap::object &o = map.objects[poid];
608 o.stat_error = true;
609 } else {
610 derr << __func__ << " got: " << cpp_strerror(r) << dendl;
611 ceph_abort();
612 }
613 if (r == -EINPROGRESS) {
614 return -EINPROGRESS;
615 }
616 pos.next_object();
617 return 0;
618 }
619
620 bool PGBackend::be_compare_scrub_objects(
621 pg_shard_t auth_shard,
622 const ScrubMap::object &auth,
623 const object_info_t& auth_oi,
624 const ScrubMap::object &candidate,
625 shard_info_wrapper &shard_result,
626 inconsistent_obj_wrapper &obj_result,
627 ostream &errorstream,
628 bool has_snapset)
629 {
630 enum { CLEAN, FOUND_ERROR } error = CLEAN;
631 if (auth.digest_present && candidate.digest_present) {
632 if (auth.digest != candidate.digest) {
633 if (error != CLEAN)
634 errorstream << ", ";
635 error = FOUND_ERROR;
636 errorstream << "data_digest 0x" << std::hex << candidate.digest
637 << " != data_digest 0x" << auth.digest << std::dec
638 << " from shard " << auth_shard;
639 obj_result.set_data_digest_mismatch();
640 }
641 }
642 if (auth.omap_digest_present && candidate.omap_digest_present) {
643 if (auth.omap_digest != candidate.omap_digest) {
644 if (error != CLEAN)
645 errorstream << ", ";
646 error = FOUND_ERROR;
647 errorstream << "omap_digest 0x" << std::hex << candidate.omap_digest
648 << " != omap_digest 0x" << auth.omap_digest << std::dec
649 << " from shard " << auth_shard;
650 obj_result.set_omap_digest_mismatch();
651 }
652 }
653 if (parent->get_pool().is_replicated()) {
654 if (auth_oi.is_data_digest() && candidate.digest_present) {
655 if (auth_oi.data_digest != candidate.digest) {
656 if (error != CLEAN)
657 errorstream << ", ";
658 error = FOUND_ERROR;
659 errorstream << "data_digest 0x" << std::hex << candidate.digest
660 << " != data_digest 0x" << auth_oi.data_digest << std::dec
661 << " from auth oi " << auth_oi;
662 shard_result.set_data_digest_mismatch_info();
663 }
664 }
665 if (auth_oi.is_omap_digest() && candidate.omap_digest_present) {
666 if (auth_oi.omap_digest != candidate.omap_digest) {
667 if (error != CLEAN)
668 errorstream << ", ";
669 error = FOUND_ERROR;
670 errorstream << "omap_digest 0x" << std::hex << candidate.omap_digest
671 << " != omap_digest 0x" << auth_oi.omap_digest << std::dec
672 << " from auth oi " << auth_oi;
673 shard_result.set_omap_digest_mismatch_info();
674 }
675 }
676 }
677 if (candidate.stat_error)
678 return error == FOUND_ERROR;
679 if (!shard_result.has_info_missing()
680 && !shard_result.has_info_corrupted()) {
681 bufferlist can_bl, auth_bl;
682 auto can_attr = candidate.attrs.find(OI_ATTR);
683 auto auth_attr = auth.attrs.find(OI_ATTR);
684
685 ceph_assert(auth_attr != auth.attrs.end());
686 ceph_assert(can_attr != candidate.attrs.end());
687
688 can_bl.push_back(can_attr->second);
689 auth_bl.push_back(auth_attr->second);
690 if (!can_bl.contents_equal(auth_bl)) {
691 if (error != CLEAN)
692 errorstream << ", ";
693 error = FOUND_ERROR;
694 obj_result.set_object_info_inconsistency();
695 errorstream << "object info inconsistent ";
696 }
697 }
698 if (has_snapset) {
699 if (!shard_result.has_snapset_missing()
700 && !shard_result.has_snapset_corrupted()) {
701 bufferlist can_bl, auth_bl;
702 auto can_attr = candidate.attrs.find(SS_ATTR);
703 auto auth_attr = auth.attrs.find(SS_ATTR);
704
705 ceph_assert(auth_attr != auth.attrs.end());
706 ceph_assert(can_attr != candidate.attrs.end());
707
708 can_bl.push_back(can_attr->second);
709 auth_bl.push_back(auth_attr->second);
710 if (!can_bl.contents_equal(auth_bl)) {
711 if (error != CLEAN)
712 errorstream << ", ";
713 error = FOUND_ERROR;
714 obj_result.set_snapset_inconsistency();
715 errorstream << "snapset inconsistent ";
716 }
717 }
718 }
719 if (parent->get_pool().is_erasure()) {
720 if (!shard_result.has_hinfo_missing()
721 && !shard_result.has_hinfo_corrupted()) {
722 bufferlist can_bl, auth_bl;
723 auto can_hi = candidate.attrs.find(ECUtil::get_hinfo_key());
724 auto auth_hi = auth.attrs.find(ECUtil::get_hinfo_key());
725
726 ceph_assert(auth_hi != auth.attrs.end());
727 ceph_assert(can_hi != candidate.attrs.end());
728
729 can_bl.push_back(can_hi->second);
730 auth_bl.push_back(auth_hi->second);
731 if (!can_bl.contents_equal(auth_bl)) {
732 if (error != CLEAN)
733 errorstream << ", ";
734 error = FOUND_ERROR;
735 obj_result.set_hinfo_inconsistency();
736 errorstream << "hinfo inconsistent ";
737 }
738 }
739 }
740 uint64_t oi_size = be_get_ondisk_size(auth_oi.size);
741 if (oi_size != candidate.size) {
742 if (error != CLEAN)
743 errorstream << ", ";
744 error = FOUND_ERROR;
745 errorstream << "size " << candidate.size
746 << " != size " << oi_size
747 << " from auth oi " << auth_oi;
748 shard_result.set_size_mismatch_info();
749 }
750 if (auth.size != candidate.size) {
751 if (error != CLEAN)
752 errorstream << ", ";
753 error = FOUND_ERROR;
754 errorstream << "size " << candidate.size
755 << " != size " << auth.size
756 << " from shard " << auth_shard;
757 obj_result.set_size_mismatch();
758 }
759 // If the replica is too large and we didn't already count it for this object
760 //
761 if (candidate.size > cct->_conf->osd_max_object_size
762 && !obj_result.has_size_too_large()) {
763 if (error != CLEAN)
764 errorstream << ", ";
765 error = FOUND_ERROR;
766 errorstream << "size " << candidate.size
767 << " > " << cct->_conf->osd_max_object_size
768 << " is too large";
769 obj_result.set_size_too_large();
770 }
771 for (map<string,bufferptr>::const_iterator i = auth.attrs.begin();
772 i != auth.attrs.end();
773 ++i) {
774 // We check system keys seperately
775 if (i->first == OI_ATTR || i->first[0] != '_')
776 continue;
777 if (!candidate.attrs.count(i->first)) {
778 if (error != CLEAN)
779 errorstream << ", ";
780 error = FOUND_ERROR;
781 errorstream << "attr name mismatch '" << i->first << "'";
782 obj_result.set_attr_name_mismatch();
783 } else if (candidate.attrs.find(i->first)->second.cmp(i->second)) {
784 if (error != CLEAN)
785 errorstream << ", ";
786 error = FOUND_ERROR;
787 errorstream << "attr value mismatch '" << i->first << "'";
788 obj_result.set_attr_value_mismatch();
789 }
790 }
791 for (map<string,bufferptr>::const_iterator i = candidate.attrs.begin();
792 i != candidate.attrs.end();
793 ++i) {
794 // We check system keys seperately
795 if (i->first == OI_ATTR || i->first[0] != '_')
796 continue;
797 if (!auth.attrs.count(i->first)) {
798 if (error != CLEAN)
799 errorstream << ", ";
800 error = FOUND_ERROR;
801 errorstream << "attr name mismatch '" << i->first << "'";
802 obj_result.set_attr_name_mismatch();
803 }
804 }
805 return error == FOUND_ERROR;
806 }
807
808 static int dcount(const object_info_t &oi)
809 {
810 int count = 0;
811 if (oi.is_data_digest())
812 count++;
813 if (oi.is_omap_digest())
814 count++;
815 return count;
816 }
817
818 map<pg_shard_t, ScrubMap *>::const_iterator
819 PGBackend::be_select_auth_object(
820 const hobject_t &obj,
821 const map<pg_shard_t,ScrubMap*> &maps,
822 object_info_t *auth_oi,
823 map<pg_shard_t, shard_info_wrapper> &shard_map,
824 bool &digest_match,
825 spg_t pgid,
826 ostream &errorstream)
827 {
828 eversion_t auth_version;
829
830 // Create list of shards with primary first so it will be auth copy all
831 // other things being equal.
832 list<pg_shard_t> shards;
833 for (map<pg_shard_t, ScrubMap *>::const_iterator j = maps.begin();
834 j != maps.end();
835 ++j) {
836 if (j->first == get_parent()->whoami_shard())
837 continue;
838 shards.push_back(j->first);
839 }
840 shards.push_front(get_parent()->whoami_shard());
841
842 map<pg_shard_t, ScrubMap *>::const_iterator auth = maps.end();
843 digest_match = true;
844 for (auto &l : shards) {
845 ostringstream shard_errorstream;
846 bool error = false;
847 map<pg_shard_t, ScrubMap *>::const_iterator j = maps.find(l);
848 map<hobject_t, ScrubMap::object>::iterator i =
849 j->second->objects.find(obj);
850 if (i == j->second->objects.end()) {
851 continue;
852 }
853 auto& shard_info = shard_map[j->first];
854 if (j->first == get_parent()->whoami_shard())
855 shard_info.primary = true;
856 if (i->second.read_error) {
857 shard_info.set_read_error();
858 if (error)
859 shard_errorstream << ", ";
860 error = true;
861 shard_errorstream << "candidate had a read error";
862 }
863 if (i->second.ec_hash_mismatch) {
864 shard_info.set_ec_hash_mismatch();
865 if (error)
866 shard_errorstream << ", ";
867 error = true;
868 shard_errorstream << "candidate had an ec hash mismatch";
869 }
870 if (i->second.ec_size_mismatch) {
871 shard_info.set_ec_size_mismatch();
872 if (error)
873 shard_errorstream << ", ";
874 error = true;
875 shard_errorstream << "candidate had an ec size mismatch";
876 }
877
878 object_info_t oi;
879 bufferlist bl;
880 map<string, bufferptr>::iterator k;
881 SnapSet ss;
882 bufferlist ss_bl, hk_bl;
883
884 if (i->second.stat_error) {
885 shard_info.set_stat_error();
886 if (error)
887 shard_errorstream << ", ";
888 error = true;
889 shard_errorstream << "candidate had a stat error";
890 // With stat_error no further checking
891 // We don't need to also see a missing_object_info_attr
892 goto out;
893 }
894
895 // We won't pick an auth copy if the snapset is missing or won't decode.
896 ceph_assert(!obj.is_snapdir());
897 if (obj.is_head()) {
898 k = i->second.attrs.find(SS_ATTR);
899 if (k == i->second.attrs.end()) {
900 shard_info.set_snapset_missing();
901 if (error)
902 shard_errorstream << ", ";
903 error = true;
904 shard_errorstream << "candidate had a missing snapset key";
905 } else {
906 ss_bl.push_back(k->second);
907 try {
908 auto bliter = ss_bl.cbegin();
909 decode(ss, bliter);
910 } catch (...) {
911 // invalid snapset, probably corrupt
912 shard_info.set_snapset_corrupted();
913 if (error)
914 shard_errorstream << ", ";
915 error = true;
916 shard_errorstream << "candidate had a corrupt snapset";
917 }
918 }
919 }
920
921 if (parent->get_pool().is_erasure()) {
922 ECUtil::HashInfo hi;
923 k = i->second.attrs.find(ECUtil::get_hinfo_key());
924 if (k == i->second.attrs.end()) {
925 shard_info.set_hinfo_missing();
926 if (error)
927 shard_errorstream << ", ";
928 error = true;
929 shard_errorstream << "candidate had a missing hinfo key";
930 } else {
931 hk_bl.push_back(k->second);
932 try {
933 auto bliter = hk_bl.cbegin();
934 decode(hi, bliter);
935 } catch (...) {
936 // invalid snapset, probably corrupt
937 shard_info.set_hinfo_corrupted();
938 if (error)
939 shard_errorstream << ", ";
940 error = true;
941 shard_errorstream << "candidate had a corrupt hinfo";
942 }
943 }
944 }
945
946 k = i->second.attrs.find(OI_ATTR);
947 if (k == i->second.attrs.end()) {
948 // no object info on object, probably corrupt
949 shard_info.set_info_missing();
950 if (error)
951 shard_errorstream << ", ";
952 error = true;
953 shard_errorstream << "candidate had a missing info key";
954 goto out;
955 }
956 bl.push_back(k->second);
957 try {
958 auto bliter = bl.cbegin();
959 decode(oi, bliter);
960 } catch (...) {
961 // invalid object info, probably corrupt
962 shard_info.set_info_corrupted();
963 if (error)
964 shard_errorstream << ", ";
965 error = true;
966 shard_errorstream << "candidate had a corrupt info";
967 goto out;
968 }
969
970 // This is automatically corrected in PG::_repair_oinfo_oid()
971 ceph_assert(oi.soid == obj);
972
973 if (i->second.size != be_get_ondisk_size(oi.size)) {
974 shard_info.set_obj_size_info_mismatch();
975 if (error)
976 shard_errorstream << ", ";
977 error = true;
978 shard_errorstream << "candidate size " << i->second.size << " info size "
979 << oi.size << " mismatch";
980 }
981
982 // digest_match will only be true if computed digests are the same
983 if (auth_version != eversion_t()
984 && auth->second->objects[obj].digest_present
985 && i->second.digest_present
986 && auth->second->objects[obj].digest != i->second.digest) {
987 digest_match = false;
988 dout(10) << __func__ << " digest_match = false, " << obj << " data_digest 0x" << std::hex << i->second.digest
989 << " != data_digest 0x" << auth->second->objects[obj].digest << std::dec
990 << dendl;
991 }
992
993 // Don't use this particular shard due to previous errors
994 // XXX: For now we can't pick one shard for repair and another's object info or snapset
995 if (shard_info.errors)
996 goto out;
997
998 if (auth_version == eversion_t() || oi.version > auth_version ||
999 (oi.version == auth_version && dcount(oi) > dcount(*auth_oi))) {
1000 auth = j;
1001 *auth_oi = oi;
1002 auth_version = oi.version;
1003 }
1004
1005 out:
1006 if (error)
1007 errorstream << pgid.pgid << " shard " << l << " soid " << obj
1008 << " : " << shard_errorstream.str() << "\n";
1009 // Keep scanning other shards
1010 }
1011 dout(10) << __func__ << ": selecting osd " << auth->first
1012 << " for obj " << obj
1013 << " with oi " << *auth_oi
1014 << dendl;
1015 return auth;
1016 }
1017
1018 void PGBackend::be_compare_scrubmaps(
1019 const map<pg_shard_t,ScrubMap*> &maps,
1020 const set<hobject_t> &master_set,
1021 bool repair,
1022 map<hobject_t, set<pg_shard_t>> &missing,
1023 map<hobject_t, set<pg_shard_t>> &inconsistent,
1024 map<hobject_t, list<pg_shard_t>> &authoritative,
1025 map<hobject_t, pair<std::optional<uint32_t>,
1026 std::optional<uint32_t>>> &missing_digest,
1027 int &shallow_errors, int &deep_errors,
1028 Scrub::Store *store,
1029 const spg_t& pgid,
1030 const vector<int> &acting,
1031 ostream &errorstream)
1032 {
1033 utime_t now = ceph_clock_now();
1034
1035 // Check maps against master set and each other
1036 for (set<hobject_t>::const_iterator k = master_set.begin();
1037 k != master_set.end();
1038 ++k) {
1039 object_info_t auth_oi;
1040 map<pg_shard_t, shard_info_wrapper> shard_map;
1041
1042 inconsistent_obj_wrapper object_error{*k};
1043
1044 bool digest_match;
1045 map<pg_shard_t, ScrubMap *>::const_iterator auth =
1046 be_select_auth_object(*k, maps, &auth_oi, shard_map, digest_match,
1047 pgid, errorstream);
1048
1049 list<pg_shard_t> auth_list;
1050 set<pg_shard_t> object_errors;
1051 if (auth == maps.end()) {
1052 object_error.set_version(0);
1053 object_error.set_auth_missing(*k, maps, shard_map, shallow_errors,
1054 deep_errors, get_parent()->whoami_shard());
1055 if (object_error.has_deep_errors())
1056 ++deep_errors;
1057 else if (object_error.has_shallow_errors())
1058 ++shallow_errors;
1059 store->add_object_error(k->pool, object_error);
1060 errorstream << pgid.pgid << " soid " << *k
1061 << " : failed to pick suitable object info\n";
1062 continue;
1063 }
1064 object_error.set_version(auth_oi.user_version);
1065 ScrubMap::object& auth_object = auth->second->objects[*k];
1066 set<pg_shard_t> cur_missing;
1067 set<pg_shard_t> cur_inconsistent;
1068 bool fix_digest = false;
1069
1070 for (auto j = maps.cbegin(); j != maps.cend(); ++j) {
1071 if (j == auth)
1072 shard_map[auth->first].selected_oi = true;
1073 if (j->second->objects.count(*k)) {
1074 shard_map[j->first].set_object(j->second->objects[*k]);
1075 // Compare
1076 stringstream ss;
1077 bool found = be_compare_scrub_objects(auth->first,
1078 auth_object,
1079 auth_oi,
1080 j->second->objects[*k],
1081 shard_map[j->first],
1082 object_error,
1083 ss,
1084 k->has_snapset());
1085
1086 dout(20) << __func__ << (repair ? " repair " : " ") << (parent->get_pool().is_replicated() ? "replicated " : "")
1087 << (j == auth ? "auth" : "") << "shards " << shard_map.size() << (digest_match ? " digest_match " : " ")
1088 << (shard_map[j->first].only_data_digest_mismatch_info() ? "'info mismatch info'" : "")
1089 << dendl;
1090 // If all replicas match, but they don't match object_info we can
1091 // repair it by using missing_digest mechanism
1092 if (repair && parent->get_pool().is_replicated() && j == auth && shard_map.size() > 1
1093 && digest_match && shard_map[j->first].only_data_digest_mismatch_info()
1094 && auth_object.digest_present) {
1095 // Set in missing_digests
1096 fix_digest = true;
1097 // Clear the error
1098 shard_map[j->first].clear_data_digest_mismatch_info();
1099 errorstream << pgid << " soid " << *k << " : repairing object info data_digest" << "\n";
1100 }
1101 // Some errors might have already been set in be_select_auth_object()
1102 if (shard_map[j->first].errors != 0) {
1103 cur_inconsistent.insert(j->first);
1104 if (shard_map[j->first].has_deep_errors())
1105 ++deep_errors;
1106 else
1107 ++shallow_errors;
1108 // Only true if be_compare_scrub_objects() found errors and put something
1109 // in ss.
1110 if (found)
1111 errorstream << pgid << " shard " << j->first << " soid " << *k
1112 << " : " << ss.str() << "\n";
1113 } else if (found) {
1114 // Track possible shard to use as authoritative, if needed
1115 // There are errors, without identifying the shard
1116 object_errors.insert(j->first);
1117 errorstream << pgid << " soid " << *k << " : " << ss.str() << "\n";
1118 } else {
1119 // XXX: The auth shard might get here that we don't know
1120 // that it has the "correct" data.
1121 auth_list.push_back(j->first);
1122 }
1123 } else {
1124 cur_missing.insert(j->first);
1125 shard_map[j->first].set_missing();
1126 shard_map[j->first].primary = (j->first == get_parent()->whoami_shard());
1127 // Can't have any other errors if there is no information available
1128 ++shallow_errors;
1129 errorstream << pgid << " shard " << j->first << " " << *k << " : missing\n";
1130 }
1131 object_error.add_shard(j->first, shard_map[j->first]);
1132 }
1133
1134 if (auth_list.empty()) {
1135 if (object_errors.empty()) {
1136 errorstream << pgid.pgid << " soid " << *k
1137 << " : failed to pick suitable auth object\n";
1138 goto out;
1139 }
1140 // Object errors exist and nothing in auth_list
1141 // Prefer the auth shard otherwise take first from list.
1142 pg_shard_t shard;
1143 if (object_errors.count(auth->first)) {
1144 shard = auth->first;
1145 } else {
1146 shard = *(object_errors.begin());
1147 }
1148 auth_list.push_back(shard);
1149 object_errors.erase(shard);
1150 }
1151 // At this point auth_list is populated, so we add the object errors shards
1152 // as inconsistent.
1153 cur_inconsistent.insert(object_errors.begin(), object_errors.end());
1154 if (!cur_missing.empty()) {
1155 missing[*k] = cur_missing;
1156 }
1157 if (!cur_inconsistent.empty()) {
1158 inconsistent[*k] = cur_inconsistent;
1159 }
1160
1161 if (fix_digest) {
1162 std::optional<uint32_t> data_digest, omap_digest;
1163 ceph_assert(auth_object.digest_present);
1164 data_digest = auth_object.digest;
1165 if (auth_object.omap_digest_present) {
1166 omap_digest = auth_object.omap_digest;
1167 }
1168 missing_digest[*k] = make_pair(data_digest, omap_digest);
1169 }
1170 if (!cur_inconsistent.empty() || !cur_missing.empty()) {
1171 authoritative[*k] = auth_list;
1172 } else if (!fix_digest && parent->get_pool().is_replicated()) {
1173 enum {
1174 NO = 0,
1175 MAYBE = 1,
1176 FORCE = 2,
1177 } update = NO;
1178
1179 if (auth_object.digest_present && !auth_oi.is_data_digest()) {
1180 dout(20) << __func__ << " missing data digest on " << *k << dendl;
1181 update = MAYBE;
1182 }
1183 if (auth_object.omap_digest_present && !auth_oi.is_omap_digest()) {
1184 dout(20) << __func__ << " missing omap digest on " << *k << dendl;
1185 update = MAYBE;
1186 }
1187
1188 // recorded digest != actual digest?
1189 if (auth_oi.is_data_digest() && auth_object.digest_present &&
1190 auth_oi.data_digest != auth_object.digest) {
1191 ceph_assert(shard_map[auth->first].has_data_digest_mismatch_info());
1192 errorstream << pgid << " recorded data digest 0x"
1193 << std::hex << auth_oi.data_digest << " != on disk 0x"
1194 << auth_object.digest << std::dec << " on " << auth_oi.soid
1195 << "\n";
1196 if (repair)
1197 update = FORCE;
1198 }
1199 if (auth_oi.is_omap_digest() && auth_object.omap_digest_present &&
1200 auth_oi.omap_digest != auth_object.omap_digest) {
1201 ceph_assert(shard_map[auth->first].has_omap_digest_mismatch_info());
1202 errorstream << pgid << " recorded omap digest 0x"
1203 << std::hex << auth_oi.omap_digest << " != on disk 0x"
1204 << auth_object.omap_digest << std::dec
1205 << " on " << auth_oi.soid << "\n";
1206 if (repair)
1207 update = FORCE;
1208 }
1209
1210 if (update != NO) {
1211 utime_t age = now - auth_oi.local_mtime;
1212 if (update == FORCE ||
1213 age > cct->_conf->osd_deep_scrub_update_digest_min_age) {
1214 std::optional<uint32_t> data_digest, omap_digest;
1215 if (auth_object.digest_present) {
1216 data_digest = auth_object.digest;
1217 dout(20) << __func__ << " will update data digest on " << *k << dendl;
1218 }
1219 if (auth_object.omap_digest_present) {
1220 omap_digest = auth_object.omap_digest;
1221 dout(20) << __func__ << " will update omap digest on " << *k << dendl;
1222 }
1223 missing_digest[*k] = make_pair(data_digest, omap_digest);
1224 } else {
1225 dout(20) << __func__ << " missing digest but age " << age
1226 << " < " << cct->_conf->osd_deep_scrub_update_digest_min_age
1227 << " on " << *k << dendl;
1228 }
1229 }
1230 }
1231 out:
1232 if (object_error.has_deep_errors())
1233 ++deep_errors;
1234 else if (object_error.has_shallow_errors())
1235 ++shallow_errors;
1236 if (object_error.errors || object_error.union_shards.errors) {
1237 store->add_object_error(k->pool, object_error);
1238 }
1239 }
1240 }
1241
1242 void PGBackend::be_omap_checks(const map<pg_shard_t,ScrubMap*> &maps,
1243 const set<hobject_t> &master_set,
1244 omap_stat_t& omap_stats,
1245 ostream &warnstream) const
1246 {
1247 bool needs_omap_check = false;
1248 for (const auto& map : maps) {
1249 if (map.second->has_large_omap_object_errors || map.second->has_omap_keys) {
1250 needs_omap_check = true;
1251 break;
1252 }
1253 }
1254
1255 if (!needs_omap_check) {
1256 return; // Nothing to do
1257 }
1258
1259 // Iterate through objects and update omap stats
1260 for (const auto& k : master_set) {
1261 for (const auto& map : maps) {
1262 if (map.first != get_parent()->primary_shard()) {
1263 // Only set omap stats for the primary
1264 continue;
1265 }
1266 auto it = map.second->objects.find(k);
1267 if (it == map.second->objects.end())
1268 continue;
1269 ScrubMap::object& obj = it->second;
1270 omap_stats.omap_bytes += obj.object_omap_bytes;
1271 omap_stats.omap_keys += obj.object_omap_keys;
1272 if (obj.large_omap_object_found) {
1273 pg_t pg;
1274 auto osdmap = get_osdmap();
1275 osdmap->map_to_pg(k.pool, k.oid.name, k.get_key(), k.nspace, &pg);
1276 pg_t mpg = osdmap->raw_pg_to_pg(pg);
1277 omap_stats.large_omap_objects++;
1278 warnstream << "Large omap object found. Object: " << k
1279 << " PG: " << pg << " (" << mpg << ")"
1280 << " Key count: " << obj.large_omap_object_key_count
1281 << " Size (bytes): " << obj.large_omap_object_value_size
1282 << '\n';
1283 break;
1284 }
1285 }
1286 }
1287 }