]> git.proxmox.com Git - ceph.git/blob - ceph/src/osd/PGBackend.cc
import 14.2.4 nautilus point release
[ceph.git] / ceph / src / osd / PGBackend.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2013,2014 Inktank Storage, Inc.
7 * Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com>
8 *
9 * Author: Loic Dachary <loic@dachary.org>
10 *
11 * This is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public
13 * License version 2.1, as published by the Free Software
14 * Foundation. See file COPYING.
15 *
16 */
17
18
19 #include "common/errno.h"
20 #include "common/scrub_types.h"
21 #include "ReplicatedBackend.h"
22 #include "ScrubStore.h"
23 #include "ECBackend.h"
24 #include "PGBackend.h"
25 #include "OSD.h"
26 #include "erasure-code/ErasureCodePlugin.h"
27 #include "OSDMap.h"
28 #include "PGLog.h"
29 #include "common/LogClient.h"
30 #include "messages/MOSDPGRecoveryDelete.h"
31 #include "messages/MOSDPGRecoveryDeleteReply.h"
32
33 #define dout_context cct
34 #define dout_subsys ceph_subsys_osd
35 #define DOUT_PREFIX_ARGS this
36 #undef dout_prefix
37 #define dout_prefix _prefix(_dout, this)
38 static ostream& _prefix(std::ostream *_dout, PGBackend *pgb) {
39 return pgb->get_parent()->gen_dbg_prefix(*_dout);
40 }
41
42 void PGBackend::recover_delete_object(const hobject_t &oid, eversion_t v,
43 RecoveryHandle *h)
44 {
45 ceph_assert(get_parent()->get_acting_recovery_backfill_shards().size() > 0);
46 for (const auto& shard : get_parent()->get_acting_recovery_backfill_shards()) {
47 if (shard == get_parent()->whoami_shard())
48 continue;
49 if (get_parent()->get_shard_missing(shard).is_missing(oid)) {
50 dout(20) << __func__ << " will remove " << oid << " " << v << " from "
51 << shard << dendl;
52 h->deletes[shard].push_back(make_pair(oid, v));
53 get_parent()->begin_peer_recover(shard, oid);
54 }
55 }
56 }
57
58 void PGBackend::send_recovery_deletes(int prio,
59 const map<pg_shard_t, vector<pair<hobject_t, eversion_t> > > &deletes)
60 {
61 epoch_t min_epoch = get_parent()->get_last_peering_reset_epoch();
62 for (const auto& p : deletes) {
63 const auto& shard = p.first;
64 const auto& objects = p.second;
65 ConnectionRef con = get_parent()->get_con_osd_cluster(
66 shard.osd,
67 get_osdmap_epoch());
68 if (!con)
69 continue;
70 auto it = objects.begin();
71 while (it != objects.end()) {
72 uint64_t cost = 0;
73 uint64_t deletes = 0;
74 spg_t target_pg = spg_t(get_parent()->get_info().pgid.pgid, shard.shard);
75 MOSDPGRecoveryDelete *msg =
76 new MOSDPGRecoveryDelete(get_parent()->whoami_shard(),
77 target_pg,
78 get_osdmap_epoch(),
79 min_epoch);
80 msg->set_priority(prio);
81
82 while (it != objects.end() &&
83 cost < cct->_conf->osd_max_push_cost &&
84 deletes < cct->_conf->osd_max_push_objects) {
85 dout(20) << __func__ << ": sending recovery delete << " << it->first
86 << " " << it->second << " to osd." << shard << dendl;
87 msg->objects.push_back(*it);
88 cost += cct->_conf->osd_push_per_object_cost;
89 ++deletes;
90 ++it;
91 }
92
93 msg->set_cost(cost);
94 get_parent()->send_message_osd_cluster(msg, con);
95 }
96 }
97 }
98
99 bool PGBackend::handle_message(OpRequestRef op)
100 {
101 switch (op->get_req()->get_type()) {
102 case MSG_OSD_PG_RECOVERY_DELETE:
103 handle_recovery_delete(op);
104 return true;
105
106 case MSG_OSD_PG_RECOVERY_DELETE_REPLY:
107 handle_recovery_delete_reply(op);
108 return true;
109
110 default:
111 break;
112 }
113
114 return _handle_message(op);
115 }
116
117 void PGBackend::handle_recovery_delete(OpRequestRef op)
118 {
119 const MOSDPGRecoveryDelete *m = static_cast<const MOSDPGRecoveryDelete *>(op->get_req());
120 ceph_assert(m->get_type() == MSG_OSD_PG_RECOVERY_DELETE);
121 dout(20) << __func__ << " " << op << dendl;
122
123 op->mark_started();
124
125 C_GatherBuilder gather(cct);
126 for (const auto &p : m->objects) {
127 get_parent()->remove_missing_object(p.first, p.second, gather.new_sub());
128 }
129
130 MOSDPGRecoveryDeleteReply *reply = new MOSDPGRecoveryDeleteReply;
131 reply->from = get_parent()->whoami_shard();
132 reply->set_priority(m->get_priority());
133 reply->pgid = spg_t(get_parent()->get_info().pgid.pgid, m->from.shard);
134 reply->map_epoch = m->map_epoch;
135 reply->min_epoch = m->min_epoch;
136 reply->objects = m->objects;
137 ConnectionRef conn = m->get_connection();
138
139 gather.set_finisher(new FunctionContext(
140 [=](int r) {
141 if (r != -EAGAIN) {
142 get_parent()->send_message_osd_cluster(reply, conn.get());
143 } else {
144 reply->put();
145 }
146 }));
147 gather.activate();
148 }
149
150 void PGBackend::handle_recovery_delete_reply(OpRequestRef op)
151 {
152 const MOSDPGRecoveryDeleteReply *m = static_cast<const MOSDPGRecoveryDeleteReply *>(op->get_req());
153 ceph_assert(m->get_type() == MSG_OSD_PG_RECOVERY_DELETE_REPLY);
154 dout(20) << __func__ << " " << op << dendl;
155
156 for (const auto &p : m->objects) {
157 ObjectRecoveryInfo recovery_info;
158 hobject_t oid = p.first;
159 recovery_info.version = p.second;
160 get_parent()->on_peer_recover(m->from, oid, recovery_info);
161 bool peers_recovered = true;
162 for (const auto& shard : get_parent()->get_acting_recovery_backfill_shards()) {
163 if (shard == get_parent()->whoami_shard())
164 continue;
165 if (get_parent()->get_shard_missing(shard).is_missing(oid)) {
166 dout(20) << __func__ << " " << oid << " still missing on at least "
167 << shard << dendl;
168 peers_recovered = false;
169 break;
170 }
171 }
172 if (peers_recovered && !get_parent()->get_local_missing().is_missing(oid)) {
173 dout(20) << __func__ << " completed recovery, local_missing = "
174 << get_parent()->get_local_missing() << dendl;
175 object_stat_sum_t stat_diff;
176 stat_diff.num_objects_recovered = 1;
177 get_parent()->on_global_recover(p.first, stat_diff, true);
178 }
179 }
180 }
181
182 void PGBackend::rollback(
183 const pg_log_entry_t &entry,
184 ObjectStore::Transaction *t)
185 {
186
187 struct RollbackVisitor : public ObjectModDesc::Visitor {
188 const hobject_t &hoid;
189 PGBackend *pg;
190 ObjectStore::Transaction t;
191 RollbackVisitor(
192 const hobject_t &hoid,
193 PGBackend *pg) : hoid(hoid), pg(pg) {}
194 void append(uint64_t old_size) override {
195 ObjectStore::Transaction temp;
196 pg->rollback_append(hoid, old_size, &temp);
197 temp.append(t);
198 temp.swap(t);
199 }
200 void setattrs(map<string, boost::optional<bufferlist> > &attrs) override {
201 ObjectStore::Transaction temp;
202 pg->rollback_setattrs(hoid, attrs, &temp);
203 temp.append(t);
204 temp.swap(t);
205 }
206 void rmobject(version_t old_version) override {
207 ObjectStore::Transaction temp;
208 pg->rollback_stash(hoid, old_version, &temp);
209 temp.append(t);
210 temp.swap(t);
211 }
212 void try_rmobject(version_t old_version) override {
213 ObjectStore::Transaction temp;
214 pg->rollback_try_stash(hoid, old_version, &temp);
215 temp.append(t);
216 temp.swap(t);
217 }
218 void create() override {
219 ObjectStore::Transaction temp;
220 pg->rollback_create(hoid, &temp);
221 temp.append(t);
222 temp.swap(t);
223 }
224 void update_snaps(const set<snapid_t> &snaps) override {
225 ObjectStore::Transaction temp;
226 pg->get_parent()->pgb_set_object_snap_mapping(hoid, snaps, &temp);
227 temp.append(t);
228 temp.swap(t);
229 }
230 void rollback_extents(
231 version_t gen,
232 const vector<pair<uint64_t, uint64_t> > &extents) override {
233 ObjectStore::Transaction temp;
234 pg->rollback_extents(gen, extents, hoid, &temp);
235 temp.append(t);
236 temp.swap(t);
237 }
238 };
239
240 ceph_assert(entry.mod_desc.can_rollback());
241 RollbackVisitor vis(entry.soid, this);
242 entry.mod_desc.visit(&vis);
243 t->append(vis.t);
244 }
245
246 struct Trimmer : public ObjectModDesc::Visitor {
247 const hobject_t &soid;
248 PGBackend *pg;
249 ObjectStore::Transaction *t;
250 Trimmer(
251 const hobject_t &soid,
252 PGBackend *pg,
253 ObjectStore::Transaction *t)
254 : soid(soid), pg(pg), t(t) {}
255 void rmobject(version_t old_version) override {
256 pg->trim_rollback_object(
257 soid,
258 old_version,
259 t);
260 }
261 // try_rmobject defaults to rmobject
262 void rollback_extents(
263 version_t gen,
264 const vector<pair<uint64_t, uint64_t> > &extents) override {
265 pg->trim_rollback_object(
266 soid,
267 gen,
268 t);
269 }
270 };
271
272 void PGBackend::rollforward(
273 const pg_log_entry_t &entry,
274 ObjectStore::Transaction *t)
275 {
276 auto dpp = get_parent()->get_dpp();
277 ldpp_dout(dpp, 20) << __func__ << ": entry=" << entry << dendl;
278 if (!entry.can_rollback())
279 return;
280 Trimmer trimmer(entry.soid, this, t);
281 entry.mod_desc.visit(&trimmer);
282 }
283
284 void PGBackend::trim(
285 const pg_log_entry_t &entry,
286 ObjectStore::Transaction *t)
287 {
288 if (!entry.can_rollback())
289 return;
290 Trimmer trimmer(entry.soid, this, t);
291 entry.mod_desc.visit(&trimmer);
292 }
293
294 void PGBackend::try_stash(
295 const hobject_t &hoid,
296 version_t v,
297 ObjectStore::Transaction *t)
298 {
299 t->try_rename(
300 coll,
301 ghobject_t(hoid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard),
302 ghobject_t(hoid, v, get_parent()->whoami_shard().shard));
303 }
304
305 void PGBackend::remove(
306 const hobject_t &hoid,
307 ObjectStore::Transaction *t) {
308 ceph_assert(!hoid.is_temp());
309 t->remove(
310 coll,
311 ghobject_t(hoid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard));
312 get_parent()->pgb_clear_object_snap_mapping(hoid, t);
313 }
314
315 void PGBackend::on_change_cleanup(ObjectStore::Transaction *t)
316 {
317 dout(10) << __func__ << dendl;
318 // clear temp
319 for (set<hobject_t>::iterator i = temp_contents.begin();
320 i != temp_contents.end();
321 ++i) {
322 dout(10) << __func__ << ": Removing oid "
323 << *i << " from the temp collection" << dendl;
324 t->remove(
325 coll,
326 ghobject_t(*i, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard));
327 }
328 temp_contents.clear();
329 }
330
331 int PGBackend::objects_list_partial(
332 const hobject_t &begin,
333 int min,
334 int max,
335 vector<hobject_t> *ls,
336 hobject_t *next)
337 {
338 ceph_assert(ls);
339 // Starts with the smallest generation to make sure the result list
340 // has the marker object (it might have multiple generations
341 // though, which would be filtered).
342 ghobject_t _next;
343 if (!begin.is_min())
344 _next = ghobject_t(begin, 0, get_parent()->whoami_shard().shard);
345 ls->reserve(max);
346 int r = 0;
347
348 if (min > max)
349 min = max;
350
351 while (!_next.is_max() && ls->size() < (unsigned)min) {
352 vector<ghobject_t> objects;
353 r = store->collection_list(
354 ch,
355 _next,
356 ghobject_t::get_max(),
357 max - ls->size(),
358 &objects,
359 &_next);
360 if (r != 0) {
361 derr << __func__ << " list collection " << ch << " got: " << cpp_strerror(r) << dendl;
362 break;
363 }
364 for (vector<ghobject_t>::iterator i = objects.begin();
365 i != objects.end();
366 ++i) {
367 if (i->is_pgmeta() || i->hobj.is_temp()) {
368 continue;
369 }
370 if (i->is_no_gen()) {
371 ls->push_back(i->hobj);
372 }
373 }
374 }
375 if (r == 0)
376 *next = _next.hobj;
377 return r;
378 }
379
380 int PGBackend::objects_list_range(
381 const hobject_t &start,
382 const hobject_t &end,
383 vector<hobject_t> *ls,
384 vector<ghobject_t> *gen_obs)
385 {
386 ceph_assert(ls);
387 vector<ghobject_t> objects;
388 int r = store->collection_list(
389 ch,
390 ghobject_t(start, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard),
391 ghobject_t(end, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard),
392 INT_MAX,
393 &objects,
394 NULL);
395 ls->reserve(objects.size());
396 for (vector<ghobject_t>::iterator i = objects.begin();
397 i != objects.end();
398 ++i) {
399 if (i->is_pgmeta() || i->hobj.is_temp()) {
400 continue;
401 }
402 if (i->is_no_gen()) {
403 ls->push_back(i->hobj);
404 } else if (gen_obs) {
405 gen_obs->push_back(*i);
406 }
407 }
408 return r;
409 }
410
411 int PGBackend::objects_get_attr(
412 const hobject_t &hoid,
413 const string &attr,
414 bufferlist *out)
415 {
416 bufferptr bp;
417 int r = store->getattr(
418 ch,
419 ghobject_t(hoid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard),
420 attr.c_str(),
421 bp);
422 if (r >= 0 && out) {
423 out->clear();
424 out->push_back(std::move(bp));
425 }
426 return r;
427 }
428
429 int PGBackend::objects_get_attrs(
430 const hobject_t &hoid,
431 map<string, bufferlist> *out)
432 {
433 return store->getattrs(
434 ch,
435 ghobject_t(hoid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard),
436 *out);
437 }
438
439 void PGBackend::rollback_setattrs(
440 const hobject_t &hoid,
441 map<string, boost::optional<bufferlist> > &old_attrs,
442 ObjectStore::Transaction *t) {
443 map<string, bufferlist> to_set;
444 ceph_assert(!hoid.is_temp());
445 for (map<string, boost::optional<bufferlist> >::iterator i = old_attrs.begin();
446 i != old_attrs.end();
447 ++i) {
448 if (i->second) {
449 to_set[i->first] = i->second.get();
450 } else {
451 t->rmattr(
452 coll,
453 ghobject_t(hoid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard),
454 i->first);
455 }
456 }
457 t->setattrs(
458 coll,
459 ghobject_t(hoid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard),
460 to_set);
461 }
462
463 void PGBackend::rollback_append(
464 const hobject_t &hoid,
465 uint64_t old_size,
466 ObjectStore::Transaction *t) {
467 ceph_assert(!hoid.is_temp());
468 t->truncate(
469 coll,
470 ghobject_t(hoid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard),
471 old_size);
472 }
473
474 void PGBackend::rollback_stash(
475 const hobject_t &hoid,
476 version_t old_version,
477 ObjectStore::Transaction *t) {
478 ceph_assert(!hoid.is_temp());
479 t->remove(
480 coll,
481 ghobject_t(hoid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard));
482 t->collection_move_rename(
483 coll,
484 ghobject_t(hoid, old_version, get_parent()->whoami_shard().shard),
485 coll,
486 ghobject_t(hoid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard));
487 }
488
489 void PGBackend::rollback_try_stash(
490 const hobject_t &hoid,
491 version_t old_version,
492 ObjectStore::Transaction *t) {
493 ceph_assert(!hoid.is_temp());
494 t->remove(
495 coll,
496 ghobject_t(hoid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard));
497 t->try_rename(
498 coll,
499 ghobject_t(hoid, old_version, get_parent()->whoami_shard().shard),
500 ghobject_t(hoid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard));
501 }
502
503 void PGBackend::rollback_extents(
504 version_t gen,
505 const vector<pair<uint64_t, uint64_t> > &extents,
506 const hobject_t &hoid,
507 ObjectStore::Transaction *t) {
508 auto shard = get_parent()->whoami_shard().shard;
509 for (auto &&extent: extents) {
510 t->clone_range(
511 coll,
512 ghobject_t(hoid, gen, shard),
513 ghobject_t(hoid, ghobject_t::NO_GEN, shard),
514 extent.first,
515 extent.second,
516 extent.first);
517 }
518 t->remove(
519 coll,
520 ghobject_t(hoid, gen, shard));
521 }
522
523 void PGBackend::trim_rollback_object(
524 const hobject_t &hoid,
525 version_t old_version,
526 ObjectStore::Transaction *t) {
527 ceph_assert(!hoid.is_temp());
528 t->remove(
529 coll, ghobject_t(hoid, old_version, get_parent()->whoami_shard().shard));
530 }
531
532 PGBackend *PGBackend::build_pg_backend(
533 const pg_pool_t &pool,
534 const map<string,string>& profile,
535 Listener *l,
536 coll_t coll,
537 ObjectStore::CollectionHandle &ch,
538 ObjectStore *store,
539 CephContext *cct)
540 {
541 ErasureCodeProfile ec_profile = profile;
542 switch (pool.type) {
543 case pg_pool_t::TYPE_REPLICATED: {
544 return new ReplicatedBackend(l, coll, ch, store, cct);
545 }
546 case pg_pool_t::TYPE_ERASURE: {
547 ErasureCodeInterfaceRef ec_impl;
548 stringstream ss;
549 ceph::ErasureCodePluginRegistry::instance().factory(
550 profile.find("plugin")->second,
551 cct->_conf.get_val<std::string>("erasure_code_dir"),
552 ec_profile,
553 &ec_impl,
554 &ss);
555 ceph_assert(ec_impl);
556 return new ECBackend(
557 l,
558 coll,
559 ch,
560 store,
561 cct,
562 ec_impl,
563 pool.stripe_width);
564 }
565 default:
566 ceph_abort();
567 return NULL;
568 }
569 }
570
571 int PGBackend::be_scan_list(
572 ScrubMap &map,
573 ScrubMapBuilder &pos)
574 {
575 dout(10) << __func__ << " " << pos << dendl;
576 ceph_assert(!pos.done());
577 ceph_assert(pos.pos < pos.ls.size());
578 hobject_t& poid = pos.ls[pos.pos];
579
580 struct stat st;
581 int r = store->stat(
582 ch,
583 ghobject_t(
584 poid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard),
585 &st,
586 true);
587 if (r == 0) {
588 ScrubMap::object &o = map.objects[poid];
589 o.size = st.st_size;
590 ceph_assert(!o.negative);
591 store->getattrs(
592 ch,
593 ghobject_t(
594 poid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard),
595 o.attrs);
596
597 if (pos.deep) {
598 r = be_deep_scrub(poid, map, pos, o);
599 }
600 dout(25) << __func__ << " " << poid << dendl;
601 } else if (r == -ENOENT) {
602 dout(25) << __func__ << " " << poid << " got " << r
603 << ", skipping" << dendl;
604 } else if (r == -EIO) {
605 dout(25) << __func__ << " " << poid << " got " << r
606 << ", stat_error" << dendl;
607 ScrubMap::object &o = map.objects[poid];
608 o.stat_error = true;
609 } else {
610 derr << __func__ << " got: " << cpp_strerror(r) << dendl;
611 ceph_abort();
612 }
613 if (r == -EINPROGRESS) {
614 return -EINPROGRESS;
615 }
616 pos.next_object();
617 return 0;
618 }
619
620 bool PGBackend::be_compare_scrub_objects(
621 pg_shard_t auth_shard,
622 const ScrubMap::object &auth,
623 const object_info_t& auth_oi,
624 const ScrubMap::object &candidate,
625 shard_info_wrapper &shard_result,
626 inconsistent_obj_wrapper &obj_result,
627 ostream &errorstream,
628 bool has_snapset)
629 {
630 enum { CLEAN, FOUND_ERROR } error = CLEAN;
631 if (auth.digest_present && candidate.digest_present) {
632 if (auth.digest != candidate.digest) {
633 if (error != CLEAN)
634 errorstream << ", ";
635 error = FOUND_ERROR;
636 errorstream << "data_digest 0x" << std::hex << candidate.digest
637 << " != data_digest 0x" << auth.digest << std::dec
638 << " from shard " << auth_shard;
639 obj_result.set_data_digest_mismatch();
640 }
641 }
642 if (auth.omap_digest_present && candidate.omap_digest_present) {
643 if (auth.omap_digest != candidate.omap_digest) {
644 if (error != CLEAN)
645 errorstream << ", ";
646 error = FOUND_ERROR;
647 errorstream << "omap_digest 0x" << std::hex << candidate.omap_digest
648 << " != omap_digest 0x" << auth.omap_digest << std::dec
649 << " from shard " << auth_shard;
650 obj_result.set_omap_digest_mismatch();
651 }
652 }
653 if (parent->get_pool().is_replicated()) {
654 if (auth_oi.is_data_digest() && candidate.digest_present) {
655 if (auth_oi.data_digest != candidate.digest) {
656 if (error != CLEAN)
657 errorstream << ", ";
658 error = FOUND_ERROR;
659 errorstream << "data_digest 0x" << std::hex << candidate.digest
660 << " != data_digest 0x" << auth_oi.data_digest << std::dec
661 << " from auth oi " << auth_oi;
662 shard_result.set_data_digest_mismatch_info();
663 }
664 }
665 if (auth_oi.is_omap_digest() && candidate.omap_digest_present) {
666 if (auth_oi.omap_digest != candidate.omap_digest) {
667 if (error != CLEAN)
668 errorstream << ", ";
669 error = FOUND_ERROR;
670 errorstream << "omap_digest 0x" << std::hex << candidate.omap_digest
671 << " != omap_digest 0x" << auth_oi.omap_digest << std::dec
672 << " from auth oi " << auth_oi;
673 shard_result.set_omap_digest_mismatch_info();
674 }
675 }
676 }
677 if (candidate.stat_error)
678 return error == FOUND_ERROR;
679 if (!shard_result.has_info_missing()
680 && !shard_result.has_info_corrupted()) {
681 bufferlist can_bl, auth_bl;
682 auto can_attr = candidate.attrs.find(OI_ATTR);
683 auto auth_attr = auth.attrs.find(OI_ATTR);
684
685 ceph_assert(auth_attr != auth.attrs.end());
686 ceph_assert(can_attr != candidate.attrs.end());
687
688 can_bl.push_back(can_attr->second);
689 auth_bl.push_back(auth_attr->second);
690 if (!can_bl.contents_equal(auth_bl)) {
691 if (error != CLEAN)
692 errorstream << ", ";
693 error = FOUND_ERROR;
694 obj_result.set_object_info_inconsistency();
695 errorstream << "object info inconsistent ";
696 }
697 }
698 if (has_snapset) {
699 if (!shard_result.has_snapset_missing()
700 && !shard_result.has_snapset_corrupted()) {
701 bufferlist can_bl, auth_bl;
702 auto can_attr = candidate.attrs.find(SS_ATTR);
703 auto auth_attr = auth.attrs.find(SS_ATTR);
704
705 ceph_assert(auth_attr != auth.attrs.end());
706 ceph_assert(can_attr != candidate.attrs.end());
707
708 can_bl.push_back(can_attr->second);
709 auth_bl.push_back(auth_attr->second);
710 if (!can_bl.contents_equal(auth_bl)) {
711 if (error != CLEAN)
712 errorstream << ", ";
713 error = FOUND_ERROR;
714 obj_result.set_snapset_inconsistency();
715 errorstream << "snapset inconsistent ";
716 }
717 }
718 }
719 if (parent->get_pool().is_erasure()) {
720 if (!shard_result.has_hinfo_missing()
721 && !shard_result.has_hinfo_corrupted()) {
722 bufferlist can_bl, auth_bl;
723 auto can_hi = candidate.attrs.find(ECUtil::get_hinfo_key());
724 auto auth_hi = auth.attrs.find(ECUtil::get_hinfo_key());
725
726 ceph_assert(auth_hi != auth.attrs.end());
727 ceph_assert(can_hi != candidate.attrs.end());
728
729 can_bl.push_back(can_hi->second);
730 auth_bl.push_back(auth_hi->second);
731 if (!can_bl.contents_equal(auth_bl)) {
732 if (error != CLEAN)
733 errorstream << ", ";
734 error = FOUND_ERROR;
735 obj_result.set_hinfo_inconsistency();
736 errorstream << "hinfo inconsistent ";
737 }
738 }
739 }
740 uint64_t oi_size = be_get_ondisk_size(auth_oi.size);
741 if (oi_size != candidate.size) {
742 if (error != CLEAN)
743 errorstream << ", ";
744 error = FOUND_ERROR;
745 errorstream << "size " << candidate.size
746 << " != size " << oi_size
747 << " from auth oi " << auth_oi;
748 shard_result.set_size_mismatch_info();
749 }
750 if (auth.size != candidate.size) {
751 if (error != CLEAN)
752 errorstream << ", ";
753 error = FOUND_ERROR;
754 errorstream << "size " << candidate.size
755 << " != size " << auth.size
756 << " from shard " << auth_shard;
757 obj_result.set_size_mismatch();
758 }
759 for (map<string,bufferptr>::const_iterator i = auth.attrs.begin();
760 i != auth.attrs.end();
761 ++i) {
762 // We check system keys seperately
763 if (i->first == OI_ATTR || i->first[0] != '_')
764 continue;
765 if (!candidate.attrs.count(i->first)) {
766 if (error != CLEAN)
767 errorstream << ", ";
768 error = FOUND_ERROR;
769 errorstream << "attr name mismatch '" << i->first << "'";
770 obj_result.set_attr_name_mismatch();
771 } else if (candidate.attrs.find(i->first)->second.cmp(i->second)) {
772 if (error != CLEAN)
773 errorstream << ", ";
774 error = FOUND_ERROR;
775 errorstream << "attr value mismatch '" << i->first << "'";
776 obj_result.set_attr_value_mismatch();
777 }
778 }
779 for (map<string,bufferptr>::const_iterator i = candidate.attrs.begin();
780 i != candidate.attrs.end();
781 ++i) {
782 // We check system keys seperately
783 if (i->first == OI_ATTR || i->first[0] != '_')
784 continue;
785 if (!auth.attrs.count(i->first)) {
786 if (error != CLEAN)
787 errorstream << ", ";
788 error = FOUND_ERROR;
789 errorstream << "attr name mismatch '" << i->first << "'";
790 obj_result.set_attr_name_mismatch();
791 }
792 }
793 return error == FOUND_ERROR;
794 }
795
796 static int dcount(const object_info_t &oi)
797 {
798 int count = 0;
799 if (oi.is_data_digest())
800 count++;
801 if (oi.is_omap_digest())
802 count++;
803 return count;
804 }
805
806 map<pg_shard_t, ScrubMap *>::const_iterator
807 PGBackend::be_select_auth_object(
808 const hobject_t &obj,
809 const map<pg_shard_t,ScrubMap*> &maps,
810 object_info_t *auth_oi,
811 map<pg_shard_t, shard_info_wrapper> &shard_map,
812 bool &digest_match,
813 spg_t pgid,
814 ostream &errorstream)
815 {
816 eversion_t auth_version;
817
818 // Create list of shards with primary first so it will be auth copy all
819 // other things being equal.
820 list<pg_shard_t> shards;
821 for (map<pg_shard_t, ScrubMap *>::const_iterator j = maps.begin();
822 j != maps.end();
823 ++j) {
824 if (j->first == get_parent()->whoami_shard())
825 continue;
826 shards.push_back(j->first);
827 }
828 shards.push_front(get_parent()->whoami_shard());
829
830 map<pg_shard_t, ScrubMap *>::const_iterator auth = maps.end();
831 digest_match = true;
832 for (auto &l : shards) {
833 ostringstream shard_errorstream;
834 bool error = false;
835 map<pg_shard_t, ScrubMap *>::const_iterator j = maps.find(l);
836 map<hobject_t, ScrubMap::object>::iterator i =
837 j->second->objects.find(obj);
838 if (i == j->second->objects.end()) {
839 continue;
840 }
841 auto& shard_info = shard_map[j->first];
842 if (j->first == get_parent()->whoami_shard())
843 shard_info.primary = true;
844 if (i->second.read_error) {
845 shard_info.set_read_error();
846 if (error)
847 shard_errorstream << ", ";
848 error = true;
849 shard_errorstream << "candidate had a read error";
850 }
851 if (i->second.ec_hash_mismatch) {
852 shard_info.set_ec_hash_mismatch();
853 if (error)
854 shard_errorstream << ", ";
855 error = true;
856 shard_errorstream << "candidate had an ec hash mismatch";
857 }
858 if (i->second.ec_size_mismatch) {
859 shard_info.set_ec_size_mismatch();
860 if (error)
861 shard_errorstream << ", ";
862 error = true;
863 shard_errorstream << "candidate had an ec size mismatch";
864 }
865
866 object_info_t oi;
867 bufferlist bl;
868 map<string, bufferptr>::iterator k;
869 SnapSet ss;
870 bufferlist ss_bl, hk_bl;
871
872 if (i->second.stat_error) {
873 shard_info.set_stat_error();
874 if (error)
875 shard_errorstream << ", ";
876 error = true;
877 shard_errorstream << "candidate had a stat error";
878 // With stat_error no further checking
879 // We don't need to also see a missing_object_info_attr
880 goto out;
881 }
882
883 // We won't pick an auth copy if the snapset is missing or won't decode.
884 ceph_assert(!obj.is_snapdir());
885 if (obj.is_head()) {
886 k = i->second.attrs.find(SS_ATTR);
887 if (k == i->second.attrs.end()) {
888 shard_info.set_snapset_missing();
889 if (error)
890 shard_errorstream << ", ";
891 error = true;
892 shard_errorstream << "candidate had a missing snapset key";
893 } else {
894 ss_bl.push_back(k->second);
895 try {
896 auto bliter = ss_bl.cbegin();
897 decode(ss, bliter);
898 } catch (...) {
899 // invalid snapset, probably corrupt
900 shard_info.set_snapset_corrupted();
901 if (error)
902 shard_errorstream << ", ";
903 error = true;
904 shard_errorstream << "candidate had a corrupt snapset";
905 }
906 }
907 }
908
909 if (parent->get_pool().is_erasure()) {
910 ECUtil::HashInfo hi;
911 k = i->second.attrs.find(ECUtil::get_hinfo_key());
912 if (k == i->second.attrs.end()) {
913 shard_info.set_hinfo_missing();
914 if (error)
915 shard_errorstream << ", ";
916 error = true;
917 shard_errorstream << "candidate had a missing hinfo key";
918 } else {
919 hk_bl.push_back(k->second);
920 try {
921 auto bliter = hk_bl.cbegin();
922 decode(hi, bliter);
923 } catch (...) {
924 // invalid snapset, probably corrupt
925 shard_info.set_hinfo_corrupted();
926 if (error)
927 shard_errorstream << ", ";
928 error = true;
929 shard_errorstream << "candidate had a corrupt hinfo";
930 }
931 }
932 }
933
934 k = i->second.attrs.find(OI_ATTR);
935 if (k == i->second.attrs.end()) {
936 // no object info on object, probably corrupt
937 shard_info.set_info_missing();
938 if (error)
939 shard_errorstream << ", ";
940 error = true;
941 shard_errorstream << "candidate had a missing info key";
942 goto out;
943 }
944 bl.push_back(k->second);
945 try {
946 auto bliter = bl.cbegin();
947 decode(oi, bliter);
948 } catch (...) {
949 // invalid object info, probably corrupt
950 shard_info.set_info_corrupted();
951 if (error)
952 shard_errorstream << ", ";
953 error = true;
954 shard_errorstream << "candidate had a corrupt info";
955 goto out;
956 }
957
958 // This is automatically corrected in PG::_repair_oinfo_oid()
959 ceph_assert(oi.soid == obj);
960
961 if (i->second.size != be_get_ondisk_size(oi.size)) {
962 shard_info.set_obj_size_info_mismatch();
963 if (error)
964 shard_errorstream << ", ";
965 error = true;
966 shard_errorstream << "candidate size " << i->second.size << " info size "
967 << oi.size << " mismatch";
968 }
969
970 // digest_match will only be true if computed digests are the same
971 if (auth_version != eversion_t()
972 && auth->second->objects[obj].digest_present
973 && i->second.digest_present
974 && auth->second->objects[obj].digest != i->second.digest) {
975 digest_match = false;
976 dout(10) << __func__ << " digest_match = false, " << obj << " data_digest 0x" << std::hex << i->second.digest
977 << " != data_digest 0x" << auth->second->objects[obj].digest << std::dec
978 << dendl;
979 }
980
981 // Don't use this particular shard due to previous errors
982 // XXX: For now we can't pick one shard for repair and another's object info or snapset
983 if (shard_info.errors)
984 goto out;
985
986 if (auth_version == eversion_t() || oi.version > auth_version ||
987 (oi.version == auth_version && dcount(oi) > dcount(*auth_oi))) {
988 auth = j;
989 *auth_oi = oi;
990 auth_version = oi.version;
991 }
992
993 out:
994 if (error)
995 errorstream << pgid.pgid << " shard " << l << " soid " << obj
996 << " : " << shard_errorstream.str() << "\n";
997 // Keep scanning other shards
998 }
999 dout(10) << __func__ << ": selecting osd " << auth->first
1000 << " for obj " << obj
1001 << " with oi " << *auth_oi
1002 << dendl;
1003 return auth;
1004 }
1005
1006 void PGBackend::be_compare_scrubmaps(
1007 const map<pg_shard_t,ScrubMap*> &maps,
1008 const set<hobject_t> &master_set,
1009 bool repair,
1010 map<hobject_t, set<pg_shard_t>> &missing,
1011 map<hobject_t, set<pg_shard_t>> &inconsistent,
1012 map<hobject_t, list<pg_shard_t>> &authoritative,
1013 map<hobject_t, pair<boost::optional<uint32_t>,
1014 boost::optional<uint32_t>>> &missing_digest,
1015 int &shallow_errors, int &deep_errors,
1016 Scrub::Store *store,
1017 const spg_t& pgid,
1018 const vector<int> &acting,
1019 ostream &errorstream)
1020 {
1021 utime_t now = ceph_clock_now();
1022
1023 // Check maps against master set and each other
1024 for (set<hobject_t>::const_iterator k = master_set.begin();
1025 k != master_set.end();
1026 ++k) {
1027 object_info_t auth_oi;
1028 map<pg_shard_t, shard_info_wrapper> shard_map;
1029
1030 inconsistent_obj_wrapper object_error{*k};
1031
1032 bool digest_match;
1033 map<pg_shard_t, ScrubMap *>::const_iterator auth =
1034 be_select_auth_object(*k, maps, &auth_oi, shard_map, digest_match,
1035 pgid, errorstream);
1036
1037 list<pg_shard_t> auth_list;
1038 set<pg_shard_t> object_errors;
1039 if (auth == maps.end()) {
1040 object_error.set_version(0);
1041 object_error.set_auth_missing(*k, maps, shard_map, shallow_errors,
1042 deep_errors, get_parent()->whoami_shard());
1043 if (object_error.has_deep_errors())
1044 ++deep_errors;
1045 else if (object_error.has_shallow_errors())
1046 ++shallow_errors;
1047 store->add_object_error(k->pool, object_error);
1048 errorstream << pgid.pgid << " soid " << *k
1049 << " : failed to pick suitable object info\n";
1050 continue;
1051 }
1052 object_error.set_version(auth_oi.user_version);
1053 ScrubMap::object& auth_object = auth->second->objects[*k];
1054 set<pg_shard_t> cur_missing;
1055 set<pg_shard_t> cur_inconsistent;
1056 bool fix_digest = false;
1057
1058 for (auto j = maps.cbegin(); j != maps.cend(); ++j) {
1059 if (j == auth)
1060 shard_map[auth->first].selected_oi = true;
1061 if (j->second->objects.count(*k)) {
1062 shard_map[j->first].set_object(j->second->objects[*k]);
1063 // Compare
1064 stringstream ss;
1065 bool found = be_compare_scrub_objects(auth->first,
1066 auth_object,
1067 auth_oi,
1068 j->second->objects[*k],
1069 shard_map[j->first],
1070 object_error,
1071 ss,
1072 k->has_snapset());
1073
1074 dout(20) << __func__ << (repair ? " repair " : " ") << (parent->get_pool().is_replicated() ? "replicated " : "")
1075 << (j == auth ? "auth" : "") << "shards " << shard_map.size() << (digest_match ? " digest_match " : " ")
1076 << (shard_map[j->first].only_data_digest_mismatch_info() ? "'info mismatch info'" : "")
1077 << dendl;
1078 // If all replicas match, but they don't match object_info we can
1079 // repair it by using missing_digest mechanism
1080 if (repair && parent->get_pool().is_replicated() && j == auth && shard_map.size() > 1
1081 && digest_match && shard_map[j->first].only_data_digest_mismatch_info()
1082 && auth_object.digest_present) {
1083 // Set in missing_digests
1084 fix_digest = true;
1085 // Clear the error
1086 shard_map[j->first].clear_data_digest_mismatch_info();
1087 errorstream << pgid << " soid " << *k << " : repairing object info data_digest" << "\n";
1088 }
1089 // Some errors might have already been set in be_select_auth_object()
1090 if (shard_map[j->first].errors != 0) {
1091 cur_inconsistent.insert(j->first);
1092 if (shard_map[j->first].has_deep_errors())
1093 ++deep_errors;
1094 else
1095 ++shallow_errors;
1096 // Only true if be_compare_scrub_objects() found errors and put something
1097 // in ss.
1098 if (found)
1099 errorstream << pgid << " shard " << j->first << " soid " << *k
1100 << " : " << ss.str() << "\n";
1101 } else if (found) {
1102 // Track possible shard to use as authoritative, if needed
1103 // There are errors, without identifying the shard
1104 object_errors.insert(j->first);
1105 errorstream << pgid << " soid " << *k << " : " << ss.str() << "\n";
1106 } else {
1107 // XXX: The auth shard might get here that we don't know
1108 // that it has the "correct" data.
1109 auth_list.push_back(j->first);
1110 }
1111 } else {
1112 cur_missing.insert(j->first);
1113 shard_map[j->first].set_missing();
1114 shard_map[j->first].primary = (j->first == get_parent()->whoami_shard());
1115 // Can't have any other errors if there is no information available
1116 ++shallow_errors;
1117 errorstream << pgid << " shard " << j->first << " " << *k << " : missing\n";
1118 }
1119 object_error.add_shard(j->first, shard_map[j->first]);
1120 }
1121
1122 if (auth_list.empty()) {
1123 if (object_errors.empty()) {
1124 errorstream << pgid.pgid << " soid " << *k
1125 << " : failed to pick suitable auth object\n";
1126 goto out;
1127 }
1128 // Object errors exist and nothing in auth_list
1129 // Prefer the auth shard otherwise take first from list.
1130 pg_shard_t shard;
1131 if (object_errors.count(auth->first)) {
1132 shard = auth->first;
1133 } else {
1134 shard = *(object_errors.begin());
1135 }
1136 auth_list.push_back(shard);
1137 object_errors.erase(shard);
1138 }
1139 // At this point auth_list is populated, so we add the object errors shards
1140 // as inconsistent.
1141 cur_inconsistent.insert(object_errors.begin(), object_errors.end());
1142 if (!cur_missing.empty()) {
1143 missing[*k] = cur_missing;
1144 }
1145 if (!cur_inconsistent.empty()) {
1146 inconsistent[*k] = cur_inconsistent;
1147 }
1148
1149 if (fix_digest) {
1150 boost::optional<uint32_t> data_digest, omap_digest;
1151 ceph_assert(auth_object.digest_present);
1152 data_digest = auth_object.digest;
1153 if (auth_object.omap_digest_present) {
1154 omap_digest = auth_object.omap_digest;
1155 }
1156 missing_digest[*k] = make_pair(data_digest, omap_digest);
1157 }
1158 if (!cur_inconsistent.empty() || !cur_missing.empty()) {
1159 authoritative[*k] = auth_list;
1160 } else if (!fix_digest && parent->get_pool().is_replicated()) {
1161 enum {
1162 NO = 0,
1163 MAYBE = 1,
1164 FORCE = 2,
1165 } update = NO;
1166
1167 if (auth_object.digest_present && !auth_oi.is_data_digest()) {
1168 dout(20) << __func__ << " missing data digest on " << *k << dendl;
1169 update = MAYBE;
1170 }
1171 if (auth_object.omap_digest_present && !auth_oi.is_omap_digest()) {
1172 dout(20) << __func__ << " missing omap digest on " << *k << dendl;
1173 update = MAYBE;
1174 }
1175
1176 // recorded digest != actual digest?
1177 if (auth_oi.is_data_digest() && auth_object.digest_present &&
1178 auth_oi.data_digest != auth_object.digest) {
1179 ceph_assert(shard_map[auth->first].has_data_digest_mismatch_info());
1180 errorstream << pgid << " recorded data digest 0x"
1181 << std::hex << auth_oi.data_digest << " != on disk 0x"
1182 << auth_object.digest << std::dec << " on " << auth_oi.soid
1183 << "\n";
1184 if (repair)
1185 update = FORCE;
1186 }
1187 if (auth_oi.is_omap_digest() && auth_object.omap_digest_present &&
1188 auth_oi.omap_digest != auth_object.omap_digest) {
1189 ceph_assert(shard_map[auth->first].has_omap_digest_mismatch_info());
1190 errorstream << pgid << " recorded omap digest 0x"
1191 << std::hex << auth_oi.omap_digest << " != on disk 0x"
1192 << auth_object.omap_digest << std::dec
1193 << " on " << auth_oi.soid << "\n";
1194 if (repair)
1195 update = FORCE;
1196 }
1197
1198 if (update != NO) {
1199 utime_t age = now - auth_oi.local_mtime;
1200 if (update == FORCE ||
1201 age > cct->_conf->osd_deep_scrub_update_digest_min_age) {
1202 boost::optional<uint32_t> data_digest, omap_digest;
1203 if (auth_object.digest_present) {
1204 data_digest = auth_object.digest;
1205 dout(20) << __func__ << " will update data digest on " << *k << dendl;
1206 }
1207 if (auth_object.omap_digest_present) {
1208 omap_digest = auth_object.omap_digest;
1209 dout(20) << __func__ << " will update omap digest on " << *k << dendl;
1210 }
1211 missing_digest[*k] = make_pair(data_digest, omap_digest);
1212 } else {
1213 dout(20) << __func__ << " missing digest but age " << age
1214 << " < " << cct->_conf->osd_deep_scrub_update_digest_min_age
1215 << " on " << *k << dendl;
1216 }
1217 }
1218 }
1219 out:
1220 if (object_error.has_deep_errors())
1221 ++deep_errors;
1222 else if (object_error.has_shallow_errors())
1223 ++shallow_errors;
1224 if (object_error.errors || object_error.union_shards.errors) {
1225 store->add_object_error(k->pool, object_error);
1226 }
1227 }
1228 }
1229
1230 void PGBackend::be_omap_checks(const map<pg_shard_t,ScrubMap*> &maps,
1231 const set<hobject_t> &master_set,
1232 omap_stat_t& omap_stats,
1233 ostream &warnstream) const
1234 {
1235 bool needs_omap_check = false;
1236 for (const auto& map : maps) {
1237 if (map.second->has_large_omap_object_errors || map.second->has_omap_keys) {
1238 needs_omap_check = true;
1239 break;
1240 }
1241 }
1242
1243 if (!needs_omap_check) {
1244 return; // Nothing to do
1245 }
1246
1247 // Iterate through objects and update omap stats
1248 for (const auto& k : master_set) {
1249 for (const auto& map : maps) {
1250 if (map.first != get_parent()->primary_shard()) {
1251 // Only set omap stats for the primary
1252 continue;
1253 }
1254 auto it = map.second->objects.find(k);
1255 if (it == map.second->objects.end())
1256 continue;
1257 ScrubMap::object& obj = it->second;
1258 omap_stats.omap_bytes += obj.object_omap_bytes;
1259 omap_stats.omap_keys += obj.object_omap_keys;
1260 if (obj.large_omap_object_found) {
1261 omap_stats.large_omap_objects++;
1262 warnstream << "Large omap object found. Object: " << k << " Key count: "
1263 << obj.large_omap_object_key_count << " Size (bytes): "
1264 << obj.large_omap_object_value_size << '\n';
1265 break;
1266 }
1267 }
1268 }
1269 }