]>
Commit | Line | Data |
---|---|---|
7c673cae FG |
1 | // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- |
2 | // vim: ts=8 sw=2 smarttab | |
3 | /* | |
4 | * Ceph - scalable distributed file system | |
5 | * | |
6 | * Copyright (C) 2013,2014 Inktank Storage, Inc. | |
7 | * Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com> | |
8 | * | |
9 | * Author: Loic Dachary <loic@dachary.org> | |
10 | * | |
11 | * This is free software; you can redistribute it and/or | |
12 | * modify it under the terms of the GNU Lesser General Public | |
13 | * License version 2.1, as published by the Free Software | |
14 | * Foundation. See file COPYING. | |
15 | * | |
16 | */ | |
17 | ||
18 | ||
19 | #include "common/errno.h" | |
20 | #include "common/scrub_types.h" | |
21 | #include "ReplicatedBackend.h" | |
22 | #include "ScrubStore.h" | |
23 | #include "ECBackend.h" | |
24 | #include "PGBackend.h" | |
25 | #include "OSD.h" | |
26 | #include "erasure-code/ErasureCodePlugin.h" | |
27 | #include "OSDMap.h" | |
28 | #include "PGLog.h" | |
29 | #include "common/LogClient.h" | |
c07f9fc5 FG |
30 | #include "messages/MOSDPGRecoveryDelete.h" |
31 | #include "messages/MOSDPGRecoveryDeleteReply.h" | |
7c673cae FG |
32 | |
33 | #define dout_context cct | |
34 | #define dout_subsys ceph_subsys_osd | |
35 | #define DOUT_PREFIX_ARGS this | |
36 | #undef dout_prefix | |
37 | #define dout_prefix _prefix(_dout, this) | |
38 | static ostream& _prefix(std::ostream *_dout, PGBackend *pgb) { | |
11fdf7f2 | 39 | return pgb->get_parent()->gen_dbg_prefix(*_dout); |
7c673cae FG |
40 | } |
41 | ||
c07f9fc5 FG |
42 | void PGBackend::recover_delete_object(const hobject_t &oid, eversion_t v, |
43 | RecoveryHandle *h) | |
44 | { | |
11fdf7f2 TL |
45 | ceph_assert(get_parent()->get_acting_recovery_backfill_shards().size() > 0); |
46 | for (const auto& shard : get_parent()->get_acting_recovery_backfill_shards()) { | |
c07f9fc5 FG |
47 | if (shard == get_parent()->whoami_shard()) |
48 | continue; | |
49 | if (get_parent()->get_shard_missing(shard).is_missing(oid)) { | |
50 | dout(20) << __func__ << " will remove " << oid << " " << v << " from " | |
51 | << shard << dendl; | |
52 | h->deletes[shard].push_back(make_pair(oid, v)); | |
53 | get_parent()->begin_peer_recover(shard, oid); | |
54 | } | |
55 | } | |
56 | } | |
57 | ||
58 | void PGBackend::send_recovery_deletes(int prio, | |
59 | const map<pg_shard_t, vector<pair<hobject_t, eversion_t> > > &deletes) | |
60 | { | |
61 | epoch_t min_epoch = get_parent()->get_last_peering_reset_epoch(); | |
62 | for (const auto& p : deletes) { | |
63 | const auto& shard = p.first; | |
64 | const auto& objects = p.second; | |
65 | ConnectionRef con = get_parent()->get_con_osd_cluster( | |
66 | shard.osd, | |
11fdf7f2 | 67 | get_osdmap_epoch()); |
c07f9fc5 FG |
68 | if (!con) |
69 | continue; | |
70 | auto it = objects.begin(); | |
71 | while (it != objects.end()) { | |
72 | uint64_t cost = 0; | |
73 | uint64_t deletes = 0; | |
74 | spg_t target_pg = spg_t(get_parent()->get_info().pgid.pgid, shard.shard); | |
75 | MOSDPGRecoveryDelete *msg = | |
76 | new MOSDPGRecoveryDelete(get_parent()->whoami_shard(), | |
77 | target_pg, | |
11fdf7f2 | 78 | get_osdmap_epoch(), |
c07f9fc5 FG |
79 | min_epoch); |
80 | msg->set_priority(prio); | |
81 | ||
82 | while (it != objects.end() && | |
83 | cost < cct->_conf->osd_max_push_cost && | |
84 | deletes < cct->_conf->osd_max_push_objects) { | |
85 | dout(20) << __func__ << ": sending recovery delete << " << it->first | |
86 | << " " << it->second << " to osd." << shard << dendl; | |
87 | msg->objects.push_back(*it); | |
88 | cost += cct->_conf->osd_push_per_object_cost; | |
89 | ++deletes; | |
90 | ++it; | |
91 | } | |
92 | ||
93 | msg->set_cost(cost); | |
94 | get_parent()->send_message_osd_cluster(msg, con); | |
95 | } | |
96 | } | |
97 | } | |
98 | ||
99 | bool PGBackend::handle_message(OpRequestRef op) | |
100 | { | |
101 | switch (op->get_req()->get_type()) { | |
102 | case MSG_OSD_PG_RECOVERY_DELETE: | |
103 | handle_recovery_delete(op); | |
104 | return true; | |
105 | ||
106 | case MSG_OSD_PG_RECOVERY_DELETE_REPLY: | |
107 | handle_recovery_delete_reply(op); | |
108 | return true; | |
109 | ||
110 | default: | |
111 | break; | |
112 | } | |
113 | ||
114 | return _handle_message(op); | |
115 | } | |
116 | ||
117 | void PGBackend::handle_recovery_delete(OpRequestRef op) | |
118 | { | |
9f95a23c | 119 | auto m = op->get_req<MOSDPGRecoveryDelete>(); |
11fdf7f2 | 120 | ceph_assert(m->get_type() == MSG_OSD_PG_RECOVERY_DELETE); |
c07f9fc5 FG |
121 | dout(20) << __func__ << " " << op << dendl; |
122 | ||
123 | op->mark_started(); | |
124 | ||
125 | C_GatherBuilder gather(cct); | |
126 | for (const auto &p : m->objects) { | |
127 | get_parent()->remove_missing_object(p.first, p.second, gather.new_sub()); | |
128 | } | |
129 | ||
130 | MOSDPGRecoveryDeleteReply *reply = new MOSDPGRecoveryDeleteReply; | |
131 | reply->from = get_parent()->whoami_shard(); | |
132 | reply->set_priority(m->get_priority()); | |
133 | reply->pgid = spg_t(get_parent()->get_info().pgid.pgid, m->from.shard); | |
134 | reply->map_epoch = m->map_epoch; | |
135 | reply->min_epoch = m->min_epoch; | |
136 | reply->objects = m->objects; | |
137 | ConnectionRef conn = m->get_connection(); | |
138 | ||
9f95a23c | 139 | gather.set_finisher(new LambdaContext( |
c07f9fc5 FG |
140 | [=](int r) { |
141 | if (r != -EAGAIN) { | |
142 | get_parent()->send_message_osd_cluster(reply, conn.get()); | |
b5b8bbf5 FG |
143 | } else { |
144 | reply->put(); | |
c07f9fc5 FG |
145 | } |
146 | })); | |
147 | gather.activate(); | |
148 | } | |
149 | ||
150 | void PGBackend::handle_recovery_delete_reply(OpRequestRef op) | |
151 | { | |
9f95a23c | 152 | auto m = op->get_req<MOSDPGRecoveryDeleteReply>(); |
11fdf7f2 | 153 | ceph_assert(m->get_type() == MSG_OSD_PG_RECOVERY_DELETE_REPLY); |
c07f9fc5 FG |
154 | dout(20) << __func__ << " " << op << dendl; |
155 | ||
156 | for (const auto &p : m->objects) { | |
157 | ObjectRecoveryInfo recovery_info; | |
158 | hobject_t oid = p.first; | |
159 | recovery_info.version = p.second; | |
160 | get_parent()->on_peer_recover(m->from, oid, recovery_info); | |
161 | bool peers_recovered = true; | |
11fdf7f2 | 162 | for (const auto& shard : get_parent()->get_acting_recovery_backfill_shards()) { |
c07f9fc5 FG |
163 | if (shard == get_parent()->whoami_shard()) |
164 | continue; | |
165 | if (get_parent()->get_shard_missing(shard).is_missing(oid)) { | |
166 | dout(20) << __func__ << " " << oid << " still missing on at least " | |
167 | << shard << dendl; | |
168 | peers_recovered = false; | |
169 | break; | |
170 | } | |
171 | } | |
172 | if (peers_recovered && !get_parent()->get_local_missing().is_missing(oid)) { | |
173 | dout(20) << __func__ << " completed recovery, local_missing = " | |
174 | << get_parent()->get_local_missing() << dendl; | |
175 | object_stat_sum_t stat_diff; | |
176 | stat_diff.num_objects_recovered = 1; | |
177 | get_parent()->on_global_recover(p.first, stat_diff, true); | |
178 | } | |
179 | } | |
180 | } | |
181 | ||
7c673cae FG |
182 | void PGBackend::rollback( |
183 | const pg_log_entry_t &entry, | |
184 | ObjectStore::Transaction *t) | |
185 | { | |
186 | ||
187 | struct RollbackVisitor : public ObjectModDesc::Visitor { | |
188 | const hobject_t &hoid; | |
189 | PGBackend *pg; | |
190 | ObjectStore::Transaction t; | |
191 | RollbackVisitor( | |
192 | const hobject_t &hoid, | |
193 | PGBackend *pg) : hoid(hoid), pg(pg) {} | |
194 | void append(uint64_t old_size) override { | |
195 | ObjectStore::Transaction temp; | |
196 | pg->rollback_append(hoid, old_size, &temp); | |
197 | temp.append(t); | |
198 | temp.swap(t); | |
199 | } | |
9f95a23c | 200 | void setattrs(map<string, std::optional<bufferlist> > &attrs) override { |
7c673cae FG |
201 | ObjectStore::Transaction temp; |
202 | pg->rollback_setattrs(hoid, attrs, &temp); | |
203 | temp.append(t); | |
204 | temp.swap(t); | |
205 | } | |
206 | void rmobject(version_t old_version) override { | |
207 | ObjectStore::Transaction temp; | |
208 | pg->rollback_stash(hoid, old_version, &temp); | |
209 | temp.append(t); | |
210 | temp.swap(t); | |
211 | } | |
212 | void try_rmobject(version_t old_version) override { | |
213 | ObjectStore::Transaction temp; | |
214 | pg->rollback_try_stash(hoid, old_version, &temp); | |
215 | temp.append(t); | |
216 | temp.swap(t); | |
217 | } | |
218 | void create() override { | |
219 | ObjectStore::Transaction temp; | |
220 | pg->rollback_create(hoid, &temp); | |
221 | temp.append(t); | |
222 | temp.swap(t); | |
223 | } | |
224 | void update_snaps(const set<snapid_t> &snaps) override { | |
225 | ObjectStore::Transaction temp; | |
226 | pg->get_parent()->pgb_set_object_snap_mapping(hoid, snaps, &temp); | |
227 | temp.append(t); | |
228 | temp.swap(t); | |
229 | } | |
230 | void rollback_extents( | |
231 | version_t gen, | |
232 | const vector<pair<uint64_t, uint64_t> > &extents) override { | |
233 | ObjectStore::Transaction temp; | |
234 | pg->rollback_extents(gen, extents, hoid, &temp); | |
235 | temp.append(t); | |
236 | temp.swap(t); | |
237 | } | |
238 | }; | |
239 | ||
11fdf7f2 | 240 | ceph_assert(entry.mod_desc.can_rollback()); |
7c673cae FG |
241 | RollbackVisitor vis(entry.soid, this); |
242 | entry.mod_desc.visit(&vis); | |
243 | t->append(vis.t); | |
244 | } | |
245 | ||
246 | struct Trimmer : public ObjectModDesc::Visitor { | |
247 | const hobject_t &soid; | |
248 | PGBackend *pg; | |
249 | ObjectStore::Transaction *t; | |
250 | Trimmer( | |
251 | const hobject_t &soid, | |
252 | PGBackend *pg, | |
253 | ObjectStore::Transaction *t) | |
254 | : soid(soid), pg(pg), t(t) {} | |
255 | void rmobject(version_t old_version) override { | |
256 | pg->trim_rollback_object( | |
257 | soid, | |
258 | old_version, | |
259 | t); | |
260 | } | |
261 | // try_rmobject defaults to rmobject | |
262 | void rollback_extents( | |
263 | version_t gen, | |
264 | const vector<pair<uint64_t, uint64_t> > &extents) override { | |
265 | pg->trim_rollback_object( | |
266 | soid, | |
267 | gen, | |
268 | t); | |
269 | } | |
270 | }; | |
271 | ||
272 | void PGBackend::rollforward( | |
273 | const pg_log_entry_t &entry, | |
274 | ObjectStore::Transaction *t) | |
275 | { | |
276 | auto dpp = get_parent()->get_dpp(); | |
277 | ldpp_dout(dpp, 20) << __func__ << ": entry=" << entry << dendl; | |
278 | if (!entry.can_rollback()) | |
279 | return; | |
280 | Trimmer trimmer(entry.soid, this, t); | |
281 | entry.mod_desc.visit(&trimmer); | |
282 | } | |
283 | ||
284 | void PGBackend::trim( | |
285 | const pg_log_entry_t &entry, | |
286 | ObjectStore::Transaction *t) | |
287 | { | |
288 | if (!entry.can_rollback()) | |
289 | return; | |
290 | Trimmer trimmer(entry.soid, this, t); | |
291 | entry.mod_desc.visit(&trimmer); | |
292 | } | |
293 | ||
294 | void PGBackend::try_stash( | |
295 | const hobject_t &hoid, | |
296 | version_t v, | |
297 | ObjectStore::Transaction *t) | |
298 | { | |
299 | t->try_rename( | |
300 | coll, | |
301 | ghobject_t(hoid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard), | |
302 | ghobject_t(hoid, v, get_parent()->whoami_shard().shard)); | |
303 | } | |
304 | ||
305 | void PGBackend::remove( | |
306 | const hobject_t &hoid, | |
307 | ObjectStore::Transaction *t) { | |
11fdf7f2 | 308 | ceph_assert(!hoid.is_temp()); |
7c673cae FG |
309 | t->remove( |
310 | coll, | |
311 | ghobject_t(hoid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard)); | |
312 | get_parent()->pgb_clear_object_snap_mapping(hoid, t); | |
313 | } | |
314 | ||
315 | void PGBackend::on_change_cleanup(ObjectStore::Transaction *t) | |
316 | { | |
317 | dout(10) << __func__ << dendl; | |
318 | // clear temp | |
319 | for (set<hobject_t>::iterator i = temp_contents.begin(); | |
320 | i != temp_contents.end(); | |
321 | ++i) { | |
322 | dout(10) << __func__ << ": Removing oid " | |
323 | << *i << " from the temp collection" << dendl; | |
324 | t->remove( | |
325 | coll, | |
326 | ghobject_t(*i, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard)); | |
327 | } | |
328 | temp_contents.clear(); | |
329 | } | |
330 | ||
331 | int PGBackend::objects_list_partial( | |
332 | const hobject_t &begin, | |
333 | int min, | |
334 | int max, | |
335 | vector<hobject_t> *ls, | |
336 | hobject_t *next) | |
337 | { | |
11fdf7f2 | 338 | ceph_assert(ls); |
7c673cae FG |
339 | // Starts with the smallest generation to make sure the result list |
340 | // has the marker object (it might have multiple generations | |
341 | // though, which would be filtered). | |
342 | ghobject_t _next; | |
343 | if (!begin.is_min()) | |
344 | _next = ghobject_t(begin, 0, get_parent()->whoami_shard().shard); | |
345 | ls->reserve(max); | |
346 | int r = 0; | |
347 | ||
348 | if (min > max) | |
349 | min = max; | |
350 | ||
351 | while (!_next.is_max() && ls->size() < (unsigned)min) { | |
352 | vector<ghobject_t> objects; | |
f91f0fd5 TL |
353 | if (HAVE_FEATURE(parent->min_upacting_features(), |
354 | OSD_FIXED_COLLECTION_LIST)) { | |
355 | r = store->collection_list( | |
356 | ch, | |
357 | _next, | |
358 | ghobject_t::get_max(), | |
359 | max - ls->size(), | |
360 | &objects, | |
361 | &_next); | |
362 | } else { | |
363 | r = store->collection_list_legacy( | |
364 | ch, | |
365 | _next, | |
366 | ghobject_t::get_max(), | |
367 | max - ls->size(), | |
368 | &objects, | |
369 | &_next); | |
370 | } | |
7c673cae FG |
371 | if (r != 0) { |
372 | derr << __func__ << " list collection " << ch << " got: " << cpp_strerror(r) << dendl; | |
373 | break; | |
374 | } | |
375 | for (vector<ghobject_t>::iterator i = objects.begin(); | |
376 | i != objects.end(); | |
377 | ++i) { | |
378 | if (i->is_pgmeta() || i->hobj.is_temp()) { | |
379 | continue; | |
380 | } | |
381 | if (i->is_no_gen()) { | |
382 | ls->push_back(i->hobj); | |
383 | } | |
384 | } | |
385 | } | |
386 | if (r == 0) | |
387 | *next = _next.hobj; | |
388 | return r; | |
389 | } | |
390 | ||
391 | int PGBackend::objects_list_range( | |
392 | const hobject_t &start, | |
393 | const hobject_t &end, | |
7c673cae FG |
394 | vector<hobject_t> *ls, |
395 | vector<ghobject_t> *gen_obs) | |
396 | { | |
11fdf7f2 | 397 | ceph_assert(ls); |
7c673cae | 398 | vector<ghobject_t> objects; |
f91f0fd5 TL |
399 | int r; |
400 | if (HAVE_FEATURE(parent->min_upacting_features(), | |
401 | OSD_FIXED_COLLECTION_LIST)) { | |
402 | r = store->collection_list( | |
403 | ch, | |
404 | ghobject_t(start, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard), | |
405 | ghobject_t(end, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard), | |
406 | INT_MAX, | |
407 | &objects, | |
408 | NULL); | |
409 | } else { | |
410 | r = store->collection_list_legacy( | |
411 | ch, | |
412 | ghobject_t(start, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard), | |
413 | ghobject_t(end, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard), | |
414 | INT_MAX, | |
415 | &objects, | |
416 | NULL); | |
417 | } | |
7c673cae FG |
418 | ls->reserve(objects.size()); |
419 | for (vector<ghobject_t>::iterator i = objects.begin(); | |
420 | i != objects.end(); | |
421 | ++i) { | |
422 | if (i->is_pgmeta() || i->hobj.is_temp()) { | |
423 | continue; | |
424 | } | |
425 | if (i->is_no_gen()) { | |
426 | ls->push_back(i->hobj); | |
427 | } else if (gen_obs) { | |
428 | gen_obs->push_back(*i); | |
429 | } | |
430 | } | |
431 | return r; | |
432 | } | |
433 | ||
434 | int PGBackend::objects_get_attr( | |
435 | const hobject_t &hoid, | |
436 | const string &attr, | |
437 | bufferlist *out) | |
438 | { | |
439 | bufferptr bp; | |
440 | int r = store->getattr( | |
441 | ch, | |
442 | ghobject_t(hoid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard), | |
443 | attr.c_str(), | |
444 | bp); | |
445 | if (r >= 0 && out) { | |
446 | out->clear(); | |
447 | out->push_back(std::move(bp)); | |
448 | } | |
449 | return r; | |
450 | } | |
451 | ||
452 | int PGBackend::objects_get_attrs( | |
453 | const hobject_t &hoid, | |
454 | map<string, bufferlist> *out) | |
455 | { | |
456 | return store->getattrs( | |
457 | ch, | |
458 | ghobject_t(hoid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard), | |
459 | *out); | |
460 | } | |
461 | ||
462 | void PGBackend::rollback_setattrs( | |
463 | const hobject_t &hoid, | |
9f95a23c | 464 | map<string, std::optional<bufferlist> > &old_attrs, |
7c673cae FG |
465 | ObjectStore::Transaction *t) { |
466 | map<string, bufferlist> to_set; | |
11fdf7f2 | 467 | ceph_assert(!hoid.is_temp()); |
9f95a23c | 468 | for (map<string, std::optional<bufferlist> >::iterator i = old_attrs.begin(); |
7c673cae FG |
469 | i != old_attrs.end(); |
470 | ++i) { | |
471 | if (i->second) { | |
9f95a23c | 472 | to_set[i->first] = *(i->second); |
7c673cae FG |
473 | } else { |
474 | t->rmattr( | |
475 | coll, | |
476 | ghobject_t(hoid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard), | |
477 | i->first); | |
478 | } | |
479 | } | |
480 | t->setattrs( | |
481 | coll, | |
482 | ghobject_t(hoid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard), | |
483 | to_set); | |
484 | } | |
485 | ||
486 | void PGBackend::rollback_append( | |
487 | const hobject_t &hoid, | |
488 | uint64_t old_size, | |
489 | ObjectStore::Transaction *t) { | |
11fdf7f2 | 490 | ceph_assert(!hoid.is_temp()); |
7c673cae FG |
491 | t->truncate( |
492 | coll, | |
493 | ghobject_t(hoid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard), | |
494 | old_size); | |
495 | } | |
496 | ||
497 | void PGBackend::rollback_stash( | |
498 | const hobject_t &hoid, | |
499 | version_t old_version, | |
500 | ObjectStore::Transaction *t) { | |
11fdf7f2 | 501 | ceph_assert(!hoid.is_temp()); |
7c673cae FG |
502 | t->remove( |
503 | coll, | |
504 | ghobject_t(hoid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard)); | |
505 | t->collection_move_rename( | |
506 | coll, | |
507 | ghobject_t(hoid, old_version, get_parent()->whoami_shard().shard), | |
508 | coll, | |
509 | ghobject_t(hoid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard)); | |
510 | } | |
511 | ||
512 | void PGBackend::rollback_try_stash( | |
513 | const hobject_t &hoid, | |
514 | version_t old_version, | |
515 | ObjectStore::Transaction *t) { | |
11fdf7f2 | 516 | ceph_assert(!hoid.is_temp()); |
7c673cae FG |
517 | t->remove( |
518 | coll, | |
519 | ghobject_t(hoid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard)); | |
520 | t->try_rename( | |
521 | coll, | |
522 | ghobject_t(hoid, old_version, get_parent()->whoami_shard().shard), | |
523 | ghobject_t(hoid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard)); | |
524 | } | |
525 | ||
526 | void PGBackend::rollback_extents( | |
527 | version_t gen, | |
528 | const vector<pair<uint64_t, uint64_t> > &extents, | |
529 | const hobject_t &hoid, | |
530 | ObjectStore::Transaction *t) { | |
531 | auto shard = get_parent()->whoami_shard().shard; | |
532 | for (auto &&extent: extents) { | |
533 | t->clone_range( | |
534 | coll, | |
535 | ghobject_t(hoid, gen, shard), | |
536 | ghobject_t(hoid, ghobject_t::NO_GEN, shard), | |
537 | extent.first, | |
538 | extent.second, | |
539 | extent.first); | |
540 | } | |
541 | t->remove( | |
542 | coll, | |
543 | ghobject_t(hoid, gen, shard)); | |
544 | } | |
545 | ||
546 | void PGBackend::trim_rollback_object( | |
547 | const hobject_t &hoid, | |
548 | version_t old_version, | |
549 | ObjectStore::Transaction *t) { | |
11fdf7f2 | 550 | ceph_assert(!hoid.is_temp()); |
7c673cae FG |
551 | t->remove( |
552 | coll, ghobject_t(hoid, old_version, get_parent()->whoami_shard().shard)); | |
553 | } | |
554 | ||
555 | PGBackend *PGBackend::build_pg_backend( | |
556 | const pg_pool_t &pool, | |
11fdf7f2 | 557 | const map<string,string>& profile, |
7c673cae FG |
558 | Listener *l, |
559 | coll_t coll, | |
560 | ObjectStore::CollectionHandle &ch, | |
561 | ObjectStore *store, | |
562 | CephContext *cct) | |
563 | { | |
11fdf7f2 | 564 | ErasureCodeProfile ec_profile = profile; |
7c673cae FG |
565 | switch (pool.type) { |
566 | case pg_pool_t::TYPE_REPLICATED: { | |
567 | return new ReplicatedBackend(l, coll, ch, store, cct); | |
568 | } | |
569 | case pg_pool_t::TYPE_ERASURE: { | |
570 | ErasureCodeInterfaceRef ec_impl; | |
7c673cae FG |
571 | stringstream ss; |
572 | ceph::ErasureCodePluginRegistry::instance().factory( | |
573 | profile.find("plugin")->second, | |
11fdf7f2 TL |
574 | cct->_conf.get_val<std::string>("erasure_code_dir"), |
575 | ec_profile, | |
7c673cae FG |
576 | &ec_impl, |
577 | &ss); | |
11fdf7f2 | 578 | ceph_assert(ec_impl); |
7c673cae FG |
579 | return new ECBackend( |
580 | l, | |
581 | coll, | |
582 | ch, | |
583 | store, | |
584 | cct, | |
585 | ec_impl, | |
586 | pool.stripe_width); | |
587 | } | |
588 | default: | |
589 | ceph_abort(); | |
590 | return NULL; | |
591 | } | |
592 | } | |
593 | ||
28e407b8 AA |
594 | int PGBackend::be_scan_list( |
595 | ScrubMap &map, | |
596 | ScrubMapBuilder &pos) | |
7c673cae | 597 | { |
28e407b8 | 598 | dout(10) << __func__ << " " << pos << dendl; |
11fdf7f2 TL |
599 | ceph_assert(!pos.done()); |
600 | ceph_assert(pos.pos < pos.ls.size()); | |
28e407b8 AA |
601 | hobject_t& poid = pos.ls[pos.pos]; |
602 | ||
603 | struct stat st; | |
604 | int r = store->stat( | |
605 | ch, | |
606 | ghobject_t( | |
607 | poid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard), | |
608 | &st, | |
609 | true); | |
610 | if (r == 0) { | |
611 | ScrubMap::object &o = map.objects[poid]; | |
612 | o.size = st.st_size; | |
11fdf7f2 | 613 | ceph_assert(!o.negative); |
28e407b8 | 614 | store->getattrs( |
7c673cae FG |
615 | ch, |
616 | ghobject_t( | |
617 | poid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard), | |
28e407b8 | 618 | o.attrs); |
7c673cae | 619 | |
28e407b8 AA |
620 | if (pos.deep) { |
621 | r = be_deep_scrub(poid, map, pos, o); | |
7c673cae | 622 | } |
28e407b8 AA |
623 | dout(25) << __func__ << " " << poid << dendl; |
624 | } else if (r == -ENOENT) { | |
625 | dout(25) << __func__ << " " << poid << " got " << r | |
626 | << ", skipping" << dendl; | |
627 | } else if (r == -EIO) { | |
628 | dout(25) << __func__ << " " << poid << " got " << r | |
629 | << ", stat_error" << dendl; | |
630 | ScrubMap::object &o = map.objects[poid]; | |
631 | o.stat_error = true; | |
632 | } else { | |
633 | derr << __func__ << " got: " << cpp_strerror(r) << dendl; | |
634 | ceph_abort(); | |
635 | } | |
636 | if (r == -EINPROGRESS) { | |
637 | return -EINPROGRESS; | |
7c673cae | 638 | } |
28e407b8 AA |
639 | pos.next_object(); |
640 | return 0; | |
7c673cae FG |
641 | } |
642 | ||
643 | bool PGBackend::be_compare_scrub_objects( | |
644 | pg_shard_t auth_shard, | |
645 | const ScrubMap::object &auth, | |
646 | const object_info_t& auth_oi, | |
647 | const ScrubMap::object &candidate, | |
648 | shard_info_wrapper &shard_result, | |
649 | inconsistent_obj_wrapper &obj_result, | |
91327a77 AA |
650 | ostream &errorstream, |
651 | bool has_snapset) | |
7c673cae FG |
652 | { |
653 | enum { CLEAN, FOUND_ERROR } error = CLEAN; | |
7c673cae FG |
654 | if (auth.digest_present && candidate.digest_present) { |
655 | if (auth.digest != candidate.digest) { | |
656 | if (error != CLEAN) | |
657 | errorstream << ", "; | |
658 | error = FOUND_ERROR; | |
659 | errorstream << "data_digest 0x" << std::hex << candidate.digest | |
660 | << " != data_digest 0x" << auth.digest << std::dec | |
661 | << " from shard " << auth_shard; | |
662 | obj_result.set_data_digest_mismatch(); | |
663 | } | |
664 | } | |
665 | if (auth.omap_digest_present && candidate.omap_digest_present) { | |
666 | if (auth.omap_digest != candidate.omap_digest) { | |
667 | if (error != CLEAN) | |
668 | errorstream << ", "; | |
669 | error = FOUND_ERROR; | |
670 | errorstream << "omap_digest 0x" << std::hex << candidate.omap_digest | |
671 | << " != omap_digest 0x" << auth.omap_digest << std::dec | |
672 | << " from shard " << auth_shard; | |
673 | obj_result.set_omap_digest_mismatch(); | |
674 | } | |
675 | } | |
676 | if (parent->get_pool().is_replicated()) { | |
677 | if (auth_oi.is_data_digest() && candidate.digest_present) { | |
678 | if (auth_oi.data_digest != candidate.digest) { | |
679 | if (error != CLEAN) | |
680 | errorstream << ", "; | |
681 | error = FOUND_ERROR; | |
682 | errorstream << "data_digest 0x" << std::hex << candidate.digest | |
683 | << " != data_digest 0x" << auth_oi.data_digest << std::dec | |
684 | << " from auth oi " << auth_oi; | |
94b18763 | 685 | shard_result.set_data_digest_mismatch_info(); |
7c673cae FG |
686 | } |
687 | } | |
688 | if (auth_oi.is_omap_digest() && candidate.omap_digest_present) { | |
689 | if (auth_oi.omap_digest != candidate.omap_digest) { | |
690 | if (error != CLEAN) | |
691 | errorstream << ", "; | |
692 | error = FOUND_ERROR; | |
693 | errorstream << "omap_digest 0x" << std::hex << candidate.omap_digest | |
694 | << " != omap_digest 0x" << auth_oi.omap_digest << std::dec | |
695 | << " from auth oi " << auth_oi; | |
94b18763 | 696 | shard_result.set_omap_digest_mismatch_info(); |
7c673cae FG |
697 | } |
698 | } | |
699 | } | |
700 | if (candidate.stat_error) | |
701 | return error == FOUND_ERROR; | |
91327a77 AA |
702 | if (!shard_result.has_info_missing() |
703 | && !shard_result.has_info_corrupted()) { | |
704 | bufferlist can_bl, auth_bl; | |
705 | auto can_attr = candidate.attrs.find(OI_ATTR); | |
706 | auto auth_attr = auth.attrs.find(OI_ATTR); | |
707 | ||
11fdf7f2 TL |
708 | ceph_assert(auth_attr != auth.attrs.end()); |
709 | ceph_assert(can_attr != candidate.attrs.end()); | |
91327a77 AA |
710 | |
711 | can_bl.push_back(can_attr->second); | |
712 | auth_bl.push_back(auth_attr->second); | |
713 | if (!can_bl.contents_equal(auth_bl)) { | |
714 | if (error != CLEAN) | |
715 | errorstream << ", "; | |
716 | error = FOUND_ERROR; | |
717 | obj_result.set_object_info_inconsistency(); | |
718 | errorstream << "object info inconsistent "; | |
719 | } | |
720 | } | |
721 | if (has_snapset) { | |
722 | if (!shard_result.has_snapset_missing() | |
723 | && !shard_result.has_snapset_corrupted()) { | |
724 | bufferlist can_bl, auth_bl; | |
725 | auto can_attr = candidate.attrs.find(SS_ATTR); | |
726 | auto auth_attr = auth.attrs.find(SS_ATTR); | |
727 | ||
11fdf7f2 TL |
728 | ceph_assert(auth_attr != auth.attrs.end()); |
729 | ceph_assert(can_attr != candidate.attrs.end()); | |
91327a77 AA |
730 | |
731 | can_bl.push_back(can_attr->second); | |
732 | auth_bl.push_back(auth_attr->second); | |
733 | if (!can_bl.contents_equal(auth_bl)) { | |
734 | if (error != CLEAN) | |
735 | errorstream << ", "; | |
736 | error = FOUND_ERROR; | |
737 | obj_result.set_snapset_inconsistency(); | |
738 | errorstream << "snapset inconsistent "; | |
739 | } | |
740 | } | |
741 | } | |
742 | if (parent->get_pool().is_erasure()) { | |
743 | if (!shard_result.has_hinfo_missing() | |
744 | && !shard_result.has_hinfo_corrupted()) { | |
745 | bufferlist can_bl, auth_bl; | |
746 | auto can_hi = candidate.attrs.find(ECUtil::get_hinfo_key()); | |
747 | auto auth_hi = auth.attrs.find(ECUtil::get_hinfo_key()); | |
748 | ||
11fdf7f2 TL |
749 | ceph_assert(auth_hi != auth.attrs.end()); |
750 | ceph_assert(can_hi != candidate.attrs.end()); | |
91327a77 AA |
751 | |
752 | can_bl.push_back(can_hi->second); | |
753 | auth_bl.push_back(auth_hi->second); | |
754 | if (!can_bl.contents_equal(auth_bl)) { | |
755 | if (error != CLEAN) | |
756 | errorstream << ", "; | |
757 | error = FOUND_ERROR; | |
758 | obj_result.set_hinfo_inconsistency(); | |
759 | errorstream << "hinfo inconsistent "; | |
760 | } | |
761 | } | |
762 | } | |
7c673cae FG |
763 | uint64_t oi_size = be_get_ondisk_size(auth_oi.size); |
764 | if (oi_size != candidate.size) { | |
765 | if (error != CLEAN) | |
766 | errorstream << ", "; | |
767 | error = FOUND_ERROR; | |
768 | errorstream << "size " << candidate.size | |
769 | << " != size " << oi_size | |
770 | << " from auth oi " << auth_oi; | |
94b18763 | 771 | shard_result.set_size_mismatch_info(); |
7c673cae FG |
772 | } |
773 | if (auth.size != candidate.size) { | |
774 | if (error != CLEAN) | |
775 | errorstream << ", "; | |
776 | error = FOUND_ERROR; | |
777 | errorstream << "size " << candidate.size | |
778 | << " != size " << auth.size | |
779 | << " from shard " << auth_shard; | |
780 | obj_result.set_size_mismatch(); | |
781 | } | |
eafe8130 TL |
782 | // If the replica is too large and we didn't already count it for this object |
783 | // | |
784 | if (candidate.size > cct->_conf->osd_max_object_size | |
785 | && !obj_result.has_size_too_large()) { | |
786 | if (error != CLEAN) | |
787 | errorstream << ", "; | |
788 | error = FOUND_ERROR; | |
789 | errorstream << "size " << candidate.size | |
790 | << " > " << cct->_conf->osd_max_object_size | |
791 | << " is too large"; | |
792 | obj_result.set_size_too_large(); | |
793 | } | |
7c673cae FG |
794 | for (map<string,bufferptr>::const_iterator i = auth.attrs.begin(); |
795 | i != auth.attrs.end(); | |
796 | ++i) { | |
b5b8bbf5 | 797 | // We check system keys seperately |
94b18763 | 798 | if (i->first == OI_ATTR || i->first[0] != '_') |
b5b8bbf5 | 799 | continue; |
7c673cae FG |
800 | if (!candidate.attrs.count(i->first)) { |
801 | if (error != CLEAN) | |
802 | errorstream << ", "; | |
803 | error = FOUND_ERROR; | |
804 | errorstream << "attr name mismatch '" << i->first << "'"; | |
805 | obj_result.set_attr_name_mismatch(); | |
806 | } else if (candidate.attrs.find(i->first)->second.cmp(i->second)) { | |
807 | if (error != CLEAN) | |
808 | errorstream << ", "; | |
809 | error = FOUND_ERROR; | |
810 | errorstream << "attr value mismatch '" << i->first << "'"; | |
811 | obj_result.set_attr_value_mismatch(); | |
812 | } | |
813 | } | |
814 | for (map<string,bufferptr>::const_iterator i = candidate.attrs.begin(); | |
815 | i != candidate.attrs.end(); | |
816 | ++i) { | |
b5b8bbf5 | 817 | // We check system keys seperately |
94b18763 | 818 | if (i->first == OI_ATTR || i->first[0] != '_') |
b5b8bbf5 | 819 | continue; |
7c673cae FG |
820 | if (!auth.attrs.count(i->first)) { |
821 | if (error != CLEAN) | |
822 | errorstream << ", "; | |
823 | error = FOUND_ERROR; | |
824 | errorstream << "attr name mismatch '" << i->first << "'"; | |
825 | obj_result.set_attr_name_mismatch(); | |
826 | } | |
827 | } | |
828 | return error == FOUND_ERROR; | |
829 | } | |
830 | ||
11fdf7f2 | 831 | static int dcount(const object_info_t &oi) |
7c673cae FG |
832 | { |
833 | int count = 0; | |
834 | if (oi.is_data_digest()) | |
835 | count++; | |
836 | if (oi.is_omap_digest()) | |
837 | count++; | |
838 | return count; | |
839 | } | |
840 | ||
841 | map<pg_shard_t, ScrubMap *>::const_iterator | |
842 | PGBackend::be_select_auth_object( | |
843 | const hobject_t &obj, | |
844 | const map<pg_shard_t,ScrubMap*> &maps, | |
845 | object_info_t *auth_oi, | |
846 | map<pg_shard_t, shard_info_wrapper> &shard_map, | |
91327a77 AA |
847 | bool &digest_match, |
848 | spg_t pgid, | |
849 | ostream &errorstream) | |
7c673cae FG |
850 | { |
851 | eversion_t auth_version; | |
7c673cae | 852 | |
b32b8144 | 853 | // Create list of shards with primary first so it will be auth copy all |
31f18b77 FG |
854 | // other things being equal. |
855 | list<pg_shard_t> shards; | |
7c673cae FG |
856 | for (map<pg_shard_t, ScrubMap *>::const_iterator j = maps.begin(); |
857 | j != maps.end(); | |
858 | ++j) { | |
31f18b77 FG |
859 | if (j->first == get_parent()->whoami_shard()) |
860 | continue; | |
861 | shards.push_back(j->first); | |
862 | } | |
b32b8144 | 863 | shards.push_front(get_parent()->whoami_shard()); |
31f18b77 FG |
864 | |
865 | map<pg_shard_t, ScrubMap *>::const_iterator auth = maps.end(); | |
1adf2230 | 866 | digest_match = true; |
31f18b77 | 867 | for (auto &l : shards) { |
91327a77 AA |
868 | ostringstream shard_errorstream; |
869 | bool error = false; | |
31f18b77 | 870 | map<pg_shard_t, ScrubMap *>::const_iterator j = maps.find(l); |
7c673cae FG |
871 | map<hobject_t, ScrubMap::object>::iterator i = |
872 | j->second->objects.find(obj); | |
873 | if (i == j->second->objects.end()) { | |
874 | continue; | |
875 | } | |
7c673cae | 876 | auto& shard_info = shard_map[j->first]; |
b5b8bbf5 FG |
877 | if (j->first == get_parent()->whoami_shard()) |
878 | shard_info.primary = true; | |
7c673cae FG |
879 | if (i->second.read_error) { |
880 | shard_info.set_read_error(); | |
91327a77 AA |
881 | if (error) |
882 | shard_errorstream << ", "; | |
883 | error = true; | |
884 | shard_errorstream << "candidate had a read error"; | |
7c673cae FG |
885 | } |
886 | if (i->second.ec_hash_mismatch) { | |
887 | shard_info.set_ec_hash_mismatch(); | |
91327a77 AA |
888 | if (error) |
889 | shard_errorstream << ", "; | |
890 | error = true; | |
891 | shard_errorstream << "candidate had an ec hash mismatch"; | |
7c673cae FG |
892 | } |
893 | if (i->second.ec_size_mismatch) { | |
894 | shard_info.set_ec_size_mismatch(); | |
91327a77 AA |
895 | if (error) |
896 | shard_errorstream << ", "; | |
897 | error = true; | |
898 | shard_errorstream << "candidate had an ec size mismatch"; | |
7c673cae FG |
899 | } |
900 | ||
901 | object_info_t oi; | |
902 | bufferlist bl; | |
903 | map<string, bufferptr>::iterator k; | |
31f18b77 | 904 | SnapSet ss; |
94b18763 | 905 | bufferlist ss_bl, hk_bl; |
7c673cae FG |
906 | |
907 | if (i->second.stat_error) { | |
908 | shard_info.set_stat_error(); | |
91327a77 AA |
909 | if (error) |
910 | shard_errorstream << ", "; | |
911 | error = true; | |
912 | shard_errorstream << "candidate had a stat error"; | |
7c673cae FG |
913 | // With stat_error no further checking |
914 | // We don't need to also see a missing_object_info_attr | |
915 | goto out; | |
916 | } | |
917 | ||
b5b8bbf5 | 918 | // We won't pick an auth copy if the snapset is missing or won't decode. |
11fdf7f2 TL |
919 | ceph_assert(!obj.is_snapdir()); |
920 | if (obj.is_head()) { | |
b5b8bbf5 FG |
921 | k = i->second.attrs.find(SS_ATTR); |
922 | if (k == i->second.attrs.end()) { | |
94b18763 | 923 | shard_info.set_snapset_missing(); |
91327a77 AA |
924 | if (error) |
925 | shard_errorstream << ", "; | |
926 | error = true; | |
927 | shard_errorstream << "candidate had a missing snapset key"; | |
b5b8bbf5 FG |
928 | } else { |
929 | ss_bl.push_back(k->second); | |
930 | try { | |
11fdf7f2 TL |
931 | auto bliter = ss_bl.cbegin(); |
932 | decode(ss, bliter); | |
b5b8bbf5 FG |
933 | } catch (...) { |
934 | // invalid snapset, probably corrupt | |
94b18763 | 935 | shard_info.set_snapset_corrupted(); |
91327a77 AA |
936 | if (error) |
937 | shard_errorstream << ", "; | |
938 | error = true; | |
939 | shard_errorstream << "candidate had a corrupt snapset"; | |
94b18763 FG |
940 | } |
941 | } | |
942 | } | |
943 | ||
944 | if (parent->get_pool().is_erasure()) { | |
945 | ECUtil::HashInfo hi; | |
946 | k = i->second.attrs.find(ECUtil::get_hinfo_key()); | |
947 | if (k == i->second.attrs.end()) { | |
948 | shard_info.set_hinfo_missing(); | |
91327a77 AA |
949 | if (error) |
950 | shard_errorstream << ", "; | |
951 | error = true; | |
952 | shard_errorstream << "candidate had a missing hinfo key"; | |
94b18763 FG |
953 | } else { |
954 | hk_bl.push_back(k->second); | |
955 | try { | |
11fdf7f2 | 956 | auto bliter = hk_bl.cbegin(); |
94b18763 | 957 | decode(hi, bliter); |
94b18763 FG |
958 | } catch (...) { |
959 | // invalid snapset, probably corrupt | |
960 | shard_info.set_hinfo_corrupted(); | |
91327a77 AA |
961 | if (error) |
962 | shard_errorstream << ", "; | |
963 | error = true; | |
964 | shard_errorstream << "candidate had a corrupt hinfo"; | |
b5b8bbf5 FG |
965 | } |
966 | } | |
967 | } | |
968 | ||
7c673cae FG |
969 | k = i->second.attrs.find(OI_ATTR); |
970 | if (k == i->second.attrs.end()) { | |
971 | // no object info on object, probably corrupt | |
94b18763 | 972 | shard_info.set_info_missing(); |
91327a77 AA |
973 | if (error) |
974 | shard_errorstream << ", "; | |
975 | error = true; | |
976 | shard_errorstream << "candidate had a missing info key"; | |
7c673cae FG |
977 | goto out; |
978 | } | |
979 | bl.push_back(k->second); | |
980 | try { | |
11fdf7f2 TL |
981 | auto bliter = bl.cbegin(); |
982 | decode(oi, bliter); | |
7c673cae FG |
983 | } catch (...) { |
984 | // invalid object info, probably corrupt | |
94b18763 | 985 | shard_info.set_info_corrupted(); |
91327a77 AA |
986 | if (error) |
987 | shard_errorstream << ", "; | |
988 | error = true; | |
989 | shard_errorstream << "candidate had a corrupt info"; | |
7c673cae FG |
990 | goto out; |
991 | } | |
992 | ||
b5b8bbf5 | 993 | // This is automatically corrected in PG::_repair_oinfo_oid() |
11fdf7f2 | 994 | ceph_assert(oi.soid == obj); |
b5b8bbf5 | 995 | |
b5b8bbf5 | 996 | if (i->second.size != be_get_ondisk_size(oi.size)) { |
94b18763 | 997 | shard_info.set_obj_size_info_mismatch(); |
91327a77 AA |
998 | if (error) |
999 | shard_errorstream << ", "; | |
1000 | error = true; | |
1001 | shard_errorstream << "candidate size " << i->second.size << " info size " | |
1002 | << oi.size << " mismatch"; | |
7c673cae FG |
1003 | } |
1004 | ||
1adf2230 AA |
1005 | // digest_match will only be true if computed digests are the same |
1006 | if (auth_version != eversion_t() | |
1007 | && auth->second->objects[obj].digest_present | |
1008 | && i->second.digest_present | |
1009 | && auth->second->objects[obj].digest != i->second.digest) { | |
1010 | digest_match = false; | |
1011 | dout(10) << __func__ << " digest_match = false, " << obj << " data_digest 0x" << std::hex << i->second.digest | |
1012 | << " != data_digest 0x" << auth->second->objects[obj].digest << std::dec | |
1013 | << dendl; | |
1014 | } | |
1015 | ||
b5b8bbf5 FG |
1016 | // Don't use this particular shard due to previous errors |
1017 | // XXX: For now we can't pick one shard for repair and another's object info or snapset | |
1018 | if (shard_info.errors) | |
7c673cae FG |
1019 | goto out; |
1020 | ||
1021 | if (auth_version == eversion_t() || oi.version > auth_version || | |
11fdf7f2 | 1022 | (oi.version == auth_version && dcount(oi) > dcount(*auth_oi))) { |
7c673cae FG |
1023 | auth = j; |
1024 | *auth_oi = oi; | |
1025 | auth_version = oi.version; | |
7c673cae FG |
1026 | } |
1027 | ||
1028 | out: | |
91327a77 AA |
1029 | if (error) |
1030 | errorstream << pgid.pgid << " shard " << l << " soid " << obj | |
1031 | << " : " << shard_errorstream.str() << "\n"; | |
7c673cae FG |
1032 | // Keep scanning other shards |
1033 | } | |
1034 | dout(10) << __func__ << ": selecting osd " << auth->first | |
1035 | << " for obj " << obj | |
1036 | << " with oi " << *auth_oi | |
1037 | << dendl; | |
1038 | return auth; | |
1039 | } | |
1040 | ||
1041 | void PGBackend::be_compare_scrubmaps( | |
1042 | const map<pg_shard_t,ScrubMap*> &maps, | |
28e407b8 | 1043 | const set<hobject_t> &master_set, |
7c673cae FG |
1044 | bool repair, |
1045 | map<hobject_t, set<pg_shard_t>> &missing, | |
1046 | map<hobject_t, set<pg_shard_t>> &inconsistent, | |
1047 | map<hobject_t, list<pg_shard_t>> &authoritative, | |
9f95a23c TL |
1048 | map<hobject_t, pair<std::optional<uint32_t>, |
1049 | std::optional<uint32_t>>> &missing_digest, | |
7c673cae FG |
1050 | int &shallow_errors, int &deep_errors, |
1051 | Scrub::Store *store, | |
1052 | const spg_t& pgid, | |
1053 | const vector<int> &acting, | |
1054 | ostream &errorstream) | |
1055 | { | |
7c673cae FG |
1056 | utime_t now = ceph_clock_now(); |
1057 | ||
7c673cae FG |
1058 | // Check maps against master set and each other |
1059 | for (set<hobject_t>::const_iterator k = master_set.begin(); | |
1060 | k != master_set.end(); | |
1061 | ++k) { | |
1062 | object_info_t auth_oi; | |
1063 | map<pg_shard_t, shard_info_wrapper> shard_map; | |
1064 | ||
1065 | inconsistent_obj_wrapper object_error{*k}; | |
1066 | ||
1adf2230 | 1067 | bool digest_match; |
7c673cae | 1068 | map<pg_shard_t, ScrubMap *>::const_iterator auth = |
91327a77 AA |
1069 | be_select_auth_object(*k, maps, &auth_oi, shard_map, digest_match, |
1070 | pgid, errorstream); | |
7c673cae FG |
1071 | |
1072 | list<pg_shard_t> auth_list; | |
31f18b77 | 1073 | set<pg_shard_t> object_errors; |
7c673cae FG |
1074 | if (auth == maps.end()) { |
1075 | object_error.set_version(0); | |
b5b8bbf5 FG |
1076 | object_error.set_auth_missing(*k, maps, shard_map, shallow_errors, |
1077 | deep_errors, get_parent()->whoami_shard()); | |
7c673cae FG |
1078 | if (object_error.has_deep_errors()) |
1079 | ++deep_errors; | |
1080 | else if (object_error.has_shallow_errors()) | |
1081 | ++shallow_errors; | |
1082 | store->add_object_error(k->pool, object_error); | |
1083 | errorstream << pgid.pgid << " soid " << *k | |
91327a77 | 1084 | << " : failed to pick suitable object info\n"; |
7c673cae FG |
1085 | continue; |
1086 | } | |
1087 | object_error.set_version(auth_oi.user_version); | |
1088 | ScrubMap::object& auth_object = auth->second->objects[*k]; | |
1089 | set<pg_shard_t> cur_missing; | |
1090 | set<pg_shard_t> cur_inconsistent; | |
1adf2230 | 1091 | bool fix_digest = false; |
7c673cae | 1092 | |
11fdf7f2 | 1093 | for (auto j = maps.cbegin(); j != maps.cend(); ++j) { |
7c673cae FG |
1094 | if (j == auth) |
1095 | shard_map[auth->first].selected_oi = true; | |
1096 | if (j->second->objects.count(*k)) { | |
1097 | shard_map[j->first].set_object(j->second->objects[*k]); | |
1098 | // Compare | |
1099 | stringstream ss; | |
1100 | bool found = be_compare_scrub_objects(auth->first, | |
1101 | auth_object, | |
1102 | auth_oi, | |
1103 | j->second->objects[*k], | |
1104 | shard_map[j->first], | |
1105 | object_error, | |
91327a77 AA |
1106 | ss, |
1107 | k->has_snapset()); | |
1adf2230 AA |
1108 | |
1109 | dout(20) << __func__ << (repair ? " repair " : " ") << (parent->get_pool().is_replicated() ? "replicated " : "") | |
11fdf7f2 TL |
1110 | << (j == auth ? "auth" : "") << "shards " << shard_map.size() << (digest_match ? " digest_match " : " ") |
1111 | << (shard_map[j->first].only_data_digest_mismatch_info() ? "'info mismatch info'" : "") | |
1112 | << dendl; | |
1adf2230 AA |
1113 | // If all replicas match, but they don't match object_info we can |
1114 | // repair it by using missing_digest mechanism | |
11fdf7f2 | 1115 | if (repair && parent->get_pool().is_replicated() && j == auth && shard_map.size() > 1 |
1adf2230 AA |
1116 | && digest_match && shard_map[j->first].only_data_digest_mismatch_info() |
1117 | && auth_object.digest_present) { | |
1118 | // Set in missing_digests | |
1119 | fix_digest = true; | |
1120 | // Clear the error | |
1121 | shard_map[j->first].clear_data_digest_mismatch_info(); | |
91327a77 | 1122 | errorstream << pgid << " soid " << *k << " : repairing object info data_digest" << "\n"; |
1adf2230 | 1123 | } |
7c673cae FG |
1124 | // Some errors might have already been set in be_select_auth_object() |
1125 | if (shard_map[j->first].errors != 0) { | |
1126 | cur_inconsistent.insert(j->first); | |
1127 | if (shard_map[j->first].has_deep_errors()) | |
1128 | ++deep_errors; | |
1129 | else | |
1130 | ++shallow_errors; | |
1131 | // Only true if be_compare_scrub_objects() found errors and put something | |
1132 | // in ss. | |
1133 | if (found) | |
91327a77 AA |
1134 | errorstream << pgid << " shard " << j->first << " soid " << *k |
1135 | << " : " << ss.str() << "\n"; | |
1136 | } else if (found) { | |
31f18b77 FG |
1137 | // Track possible shard to use as authoritative, if needed |
1138 | // There are errors, without identifying the shard | |
1139 | object_errors.insert(j->first); | |
91327a77 | 1140 | errorstream << pgid << " soid " << *k << " : " << ss.str() << "\n"; |
7c673cae FG |
1141 | } else { |
1142 | // XXX: The auth shard might get here that we don't know | |
1143 | // that it has the "correct" data. | |
1144 | auth_list.push_back(j->first); | |
1145 | } | |
1146 | } else { | |
1147 | cur_missing.insert(j->first); | |
1148 | shard_map[j->first].set_missing(); | |
b5b8bbf5 | 1149 | shard_map[j->first].primary = (j->first == get_parent()->whoami_shard()); |
7c673cae FG |
1150 | // Can't have any other errors if there is no information available |
1151 | ++shallow_errors; | |
91327a77 | 1152 | errorstream << pgid << " shard " << j->first << " " << *k << " : missing\n"; |
7c673cae FG |
1153 | } |
1154 | object_error.add_shard(j->first, shard_map[j->first]); | |
1155 | } | |
1156 | ||
1157 | if (auth_list.empty()) { | |
31f18b77 FG |
1158 | if (object_errors.empty()) { |
1159 | errorstream << pgid.pgid << " soid " << *k | |
91327a77 | 1160 | << " : failed to pick suitable auth object\n"; |
31f18b77 FG |
1161 | goto out; |
1162 | } | |
1163 | // Object errors exist and nothing in auth_list | |
1164 | // Prefer the auth shard otherwise take first from list. | |
1165 | pg_shard_t shard; | |
1166 | if (object_errors.count(auth->first)) { | |
1167 | shard = auth->first; | |
1168 | } else { | |
1169 | shard = *(object_errors.begin()); | |
1170 | } | |
1171 | auth_list.push_back(shard); | |
1172 | object_errors.erase(shard); | |
7c673cae | 1173 | } |
31f18b77 FG |
1174 | // At this point auth_list is populated, so we add the object errors shards |
1175 | // as inconsistent. | |
1176 | cur_inconsistent.insert(object_errors.begin(), object_errors.end()); | |
7c673cae FG |
1177 | if (!cur_missing.empty()) { |
1178 | missing[*k] = cur_missing; | |
1179 | } | |
1180 | if (!cur_inconsistent.empty()) { | |
1181 | inconsistent[*k] = cur_inconsistent; | |
1182 | } | |
1adf2230 AA |
1183 | |
1184 | if (fix_digest) { | |
9f95a23c | 1185 | std::optional<uint32_t> data_digest, omap_digest; |
11fdf7f2 | 1186 | ceph_assert(auth_object.digest_present); |
1adf2230 AA |
1187 | data_digest = auth_object.digest; |
1188 | if (auth_object.omap_digest_present) { | |
1189 | omap_digest = auth_object.omap_digest; | |
1190 | } | |
1191 | missing_digest[*k] = make_pair(data_digest, omap_digest); | |
1192 | } | |
7c673cae FG |
1193 | if (!cur_inconsistent.empty() || !cur_missing.empty()) { |
1194 | authoritative[*k] = auth_list; | |
1adf2230 | 1195 | } else if (!fix_digest && parent->get_pool().is_replicated()) { |
7c673cae FG |
1196 | enum { |
1197 | NO = 0, | |
1198 | MAYBE = 1, | |
1199 | FORCE = 2, | |
1200 | } update = NO; | |
1201 | ||
28e407b8 AA |
1202 | if (auth_object.digest_present && !auth_oi.is_data_digest()) { |
1203 | dout(20) << __func__ << " missing data digest on " << *k << dendl; | |
7c673cae FG |
1204 | update = MAYBE; |
1205 | } | |
28e407b8 AA |
1206 | if (auth_object.omap_digest_present && !auth_oi.is_omap_digest()) { |
1207 | dout(20) << __func__ << " missing omap digest on " << *k << dendl; | |
7c673cae FG |
1208 | update = MAYBE; |
1209 | } | |
1210 | ||
1211 | // recorded digest != actual digest? | |
1212 | if (auth_oi.is_data_digest() && auth_object.digest_present && | |
1213 | auth_oi.data_digest != auth_object.digest) { | |
11fdf7f2 | 1214 | ceph_assert(shard_map[auth->first].has_data_digest_mismatch_info()); |
7c673cae FG |
1215 | errorstream << pgid << " recorded data digest 0x" |
1216 | << std::hex << auth_oi.data_digest << " != on disk 0x" | |
1217 | << auth_object.digest << std::dec << " on " << auth_oi.soid | |
1218 | << "\n"; | |
1219 | if (repair) | |
1220 | update = FORCE; | |
1221 | } | |
1222 | if (auth_oi.is_omap_digest() && auth_object.omap_digest_present && | |
1223 | auth_oi.omap_digest != auth_object.omap_digest) { | |
11fdf7f2 | 1224 | ceph_assert(shard_map[auth->first].has_omap_digest_mismatch_info()); |
7c673cae FG |
1225 | errorstream << pgid << " recorded omap digest 0x" |
1226 | << std::hex << auth_oi.omap_digest << " != on disk 0x" | |
1227 | << auth_object.omap_digest << std::dec | |
1228 | << " on " << auth_oi.soid << "\n"; | |
1229 | if (repair) | |
1230 | update = FORCE; | |
1231 | } | |
1232 | ||
1233 | if (update != NO) { | |
1234 | utime_t age = now - auth_oi.local_mtime; | |
1235 | if (update == FORCE || | |
1236 | age > cct->_conf->osd_deep_scrub_update_digest_min_age) { | |
9f95a23c | 1237 | std::optional<uint32_t> data_digest, omap_digest; |
28e407b8 AA |
1238 | if (auth_object.digest_present) { |
1239 | data_digest = auth_object.digest; | |
1240 | dout(20) << __func__ << " will update data digest on " << *k << dendl; | |
1241 | } | |
1242 | if (auth_object.omap_digest_present) { | |
1243 | omap_digest = auth_object.omap_digest; | |
1244 | dout(20) << __func__ << " will update omap digest on " << *k << dendl; | |
1245 | } | |
1246 | missing_digest[*k] = make_pair(data_digest, omap_digest); | |
7c673cae FG |
1247 | } else { |
1248 | dout(20) << __func__ << " missing digest but age " << age | |
1249 | << " < " << cct->_conf->osd_deep_scrub_update_digest_min_age | |
1250 | << " on " << *k << dendl; | |
1251 | } | |
1252 | } | |
1253 | } | |
1254 | out: | |
1255 | if (object_error.has_deep_errors()) | |
1256 | ++deep_errors; | |
1257 | else if (object_error.has_shallow_errors()) | |
1258 | ++shallow_errors; | |
1259 | if (object_error.errors || object_error.union_shards.errors) { | |
1260 | store->add_object_error(k->pool, object_error); | |
1261 | } | |
1262 | } | |
1263 | } | |
28e407b8 | 1264 | |
11fdf7f2 | 1265 | void PGBackend::be_omap_checks(const map<pg_shard_t,ScrubMap*> &maps, |
28e407b8 | 1266 | const set<hobject_t> &master_set, |
11fdf7f2 | 1267 | omap_stat_t& omap_stats, |
28e407b8 AA |
1268 | ostream &warnstream) const |
1269 | { | |
11fdf7f2 | 1270 | bool needs_omap_check = false; |
28e407b8 | 1271 | for (const auto& map : maps) { |
11fdf7f2 TL |
1272 | if (map.second->has_large_omap_object_errors || map.second->has_omap_keys) { |
1273 | needs_omap_check = true; | |
28e407b8 AA |
1274 | break; |
1275 | } | |
1276 | } | |
1277 | ||
11fdf7f2 TL |
1278 | if (!needs_omap_check) { |
1279 | return; // Nothing to do | |
28e407b8 AA |
1280 | } |
1281 | ||
11fdf7f2 | 1282 | // Iterate through objects and update omap stats |
28e407b8 AA |
1283 | for (const auto& k : master_set) { |
1284 | for (const auto& map : maps) { | |
494da23a TL |
1285 | if (map.first != get_parent()->primary_shard()) { |
1286 | // Only set omap stats for the primary | |
1287 | continue; | |
1288 | } | |
11fdf7f2 TL |
1289 | auto it = map.second->objects.find(k); |
1290 | if (it == map.second->objects.end()) | |
1291 | continue; | |
1292 | ScrubMap::object& obj = it->second; | |
1293 | omap_stats.omap_bytes += obj.object_omap_bytes; | |
1294 | omap_stats.omap_keys += obj.object_omap_keys; | |
28e407b8 | 1295 | if (obj.large_omap_object_found) { |
eafe8130 TL |
1296 | pg_t pg; |
1297 | auto osdmap = get_osdmap(); | |
1298 | osdmap->map_to_pg(k.pool, k.oid.name, k.get_key(), k.nspace, &pg); | |
1299 | pg_t mpg = osdmap->raw_pg_to_pg(pg); | |
11fdf7f2 | 1300 | omap_stats.large_omap_objects++; |
eafe8130 TL |
1301 | warnstream << "Large omap object found. Object: " << k |
1302 | << " PG: " << pg << " (" << mpg << ")" | |
1303 | << " Key count: " << obj.large_omap_object_key_count | |
1304 | << " Size (bytes): " << obj.large_omap_object_value_size | |
1305 | << '\n'; | |
28e407b8 AA |
1306 | break; |
1307 | } | |
1308 | } | |
1309 | } | |
1310 | } |