]> git.proxmox.com Git - ceph.git/blame - ceph/src/osd/PGBackend.cc
bump version to 18.2.2-pve1
[ceph.git] / ceph / src / osd / PGBackend.cc
CommitLineData
7c673cae
FG
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3/*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2013,2014 Inktank Storage, Inc.
7 * Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com>
8 *
9 * Author: Loic Dachary <loic@dachary.org>
10 *
11 * This is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public
13 * License version 2.1, as published by the Free Software
14 * Foundation. See file COPYING.
15 *
16 */
17
18
19#include "common/errno.h"
20#include "common/scrub_types.h"
21#include "ReplicatedBackend.h"
20effc67 22#include "osd/scrubber/ScrubStore.h"
7c673cae
FG
23#include "ECBackend.h"
24#include "PGBackend.h"
25#include "OSD.h"
26#include "erasure-code/ErasureCodePlugin.h"
27#include "OSDMap.h"
28#include "PGLog.h"
29#include "common/LogClient.h"
c07f9fc5
FG
30#include "messages/MOSDPGRecoveryDelete.h"
31#include "messages/MOSDPGRecoveryDeleteReply.h"
7c673cae 32
20effc67 33using std::less;
f67539c2
TL
34using std::list;
35using std::make_pair;
36using std::map;
37using std::ostream;
38using std::ostringstream;
39using std::pair;
40using std::set;
41using std::string;
42using std::stringstream;
43using std::vector;
44
45using ceph::bufferlist;
46using ceph::bufferptr;
47using ceph::ErasureCodeProfile;
48using ceph::ErasureCodeInterfaceRef;
49
7c673cae
FG
50#define dout_context cct
51#define dout_subsys ceph_subsys_osd
52#define DOUT_PREFIX_ARGS this
53#undef dout_prefix
54#define dout_prefix _prefix(_dout, this)
55static ostream& _prefix(std::ostream *_dout, PGBackend *pgb) {
11fdf7f2 56 return pgb->get_parent()->gen_dbg_prefix(*_dout);
7c673cae
FG
57}
58
c07f9fc5
FG
59void PGBackend::recover_delete_object(const hobject_t &oid, eversion_t v,
60 RecoveryHandle *h)
61{
11fdf7f2
TL
62 ceph_assert(get_parent()->get_acting_recovery_backfill_shards().size() > 0);
63 for (const auto& shard : get_parent()->get_acting_recovery_backfill_shards()) {
c07f9fc5
FG
64 if (shard == get_parent()->whoami_shard())
65 continue;
66 if (get_parent()->get_shard_missing(shard).is_missing(oid)) {
67 dout(20) << __func__ << " will remove " << oid << " " << v << " from "
68 << shard << dendl;
69 h->deletes[shard].push_back(make_pair(oid, v));
70 get_parent()->begin_peer_recover(shard, oid);
71 }
72 }
73}
74
75void PGBackend::send_recovery_deletes(int prio,
76 const map<pg_shard_t, vector<pair<hobject_t, eversion_t> > > &deletes)
77{
78 epoch_t min_epoch = get_parent()->get_last_peering_reset_epoch();
79 for (const auto& p : deletes) {
80 const auto& shard = p.first;
81 const auto& objects = p.second;
82 ConnectionRef con = get_parent()->get_con_osd_cluster(
83 shard.osd,
11fdf7f2 84 get_osdmap_epoch());
c07f9fc5
FG
85 if (!con)
86 continue;
87 auto it = objects.begin();
88 while (it != objects.end()) {
89 uint64_t cost = 0;
90 uint64_t deletes = 0;
91 spg_t target_pg = spg_t(get_parent()->get_info().pgid.pgid, shard.shard);
92 MOSDPGRecoveryDelete *msg =
93 new MOSDPGRecoveryDelete(get_parent()->whoami_shard(),
94 target_pg,
11fdf7f2 95 get_osdmap_epoch(),
c07f9fc5
FG
96 min_epoch);
97 msg->set_priority(prio);
98
99 while (it != objects.end() &&
100 cost < cct->_conf->osd_max_push_cost &&
101 deletes < cct->_conf->osd_max_push_objects) {
102 dout(20) << __func__ << ": sending recovery delete << " << it->first
103 << " " << it->second << " to osd." << shard << dendl;
104 msg->objects.push_back(*it);
105 cost += cct->_conf->osd_push_per_object_cost;
106 ++deletes;
107 ++it;
108 }
109
110 msg->set_cost(cost);
111 get_parent()->send_message_osd_cluster(msg, con);
112 }
113 }
114}
115
116bool PGBackend::handle_message(OpRequestRef op)
117{
118 switch (op->get_req()->get_type()) {
119 case MSG_OSD_PG_RECOVERY_DELETE:
120 handle_recovery_delete(op);
121 return true;
122
123 case MSG_OSD_PG_RECOVERY_DELETE_REPLY:
124 handle_recovery_delete_reply(op);
125 return true;
126
127 default:
128 break;
129 }
130
131 return _handle_message(op);
132}
133
134void PGBackend::handle_recovery_delete(OpRequestRef op)
135{
9f95a23c 136 auto m = op->get_req<MOSDPGRecoveryDelete>();
11fdf7f2 137 ceph_assert(m->get_type() == MSG_OSD_PG_RECOVERY_DELETE);
1e59de90 138 dout(20) << __func__ << " " << *op->get_req() << dendl;
c07f9fc5
FG
139
140 op->mark_started();
141
142 C_GatherBuilder gather(cct);
143 for (const auto &p : m->objects) {
144 get_parent()->remove_missing_object(p.first, p.second, gather.new_sub());
145 }
146
f67539c2 147 auto reply = make_message<MOSDPGRecoveryDeleteReply>();
c07f9fc5
FG
148 reply->from = get_parent()->whoami_shard();
149 reply->set_priority(m->get_priority());
150 reply->pgid = spg_t(get_parent()->get_info().pgid.pgid, m->from.shard);
151 reply->map_epoch = m->map_epoch;
152 reply->min_epoch = m->min_epoch;
153 reply->objects = m->objects;
154 ConnectionRef conn = m->get_connection();
155
9f95a23c 156 gather.set_finisher(new LambdaContext(
1e59de90 157 [=, this](int r) {
c07f9fc5
FG
158 if (r != -EAGAIN) {
159 get_parent()->send_message_osd_cluster(reply, conn.get());
160 }
161 }));
162 gather.activate();
163}
164
165void PGBackend::handle_recovery_delete_reply(OpRequestRef op)
166{
9f95a23c 167 auto m = op->get_req<MOSDPGRecoveryDeleteReply>();
11fdf7f2 168 ceph_assert(m->get_type() == MSG_OSD_PG_RECOVERY_DELETE_REPLY);
1e59de90 169 dout(20) << __func__ << " " << *op->get_req() << dendl;
c07f9fc5
FG
170
171 for (const auto &p : m->objects) {
172 ObjectRecoveryInfo recovery_info;
173 hobject_t oid = p.first;
174 recovery_info.version = p.second;
175 get_parent()->on_peer_recover(m->from, oid, recovery_info);
176 bool peers_recovered = true;
11fdf7f2 177 for (const auto& shard : get_parent()->get_acting_recovery_backfill_shards()) {
c07f9fc5
FG
178 if (shard == get_parent()->whoami_shard())
179 continue;
180 if (get_parent()->get_shard_missing(shard).is_missing(oid)) {
181 dout(20) << __func__ << " " << oid << " still missing on at least "
182 << shard << dendl;
183 peers_recovered = false;
184 break;
185 }
186 }
187 if (peers_recovered && !get_parent()->get_local_missing().is_missing(oid)) {
188 dout(20) << __func__ << " completed recovery, local_missing = "
189 << get_parent()->get_local_missing() << dendl;
190 object_stat_sum_t stat_diff;
191 stat_diff.num_objects_recovered = 1;
192 get_parent()->on_global_recover(p.first, stat_diff, true);
193 }
194 }
195}
196
7c673cae
FG
197void PGBackend::rollback(
198 const pg_log_entry_t &entry,
199 ObjectStore::Transaction *t)
200{
201
202 struct RollbackVisitor : public ObjectModDesc::Visitor {
203 const hobject_t &hoid;
204 PGBackend *pg;
205 ObjectStore::Transaction t;
206 RollbackVisitor(
207 const hobject_t &hoid,
208 PGBackend *pg) : hoid(hoid), pg(pg) {}
209 void append(uint64_t old_size) override {
210 ObjectStore::Transaction temp;
211 pg->rollback_append(hoid, old_size, &temp);
212 temp.append(t);
213 temp.swap(t);
214 }
9f95a23c 215 void setattrs(map<string, std::optional<bufferlist> > &attrs) override {
7c673cae
FG
216 ObjectStore::Transaction temp;
217 pg->rollback_setattrs(hoid, attrs, &temp);
218 temp.append(t);
219 temp.swap(t);
220 }
221 void rmobject(version_t old_version) override {
222 ObjectStore::Transaction temp;
223 pg->rollback_stash(hoid, old_version, &temp);
224 temp.append(t);
225 temp.swap(t);
226 }
227 void try_rmobject(version_t old_version) override {
228 ObjectStore::Transaction temp;
229 pg->rollback_try_stash(hoid, old_version, &temp);
230 temp.append(t);
231 temp.swap(t);
232 }
233 void create() override {
234 ObjectStore::Transaction temp;
235 pg->rollback_create(hoid, &temp);
236 temp.append(t);
237 temp.swap(t);
238 }
239 void update_snaps(const set<snapid_t> &snaps) override {
240 ObjectStore::Transaction temp;
241 pg->get_parent()->pgb_set_object_snap_mapping(hoid, snaps, &temp);
242 temp.append(t);
243 temp.swap(t);
244 }
245 void rollback_extents(
246 version_t gen,
247 const vector<pair<uint64_t, uint64_t> > &extents) override {
248 ObjectStore::Transaction temp;
249 pg->rollback_extents(gen, extents, hoid, &temp);
250 temp.append(t);
251 temp.swap(t);
252 }
253 };
254
11fdf7f2 255 ceph_assert(entry.mod_desc.can_rollback());
7c673cae
FG
256 RollbackVisitor vis(entry.soid, this);
257 entry.mod_desc.visit(&vis);
258 t->append(vis.t);
259}
260
261struct Trimmer : public ObjectModDesc::Visitor {
262 const hobject_t &soid;
263 PGBackend *pg;
264 ObjectStore::Transaction *t;
265 Trimmer(
266 const hobject_t &soid,
267 PGBackend *pg,
268 ObjectStore::Transaction *t)
269 : soid(soid), pg(pg), t(t) {}
270 void rmobject(version_t old_version) override {
271 pg->trim_rollback_object(
272 soid,
273 old_version,
274 t);
275 }
276 // try_rmobject defaults to rmobject
277 void rollback_extents(
278 version_t gen,
279 const vector<pair<uint64_t, uint64_t> > &extents) override {
280 pg->trim_rollback_object(
281 soid,
282 gen,
283 t);
284 }
285};
286
287void PGBackend::rollforward(
288 const pg_log_entry_t &entry,
289 ObjectStore::Transaction *t)
290{
291 auto dpp = get_parent()->get_dpp();
292 ldpp_dout(dpp, 20) << __func__ << ": entry=" << entry << dendl;
293 if (!entry.can_rollback())
294 return;
295 Trimmer trimmer(entry.soid, this, t);
296 entry.mod_desc.visit(&trimmer);
297}
298
299void PGBackend::trim(
300 const pg_log_entry_t &entry,
301 ObjectStore::Transaction *t)
302{
303 if (!entry.can_rollback())
304 return;
305 Trimmer trimmer(entry.soid, this, t);
306 entry.mod_desc.visit(&trimmer);
307}
308
309void PGBackend::try_stash(
310 const hobject_t &hoid,
311 version_t v,
312 ObjectStore::Transaction *t)
313{
314 t->try_rename(
315 coll,
316 ghobject_t(hoid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard),
317 ghobject_t(hoid, v, get_parent()->whoami_shard().shard));
318}
319
320void PGBackend::remove(
321 const hobject_t &hoid,
322 ObjectStore::Transaction *t) {
11fdf7f2 323 ceph_assert(!hoid.is_temp());
7c673cae
FG
324 t->remove(
325 coll,
326 ghobject_t(hoid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard));
327 get_parent()->pgb_clear_object_snap_mapping(hoid, t);
328}
329
330void PGBackend::on_change_cleanup(ObjectStore::Transaction *t)
331{
332 dout(10) << __func__ << dendl;
333 // clear temp
334 for (set<hobject_t>::iterator i = temp_contents.begin();
335 i != temp_contents.end();
336 ++i) {
337 dout(10) << __func__ << ": Removing oid "
338 << *i << " from the temp collection" << dendl;
339 t->remove(
340 coll,
341 ghobject_t(*i, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard));
342 }
343 temp_contents.clear();
344}
345
346int PGBackend::objects_list_partial(
347 const hobject_t &begin,
348 int min,
349 int max,
350 vector<hobject_t> *ls,
351 hobject_t *next)
352{
11fdf7f2 353 ceph_assert(ls);
7c673cae
FG
354 // Starts with the smallest generation to make sure the result list
355 // has the marker object (it might have multiple generations
356 // though, which would be filtered).
357 ghobject_t _next;
358 if (!begin.is_min())
359 _next = ghobject_t(begin, 0, get_parent()->whoami_shard().shard);
360 ls->reserve(max);
361 int r = 0;
362
363 if (min > max)
364 min = max;
365
366 while (!_next.is_max() && ls->size() < (unsigned)min) {
367 vector<ghobject_t> objects;
f91f0fd5
TL
368 if (HAVE_FEATURE(parent->min_upacting_features(),
369 OSD_FIXED_COLLECTION_LIST)) {
370 r = store->collection_list(
371 ch,
372 _next,
373 ghobject_t::get_max(),
374 max - ls->size(),
375 &objects,
376 &_next);
377 } else {
378 r = store->collection_list_legacy(
379 ch,
380 _next,
381 ghobject_t::get_max(),
382 max - ls->size(),
383 &objects,
384 &_next);
385 }
7c673cae
FG
386 if (r != 0) {
387 derr << __func__ << " list collection " << ch << " got: " << cpp_strerror(r) << dendl;
388 break;
389 }
390 for (vector<ghobject_t>::iterator i = objects.begin();
391 i != objects.end();
392 ++i) {
393 if (i->is_pgmeta() || i->hobj.is_temp()) {
394 continue;
395 }
396 if (i->is_no_gen()) {
397 ls->push_back(i->hobj);
398 }
399 }
400 }
401 if (r == 0)
402 *next = _next.hobj;
403 return r;
404}
405
406int PGBackend::objects_list_range(
407 const hobject_t &start,
408 const hobject_t &end,
7c673cae
FG
409 vector<hobject_t> *ls,
410 vector<ghobject_t> *gen_obs)
411{
11fdf7f2 412 ceph_assert(ls);
7c673cae 413 vector<ghobject_t> objects;
f91f0fd5
TL
414 int r;
415 if (HAVE_FEATURE(parent->min_upacting_features(),
416 OSD_FIXED_COLLECTION_LIST)) {
417 r = store->collection_list(
418 ch,
419 ghobject_t(start, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard),
420 ghobject_t(end, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard),
421 INT_MAX,
422 &objects,
423 NULL);
424 } else {
425 r = store->collection_list_legacy(
426 ch,
427 ghobject_t(start, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard),
428 ghobject_t(end, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard),
429 INT_MAX,
430 &objects,
431 NULL);
432 }
7c673cae
FG
433 ls->reserve(objects.size());
434 for (vector<ghobject_t>::iterator i = objects.begin();
435 i != objects.end();
436 ++i) {
437 if (i->is_pgmeta() || i->hobj.is_temp()) {
438 continue;
439 }
440 if (i->is_no_gen()) {
441 ls->push_back(i->hobj);
442 } else if (gen_obs) {
443 gen_obs->push_back(*i);
444 }
445 }
446 return r;
447}
448
449int PGBackend::objects_get_attr(
450 const hobject_t &hoid,
451 const string &attr,
452 bufferlist *out)
453{
454 bufferptr bp;
455 int r = store->getattr(
456 ch,
457 ghobject_t(hoid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard),
458 attr.c_str(),
459 bp);
460 if (r >= 0 && out) {
461 out->clear();
462 out->push_back(std::move(bp));
463 }
464 return r;
465}
466
467int PGBackend::objects_get_attrs(
468 const hobject_t &hoid,
20effc67 469 map<string, bufferlist, less<>> *out)
7c673cae
FG
470{
471 return store->getattrs(
472 ch,
473 ghobject_t(hoid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard),
474 *out);
475}
476
477void PGBackend::rollback_setattrs(
478 const hobject_t &hoid,
9f95a23c 479 map<string, std::optional<bufferlist> > &old_attrs,
7c673cae 480 ObjectStore::Transaction *t) {
20effc67 481 map<string, bufferlist, less<>> to_set;
11fdf7f2 482 ceph_assert(!hoid.is_temp());
9f95a23c 483 for (map<string, std::optional<bufferlist> >::iterator i = old_attrs.begin();
7c673cae
FG
484 i != old_attrs.end();
485 ++i) {
486 if (i->second) {
9f95a23c 487 to_set[i->first] = *(i->second);
7c673cae
FG
488 } else {
489 t->rmattr(
490 coll,
491 ghobject_t(hoid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard),
492 i->first);
493 }
494 }
495 t->setattrs(
496 coll,
497 ghobject_t(hoid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard),
498 to_set);
499}
500
501void PGBackend::rollback_append(
502 const hobject_t &hoid,
503 uint64_t old_size,
504 ObjectStore::Transaction *t) {
11fdf7f2 505 ceph_assert(!hoid.is_temp());
7c673cae
FG
506 t->truncate(
507 coll,
508 ghobject_t(hoid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard),
509 old_size);
510}
511
512void PGBackend::rollback_stash(
513 const hobject_t &hoid,
514 version_t old_version,
515 ObjectStore::Transaction *t) {
11fdf7f2 516 ceph_assert(!hoid.is_temp());
7c673cae
FG
517 t->remove(
518 coll,
519 ghobject_t(hoid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard));
520 t->collection_move_rename(
521 coll,
522 ghobject_t(hoid, old_version, get_parent()->whoami_shard().shard),
523 coll,
524 ghobject_t(hoid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard));
525}
526
527void PGBackend::rollback_try_stash(
528 const hobject_t &hoid,
529 version_t old_version,
530 ObjectStore::Transaction *t) {
11fdf7f2 531 ceph_assert(!hoid.is_temp());
7c673cae
FG
532 t->remove(
533 coll,
534 ghobject_t(hoid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard));
535 t->try_rename(
536 coll,
537 ghobject_t(hoid, old_version, get_parent()->whoami_shard().shard),
538 ghobject_t(hoid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard));
539}
540
541void PGBackend::rollback_extents(
542 version_t gen,
543 const vector<pair<uint64_t, uint64_t> > &extents,
544 const hobject_t &hoid,
545 ObjectStore::Transaction *t) {
546 auto shard = get_parent()->whoami_shard().shard;
547 for (auto &&extent: extents) {
548 t->clone_range(
549 coll,
550 ghobject_t(hoid, gen, shard),
551 ghobject_t(hoid, ghobject_t::NO_GEN, shard),
552 extent.first,
553 extent.second,
554 extent.first);
555 }
556 t->remove(
557 coll,
558 ghobject_t(hoid, gen, shard));
559}
560
561void PGBackend::trim_rollback_object(
562 const hobject_t &hoid,
563 version_t old_version,
564 ObjectStore::Transaction *t) {
11fdf7f2 565 ceph_assert(!hoid.is_temp());
7c673cae
FG
566 t->remove(
567 coll, ghobject_t(hoid, old_version, get_parent()->whoami_shard().shard));
568}
569
570PGBackend *PGBackend::build_pg_backend(
571 const pg_pool_t &pool,
11fdf7f2 572 const map<string,string>& profile,
7c673cae
FG
573 Listener *l,
574 coll_t coll,
575 ObjectStore::CollectionHandle &ch,
576 ObjectStore *store,
577 CephContext *cct)
578{
11fdf7f2 579 ErasureCodeProfile ec_profile = profile;
7c673cae
FG
580 switch (pool.type) {
581 case pg_pool_t::TYPE_REPLICATED: {
582 return new ReplicatedBackend(l, coll, ch, store, cct);
583 }
584 case pg_pool_t::TYPE_ERASURE: {
585 ErasureCodeInterfaceRef ec_impl;
7c673cae
FG
586 stringstream ss;
587 ceph::ErasureCodePluginRegistry::instance().factory(
588 profile.find("plugin")->second,
11fdf7f2
TL
589 cct->_conf.get_val<std::string>("erasure_code_dir"),
590 ec_profile,
7c673cae
FG
591 &ec_impl,
592 &ss);
11fdf7f2 593 ceph_assert(ec_impl);
7c673cae
FG
594 return new ECBackend(
595 l,
596 coll,
597 ch,
598 store,
599 cct,
600 ec_impl,
601 pool.stripe_width);
602 }
603 default:
604 ceph_abort();
605 return NULL;
606 }
607}
608
28e407b8
AA
609int PGBackend::be_scan_list(
610 ScrubMap &map,
611 ScrubMapBuilder &pos)
7c673cae 612{
28e407b8 613 dout(10) << __func__ << " " << pos << dendl;
11fdf7f2
TL
614 ceph_assert(!pos.done());
615 ceph_assert(pos.pos < pos.ls.size());
28e407b8
AA
616 hobject_t& poid = pos.ls[pos.pos];
617
618 struct stat st;
619 int r = store->stat(
620 ch,
621 ghobject_t(
622 poid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard),
623 &st,
624 true);
625 if (r == 0) {
626 ScrubMap::object &o = map.objects[poid];
627 o.size = st.st_size;
11fdf7f2 628 ceph_assert(!o.negative);
28e407b8 629 store->getattrs(
7c673cae
FG
630 ch,
631 ghobject_t(
632 poid, ghobject_t::NO_GEN, get_parent()->whoami_shard().shard),
28e407b8 633 o.attrs);
7c673cae 634
28e407b8
AA
635 if (pos.deep) {
636 r = be_deep_scrub(poid, map, pos, o);
7c673cae 637 }
28e407b8
AA
638 dout(25) << __func__ << " " << poid << dendl;
639 } else if (r == -ENOENT) {
640 dout(25) << __func__ << " " << poid << " got " << r
641 << ", skipping" << dendl;
642 } else if (r == -EIO) {
643 dout(25) << __func__ << " " << poid << " got " << r
644 << ", stat_error" << dendl;
645 ScrubMap::object &o = map.objects[poid];
646 o.stat_error = true;
647 } else {
648 derr << __func__ << " got: " << cpp_strerror(r) << dendl;
649 ceph_abort();
650 }
651 if (r == -EINPROGRESS) {
652 return -EINPROGRESS;
7c673cae 653 }
28e407b8
AA
654 pos.next_object();
655 return 0;
7c673cae 656}