1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2013,2014 Inktank Storage, Inc.
7 * Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com>
9 * Author: Loic Dachary <loic@dachary.org>
11 * This is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public
13 * License version 2.1, as published by the Free Software
14 * Foundation. See file COPYING.
19 #include "common/errno.h"
20 #include "common/scrub_types.h"
21 #include "ReplicatedBackend.h"
22 #include "osd/scrubber/ScrubStore.h"
23 #include "ECBackend.h"
24 #include "PGBackend.h"
26 #include "erasure-code/ErasureCodePlugin.h"
29 #include "common/LogClient.h"
30 #include "messages/MOSDPGRecoveryDelete.h"
31 #include "messages/MOSDPGRecoveryDeleteReply.h"
38 using std::ostringstream
;
42 using std::stringstream
;
45 using ceph::bufferlist
;
46 using ceph::bufferptr
;
47 using ceph::ErasureCodeProfile
;
48 using ceph::ErasureCodeInterfaceRef
;
50 #define dout_context cct
51 #define dout_subsys ceph_subsys_osd
52 #define DOUT_PREFIX_ARGS this
54 #define dout_prefix _prefix(_dout, this)
55 static ostream
& _prefix(std::ostream
*_dout
, PGBackend
*pgb
) {
56 return pgb
->get_parent()->gen_dbg_prefix(*_dout
);
59 void PGBackend::recover_delete_object(const hobject_t
&oid
, eversion_t v
,
62 ceph_assert(get_parent()->get_acting_recovery_backfill_shards().size() > 0);
63 for (const auto& shard
: get_parent()->get_acting_recovery_backfill_shards()) {
64 if (shard
== get_parent()->whoami_shard())
66 if (get_parent()->get_shard_missing(shard
).is_missing(oid
)) {
67 dout(20) << __func__
<< " will remove " << oid
<< " " << v
<< " from "
69 h
->deletes
[shard
].push_back(make_pair(oid
, v
));
70 get_parent()->begin_peer_recover(shard
, oid
);
75 void PGBackend::send_recovery_deletes(int prio
,
76 const map
<pg_shard_t
, vector
<pair
<hobject_t
, eversion_t
> > > &deletes
)
78 epoch_t min_epoch
= get_parent()->get_last_peering_reset_epoch();
79 for (const auto& p
: deletes
) {
80 const auto& shard
= p
.first
;
81 const auto& objects
= p
.second
;
82 ConnectionRef con
= get_parent()->get_con_osd_cluster(
87 auto it
= objects
.begin();
88 while (it
!= objects
.end()) {
91 spg_t target_pg
= spg_t(get_parent()->get_info().pgid
.pgid
, shard
.shard
);
92 MOSDPGRecoveryDelete
*msg
=
93 new MOSDPGRecoveryDelete(get_parent()->whoami_shard(),
97 msg
->set_priority(prio
);
99 while (it
!= objects
.end() &&
100 cost
< cct
->_conf
->osd_max_push_cost
&&
101 deletes
< cct
->_conf
->osd_max_push_objects
) {
102 dout(20) << __func__
<< ": sending recovery delete << " << it
->first
103 << " " << it
->second
<< " to osd." << shard
<< dendl
;
104 msg
->objects
.push_back(*it
);
105 cost
+= cct
->_conf
->osd_push_per_object_cost
;
111 get_parent()->send_message_osd_cluster(msg
, con
);
116 bool PGBackend::handle_message(OpRequestRef op
)
118 switch (op
->get_req()->get_type()) {
119 case MSG_OSD_PG_RECOVERY_DELETE
:
120 handle_recovery_delete(op
);
123 case MSG_OSD_PG_RECOVERY_DELETE_REPLY
:
124 handle_recovery_delete_reply(op
);
131 return _handle_message(op
);
134 void PGBackend::handle_recovery_delete(OpRequestRef op
)
136 auto m
= op
->get_req
<MOSDPGRecoveryDelete
>();
137 ceph_assert(m
->get_type() == MSG_OSD_PG_RECOVERY_DELETE
);
138 dout(20) << __func__
<< " " << *op
->get_req() << dendl
;
142 C_GatherBuilder
gather(cct
);
143 for (const auto &p
: m
->objects
) {
144 get_parent()->remove_missing_object(p
.first
, p
.second
, gather
.new_sub());
147 auto reply
= make_message
<MOSDPGRecoveryDeleteReply
>();
148 reply
->from
= get_parent()->whoami_shard();
149 reply
->set_priority(m
->get_priority());
150 reply
->pgid
= spg_t(get_parent()->get_info().pgid
.pgid
, m
->from
.shard
);
151 reply
->map_epoch
= m
->map_epoch
;
152 reply
->min_epoch
= m
->min_epoch
;
153 reply
->objects
= m
->objects
;
154 ConnectionRef conn
= m
->get_connection();
156 gather
.set_finisher(new LambdaContext(
159 get_parent()->send_message_osd_cluster(reply
, conn
.get());
165 void PGBackend::handle_recovery_delete_reply(OpRequestRef op
)
167 auto m
= op
->get_req
<MOSDPGRecoveryDeleteReply
>();
168 ceph_assert(m
->get_type() == MSG_OSD_PG_RECOVERY_DELETE_REPLY
);
169 dout(20) << __func__
<< " " << *op
->get_req() << dendl
;
171 for (const auto &p
: m
->objects
) {
172 ObjectRecoveryInfo recovery_info
;
173 hobject_t oid
= p
.first
;
174 recovery_info
.version
= p
.second
;
175 get_parent()->on_peer_recover(m
->from
, oid
, recovery_info
);
176 bool peers_recovered
= true;
177 for (const auto& shard
: get_parent()->get_acting_recovery_backfill_shards()) {
178 if (shard
== get_parent()->whoami_shard())
180 if (get_parent()->get_shard_missing(shard
).is_missing(oid
)) {
181 dout(20) << __func__
<< " " << oid
<< " still missing on at least "
183 peers_recovered
= false;
187 if (peers_recovered
&& !get_parent()->get_local_missing().is_missing(oid
)) {
188 dout(20) << __func__
<< " completed recovery, local_missing = "
189 << get_parent()->get_local_missing() << dendl
;
190 object_stat_sum_t stat_diff
;
191 stat_diff
.num_objects_recovered
= 1;
192 get_parent()->on_global_recover(p
.first
, stat_diff
, true);
197 void PGBackend::rollback(
198 const pg_log_entry_t
&entry
,
199 ObjectStore::Transaction
*t
)
202 struct RollbackVisitor
: public ObjectModDesc::Visitor
{
203 const hobject_t
&hoid
;
205 ObjectStore::Transaction t
;
207 const hobject_t
&hoid
,
208 PGBackend
*pg
) : hoid(hoid
), pg(pg
) {}
209 void append(uint64_t old_size
) override
{
210 ObjectStore::Transaction temp
;
211 pg
->rollback_append(hoid
, old_size
, &temp
);
215 void setattrs(map
<string
, std::optional
<bufferlist
> > &attrs
) override
{
216 ObjectStore::Transaction temp
;
217 pg
->rollback_setattrs(hoid
, attrs
, &temp
);
221 void rmobject(version_t old_version
) override
{
222 ObjectStore::Transaction temp
;
223 pg
->rollback_stash(hoid
, old_version
, &temp
);
227 void try_rmobject(version_t old_version
) override
{
228 ObjectStore::Transaction temp
;
229 pg
->rollback_try_stash(hoid
, old_version
, &temp
);
233 void create() override
{
234 ObjectStore::Transaction temp
;
235 pg
->rollback_create(hoid
, &temp
);
239 void update_snaps(const set
<snapid_t
> &snaps
) override
{
240 ObjectStore::Transaction temp
;
241 pg
->get_parent()->pgb_set_object_snap_mapping(hoid
, snaps
, &temp
);
245 void rollback_extents(
247 const vector
<pair
<uint64_t, uint64_t> > &extents
) override
{
248 ObjectStore::Transaction temp
;
249 pg
->rollback_extents(gen
, extents
, hoid
, &temp
);
255 ceph_assert(entry
.mod_desc
.can_rollback());
256 RollbackVisitor
vis(entry
.soid
, this);
257 entry
.mod_desc
.visit(&vis
);
261 struct Trimmer
: public ObjectModDesc::Visitor
{
262 const hobject_t
&soid
;
264 ObjectStore::Transaction
*t
;
266 const hobject_t
&soid
,
268 ObjectStore::Transaction
*t
)
269 : soid(soid
), pg(pg
), t(t
) {}
270 void rmobject(version_t old_version
) override
{
271 pg
->trim_rollback_object(
276 // try_rmobject defaults to rmobject
277 void rollback_extents(
279 const vector
<pair
<uint64_t, uint64_t> > &extents
) override
{
280 pg
->trim_rollback_object(
287 void PGBackend::rollforward(
288 const pg_log_entry_t
&entry
,
289 ObjectStore::Transaction
*t
)
291 auto dpp
= get_parent()->get_dpp();
292 ldpp_dout(dpp
, 20) << __func__
<< ": entry=" << entry
<< dendl
;
293 if (!entry
.can_rollback())
295 Trimmer
trimmer(entry
.soid
, this, t
);
296 entry
.mod_desc
.visit(&trimmer
);
299 void PGBackend::trim(
300 const pg_log_entry_t
&entry
,
301 ObjectStore::Transaction
*t
)
303 if (!entry
.can_rollback())
305 Trimmer
trimmer(entry
.soid
, this, t
);
306 entry
.mod_desc
.visit(&trimmer
);
309 void PGBackend::try_stash(
310 const hobject_t
&hoid
,
312 ObjectStore::Transaction
*t
)
316 ghobject_t(hoid
, ghobject_t::NO_GEN
, get_parent()->whoami_shard().shard
),
317 ghobject_t(hoid
, v
, get_parent()->whoami_shard().shard
));
320 void PGBackend::remove(
321 const hobject_t
&hoid
,
322 ObjectStore::Transaction
*t
) {
323 ceph_assert(!hoid
.is_temp());
326 ghobject_t(hoid
, ghobject_t::NO_GEN
, get_parent()->whoami_shard().shard
));
327 get_parent()->pgb_clear_object_snap_mapping(hoid
, t
);
330 void PGBackend::on_change_cleanup(ObjectStore::Transaction
*t
)
332 dout(10) << __func__
<< dendl
;
334 for (set
<hobject_t
>::iterator i
= temp_contents
.begin();
335 i
!= temp_contents
.end();
337 dout(10) << __func__
<< ": Removing oid "
338 << *i
<< " from the temp collection" << dendl
;
341 ghobject_t(*i
, ghobject_t::NO_GEN
, get_parent()->whoami_shard().shard
));
343 temp_contents
.clear();
346 int PGBackend::objects_list_partial(
347 const hobject_t
&begin
,
350 vector
<hobject_t
> *ls
,
354 // Starts with the smallest generation to make sure the result list
355 // has the marker object (it might have multiple generations
356 // though, which would be filtered).
359 _next
= ghobject_t(begin
, 0, get_parent()->whoami_shard().shard
);
366 while (!_next
.is_max() && ls
->size() < (unsigned)min
) {
367 vector
<ghobject_t
> objects
;
368 if (HAVE_FEATURE(parent
->min_upacting_features(),
369 OSD_FIXED_COLLECTION_LIST
)) {
370 r
= store
->collection_list(
373 ghobject_t::get_max(),
378 r
= store
->collection_list_legacy(
381 ghobject_t::get_max(),
387 derr
<< __func__
<< " list collection " << ch
<< " got: " << cpp_strerror(r
) << dendl
;
390 for (vector
<ghobject_t
>::iterator i
= objects
.begin();
393 if (i
->is_pgmeta() || i
->hobj
.is_temp()) {
396 if (i
->is_no_gen()) {
397 ls
->push_back(i
->hobj
);
406 int PGBackend::objects_list_range(
407 const hobject_t
&start
,
408 const hobject_t
&end
,
409 vector
<hobject_t
> *ls
,
410 vector
<ghobject_t
> *gen_obs
)
413 vector
<ghobject_t
> objects
;
415 if (HAVE_FEATURE(parent
->min_upacting_features(),
416 OSD_FIXED_COLLECTION_LIST
)) {
417 r
= store
->collection_list(
419 ghobject_t(start
, ghobject_t::NO_GEN
, get_parent()->whoami_shard().shard
),
420 ghobject_t(end
, ghobject_t::NO_GEN
, get_parent()->whoami_shard().shard
),
425 r
= store
->collection_list_legacy(
427 ghobject_t(start
, ghobject_t::NO_GEN
, get_parent()->whoami_shard().shard
),
428 ghobject_t(end
, ghobject_t::NO_GEN
, get_parent()->whoami_shard().shard
),
433 ls
->reserve(objects
.size());
434 for (vector
<ghobject_t
>::iterator i
= objects
.begin();
437 if (i
->is_pgmeta() || i
->hobj
.is_temp()) {
440 if (i
->is_no_gen()) {
441 ls
->push_back(i
->hobj
);
442 } else if (gen_obs
) {
443 gen_obs
->push_back(*i
);
449 int PGBackend::objects_get_attr(
450 const hobject_t
&hoid
,
455 int r
= store
->getattr(
457 ghobject_t(hoid
, ghobject_t::NO_GEN
, get_parent()->whoami_shard().shard
),
462 out
->push_back(std::move(bp
));
467 int PGBackend::objects_get_attrs(
468 const hobject_t
&hoid
,
469 map
<string
, bufferlist
, less
<>> *out
)
471 return store
->getattrs(
473 ghobject_t(hoid
, ghobject_t::NO_GEN
, get_parent()->whoami_shard().shard
),
477 void PGBackend::rollback_setattrs(
478 const hobject_t
&hoid
,
479 map
<string
, std::optional
<bufferlist
> > &old_attrs
,
480 ObjectStore::Transaction
*t
) {
481 map
<string
, bufferlist
, less
<>> to_set
;
482 ceph_assert(!hoid
.is_temp());
483 for (map
<string
, std::optional
<bufferlist
> >::iterator i
= old_attrs
.begin();
484 i
!= old_attrs
.end();
487 to_set
[i
->first
] = *(i
->second
);
491 ghobject_t(hoid
, ghobject_t::NO_GEN
, get_parent()->whoami_shard().shard
),
497 ghobject_t(hoid
, ghobject_t::NO_GEN
, get_parent()->whoami_shard().shard
),
501 void PGBackend::rollback_append(
502 const hobject_t
&hoid
,
504 ObjectStore::Transaction
*t
) {
505 ceph_assert(!hoid
.is_temp());
508 ghobject_t(hoid
, ghobject_t::NO_GEN
, get_parent()->whoami_shard().shard
),
512 void PGBackend::rollback_stash(
513 const hobject_t
&hoid
,
514 version_t old_version
,
515 ObjectStore::Transaction
*t
) {
516 ceph_assert(!hoid
.is_temp());
519 ghobject_t(hoid
, ghobject_t::NO_GEN
, get_parent()->whoami_shard().shard
));
520 t
->collection_move_rename(
522 ghobject_t(hoid
, old_version
, get_parent()->whoami_shard().shard
),
524 ghobject_t(hoid
, ghobject_t::NO_GEN
, get_parent()->whoami_shard().shard
));
527 void PGBackend::rollback_try_stash(
528 const hobject_t
&hoid
,
529 version_t old_version
,
530 ObjectStore::Transaction
*t
) {
531 ceph_assert(!hoid
.is_temp());
534 ghobject_t(hoid
, ghobject_t::NO_GEN
, get_parent()->whoami_shard().shard
));
537 ghobject_t(hoid
, old_version
, get_parent()->whoami_shard().shard
),
538 ghobject_t(hoid
, ghobject_t::NO_GEN
, get_parent()->whoami_shard().shard
));
541 void PGBackend::rollback_extents(
543 const vector
<pair
<uint64_t, uint64_t> > &extents
,
544 const hobject_t
&hoid
,
545 ObjectStore::Transaction
*t
) {
546 auto shard
= get_parent()->whoami_shard().shard
;
547 for (auto &&extent
: extents
) {
550 ghobject_t(hoid
, gen
, shard
),
551 ghobject_t(hoid
, ghobject_t::NO_GEN
, shard
),
558 ghobject_t(hoid
, gen
, shard
));
561 void PGBackend::trim_rollback_object(
562 const hobject_t
&hoid
,
563 version_t old_version
,
564 ObjectStore::Transaction
*t
) {
565 ceph_assert(!hoid
.is_temp());
567 coll
, ghobject_t(hoid
, old_version
, get_parent()->whoami_shard().shard
));
570 PGBackend
*PGBackend::build_pg_backend(
571 const pg_pool_t
&pool
,
572 const map
<string
,string
>& profile
,
575 ObjectStore::CollectionHandle
&ch
,
579 ErasureCodeProfile ec_profile
= profile
;
581 case pg_pool_t::TYPE_REPLICATED
: {
582 return new ReplicatedBackend(l
, coll
, ch
, store
, cct
);
584 case pg_pool_t::TYPE_ERASURE
: {
585 ErasureCodeInterfaceRef ec_impl
;
587 ceph::ErasureCodePluginRegistry::instance().factory(
588 profile
.find("plugin")->second
,
589 cct
->_conf
.get_val
<std::string
>("erasure_code_dir"),
593 ceph_assert(ec_impl
);
594 return new ECBackend(
609 int PGBackend::be_scan_list(
611 ScrubMapBuilder
&pos
)
613 dout(10) << __func__
<< " " << pos
<< dendl
;
614 ceph_assert(!pos
.done());
615 ceph_assert(pos
.pos
< pos
.ls
.size());
616 hobject_t
& poid
= pos
.ls
[pos
.pos
];
622 poid
, ghobject_t::NO_GEN
, get_parent()->whoami_shard().shard
),
626 ScrubMap::object
&o
= map
.objects
[poid
];
628 ceph_assert(!o
.negative
);
632 poid
, ghobject_t::NO_GEN
, get_parent()->whoami_shard().shard
),
636 r
= be_deep_scrub(poid
, map
, pos
, o
);
638 dout(25) << __func__
<< " " << poid
<< dendl
;
639 } else if (r
== -ENOENT
) {
640 dout(25) << __func__
<< " " << poid
<< " got " << r
641 << ", skipping" << dendl
;
642 } else if (r
== -EIO
) {
643 dout(25) << __func__
<< " " << poid
<< " got " << r
644 << ", stat_error" << dendl
;
645 ScrubMap::object
&o
= map
.objects
[poid
];
648 derr
<< __func__
<< " got: " << cpp_strerror(r
) << dendl
;
651 if (r
== -EINPROGRESS
) {