1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 * Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com>
9 * Author: Loic Dachary <loic@dachary.org>
11 * This is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public
13 * License version 2.1, as published by the Free Software
14 * Foundation. See file COPYING.
17 #include "PrimaryLogPG.h"
25 #include <boost/intrusive_ptr.hpp>
26 #include <boost/tuple/tuple.hpp>
28 #include "PrimaryLogPG.h"
30 #include "cls/cas/cls_cas_ops.h"
31 #include "common/CDC.h"
32 #include "common/EventTrace.h"
33 #include "common/ceph_crypto.h"
34 #include "common/config.h"
35 #include "common/errno.h"
36 #include "common/perf_counters.h"
37 #include "common/scrub_types.h"
38 #include "include/compat.h"
39 #include "json_spirit/json_spirit_reader.h"
40 #include "json_spirit/json_spirit_value.h"
41 #include "messages/MCommandReply.h"
42 #include "messages/MOSDBackoff.h"
43 #include "messages/MOSDOp.h"
44 #include "messages/MOSDPGBackfill.h"
45 #include "messages/MOSDPGBackfillRemove.h"
46 #include "messages/MOSDPGLog.h"
47 #include "messages/MOSDPGScan.h"
48 #include "messages/MOSDPGTrim.h"
49 #include "messages/MOSDPGUpdateLogMissing.h"
50 #include "messages/MOSDPGUpdateLogMissingReply.h"
51 #include "messages/MOSDRepScrub.h"
52 #include "messages/MOSDScrubReserve.h"
53 #include "mon/MonClient.h"
54 #include "objclass/objclass.h"
55 #include "osd/ClassHandler.h"
56 #include "osdc/Objecter.h"
57 #include "osd/scrubber/PrimaryLogScrub.h"
58 #include "osd/scrubber/ScrubStore.h"
59 #include "osd/scrubber/pg_scrubber.h"
62 #include "OpRequest.h"
66 // required includes order:
67 #include "json_spirit/json_spirit_value.h"
68 #include "json_spirit/json_spirit_reader.h"
69 #include "include/ceph_assert.h" // json_spirit clobbers it
70 #include "include/rados/rados_types.hpp"
73 #include "tracing/osd.h"
75 #define tracepoint(...)
78 #define dout_context cct
79 #define dout_subsys ceph_subsys_osd
80 #define DOUT_PREFIX_ARGS this, osd->whoami, get_osdmap()
82 #define dout_prefix _prefix(_dout, this)
84 #include "osd_tracer.h"
86 MEMPOOL_DEFINE_OBJECT_FACTORY(PrimaryLogPG
, replicatedpg
, osd
);
93 using std::make_unique
;
95 using std::ostringstream
;
98 using std::string_view
;
99 using std::stringstream
;
100 using std::unique_ptr
;
103 using ceph::bufferlist
;
104 using ceph::bufferptr
;
105 using ceph::Formatter
;
107 using ceph::decode_noclear
;
109 using ceph::encode_destructively
;
111 using namespace ceph::osd::scheduler
;
112 using TOPNSPC::common::cmd_getval
;
113 using TOPNSPC::common::cmd_getval_or
;
115 template <typename T
>
116 static ostream
& _prefix(std::ostream
*_dout
, T
*pg
) {
117 return pg
->gen_prefix(*_dout
);
121 * The CopyCallback class defines an interface for completions to the
122 * copy_start code. Users of the copy infrastructure must implement
123 * one and give an instance of the class to start_copy.
125 * The implementer is responsible for making sure that the CopyCallback
126 * can associate itself with the correct copy operation.
128 class PrimaryLogPG::CopyCallback
: public GenContext
<CopyCallbackResults
> {
132 * results.get<0>() is the return code: 0 for success; -ECANCELED if
133 * the operation was cancelled by the local OSD; -errno for other issues.
134 * results.get<1>() is a pointer to a CopyResults object, which you are
135 * responsible for deleting.
137 void finish(CopyCallbackResults results_
) override
= 0;
140 /// Provide the final size of the copied object to the CopyCallback
141 ~CopyCallback() override
{}
144 template <typename T
>
145 class PrimaryLogPG::BlessedGenContext
: public GenContext
<T
> {
147 unique_ptr
<GenContext
<T
>> c
;
150 BlessedGenContext(PrimaryLogPG
*pg
, GenContext
<T
> *c
, epoch_t e
)
151 : pg(pg
), c(c
), e(e
) {}
152 void finish(T t
) override
{
153 std::scoped_lock locker
{*pg
};
154 if (pg
->pg_has_reset_since(e
))
157 c
.release()->complete(t
);
159 bool sync_finish(T t
) {
160 // we assume here all blessed/wrapped Contexts can complete synchronously.
161 c
.release()->complete(t
);
166 GenContext
<ThreadPool::TPHandle
&> *PrimaryLogPG::bless_gencontext(
167 GenContext
<ThreadPool::TPHandle
&> *c
) {
168 return new BlessedGenContext
<ThreadPool::TPHandle
&>(
169 this, c
, get_osdmap_epoch());
172 template <typename T
>
173 class PrimaryLogPG::UnlockedBlessedGenContext
: public GenContext
<T
> {
175 unique_ptr
<GenContext
<T
>> c
;
178 UnlockedBlessedGenContext(PrimaryLogPG
*pg
, GenContext
<T
> *c
, epoch_t e
)
179 : pg(pg
), c(c
), e(e
) {}
180 void finish(T t
) override
{
181 if (pg
->pg_has_reset_since(e
))
184 c
.release()->complete(t
);
186 bool sync_finish(T t
) {
187 // we assume here all blessed/wrapped Contexts can complete synchronously.
188 c
.release()->complete(t
);
193 GenContext
<ThreadPool::TPHandle
&> *PrimaryLogPG::bless_unlocked_gencontext(
194 GenContext
<ThreadPool::TPHandle
&> *c
) {
195 return new UnlockedBlessedGenContext
<ThreadPool::TPHandle
&>(
196 this, c
, get_osdmap_epoch());
199 class PrimaryLogPG::BlessedContext
: public Context
{
201 unique_ptr
<Context
> c
;
204 BlessedContext(PrimaryLogPG
*pg
, Context
*c
, epoch_t e
)
205 : pg(pg
), c(c
), e(e
) {}
206 void finish(int r
) override
{
207 std::scoped_lock locker
{*pg
};
208 if (pg
->pg_has_reset_since(e
))
211 c
.release()->complete(r
);
213 bool sync_finish(int r
) override
{
214 // we assume here all blessed/wrapped Contexts can complete synchronously.
215 c
.release()->complete(r
);
220 Context
*PrimaryLogPG::bless_context(Context
*c
) {
221 return new BlessedContext(this, c
, get_osdmap_epoch());
224 class PrimaryLogPG::C_PG_ObjectContext
: public Context
{
228 C_PG_ObjectContext(PrimaryLogPG
*p
, ObjectContext
*o
) :
230 void finish(int r
) override
{
231 pg
->object_context_destructor_callback(obc
);
235 struct OnReadComplete
: public Context
{
237 PrimaryLogPG::OpContext
*opcontext
;
240 PrimaryLogPG::OpContext
*ctx
) : pg(pg
), opcontext(ctx
) {}
241 void finish(int r
) override
{
242 opcontext
->finish_read(pg
);
244 ~OnReadComplete() override
{}
247 class PrimaryLogPG::C_OSD_AppliedRecoveredObject
: public Context
{
249 ObjectContextRef obc
;
251 C_OSD_AppliedRecoveredObject(PrimaryLogPG
*p
, ObjectContextRef o
) :
253 bool sync_finish(int r
) override
{
254 pg
->_applied_recovered_object(obc
);
257 void finish(int r
) override
{
258 std::scoped_lock locker
{*pg
};
259 pg
->_applied_recovered_object(obc
);
263 class PrimaryLogPG::C_OSD_CommittedPushedObject
: public Context
{
266 eversion_t last_complete
;
268 C_OSD_CommittedPushedObject(
269 PrimaryLogPG
*p
, epoch_t epoch
, eversion_t lc
) :
270 pg(p
), epoch(epoch
), last_complete(lc
) {
272 void finish(int r
) override
{
273 pg
->_committed_pushed_object(epoch
, last_complete
);
277 class PrimaryLogPG::C_OSD_AppliedRecoveredObjectReplica
: public Context
{
280 explicit C_OSD_AppliedRecoveredObjectReplica(PrimaryLogPG
*p
) :
282 bool sync_finish(int r
) override
{
283 pg
->_applied_recovered_object_replica();
286 void finish(int r
) override
{
287 std::scoped_lock locker
{*pg
};
288 pg
->_applied_recovered_object_replica();
293 void PrimaryLogPG::OpContext::start_async_reads(PrimaryLogPG
*pg
)
296 list
<pair
<boost::tuple
<uint64_t, uint64_t, unsigned>,
297 pair
<bufferlist
*, Context
*> > > in
;
298 in
.swap(pending_async_reads
);
299 pg
->pgbackend
->objects_read_async(
302 new OnReadComplete(pg
, this), pg
->get_pool().fast_read
);
304 void PrimaryLogPG::OpContext::finish_read(PrimaryLogPG
*pg
)
306 ceph_assert(inflightreads
> 0);
308 if (async_reads_complete()) {
309 ceph_assert(pg
->in_progress_async_reads
.size());
310 ceph_assert(pg
->in_progress_async_reads
.front().second
== this);
311 pg
->in_progress_async_reads
.pop_front();
313 // Restart the op context now that all reads have been
314 // completed. Read failures will be handled by the op finisher
315 pg
->execute_ctx(this);
319 class CopyFromCallback
: public PrimaryLogPG::CopyCallback
{
321 PrimaryLogPG::CopyResults
*results
= nullptr;
322 PrimaryLogPG::OpContext
*ctx
;
324 uint32_t truncate_seq
;
325 uint64_t truncate_size
;
326 bool have_truncate
= false;
328 CopyFromCallback(PrimaryLogPG::OpContext
*ctx
, OSDOp
&osd_op
)
329 : ctx(ctx
), osd_op(osd_op
) {
331 ~CopyFromCallback() override
{}
333 void finish(PrimaryLogPG::CopyCallbackResults results_
) override
{
334 results
= results_
.get
<1>();
335 int r
= results_
.get
<0>();
337 // Only use truncate_{seq,size} from the original object if the client
338 // did not sent us these parameters
339 if (!have_truncate
) {
340 truncate_seq
= results
->truncate_seq
;
341 truncate_size
= results
->truncate_size
;
344 // for finish_copyfrom
345 ctx
->user_at_version
= results
->user_version
;
348 ctx
->pg
->execute_ctx(ctx
);
350 if (r
!= -ECANCELED
) { // on cancel just toss it out; client resends
352 ctx
->pg
->osd
->reply_op_error(ctx
->op
, r
);
353 } else if (results
->should_requeue
) {
355 ctx
->pg
->requeue_op(ctx
->op
);
357 ctx
->pg
->close_op_ctx(ctx
);
361 bool is_temp_obj_used() {
362 return results
->started_temp_obj
;
364 uint64_t get_data_size() {
365 return results
->object_size
;
367 void set_truncate(uint32_t seq
, uint64_t size
) {
369 truncate_size
= size
;
370 have_truncate
= true;
374 struct CopyFromFinisher
: public PrimaryLogPG::OpFinisher
{
375 CopyFromCallback
*copy_from_callback
;
377 explicit CopyFromFinisher(CopyFromCallback
*copy_from_callback
)
378 : copy_from_callback(copy_from_callback
) {
381 int execute() override
{
382 // instance will be destructed after this method completes
383 copy_from_callback
->ctx
->pg
->finish_copyfrom(copy_from_callback
);
388 // ======================
389 // PGBackend::Listener
391 void PrimaryLogPG::on_local_recover(
392 const hobject_t
&hoid
,
393 const ObjectRecoveryInfo
&_recovery_info
,
394 ObjectContextRef obc
,
396 ObjectStore::Transaction
*t
399 dout(10) << __func__
<< ": " << hoid
<< dendl
;
401 ObjectRecoveryInfo
recovery_info(_recovery_info
);
402 clear_object_snap_mapping(t
, hoid
);
403 if (!is_delete
&& recovery_info
.soid
.is_snap()) {
404 OSDriver::OSTransaction
_t(osdriver
.get_transaction(t
));
406 dout(20) << " snapset " << recovery_info
.ss
<< dendl
;
407 auto p
= recovery_info
.ss
.clone_snaps
.find(hoid
.snap
);
408 if (p
!= recovery_info
.ss
.clone_snaps
.end()) {
409 snaps
.insert(p
->second
.begin(), p
->second
.end());
410 dout(20) << " snaps " << snaps
<< dendl
;
416 derr
<< __func__
<< " " << hoid
<< " had no clone_snaps" << dendl
;
419 if (!is_delete
&& recovery_state
.get_pg_log().get_missing().is_missing(recovery_info
.soid
) &&
420 recovery_state
.get_pg_log().get_missing().get_items().find(recovery_info
.soid
)->second
.need
> recovery_info
.version
) {
421 ceph_assert(is_primary());
422 const pg_log_entry_t
*latest
= recovery_state
.get_pg_log().get_log().objects
.find(recovery_info
.soid
)->second
;
423 if (latest
->op
== pg_log_entry_t::LOST_REVERT
&&
424 latest
->reverting_to
== recovery_info
.version
) {
425 dout(10) << " got old revert version " << recovery_info
.version
426 << " for " << *latest
<< dendl
;
427 recovery_info
.version
= latest
->version
;
428 // update the attr to the revert event version
429 recovery_info
.oi
.prior_version
= recovery_info
.oi
.version
;
430 recovery_info
.oi
.version
= latest
->version
;
432 encode(recovery_info
.oi
, bl
,
433 get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD
, nullptr));
434 ceph_assert(!pool
.info
.is_erasure());
435 t
->setattr(coll
, ghobject_t(recovery_info
.soid
), OI_ATTR
, bl
);
437 obc
->attr_cache
[OI_ATTR
] = bl
;
441 // keep track of active pushes for scrub
444 recovery_state
.recover_got(
446 recovery_info
.version
,
452 obc
->obs
.exists
= true;
454 bool got
= obc
->get_recovery_read();
457 ceph_assert(recovering
.count(obc
->obs
.oi
.soid
));
458 recovering
[obc
->obs
.oi
.soid
] = obc
;
459 obc
->obs
.oi
= recovery_info
.oi
; // may have been updated above
462 t
->register_on_applied(new C_OSD_AppliedRecoveredObject(this, obc
));
464 publish_stats_to_osd();
465 release_backoffs(hoid
);
466 if (!is_unreadable_object(hoid
)) {
467 auto unreadable_object_entry
= waiting_for_unreadable_object
.find(hoid
);
468 if (unreadable_object_entry
!= waiting_for_unreadable_object
.end()) {
469 dout(20) << " kicking unreadable waiters on " << hoid
<< dendl
;
470 requeue_ops(unreadable_object_entry
->second
);
471 waiting_for_unreadable_object
.erase(unreadable_object_entry
);
475 t
->register_on_applied(
476 new C_OSD_AppliedRecoveredObjectReplica(this));
480 t
->register_on_commit(
481 new C_OSD_CommittedPushedObject(
484 info
.last_complete
));
487 void PrimaryLogPG::on_global_recover(
488 const hobject_t
&soid
,
489 const object_stat_sum_t
&stat_diff
,
492 recovery_state
.object_recovered(soid
, stat_diff
);
493 publish_stats_to_osd();
494 dout(10) << "pushed " << soid
<< " to all replicas" << dendl
;
495 auto i
= recovering
.find(soid
);
496 ceph_assert(i
!= recovering
.end());
498 if (i
->second
&& i
->second
->rwstate
.recovery_read_marker
) {
499 // recover missing won't have had an obc, but it gets filled in
500 // during on_local_recover
501 ceph_assert(i
->second
);
502 list
<OpRequestRef
> requeue_list
;
503 i
->second
->drop_recovery_read(&requeue_list
);
504 requeue_ops(requeue_list
);
507 backfills_in_flight
.erase(soid
);
510 finish_recovery_op(soid
);
511 release_backoffs(soid
);
512 auto degraded_object_entry
= waiting_for_degraded_object
.find(soid
);
513 if (degraded_object_entry
!= waiting_for_degraded_object
.end()) {
514 dout(20) << " kicking degraded waiters on " << soid
<< dendl
;
515 requeue_ops(degraded_object_entry
->second
);
516 waiting_for_degraded_object
.erase(degraded_object_entry
);
518 auto unreadable_object_entry
= waiting_for_unreadable_object
.find(soid
);
519 if (unreadable_object_entry
!= waiting_for_unreadable_object
.end()) {
520 dout(20) << " kicking unreadable waiters on " << soid
<< dendl
;
521 requeue_ops(unreadable_object_entry
->second
);
522 waiting_for_unreadable_object
.erase(unreadable_object_entry
);
524 finish_degraded_object(soid
);
527 void PrimaryLogPG::schedule_recovery_work(
528 GenContext
<ThreadPool::TPHandle
&> *c
)
530 osd
->queue_recovery_context(this, c
);
533 void PrimaryLogPG::replica_clear_repop_obc(
534 const vector
<pg_log_entry_t
> &logv
,
535 ObjectStore::Transaction
&t
)
537 for (auto &&e
: logv
) {
538 /* Have to blast all clones, they share a snapset */
539 object_contexts
.clear_range(
540 e
.soid
.get_object_boundary(), e
.soid
.get_head());
542 snapset_contexts
.find(e
.soid
.get_head()) ==
543 snapset_contexts
.end());
547 bool PrimaryLogPG::should_send_op(
549 const hobject_t
&hoid
) {
550 if (peer
== get_primary())
552 ceph_assert(recovery_state
.has_peer_info(peer
));
554 hoid
.pool
!= (int64_t)info
.pgid
.pool() ||
555 hoid
<= last_backfill_started
||
556 hoid
<= recovery_state
.get_peer_info(peer
).last_backfill
;
558 ceph_assert(is_backfill_target(peer
));
559 dout(10) << __func__
<< " issue_repop shipping empty opt to osd." << peer
560 << ", object " << hoid
561 << " beyond std::max(last_backfill_started "
562 << ", peer_info[peer].last_backfill "
563 << recovery_state
.get_peer_info(peer
).last_backfill
567 if (is_async_recovery_target(peer
) &&
568 recovery_state
.get_peer_missing(peer
).is_missing(hoid
)) {
570 dout(10) << __func__
<< " issue_repop shipping empty opt to osd." << peer
571 << ", object " << hoid
572 << " which is pending recovery in async_recovery_targets" << dendl
;
578 ConnectionRef
PrimaryLogPG::get_con_osd_cluster(
579 int peer
, epoch_t from_epoch
)
581 return osd
->get_con_osd_cluster(peer
, from_epoch
);
584 PerfCounters
*PrimaryLogPG::get_logger()
590 // ====================
593 bool PrimaryLogPG::is_missing_object(const hobject_t
& soid
) const
595 return recovery_state
.get_pg_log().get_missing().get_items().count(soid
);
598 void PrimaryLogPG::maybe_kick_recovery(
599 const hobject_t
&soid
)
602 bool work_started
= false;
603 if (!recovery_state
.get_missing_loc().needs_recovery(soid
, &v
))
606 map
<hobject_t
, ObjectContextRef
>::const_iterator p
= recovering
.find(soid
);
607 if (p
!= recovering
.end()) {
608 dout(7) << "object " << soid
<< " v " << v
<< ", already recovering." << dendl
;
609 } else if (recovery_state
.get_missing_loc().is_unfound(soid
)) {
610 dout(7) << "object " << soid
<< " v " << v
<< ", is unfound." << dendl
;
612 dout(7) << "object " << soid
<< " v " << v
<< ", recovering." << dendl
;
613 PGBackend::RecoveryHandle
*h
= pgbackend
->open_recovery_op();
614 if (is_missing_object(soid
)) {
615 recover_missing(soid
, v
, CEPH_MSG_PRIO_HIGH
, h
);
616 } else if (recovery_state
.get_missing_loc().is_deleted(soid
)) {
617 prep_object_replica_deletes(soid
, v
, h
, &work_started
);
619 prep_object_replica_pushes(soid
, v
, h
, &work_started
);
621 pgbackend
->run_recovery_op(h
, CEPH_MSG_PRIO_HIGH
);
625 void PrimaryLogPG::wait_for_unreadable_object(
626 const hobject_t
& soid
, OpRequestRef op
)
628 ceph_assert(is_unreadable_object(soid
));
629 maybe_kick_recovery(soid
);
630 waiting_for_unreadable_object
[soid
].push_back(op
);
631 op
->mark_delayed("waiting for missing object");
634 bool PrimaryLogPG::is_degraded_or_backfilling_object(const hobject_t
& soid
)
636 /* The conditions below may clear (on_local_recover, before we queue
637 * the transaction) before we actually requeue the degraded waiters
638 * in on_global_recover after the transaction completes.
640 if (waiting_for_degraded_object
.count(soid
))
642 if (recovery_state
.get_pg_log().get_missing().get_items().count(soid
))
644 ceph_assert(!get_acting_recovery_backfill().empty());
645 for (set
<pg_shard_t
>::iterator i
= get_acting_recovery_backfill().begin();
646 i
!= get_acting_recovery_backfill().end();
648 if (*i
== get_primary()) continue;
649 pg_shard_t peer
= *i
;
650 auto peer_missing_entry
= recovery_state
.get_peer_missing().find(peer
);
651 // If an object is missing on an async_recovery_target, return false.
652 // This will not block the op and the object is async recovered later.
653 if (peer_missing_entry
!= recovery_state
.get_peer_missing().end() &&
654 peer_missing_entry
->second
.get_items().count(soid
)) {
655 if (is_async_recovery_target(peer
))
660 // Object is degraded if after last_backfill AND
661 // we are backfilling it
662 if (is_backfill_target(peer
) &&
663 recovery_state
.get_peer_info(peer
).last_backfill
<= soid
&&
664 last_backfill_started
>= soid
&&
665 backfills_in_flight
.count(soid
))
671 bool PrimaryLogPG::is_degraded_on_async_recovery_target(const hobject_t
& soid
)
673 for (auto &i
: get_async_recovery_targets()) {
674 auto peer_missing_entry
= recovery_state
.get_peer_missing().find(i
);
675 if (peer_missing_entry
!= recovery_state
.get_peer_missing().end() &&
676 peer_missing_entry
->second
.get_items().count(soid
)) {
677 dout(30) << __func__
<< " " << soid
<< dendl
;
684 void PrimaryLogPG::wait_for_degraded_object(const hobject_t
& soid
, OpRequestRef op
)
686 ceph_assert(is_degraded_or_backfilling_object(soid
) || is_degraded_on_async_recovery_target(soid
));
688 maybe_kick_recovery(soid
);
689 waiting_for_degraded_object
[soid
].push_back(op
);
690 op
->mark_delayed("waiting for degraded object");
693 void PrimaryLogPG::block_write_on_full_cache(
694 const hobject_t
& _oid
, OpRequestRef op
)
696 const hobject_t oid
= _oid
.get_head();
697 dout(20) << __func__
<< ": blocking object " << oid
698 << " on full cache" << dendl
;
699 objects_blocked_on_cache_full
.insert(oid
);
700 waiting_for_cache_not_full
.push_back(op
);
701 op
->mark_delayed("waiting for cache not full");
704 void PrimaryLogPG::block_for_clean(
705 const hobject_t
& oid
, OpRequestRef op
)
707 dout(20) << __func__
<< ": blocking object " << oid
708 << " on primary repair" << dendl
;
709 waiting_for_clean_to_primary_repair
.push_back(op
);
710 op
->mark_delayed("waiting for clean to repair");
713 void PrimaryLogPG::block_write_on_snap_rollback(
714 const hobject_t
& oid
, ObjectContextRef obc
, OpRequestRef op
)
716 dout(20) << __func__
<< ": blocking object " << oid
.get_head()
717 << " on snap promotion " << obc
->obs
.oi
.soid
<< dendl
;
718 // otherwise, we'd have blocked in do_op
719 ceph_assert(oid
.is_head());
720 ceph_assert(objects_blocked_on_snap_promotion
.count(oid
) == 0);
722 * We block the head object here.
724 * Let's assume that there is racing read When the head object is being rollbacked.
725 * Since the two different ops can trigger promote_object() with the same source,
726 * infinite loop happens by canceling ops each other.
727 * To avoid this, we block the head object during rollback.
728 * So, the racing read will be blocked until the rollback is completed.
729 * see also: https://tracker.ceph.com/issues/49726
731 ObjectContextRef head_obc
= get_object_context(oid
, false);
732 head_obc
->start_block();
733 objects_blocked_on_snap_promotion
[oid
] = obc
;
734 wait_for_blocked_object(obc
->obs
.oi
.soid
, op
);
737 void PrimaryLogPG::block_write_on_degraded_snap(
738 const hobject_t
& snap
, OpRequestRef op
)
740 dout(20) << __func__
<< ": blocking object " << snap
.get_head()
741 << " on degraded snap " << snap
<< dendl
;
742 // otherwise, we'd have blocked in do_op
743 ceph_assert(objects_blocked_on_degraded_snap
.count(snap
.get_head()) == 0);
744 objects_blocked_on_degraded_snap
[snap
.get_head()] = snap
.snap
;
745 wait_for_degraded_object(snap
, op
);
748 bool PrimaryLogPG::maybe_await_blocked_head(
749 const hobject_t
&hoid
,
752 ObjectContextRef obc
;
753 obc
= object_contexts
.lookup(hoid
.get_head());
755 if (obc
->is_blocked()) {
756 wait_for_blocked_object(obc
->obs
.oi
.soid
, op
);
765 void PrimaryLogPG::wait_for_blocked_object(const hobject_t
& soid
, OpRequestRef op
)
767 dout(10) << __func__
<< " " << soid
<< " " << op
<< dendl
;
768 waiting_for_blocked_object
[soid
].push_back(op
);
769 op
->mark_delayed("waiting for blocked object");
772 void PrimaryLogPG::maybe_force_recovery()
774 // no force if not in degraded/recovery/backfill states
775 if (!is_degraded() &&
776 !state_test(PG_STATE_RECOVERING
|
777 PG_STATE_RECOVERY_WAIT
|
778 PG_STATE_BACKFILLING
|
779 PG_STATE_BACKFILL_WAIT
|
780 PG_STATE_BACKFILL_TOOFULL
))
783 if (recovery_state
.get_pg_log().get_log().approx_size() <
784 cct
->_conf
->osd_max_pg_log_entries
*
785 cct
->_conf
->osd_force_recovery_pg_log_entries_factor
)
788 // find the oldest missing object
789 version_t min_version
= recovery_state
.get_pg_log().get_log().head
.version
;
791 if (!recovery_state
.get_pg_log().get_missing().get_rmissing().empty()) {
792 min_version
= recovery_state
.get_pg_log().get_missing().get_rmissing().begin()->first
;
793 soid
= recovery_state
.get_pg_log().get_missing().get_rmissing().begin()->second
;
795 ceph_assert(!get_acting_recovery_backfill().empty());
796 for (set
<pg_shard_t
>::iterator it
= get_acting_recovery_backfill().begin();
797 it
!= get_acting_recovery_backfill().end();
799 if (*it
== get_primary()) continue;
800 pg_shard_t peer
= *it
;
801 auto it_missing
= recovery_state
.get_peer_missing().find(peer
);
802 if (it_missing
!= recovery_state
.get_peer_missing().end() &&
803 !it_missing
->second
.get_rmissing().empty()) {
804 const auto& min_obj
= recovery_state
.get_peer_missing(peer
).get_rmissing().begin();
805 dout(20) << __func__
<< " peer " << peer
<< " min_version " << min_obj
->first
806 << " oid " << min_obj
->second
<< dendl
;
807 if (min_version
> min_obj
->first
) {
808 min_version
= min_obj
->first
;
809 soid
= min_obj
->second
;
815 if (soid
!= hobject_t())
816 maybe_kick_recovery(soid
);
819 bool PrimaryLogPG::check_laggy(OpRequestRef
& op
)
821 assert(HAVE_FEATURE(recovery_state
.get_min_upacting_features(),
823 if (state_test(PG_STATE_WAIT
)) {
824 dout(10) << __func__
<< " PG is WAIT state" << dendl
;
825 } else if (!state_test(PG_STATE_LAGGY
)) {
826 auto mnow
= osd
->get_mnow();
827 auto ru
= recovery_state
.get_readable_until();
834 << " > readable_until " << ru
<< dendl
;
837 osd
->reply_op_error(op
, -EAGAIN
);
842 state_set(PG_STATE_LAGGY
);
843 publish_stats_to_osd();
845 dout(10) << __func__
<< " not readable" << dendl
;
846 waiting_for_readable
.push_back(op
);
847 op
->mark_delayed("waiting for readable");
851 bool PrimaryLogPG::check_laggy_requeue(OpRequestRef
& op
)
853 assert(HAVE_FEATURE(recovery_state
.get_min_upacting_features(),
855 if (!state_test(PG_STATE_WAIT
) && !state_test(PG_STATE_LAGGY
)) {
856 return true; // not laggy
858 dout(10) << __func__
<< " not readable" << dendl
;
859 waiting_for_readable
.push_front(op
);
860 op
->mark_delayed("waiting for readable");
864 void PrimaryLogPG::recheck_readable()
866 if (!is_wait() && !is_laggy()) {
867 dout(20) << __func__
<< " wasn't wait or laggy" << dendl
;
870 auto mnow
= osd
->get_mnow();
873 auto prior_readable_until_ub
= recovery_state
.get_prior_readable_until_ub();
874 if (mnow
< prior_readable_until_ub
) {
875 dout(10) << __func__
<< " still wait (mnow " << mnow
876 << " < prior_readable_until_ub " << prior_readable_until_ub
879 dout(10) << __func__
<< " no longer wait (mnow " << mnow
880 << " >= prior_readable_until_ub " << prior_readable_until_ub
882 state_clear(PG_STATE_WAIT
);
883 recovery_state
.clear_prior_readable_until_ub();
888 auto ru
= recovery_state
.get_readable_until();
889 if (ru
== ceph::signedspan::zero()) {
890 dout(10) << __func__
<< " still laggy (mnow " << mnow
891 << ", readable_until zero)" << dendl
;
892 } else if (mnow
>= ru
) {
893 dout(10) << __func__
<< " still laggy (mnow " << mnow
894 << " >= readable_until " << ru
<< ")" << dendl
;
896 dout(10) << __func__
<< " no longer laggy (mnow " << mnow
897 << " < readable_until " << ru
<< ")" << dendl
;
898 state_clear(PG_STATE_LAGGY
);
903 publish_stats_to_osd();
905 if (!is_laggy() && !is_wait()) {
906 requeue_ops(waiting_for_readable
);
910 bool PrimaryLogPG::pgls_filter(const PGLSFilter
& filter
, const hobject_t
& sobj
)
914 // If filter has expressed an interest in an xattr, load it.
915 if (!filter
.get_xattr().empty()) {
916 int ret
= pgbackend
->objects_get_attr(
920 dout(0) << "getattr (sobj=" << sobj
<< ", attr=" << filter
.get_xattr() << ") returned " << ret
<< dendl
;
922 if (ret
!= -ENODATA
|| filter
.reject_empty_xattr()) {
928 return filter
.filter(sobj
, bl
);
931 std::pair
<int, std::unique_ptr
<const PGLSFilter
>>
932 PrimaryLogPG::get_pgls_filter(bufferlist::const_iterator
& iter
)
935 // storing non-const PGLSFilter for the sake of ::init()
936 std::unique_ptr
<PGLSFilter
> filter
;
941 catch (ceph::buffer::error
& e
) {
942 return { -EINVAL
, nullptr };
945 if (type
.compare("plain") == 0) {
946 filter
= std::make_unique
<PGLSPlainFilter
>();
948 std::size_t dot
= type
.find('.');
949 if (dot
== std::string::npos
|| dot
== 0 || dot
== type
.size() - 1) {
950 return { -EINVAL
, nullptr };
953 const std::string class_name
= type
.substr(0, dot
);
954 const std::string filter_name
= type
.substr(dot
+ 1);
955 ClassHandler::ClassData
*cls
= NULL
;
956 int r
= ClassHandler::get_instance().open_class(class_name
, &cls
);
958 derr
<< "Error opening class '" << class_name
<< "': "
959 << cpp_strerror(r
) << dendl
;
960 if (r
!= -EPERM
) // propagate permission error
962 return { r
, nullptr };
967 ClassHandler::ClassFilter
*class_filter
= cls
->get_filter(filter_name
);
968 if (class_filter
== NULL
) {
969 derr
<< "Error finding filter '" << filter_name
<< "' in class "
970 << class_name
<< dendl
;
971 return { -EINVAL
, nullptr };
973 filter
.reset(class_filter
->fn());
975 // Object classes are obliged to return us something, but let's
976 // give an error rather than asserting out.
977 derr
<< "Buggy class " << class_name
<< " failed to construct "
978 "filter " << filter_name
<< dendl
;
979 return { -EINVAL
, nullptr };
984 int r
= filter
->init(iter
);
986 derr
<< "Error initializing filter " << type
<< ": "
987 << cpp_strerror(r
) << dendl
;
988 return { -EINVAL
, nullptr };
990 // Successfully constructed and initialized, return it.
991 return std::make_pair(0, std::move(filter
));
996 // ==========================================================
998 void PrimaryLogPG::do_command(
999 const string_view
& orig_prefix
,
1000 const cmdmap_t
& cmdmap
,
1001 const bufferlist
& idata
,
1002 std::function
<void(int,const std::string
&,bufferlist
&)> on_finish
)
1005 cmd_getval(cmdmap
, "format", format
);
1006 std::unique_ptr
<Formatter
> f(Formatter::create(
1007 format
, "json-pretty", "json-pretty"));
1009 stringstream ss
; // stderr error message stream
1010 bufferlist outbl
; // if empty at end, we'll dump formatter as output
1012 // get final prefix:
1013 // - ceph pg <pgid> foo -> prefix=pg, cmd=foo
1014 // - ceph tell <pgid> foo -> prefix=foo
1015 string
prefix(orig_prefix
);
1017 cmd_getval(cmdmap
, "cmd", command
);
1018 if (command
.size()) {
1022 if (prefix
== "query") {
1023 f
->open_object_section("pg");
1024 f
->dump_stream("snap_trimq") << snap_trimq
;
1025 f
->dump_unsigned("snap_trimq_len", snap_trimq
.size());
1026 recovery_state
.dump_peering_state(f
.get());
1028 f
->open_array_section("recovery_state");
1029 handle_query_state(f
.get());
1032 if (is_primary() && is_active() && m_scrubber
) {
1033 m_scrubber
->dump_scrubber(f
.get(), m_planned_scrub
);
1036 f
->open_object_section("agent_state");
1038 agent_state
->dump(f
.get());
1044 else if (prefix
== "mark_unfound_lost") {
1046 cmd_getval(cmdmap
, "mulcmd", mulcmd
);
1048 if (mulcmd
== "revert") {
1049 if (pool
.info
.is_erasure()) {
1050 ss
<< "mode must be 'delete' for ec pool";
1054 mode
= pg_log_entry_t::LOST_REVERT
;
1055 } else if (mulcmd
== "delete") {
1056 mode
= pg_log_entry_t::LOST_DELETE
;
1058 ss
<< "mode must be 'revert' or 'delete'; mark not yet implemented";
1062 ceph_assert(mode
== pg_log_entry_t::LOST_REVERT
||
1063 mode
== pg_log_entry_t::LOST_DELETE
);
1065 if (!is_primary()) {
1066 ss
<< "not primary";
1071 uint64_t unfound
= recovery_state
.get_missing_loc().num_unfound();
1073 ss
<< "pg has no unfound objects";
1074 goto out
; // make command idempotent
1077 if (!recovery_state
.all_unfound_are_queried_or_lost(get_osdmap())) {
1078 ss
<< "pg has " << unfound
1079 << " unfound objects but we haven't probed all sources, not marking lost";
1084 mark_all_unfound_lost(mode
, on_finish
);
1088 else if (prefix
== "list_unfound") {
1091 bool show_offset
= false;
1092 if (cmd_getval(cmdmap
, "offset", offset_json
)) {
1093 json_spirit::Value v
;
1095 if (!json_spirit::read(offset_json
, v
))
1096 throw std::runtime_error("bad json");
1098 } catch (std::runtime_error
& e
) {
1099 ss
<< "error parsing offset: " << e
.what();
1105 f
->open_object_section("missing");
1107 f
->open_object_section("offset");
1108 offset
.dump(f
.get());
1111 auto &needs_recovery_map
= recovery_state
.get_missing_loc()
1112 .get_needs_recovery();
1113 f
->dump_int("num_missing", needs_recovery_map
.size());
1114 f
->dump_int("num_unfound", get_num_unfound());
1115 map
<hobject_t
, pg_missing_item
>::const_iterator p
=
1116 needs_recovery_map
.upper_bound(offset
);
1118 f
->open_array_section("objects");
1120 for (; p
!= needs_recovery_map
.end() &&
1121 num
< cct
->_conf
->osd_command_max_records
;
1123 if (recovery_state
.get_missing_loc().is_unfound(p
->first
)) {
1124 f
->open_object_section("object");
1126 f
->open_object_section("oid");
1127 p
->first
.dump(f
.get());
1130 p
->second
.dump(f
.get()); // have, need keys
1132 f
->open_array_section("locations");
1133 for (auto &&r
: recovery_state
.get_missing_loc().get_locations(
1135 f
->dump_stream("shard") << r
;
1145 // Get possible locations of missing objects from pg information
1146 PeeringState::QueryUnfound
q(f
.get());
1147 recovery_state
.handle_event(q
, 0);
1148 f
->dump_bool("more", p
!= needs_recovery_map
.end());
1152 else if (prefix
== "scrub" ||
1153 prefix
== "deep_scrub") {
1154 bool deep
= (prefix
== "deep_scrub");
1155 int64_t time
= cmd_getval_or
<int64_t>(cmdmap
, "time", 0);
1158 const pg_pool_t
*p
= &pool
.info
;
1159 double pool_scrub_max_interval
= 0;
1160 double scrub_max_interval
;
1162 p
->opts
.get(pool_opts_t::DEEP_SCRUB_INTERVAL
, &pool_scrub_max_interval
);
1163 scrub_max_interval
= pool_scrub_max_interval
> 0 ?
1164 pool_scrub_max_interval
: g_conf()->osd_deep_scrub_interval
;
1166 p
->opts
.get(pool_opts_t::SCRUB_MAX_INTERVAL
, &pool_scrub_max_interval
);
1167 scrub_max_interval
= pool_scrub_max_interval
> 0 ?
1168 pool_scrub_max_interval
: g_conf()->osd_scrub_max_interval
;
1170 // Instead of marking must_scrub force a schedule scrub
1171 utime_t stamp
= ceph_clock_now();
1173 stamp
-= scrub_max_interval
;
1175 stamp
-= (float)time
;
1176 stamp
-= 100.0; // push back last scrub more for good measure
1178 set_last_deep_scrub_stamp(stamp
);
1180 set_last_scrub_stamp(stamp
); // for 'deep' as well, as we use this value to order scrubs
1181 f
->open_object_section("result");
1182 f
->dump_bool("deep", deep
);
1183 f
->dump_stream("stamp") << stamp
;
1186 ss
<< "Not primary";
1189 outbl
.append(ss
.str());
1192 else if (prefix
== "block" || prefix
== "unblock" || prefix
== "set" ||
1193 prefix
== "unset") {
1195 cmd_getval(cmdmap
, "value", value
);
1198 ret
= m_scrubber
->asok_debug(prefix
, value
, f
.get(), ss
);
1199 f
->open_object_section("result");
1200 f
->dump_bool("success", true);
1203 ss
<< "Not primary";
1206 outbl
.append(ss
.str());
1210 ss
<< "prefix '" << prefix
<< "' not implemented";
1214 if (ret
>= 0 && outbl
.length() == 0) {
1217 on_finish(ret
, ss
.str(), outbl
);
1221 // ==========================================================
1223 void PrimaryLogPG::do_pg_op(OpRequestRef op
)
1225 const MOSDOp
*m
= static_cast<const MOSDOp
*>(op
->get_req());
1226 ceph_assert(m
->get_type() == CEPH_MSG_OSD_OP
);
1227 dout(10) << "do_pg_op " << *m
<< dendl
;
1232 string cname
, mname
;
1234 snapid_t snapid
= m
->get_snapid();
1236 vector
<OSDOp
> ops
= m
->ops
;
1238 for (vector
<OSDOp
>::iterator p
= ops
.begin(); p
!= ops
.end(); ++p
) {
1239 std::unique_ptr
<const PGLSFilter
> filter
;
1241 auto bp
= p
->indata
.cbegin();
1243 case CEPH_OSD_OP_PGNLS_FILTER
:
1248 catch (const ceph::buffer::error
& e
) {
1249 dout(0) << "unable to decode PGLS_FILTER description in " << *m
<< dendl
;
1253 std::tie(result
, filter
) = get_pgls_filter(bp
);
1257 ceph_assert(filter
);
1261 case CEPH_OSD_OP_PGNLS
:
1262 if (snapid
!= CEPH_NOSNAP
) {
1266 if (get_osdmap()->raw_pg_to_pg(m
->get_pg()) != info
.pgid
.pgid
) {
1267 dout(10) << " pgnls pg=" << m
->get_pg()
1268 << " " << get_osdmap()->raw_pg_to_pg(m
->get_pg())
1269 << " != " << info
.pgid
<< dendl
;
1272 unsigned list_size
= std::min
<uint64_t>(cct
->_conf
->osd_max_pgls
,
1275 dout(10) << " pgnls pg=" << m
->get_pg() << " count " << list_size
1277 // read into a buffer
1278 vector
<hobject_t
> sentries
;
1279 pg_nls_response_t response
;
1281 decode(response
.handle
, bp
);
1283 catch (const ceph::buffer::error
& e
) {
1284 dout(0) << "unable to decode PGNLS handle in " << *m
<< dendl
;
1290 hobject_t lower_bound
= response
.handle
;
1291 hobject_t pg_start
= info
.pgid
.pgid
.get_hobj_start();
1292 hobject_t pg_end
= info
.pgid
.pgid
.get_hobj_end(pool
.info
.get_pg_num());
1293 dout(10) << " pgnls lower_bound " << lower_bound
1294 << " pg_end " << pg_end
<< dendl
;
1295 if (((!lower_bound
.is_max() && lower_bound
>= pg_end
) ||
1296 (lower_bound
!= hobject_t() && lower_bound
< pg_start
))) {
1297 // this should only happen with a buggy client.
1298 dout(10) << "outside of PG bounds " << pg_start
<< " .. "
1304 hobject_t current
= lower_bound
;
1305 int r
= pgbackend
->objects_list_partial(
1316 map
<hobject_t
, pg_missing_item
>::const_iterator missing_iter
=
1317 recovery_state
.get_pg_log().get_missing().get_items().lower_bound(current
);
1318 vector
<hobject_t
>::iterator ls_iter
= sentries
.begin();
1319 hobject_t _max
= hobject_t::get_max();
1321 const hobject_t
&mcand
=
1322 missing_iter
== recovery_state
.get_pg_log().get_missing().get_items().end() ?
1324 missing_iter
->first
;
1325 const hobject_t
&lcand
=
1326 ls_iter
== sentries
.end() ?
1330 hobject_t candidate
;
1331 if (mcand
== lcand
) {
1333 if (!mcand
.is_max()) {
1337 } else if (mcand
< lcand
) {
1339 ceph_assert(!mcand
.is_max());
1343 ceph_assert(!lcand
.is_max());
1347 dout(10) << " pgnls candidate 0x" << std::hex
<< candidate
.get_hash()
1348 << " vs lower bound 0x" << lower_bound
.get_hash()
1349 << std::dec
<< dendl
;
1351 if (candidate
>= next
) {
1355 if (response
.entries
.size() == list_size
) {
1360 if (candidate
.snap
!= CEPH_NOSNAP
)
1363 // skip internal namespace
1364 if (candidate
.get_namespace() == cct
->_conf
->osd_hit_set_namespace
)
1367 if (recovery_state
.get_missing_loc().is_deleted(candidate
))
1370 // skip wrong namespace
1371 if (m
->get_hobj().nspace
!= librados::all_nspaces
&&
1372 candidate
.get_namespace() != m
->get_hobj().nspace
)
1375 if (filter
&& !pgls_filter(*filter
, candidate
))
1378 dout(20) << "pgnls item 0x" << std::hex
1379 << candidate
.get_hash()
1380 << ", rev 0x" << hobject_t::_reverse_bits(candidate
.get_hash())
1382 << candidate
.oid
.name
<< dendl
;
1384 librados::ListObjectImpl item
;
1385 item
.nspace
= candidate
.get_namespace();
1386 item
.oid
= candidate
.oid
.name
;
1387 item
.locator
= candidate
.get_key();
1388 response
.entries
.push_back(item
);
1391 if (next
.is_max() &&
1392 missing_iter
== recovery_state
.get_pg_log().get_missing().get_items().end() &&
1393 ls_iter
== sentries
.end()) {
1396 // Set response.handle to the start of the next PG according
1397 // to the object sort order.
1398 response
.handle
= info
.pgid
.pgid
.get_hobj_end(pool
.info
.get_pg_num());
1400 response
.handle
= next
;
1402 dout(10) << "pgnls handle=" << response
.handle
<< dendl
;
1403 encode(response
, osd_op
.outdata
);
1404 dout(10) << " pgnls result=" << result
<< " outdata.length()="
1405 << osd_op
.outdata
.length() << dendl
;
1409 case CEPH_OSD_OP_PGLS_FILTER
:
1414 catch (const ceph::buffer::error
& e
) {
1415 dout(0) << "unable to decode PGLS_FILTER description in " << *m
<< dendl
;
1419 std::tie(result
, filter
) = get_pgls_filter(bp
);
1423 ceph_assert(filter
);
1427 case CEPH_OSD_OP_PGLS
:
1428 if (snapid
!= CEPH_NOSNAP
) {
1432 if (get_osdmap()->raw_pg_to_pg(m
->get_pg()) != info
.pgid
.pgid
) {
1433 dout(10) << " pgls pg=" << m
->get_pg()
1434 << " " << get_osdmap()->raw_pg_to_pg(m
->get_pg())
1435 << " != " << info
.pgid
<< dendl
;
1438 unsigned list_size
= std::min
<uint64_t>(cct
->_conf
->osd_max_pgls
,
1441 dout(10) << " pgls pg=" << m
->get_pg() << " count " << list_size
<< dendl
;
1442 // read into a buffer
1443 vector
<hobject_t
> sentries
;
1444 pg_ls_response_t response
;
1446 decode(response
.handle
, bp
);
1448 catch (const ceph::buffer::error
& e
) {
1449 dout(0) << "unable to decode PGLS handle in " << *m
<< dendl
;
1455 hobject_t current
= response
.handle
;
1456 int r
= pgbackend
->objects_list_partial(
1467 ceph_assert(snapid
== CEPH_NOSNAP
|| recovery_state
.get_pg_log().get_missing().get_items().empty());
1469 map
<hobject_t
, pg_missing_item
>::const_iterator missing_iter
=
1470 recovery_state
.get_pg_log().get_missing().get_items().lower_bound(current
);
1471 vector
<hobject_t
>::iterator ls_iter
= sentries
.begin();
1472 hobject_t _max
= hobject_t::get_max();
1474 const hobject_t
&mcand
=
1475 missing_iter
== recovery_state
.get_pg_log().get_missing().get_items().end() ?
1477 missing_iter
->first
;
1478 const hobject_t
&lcand
=
1479 ls_iter
== sentries
.end() ?
1483 hobject_t candidate
;
1484 if (mcand
== lcand
) {
1486 if (!mcand
.is_max()) {
1490 } else if (mcand
< lcand
) {
1492 ceph_assert(!mcand
.is_max());
1496 ceph_assert(!lcand
.is_max());
1500 if (candidate
>= next
) {
1504 if (response
.entries
.size() == list_size
) {
1509 if (candidate
.snap
!= CEPH_NOSNAP
)
1512 // skip wrong namespace
1513 if (candidate
.get_namespace() != m
->get_hobj().nspace
)
1516 if (recovery_state
.get_missing_loc().is_deleted(candidate
))
1519 if (filter
&& !pgls_filter(*filter
, candidate
))
1522 response
.entries
.push_back(make_pair(candidate
.oid
,
1523 candidate
.get_key()));
1525 if (next
.is_max() &&
1526 missing_iter
== recovery_state
.get_pg_log().get_missing().get_items().end() &&
1527 ls_iter
== sentries
.end()) {
1530 response
.handle
= next
;
1531 encode(response
, osd_op
.outdata
);
1532 dout(10) << " pgls result=" << result
<< " outdata.length()="
1533 << osd_op
.outdata
.length() << dendl
;
1537 case CEPH_OSD_OP_PG_HITSET_LS
:
1539 list
< pair
<utime_t
,utime_t
> > ls
;
1540 for (list
<pg_hit_set_info_t
>::const_iterator p
= info
.hit_set
.history
.begin();
1541 p
!= info
.hit_set
.history
.end();
1543 ls
.push_back(make_pair(p
->begin
, p
->end
));
1545 ls
.push_back(make_pair(hit_set_start_stamp
, utime_t()));
1546 encode(ls
, osd_op
.outdata
);
1550 case CEPH_OSD_OP_PG_HITSET_GET
:
1552 utime_t
stamp(osd_op
.op
.hit_set_get
.stamp
);
1553 if (hit_set_start_stamp
&& stamp
>= hit_set_start_stamp
) {
1554 // read the current in-memory HitSet, not the version we've
1560 encode(*hit_set
, osd_op
.outdata
);
1561 result
= osd_op
.outdata
.length();
1563 // read an archived HitSet.
1565 for (list
<pg_hit_set_info_t
>::const_iterator p
= info
.hit_set
.history
.begin();
1566 p
!= info
.hit_set
.history
.end();
1568 if (stamp
>= p
->begin
&& stamp
<= p
->end
) {
1569 oid
= get_hit_set_archive_object(p
->begin
, p
->end
, p
->using_gmt
);
1573 if (oid
== hobject_t()) {
1577 if (!pool
.info
.is_replicated()) {
1578 // FIXME: EC not supported yet
1579 result
= -EOPNOTSUPP
;
1582 if (is_unreadable_object(oid
)) {
1583 wait_for_unreadable_object(oid
, op
);
1586 result
= osd
->store
->read(ch
, ghobject_t(oid
), 0, 0, osd_op
.outdata
);
1591 case CEPH_OSD_OP_SCRUBLS
:
1592 result
= do_scrub_ls(m
, &osd_op
);
1605 MOSDOpReply
*reply
= new MOSDOpReply(m
, 0, get_osdmap_epoch(),
1606 CEPH_OSD_FLAG_ACK
| CEPH_OSD_FLAG_ONDISK
,
1608 reply
->claim_op_out_data(ops
);
1609 reply
->set_result(result
);
1610 reply
->set_reply_versions(info
.last_update
, info
.last_user_version
);
1611 osd
->send_message_osd_client(reply
, m
->get_connection());
1614 int PrimaryLogPG::do_scrub_ls(const MOSDOp
*m
, OSDOp
*osd_op
)
1616 if (m
->get_pg() != info
.pgid
.pgid
) {
1617 dout(10) << " scrubls pg=" << m
->get_pg() << " != " << info
.pgid
<< dendl
;
1618 return -EINVAL
; // hmm?
1620 auto bp
= osd_op
->indata
.cbegin();
1624 } catch (ceph::buffer::error
&) {
1625 dout(10) << " corrupted scrub_ls_arg_t" << dendl
;
1630 scrub_ls_result_t result
= {.interval
= info
.history
.same_interval_since
};
1632 if (arg
.interval
!= 0 && arg
.interval
!= info
.history
.same_interval_since
) {
1635 bool store_queried
= m_scrubber
&& m_scrubber
->get_store_errors(arg
, result
);
1636 if (store_queried
) {
1637 encode(result
, osd_op
->outdata
);
1639 // the scrubber's store is not initialized
1648 * Grabs locks for OpContext, should be cleaned up in close_op_ctx
1650 * @param ctx [in,out] ctx to get locks for
1651 * @return true on success, false if we are queued
1653 bool PrimaryLogPG::get_rw_locks(bool write_ordered
, OpContext
*ctx
)
1655 /* If head_obc, !obc->obs->exists and we will always take the
1656 * snapdir lock *before* the head lock. Since all callers will do
1657 * this (read or write) if we get the first we will be guaranteed
1658 * to get the second.
1660 if (write_ordered
&& ctx
->op
->may_read()) {
1661 ctx
->lock_type
= RWState::RWEXCL
;
1662 } else if (write_ordered
) {
1663 ctx
->lock_type
= RWState::RWWRITE
;
1665 ceph_assert(ctx
->op
->may_read());
1666 ctx
->lock_type
= RWState::RWREAD
;
1669 if (ctx
->head_obc
) {
1670 ceph_assert(!ctx
->obc
->obs
.exists
);
1671 if (!ctx
->lock_manager
.get_lock_type(
1673 ctx
->head_obc
->obs
.oi
.soid
,
1676 ctx
->lock_type
= RWState::RWNONE
;
1680 if (ctx
->lock_manager
.get_lock_type(
1682 ctx
->obc
->obs
.oi
.soid
,
1687 ceph_assert(!ctx
->head_obc
);
1688 ctx
->lock_type
= RWState::RWNONE
;
1696 * @param manager [in] manager with locks to release
1698 void PrimaryLogPG::release_object_locks(
1699 ObcLockManager
&lock_manager
) {
1700 std::list
<std::pair
<ObjectContextRef
, std::list
<OpRequestRef
> > > to_req
;
1701 bool requeue_recovery
= false;
1702 bool requeue_snaptrim
= false;
1703 lock_manager
.put_locks(
1707 if (requeue_recovery
)
1709 if (requeue_snaptrim
)
1710 snap_trimmer_machine
.process_event(TrimWriteUnblocked());
1712 if (!to_req
.empty()) {
1713 // requeue at front of scrub blocking queue if we are blocked by scrub
1714 for (auto &&p
: to_req
) {
1715 if (m_scrubber
->write_blocked_by_scrub(p
.first
->obs
.oi
.soid
.get_head())) {
1716 for (auto& op
: p
.second
) {
1717 op
->mark_delayed("waiting for scrub");
1720 waiting_for_scrub
.splice(
1721 waiting_for_scrub
.begin(),
1725 } else if (is_laggy()) {
1726 for (auto& op
: p
.second
) {
1727 op
->mark_delayed("waiting for readable");
1729 waiting_for_readable
.splice(
1730 waiting_for_readable
.begin(),
1735 requeue_ops(p
.second
);
1741 PrimaryLogPG::PrimaryLogPG(OSDService
*o
, OSDMapRef curmap
,
1742 const PGPool
&_pool
,
1743 const map
<string
,string
>& ec_profile
, spg_t p
) :
1744 PG(o
, curmap
, _pool
, p
),
1746 PGBackend::build_pg_backend(
1747 _pool
.info
, ec_profile
, this, coll_t(p
), ch
, o
->store
, cct
)),
1748 object_contexts(o
->cct
, o
->cct
->_conf
->osd_pg_object_context_cache_count
),
1749 new_backfill(false),
1751 snap_trimmer_machine(this)
1753 recovery_state
.set_backend_predicates(
1754 pgbackend
->get_is_readable_predicate(),
1755 pgbackend
->get_is_recoverable_predicate());
1756 snap_trimmer_machine
.initiate();
1758 m_scrubber
= make_unique
<PrimaryLogScrub
>(this);
1761 PrimaryLogPG::~PrimaryLogPG()
1766 void PrimaryLogPG::get_src_oloc(const object_t
& oid
, const object_locator_t
& oloc
, object_locator_t
& src_oloc
)
1769 if (oloc
.key
.empty())
1770 src_oloc
.key
= oid
.name
;
1773 void PrimaryLogPG::handle_backoff(OpRequestRef
& op
)
1775 auto m
= op
->get_req
<MOSDBackoff
>();
1776 auto session
= ceph::ref_cast
<Session
>(m
->get_connection()->get_priv());
1779 hobject_t begin
= info
.pgid
.pgid
.get_hobj_start();
1780 hobject_t end
= info
.pgid
.pgid
.get_hobj_end(pool
.info
.get_pg_num());
1781 if (begin
< m
->begin
) {
1787 dout(10) << __func__
<< " backoff ack id " << m
->id
1788 << " [" << begin
<< "," << end
<< ")" << dendl
;
1789 session
->ack_backoff(cct
, m
->pgid
, m
->id
, begin
, end
);
1792 void PrimaryLogPG::do_request(
1794 ThreadPool::TPHandle
&handle
)
1796 if (op
->osd_trace
) {
1797 op
->pg_trace
.init("pg op", &trace_endpoint
, &op
->osd_trace
);
1798 op
->pg_trace
.event("do request");
1801 [[maybe_unused
]] auto span
= tracing::osd::tracer
.add_span(__func__
, op
->osd_parent_span
);
1803 // make sure we have a new enough map
1804 auto p
= waiting_for_map
.find(op
->get_source());
1805 if (p
!= waiting_for_map
.end()) {
1806 // preserve ordering
1807 dout(20) << __func__
<< " waiting_for_map "
1808 << p
->first
<< " not empty, queueing" << dendl
;
1809 p
->second
.push_back(op
);
1810 op
->mark_delayed("waiting_for_map not empty");
1813 if (!have_same_or_newer_map(op
->min_epoch
)) {
1814 dout(20) << __func__
<< " min " << op
->min_epoch
1815 << ", queue on waiting_for_map " << op
->get_source() << dendl
;
1816 waiting_for_map
[op
->get_source()].push_back(op
);
1817 op
->mark_delayed("op must wait for map");
1818 osd
->request_osdmap_update(op
->min_epoch
);
1822 if (can_discard_request(op
)) {
1827 const Message
*m
= op
->get_req();
1828 int msg_type
= m
->get_type();
1829 if (m
->get_connection()->has_feature(CEPH_FEATURE_RADOS_BACKOFF
)) {
1830 auto session
= ceph::ref_cast
<Session
>(m
->get_connection()->get_priv());
1833 if (msg_type
== CEPH_MSG_OSD_OP
) {
1834 if (session
->check_backoff(cct
, info
.pgid
,
1835 info
.pgid
.pgid
.get_hobj_start(), m
)) {
1842 (!is_active() && is_peered());
1843 if (g_conf()->osd_backoff_on_peering
&& !backoff
) {
1849 add_pg_backoff(session
);
1853 // pg backoff acks at pg-level
1854 if (msg_type
== CEPH_MSG_OSD_BACKOFF
) {
1855 const MOSDBackoff
*ba
= static_cast<const MOSDBackoff
*>(m
);
1856 if (ba
->begin
!= ba
->end
) {
1864 // Delay unless PGBackend says it's ok
1865 if (pgbackend
->can_handle_while_inactive(op
)) {
1866 bool handled
= pgbackend
->handle_message(op
);
1867 ceph_assert(handled
);
1870 waiting_for_peered
.push_back(op
);
1871 op
->mark_delayed("waiting for peered");
1876 if (recovery_state
.needs_flush()) {
1877 dout(20) << "waiting for flush on " << op
<< dendl
;
1878 waiting_for_flush
.push_back(op
);
1879 op
->mark_delayed("waiting for flush");
1883 ceph_assert(is_peered() && !recovery_state
.needs_flush());
1884 if (pgbackend
->handle_message(op
))
1888 case CEPH_MSG_OSD_OP
:
1889 case CEPH_MSG_OSD_BACKOFF
:
1891 dout(20) << " peered, not active, waiting for active on " << op
<< dendl
;
1892 waiting_for_active
.push_back(op
);
1893 op
->mark_delayed("waiting for active");
1897 case CEPH_MSG_OSD_OP
:
1898 // verify client features
1899 if ((pool
.info
.has_tiers() || pool
.info
.is_tier()) &&
1900 !op
->has_feature(CEPH_FEATURE_OSD_CACHEPOOL
)) {
1901 osd
->reply_op_error(op
, -EOPNOTSUPP
);
1906 case CEPH_MSG_OSD_BACKOFF
:
1907 // object-level backoff acks handled in osdop context
1913 case MSG_OSD_PG_SCAN
:
1914 do_scan(op
, handle
);
1917 case MSG_OSD_PG_BACKFILL
:
1921 case MSG_OSD_PG_BACKFILL_REMOVE
:
1922 do_backfill_remove(op
);
1925 case MSG_OSD_SCRUB_RESERVE
:
1928 osd
->reply_op_error(op
, -EAGAIN
);
1931 auto m
= op
->get_req
<MOSDScrubReserve
>();
1933 case MOSDScrubReserve::REQUEST
:
1934 m_scrubber
->handle_scrub_reserve_request(op
);
1936 case MOSDScrubReserve::GRANT
:
1937 m_scrubber
->handle_scrub_reserve_grant(op
, m
->from
);
1939 case MOSDScrubReserve::REJECT
:
1940 m_scrubber
->handle_scrub_reserve_reject(op
, m
->from
);
1942 case MOSDScrubReserve::RELEASE
:
1943 m_scrubber
->handle_scrub_reserve_release(op
);
1949 case MSG_OSD_REP_SCRUB
:
1950 replica_scrub(op
, handle
);
1953 case MSG_OSD_REP_SCRUBMAP
:
1954 do_replica_scrub_map(op
);
1957 case MSG_OSD_PG_UPDATE_LOG_MISSING
:
1958 do_update_log_missing(op
);
1961 case MSG_OSD_PG_UPDATE_LOG_MISSING_REPLY
:
1962 do_update_log_missing_reply(op
);
1966 ceph_abort_msg("bad message type in do_request");
1970 /** do_op - do an op
1971 * pg lock will be held (if multithreaded)
1972 * osd_lock NOT held.
1974 void PrimaryLogPG::do_op(OpRequestRef
& op
)
1977 // NOTE: take a non-const pointer here; we must be careful not to
1978 // change anything that will break other reads on m (operator<<).
1979 MOSDOp
*m
= static_cast<MOSDOp
*>(op
->get_nonconst_req());
1980 ceph_assert(m
->get_type() == CEPH_MSG_OSD_OP
);
1981 if (m
->finish_decode()) {
1982 op
->reset_desc(); // for TrackedOp
1986 dout(20) << __func__
<< ": op " << *m
<< dendl
;
1988 const hobject_t head
= m
->get_hobj().get_head();
1990 if (!info
.pgid
.pgid
.contains(
1991 info
.pgid
.pgid
.get_split_bits(pool
.info
.get_pg_num()), head
)) {
1992 derr
<< __func__
<< " " << info
.pgid
.pgid
<< " does not contain "
1993 << head
<< " pg_num " << pool
.info
.get_pg_num() << " hash "
1994 << std::hex
<< head
.get_hash() << std::dec
<< dendl
;
1995 osd
->clog
->warn() << info
.pgid
.pgid
<< " does not contain " << head
1997 ceph_assert(!cct
->_conf
->osd_debug_misdirected_ops
);
2002 m
->get_connection()->has_feature(CEPH_FEATURE_RADOS_BACKOFF
);
2003 ceph::ref_t
<Session
> session
;
2005 session
= static_cast<Session
*>(m
->get_connection()->get_priv().get());
2006 if (!session
.get()) {
2007 dout(10) << __func__
<< " no session" << dendl
;
2011 if (session
->check_backoff(cct
, info
.pgid
, head
, m
)) {
2016 if (m
->has_flag(CEPH_OSD_FLAG_PARALLELEXEC
)) {
2018 dout(20) << __func__
<< ": PARALLELEXEC not implemented " << *m
<< dendl
;
2019 osd
->reply_op_error(op
, -EINVAL
);
2024 int r
= op
->maybe_init_op_info(*get_osdmap());
2026 osd
->reply_op_error(op
, r
);
2031 if ((m
->get_flags() & (CEPH_OSD_FLAG_BALANCE_READS
|
2032 CEPH_OSD_FLAG_LOCALIZE_READS
)) &&
2034 !(op
->may_write() || op
->may_cache())) {
2035 // balanced reads; any replica will do
2036 if (!(is_primary() || is_nonprimary())) {
2037 osd
->handle_misdirected_op(this, op
);
2041 // normal case; must be primary
2042 if (!is_primary()) {
2043 osd
->handle_misdirected_op(this, op
);
2048 if (!check_laggy(op
)) {
2052 if (!op_has_sufficient_caps(op
)) {
2053 osd
->reply_op_error(op
, -EPERM
);
2057 if (op
->includes_pg_op()) {
2058 return do_pg_op(op
);
2061 // object name too long?
2062 if (m
->get_oid().name
.size() > cct
->_conf
->osd_max_object_name_len
) {
2063 dout(4) << "do_op name is longer than "
2064 << cct
->_conf
->osd_max_object_name_len
2065 << " bytes" << dendl
;
2066 osd
->reply_op_error(op
, -ENAMETOOLONG
);
2069 if (m
->get_hobj().get_key().size() > cct
->_conf
->osd_max_object_name_len
) {
2070 dout(4) << "do_op locator is longer than "
2071 << cct
->_conf
->osd_max_object_name_len
2072 << " bytes" << dendl
;
2073 osd
->reply_op_error(op
, -ENAMETOOLONG
);
2076 if (m
->get_hobj().nspace
.size() > cct
->_conf
->osd_max_object_namespace_len
) {
2077 dout(4) << "do_op namespace is longer than "
2078 << cct
->_conf
->osd_max_object_namespace_len
2079 << " bytes" << dendl
;
2080 osd
->reply_op_error(op
, -ENAMETOOLONG
);
2083 if (m
->get_hobj().oid
.name
.empty()) {
2084 dout(4) << "do_op empty oid name is not allowed" << dendl
;
2085 osd
->reply_op_error(op
, -EINVAL
);
2089 if (int r
= osd
->store
->validate_hobject_key(head
)) {
2090 dout(4) << "do_op object " << head
<< " invalid for backing store: "
2092 osd
->reply_op_error(op
, r
);
2097 if (get_osdmap()->is_blocklisted(m
->get_source_addr())) {
2098 dout(10) << "do_op " << m
->get_source_addr() << " is blocklisted" << dendl
;
2099 osd
->reply_op_error(op
, -EBLOCKLISTED
);
2103 // order this op as a write?
2104 bool write_ordered
= op
->rwordered();
2106 // discard due to cluster full transition? (we discard any op that
2107 // originates before the cluster or pool is marked full; the client
2108 // will resend after the full flag is removed or if they expect the
2109 // op to succeed despite being full). The except is FULL_FORCE and
2110 // FULL_TRY ops, which there is no reason to discard because they
2111 // bypass all full checks anyway. If this op isn't write or
2112 // read-ordered, we skip.
2113 // FIXME: we exclude mds writes for now.
2114 if (write_ordered
&& !(m
->get_source().is_mds() ||
2115 m
->has_flag(CEPH_OSD_FLAG_FULL_TRY
) ||
2116 m
->has_flag(CEPH_OSD_FLAG_FULL_FORCE
)) &&
2117 info
.history
.last_epoch_marked_full
> m
->get_map_epoch()) {
2118 dout(10) << __func__
<< " discarding op sent before full " << m
<< " "
2122 // mds should have stopped writing before this point.
2123 // We can't allow OSD to become non-startable even if mds
2124 // could be writing as part of file removals.
2125 if (write_ordered
&& osd
->check_failsafe_full(get_dpp()) &&
2126 !m
->has_flag(CEPH_OSD_FLAG_FULL_TRY
)) {
2127 dout(10) << __func__
<< " fail-safe full check failed, dropping request." << dendl
;
2130 int64_t poolid
= get_pgid().pool();
2131 const pg_pool_t
*pi
= get_osdmap()->get_pg_pool(poolid
);
2135 if (pi
->has_flag(pg_pool_t::FLAG_EIO
)) {
2136 // drop op on the floor; the client will handle returning EIO
2137 if (m
->has_flag(CEPH_OSD_FLAG_SUPPORTSPOOLEIO
)) {
2138 dout(10) << __func__
<< " discarding op due to pool EIO flag" << dendl
;
2140 dout(10) << __func__
<< " replying EIO due to pool EIO flag" << dendl
;
2141 osd
->reply_op_error(op
, -EIO
);
2145 if (op
->may_write()) {
2148 if (m
->get_snapid() != CEPH_NOSNAP
) {
2149 dout(20) << __func__
<< ": write to clone not valid " << *m
<< dendl
;
2150 osd
->reply_op_error(op
, -EINVAL
);
2155 if (cct
->_conf
->osd_max_write_size
&&
2156 m
->get_data_len() > cct
->_conf
->osd_max_write_size
<< 20) {
2157 // journal can't hold commit!
2158 derr
<< "do_op msg data len " << m
->get_data_len()
2159 << " > osd_max_write_size " << (cct
->_conf
->osd_max_write_size
<< 20)
2160 << " on " << *m
<< dendl
;
2161 osd
->reply_op_error(op
, -OSD_WRITETOOBIG
);
2166 dout(10) << "do_op " << *m
2167 << (op
->may_write() ? " may_write" : "")
2168 << (op
->may_read() ? " may_read" : "")
2169 << (op
->may_cache() ? " may_cache" : "")
2170 << " -> " << (write_ordered
? "write-ordered" : "read-ordered")
2171 << " flags " << ceph_osd_flag_string(m
->get_flags())
2174 [[maybe_unused
]] auto span
= tracing::osd::tracer
.add_span(__func__
, op
->osd_parent_span
);
2177 if (is_unreadable_object(head
)) {
2178 if (!is_primary()) {
2179 osd
->reply_op_error(op
, -EAGAIN
);
2183 (g_conf()->osd_backoff_on_degraded
||
2184 (g_conf()->osd_backoff_on_unfound
&&
2185 recovery_state
.get_missing_loc().is_unfound(head
)))) {
2186 add_backoff(session
, head
, head
);
2187 maybe_kick_recovery(head
);
2189 wait_for_unreadable_object(head
, op
);
2194 if (write_ordered
) {
2196 if (is_degraded_or_backfilling_object(head
)) {
2197 if (can_backoff
&& g_conf()->osd_backoff_on_degraded
) {
2198 add_backoff(session
, head
, head
);
2199 maybe_kick_recovery(head
);
2201 wait_for_degraded_object(head
, op
);
2206 if (m_scrubber
->is_scrub_active() && m_scrubber
->write_blocked_by_scrub(head
)) {
2207 dout(20) << __func__
<< ": waiting for scrub" << dendl
;
2208 waiting_for_scrub
.push_back(op
);
2209 op
->mark_delayed("waiting for scrub");
2212 if (!check_laggy_requeue(op
)) {
2217 if (auto blocked_iter
= objects_blocked_on_degraded_snap
.find(head
);
2218 blocked_iter
!= std::end(objects_blocked_on_degraded_snap
)) {
2219 hobject_t
to_wait_on(head
);
2220 to_wait_on
.snap
= blocked_iter
->second
;
2221 wait_for_degraded_object(to_wait_on
, op
);
2224 if (auto blocked_snap_promote_iter
= objects_blocked_on_snap_promotion
.find(head
);
2225 blocked_snap_promote_iter
!= std::end(objects_blocked_on_snap_promotion
)) {
2226 wait_for_blocked_object(blocked_snap_promote_iter
->second
->obs
.oi
.soid
, op
);
2229 if (objects_blocked_on_cache_full
.count(head
)) {
2230 block_write_on_full_cache(head
, op
);
2236 if (op
->may_write() || op
->may_cache()) {
2237 // warning: we will get back *a* request for this reqid, but not
2238 // necessarily the most recent. this happens with flush and
2239 // promote ops, but we can't possible have both in our log where
2240 // the original request is still not stable on disk, so for our
2241 // purposes here it doesn't matter which one we get.
2243 version_t user_version
;
2244 int return_code
= 0;
2245 vector
<pg_log_op_return_item_t
> op_returns
;
2246 bool got
= check_in_progress_op(
2247 m
->get_reqid(), &version
, &user_version
, &return_code
, &op_returns
);
2249 dout(3) << __func__
<< " dup " << m
->get_reqid()
2250 << " version " << version
<< dendl
;
2251 if (already_complete(version
)) {
2252 osd
->reply_op_error(op
, return_code
, version
, user_version
, op_returns
);
2254 dout(10) << " waiting for " << version
<< " to commit" << dendl
;
2255 // always queue ondisk waiters, so that we can requeue if needed
2256 waiting_for_ondisk
[version
].emplace_back(op
, user_version
, return_code
,
2258 op
->mark_delayed("waiting for ondisk");
2264 ObjectContextRef obc
;
2265 bool can_create
= op
->may_write();
2266 hobject_t missing_oid
;
2268 // kludge around the fact that LIST_SNAPS sets CEPH_SNAPDIR for LIST_SNAPS
2269 const hobject_t
& oid
=
2270 m
->get_snapid() == CEPH_SNAPDIR
? head
: m
->get_hobj();
2272 // make sure LIST_SNAPS is on CEPH_SNAPDIR and nothing else
2273 for (vector
<OSDOp
>::iterator p
= m
->ops
.begin(); p
!= m
->ops
.end(); ++p
) {
2276 if (osd_op
.op
.op
== CEPH_OSD_OP_LIST_SNAPS
) {
2277 if (m
->get_snapid() != CEPH_SNAPDIR
) {
2278 dout(10) << "LIST_SNAPS with incorrect context" << dendl
;
2279 osd
->reply_op_error(op
, -EINVAL
);
2283 if (m
->get_snapid() == CEPH_SNAPDIR
) {
2284 dout(10) << "non-LIST_SNAPS on snapdir" << dendl
;
2285 osd
->reply_op_error(op
, -EINVAL
);
2291 // io blocked on obc?
2292 if (!m
->has_flag(CEPH_OSD_FLAG_FLUSH
) &&
2293 maybe_await_blocked_head(oid
, op
)) {
2297 if (!is_primary()) {
2298 if (!recovery_state
.can_serve_replica_read(oid
)) {
2299 dout(20) << __func__
2300 << ": unstable write on replica, bouncing to primary "
2302 osd
->reply_op_error(op
, -EAGAIN
);
2305 dout(20) << __func__
<< ": serving replica read on oid " << oid
2309 int r
= find_object_context(
2310 oid
, &obc
, can_create
,
2311 m
->has_flag(CEPH_OSD_FLAG_MAP_SNAP_CLONE
),
2314 // LIST_SNAPS needs the ssc too
2316 m
->get_snapid() == CEPH_SNAPDIR
&&
2318 obc
->ssc
= get_snapset_context(oid
, true);
2322 // If we're not the primary of this OSD, we just return -EAGAIN. Otherwise,
2323 // we have to wait for the object.
2325 // missing the specific snap we need; requeue and wait.
2326 ceph_assert(!op
->may_write()); // only happens on a read/cache
2327 wait_for_unreadable_object(missing_oid
, op
);
2330 } else if (r
== 0) {
2331 if (is_unreadable_object(obc
->obs
.oi
.soid
)) {
2332 dout(10) << __func__
<< ": clone " << obc
->obs
.oi
.soid
2333 << " is unreadable, waiting" << dendl
;
2334 wait_for_unreadable_object(obc
->obs
.oi
.soid
, op
);
2338 // degraded object? (the check above was for head; this could be a clone)
2339 if (write_ordered
&&
2340 obc
->obs
.oi
.soid
.snap
!= CEPH_NOSNAP
&&
2341 is_degraded_or_backfilling_object(obc
->obs
.oi
.soid
)) {
2342 dout(10) << __func__
<< ": clone " << obc
->obs
.oi
.soid
2343 << " is degraded, waiting" << dendl
;
2344 wait_for_degraded_object(obc
->obs
.oi
.soid
, op
);
2349 bool in_hit_set
= false;
2352 if (obc
->obs
.oi
.soid
!= hobject_t() && hit_set
->contains(obc
->obs
.oi
.soid
))
2355 if (missing_oid
!= hobject_t() && hit_set
->contains(missing_oid
))
2358 if (!op
->hitset_inserted
) {
2359 hit_set
->insert(oid
);
2360 op
->hitset_inserted
= true;
2361 if (hit_set
->is_full() ||
2362 hit_set_start_stamp
+ pool
.info
.hit_set_period
<= m
->get_recv_stamp()) {
2369 if (agent_choose_mode(false, op
))
2373 if (obc
.get() && obc
->obs
.exists
) {
2374 if (recover_adjacent_clones(obc
, op
)) {
2377 if (maybe_handle_manifest(op
,
2383 if (maybe_handle_cache(op
,
2392 if (r
&& (r
!= -ENOENT
|| !obc
)) {
2393 // copy the reqids for copy get on ENOENT
2395 (m
->ops
[0].op
.op
== CEPH_OSD_OP_COPY_GET
)) {
2396 fill_in_copy_get_noent(op
, oid
, m
->ops
[0]);
2399 dout(20) << __func__
<< ": find_object_context got error " << r
<< dendl
;
2400 if (op
->may_write() &&
2401 get_osdmap()->require_osd_release
>= ceph_release_t::kraken
) {
2402 record_write_error(op
, oid
, nullptr, r
);
2404 osd
->reply_op_error(op
, r
);
2409 // make sure locator is consistent
2410 object_locator_t
oloc(obc
->obs
.oi
.soid
);
2411 if (m
->get_object_locator() != oloc
) {
2412 dout(10) << " provided locator " << m
->get_object_locator()
2413 << " != object's " << obc
->obs
.oi
.soid
<< dendl
;
2414 osd
->clog
->warn() << "bad locator " << m
->get_object_locator()
2415 << " on object " << oloc
2419 // io blocked on obc?
2420 if (obc
->is_blocked() &&
2421 !m
->has_flag(CEPH_OSD_FLAG_FLUSH
)) {
2422 wait_for_blocked_object(obc
->obs
.oi
.soid
, op
);
2426 dout(25) << __func__
<< " oi " << obc
->obs
.oi
<< dendl
;
2428 OpContext
*ctx
= new OpContext(op
, m
->get_reqid(), &m
->ops
, obc
, this);
2430 if (m
->has_flag(CEPH_OSD_FLAG_SKIPRWLOCKS
)) {
2431 dout(20) << __func__
<< ": skipping rw locks" << dendl
;
2432 } else if (m
->get_flags() & CEPH_OSD_FLAG_FLUSH
) {
2433 dout(20) << __func__
<< ": part of flush, will ignore write lock" << dendl
;
2435 // verify there is in fact a flush in progress
2436 // FIXME: we could make this a stronger test.
2437 map
<hobject_t
,FlushOpRef
>::iterator p
= flush_ops
.find(obc
->obs
.oi
.soid
);
2438 if (p
== flush_ops
.end()) {
2439 dout(10) << __func__
<< " no flush in progress, aborting" << dendl
;
2440 reply_ctx(ctx
, -EINVAL
);
2443 } else if (!get_rw_locks(write_ordered
, ctx
)) {
2444 dout(20) << __func__
<< " waiting for rw locks " << dendl
;
2445 op
->mark_delayed("waiting for rw locks");
2449 dout(20) << __func__
<< " obc " << *obc
<< dendl
;
2452 dout(20) << __func__
<< " returned an error: " << r
<< dendl
;
2453 if (op
->may_write() &&
2454 get_osdmap()->require_osd_release
>= ceph_release_t::kraken
) {
2455 record_write_error(op
, oid
, nullptr, r
,
2456 ctx
->op
->allows_returnvec() ? ctx
: nullptr);
2458 osd
->reply_op_error(op
, r
);
2464 if (m
->has_flag(CEPH_OSD_FLAG_IGNORE_CACHE
)) {
2465 ctx
->ignore_cache
= true;
2468 if ((op
->may_read()) && (obc
->obs
.oi
.is_lost())) {
2469 // This object is lost. Reading from it returns an error.
2470 dout(20) << __func__
<< ": object " << obc
->obs
.oi
.soid
2471 << " is lost" << dendl
;
2472 reply_ctx(ctx
, -ENFILE
);
2475 if (!op
->may_write() &&
2477 (!obc
->obs
.exists
||
2478 ((m
->get_snapid() != CEPH_SNAPDIR
) &&
2479 obc
->obs
.oi
.is_whiteout()))) {
2480 // copy the reqids for copy get on ENOENT
2481 if (m
->ops
[0].op
.op
== CEPH_OSD_OP_COPY_GET
) {
2482 fill_in_copy_get_noent(op
, oid
, m
->ops
[0]);
2486 reply_ctx(ctx
, -ENOENT
);
2493 utime_t prepare_latency
= ceph_clock_now();
2494 prepare_latency
-= op
->get_dequeued_time();
2495 osd
->logger
->tinc(l_osd_op_prepare_lat
, prepare_latency
);
2496 if (op
->may_read() && op
->may_write()) {
2497 osd
->logger
->tinc(l_osd_op_rw_prepare_lat
, prepare_latency
);
2498 } else if (op
->may_read()) {
2499 osd
->logger
->tinc(l_osd_op_r_prepare_lat
, prepare_latency
);
2500 } else if (op
->may_write() || op
->may_cache()) {
2501 osd
->logger
->tinc(l_osd_op_w_prepare_lat
, prepare_latency
);
2504 // force recovery of the oldest missing object if too many logs
2505 maybe_force_recovery();
2508 PrimaryLogPG::cache_result_t
PrimaryLogPG::maybe_handle_manifest_detail(
2511 ObjectContextRef obc
)
2514 dout(20) << __func__
<< ": no obc " << dendl
;
2515 return cache_result_t::NOOP
;
2518 if (!obc
->obs
.oi
.has_manifest()) {
2519 dout(20) << __func__
<< ": " << obc
->obs
.oi
.soid
2520 << " is not manifest object " << dendl
;
2521 return cache_result_t::NOOP
;
2523 if (op
->get_req
<MOSDOp
>()->get_flags() & CEPH_OSD_FLAG_IGNORE_REDIRECT
) {
2524 dout(20) << __func__
<< ": ignoring redirect due to flag" << dendl
;
2525 return cache_result_t::NOOP
;
2528 // if it is write-ordered and blocked, stop now
2529 if (obc
->is_blocked() && write_ordered
) {
2530 // we're already doing something with this object
2531 dout(20) << __func__
<< " blocked on " << obc
->obs
.oi
.soid
<< dendl
;
2532 return cache_result_t::NOOP
;
2535 vector
<OSDOp
> ops
= op
->get_req
<MOSDOp
>()->ops
;
2536 for (vector
<OSDOp
>::iterator p
= ops
.begin(); p
!= ops
.end(); ++p
) {
2538 ceph_osd_op
& op
= osd_op
.op
;
2539 if (op
.op
== CEPH_OSD_OP_SET_REDIRECT
||
2540 op
.op
== CEPH_OSD_OP_SET_CHUNK
||
2541 op
.op
== CEPH_OSD_OP_UNSET_MANIFEST
||
2542 op
.op
== CEPH_OSD_OP_TIER_PROMOTE
||
2543 op
.op
== CEPH_OSD_OP_TIER_FLUSH
||
2544 op
.op
== CEPH_OSD_OP_TIER_EVICT
||
2545 op
.op
== CEPH_OSD_OP_ISDIRTY
) {
2546 return cache_result_t::NOOP
;
2550 switch (obc
->obs
.oi
.manifest
.type
) {
2551 case object_manifest_t::TYPE_REDIRECT
:
2552 if (op
->may_write() || write_ordered
) {
2553 do_proxy_write(op
, obc
);
2556 if (obc
->obs
.oi
.size
!= 0) {
2557 return cache_result_t::NOOP
;
2559 do_proxy_read(op
, obc
);
2561 return cache_result_t::HANDLED_PROXY
;
2562 case object_manifest_t::TYPE_CHUNKED
:
2564 if (can_proxy_chunked_read(op
, obc
)) {
2565 map
<hobject_t
,FlushOpRef
>::iterator p
= flush_ops
.find(obc
->obs
.oi
.soid
);
2566 if (p
!= flush_ops
.end()) {
2567 do_proxy_chunked_op(op
, obc
->obs
.oi
.soid
, obc
, true);
2568 return cache_result_t::HANDLED_PROXY
;
2570 do_proxy_chunked_op(op
, obc
->obs
.oi
.soid
, obc
, write_ordered
);
2571 return cache_result_t::HANDLED_PROXY
;
2574 MOSDOp
*m
= static_cast<MOSDOp
*>(op
->get_nonconst_req());
2575 ceph_assert(m
->get_type() == CEPH_MSG_OSD_OP
);
2576 hobject_t head
= m
->get_hobj();
2578 if (is_degraded_or_backfilling_object(head
)) {
2579 dout(20) << __func__
<< ": " << head
<< " is degraded, waiting" << dendl
;
2580 wait_for_degraded_object(head
, op
);
2581 return cache_result_t::BLOCKED_RECOVERY
;
2584 if (m_scrubber
->write_blocked_by_scrub(head
)) {
2585 dout(20) << __func__
<< ": waiting for scrub" << dendl
;
2586 waiting_for_scrub
.push_back(op
);
2587 op
->mark_delayed("waiting for scrub");
2588 return cache_result_t::BLOCKED_RECOVERY
;
2590 if (!check_laggy_requeue(op
)) {
2591 return cache_result_t::BLOCKED_RECOVERY
;
2594 for (auto& p
: obc
->obs
.oi
.manifest
.chunk_map
) {
2595 if (p
.second
.is_missing()) {
2596 auto m
= op
->get_req
<MOSDOp
>();
2597 const object_locator_t oloc
= m
->get_object_locator();
2598 promote_object(obc
, obc
->obs
.oi
.soid
, oloc
, op
, NULL
);
2599 return cache_result_t::BLOCKED_PROMOTE
;
2602 return cache_result_t::NOOP
;
2605 ceph_abort_msg("unrecognized manifest type");
2608 return cache_result_t::NOOP
;
2611 void PrimaryLogPG::record_write_error(OpRequestRef op
, const hobject_t
&soid
,
2612 MOSDOpReply
*orig_reply
, int r
,
2613 OpContext
*ctx_for_op_returns
)
2615 dout(20) << __func__
<< " r=" << r
<< dendl
;
2616 ceph_assert(op
->may_write());
2617 const osd_reqid_t
&reqid
= op
->get_req
<MOSDOp
>()->get_reqid();
2618 mempool::osd_pglog::list
<pg_log_entry_t
> entries
;
2619 entries
.push_back(pg_log_entry_t(pg_log_entry_t::ERROR
, soid
,
2620 get_next_version(), eversion_t(), 0,
2621 reqid
, utime_t(), r
));
2622 if (ctx_for_op_returns
) {
2623 entries
.back().set_op_returns(*ctx_for_op_returns
->ops
);
2624 dout(20) << __func__
<< " op_returns=" << entries
.back().op_returns
<< dendl
;
2630 boost::intrusive_ptr
<MOSDOpReply
> orig_reply
;
2635 MOSDOpReply
*orig_reply
,
2638 orig_reply(orig_reply
, false /* take over ref */), r(r
)
2641 ldpp_dout(pg
, 20) << "finished " << __func__
<< " r=" << r
<< dendl
;
2642 auto m
= op
->get_req
<MOSDOp
>();
2643 MOSDOpReply
*reply
= orig_reply
.detach();
2644 ldpp_dout(pg
, 10) << " sending commit on " << *m
<< " " << reply
<< dendl
;
2645 pg
->osd
->send_message_osd_client(reply
, m
->get_connection());
2649 ObcLockManager lock_manager
;
2652 std::move(lock_manager
),
2653 std::optional
<std::function
<void(void)> >(
2654 OnComplete(this, op
, orig_reply
, r
)),
2659 PrimaryLogPG::cache_result_t
PrimaryLogPG::maybe_handle_cache_detail(
2662 ObjectContextRef obc
,
2663 int r
, hobject_t missing_oid
,
2666 ObjectContextRef
*promote_obc
)
2668 // return quickly if caching is not enabled
2669 if (pool
.info
.cache_mode
== pg_pool_t::CACHEMODE_NONE
)
2670 return cache_result_t::NOOP
;
2674 op
->get_req()->get_type() == CEPH_MSG_OSD_OP
&&
2675 (op
->get_req
<MOSDOp
>()->get_flags() &
2676 CEPH_OSD_FLAG_IGNORE_CACHE
)) {
2677 dout(20) << __func__
<< ": ignoring cache due to flag" << dendl
;
2678 return cache_result_t::NOOP
;
2681 must_promote
= must_promote
|| op
->need_promote();
2684 dout(25) << __func__
<< " " << obc
->obs
.oi
<< " "
2685 << (obc
->obs
.exists
? "exists" : "DNE")
2686 << " missing_oid " << missing_oid
2687 << " must_promote " << (int)must_promote
2688 << " in_hit_set " << (int)in_hit_set
2691 dout(25) << __func__
<< " (no obc)"
2692 << " missing_oid " << missing_oid
2693 << " must_promote " << (int)must_promote
2694 << " in_hit_set " << (int)in_hit_set
2697 // if it is write-ordered and blocked, stop now
2698 if (obc
.get() && obc
->is_blocked() && write_ordered
) {
2699 // we're already doing something with this object
2700 dout(20) << __func__
<< " blocked on " << obc
->obs
.oi
.soid
<< dendl
;
2701 return cache_result_t::NOOP
;
2704 if (r
== -ENOENT
&& missing_oid
== hobject_t()) {
2705 // we know this object is logically absent (e.g., an undefined clone)
2706 return cache_result_t::NOOP
;
2709 if (obc
.get() && obc
->obs
.exists
) {
2710 osd
->logger
->inc(l_osd_op_cache_hit
);
2711 return cache_result_t::NOOP
;
2713 if (!is_primary()) {
2714 dout(20) << __func__
<< " cache miss; ask the primary" << dendl
;
2715 osd
->reply_op_error(op
, -EAGAIN
);
2716 return cache_result_t::REPLIED_WITH_EAGAIN
;
2719 if (missing_oid
== hobject_t() && obc
.get()) {
2720 missing_oid
= obc
->obs
.oi
.soid
;
2723 auto m
= op
->get_req
<MOSDOp
>();
2724 const object_locator_t oloc
= m
->get_object_locator();
2726 if (op
->need_skip_handle_cache()) {
2727 return cache_result_t::NOOP
;
2730 OpRequestRef promote_op
;
2732 switch (pool
.info
.cache_mode
) {
2733 case pg_pool_t::CACHEMODE_WRITEBACK
:
2735 agent_state
->evict_mode
== TierAgentState::EVICT_MODE_FULL
) {
2736 if (!op
->may_write() && !op
->may_cache() &&
2737 !write_ordered
&& !must_promote
) {
2738 dout(20) << __func__
<< " cache pool full, proxying read" << dendl
;
2740 return cache_result_t::HANDLED_PROXY
;
2742 dout(20) << __func__
<< " cache pool full, waiting" << dendl
;
2743 block_write_on_full_cache(missing_oid
, op
);
2744 return cache_result_t::BLOCKED_FULL
;
2747 if (must_promote
|| (!hit_set
&& !op
->need_skip_promote())) {
2748 promote_object(obc
, missing_oid
, oloc
, op
, promote_obc
);
2749 return cache_result_t::BLOCKED_PROMOTE
;
2752 if (op
->may_write() || op
->may_cache()) {
2756 if (!op
->need_skip_promote() &&
2757 maybe_promote(obc
, missing_oid
, oloc
, in_hit_set
,
2758 pool
.info
.min_write_recency_for_promote
,
2761 return cache_result_t::BLOCKED_PROMOTE
;
2763 return cache_result_t::HANDLED_PROXY
;
2767 // Avoid duplicate promotion
2768 if (obc
.get() && obc
->is_blocked()) {
2771 return cache_result_t::BLOCKED_PROMOTE
;
2775 if (!op
->need_skip_promote()) {
2776 (void)maybe_promote(obc
, missing_oid
, oloc
, in_hit_set
,
2777 pool
.info
.min_read_recency_for_promote
,
2778 promote_op
, promote_obc
);
2781 return cache_result_t::HANDLED_PROXY
;
2783 ceph_abort_msg("unreachable");
2784 return cache_result_t::NOOP
;
2786 case pg_pool_t::CACHEMODE_READONLY
:
2787 // TODO: clean this case up
2788 if (!obc
.get() && r
== -ENOENT
) {
2789 // we don't have the object and op's a read
2790 promote_object(obc
, missing_oid
, oloc
, op
, promote_obc
);
2791 return cache_result_t::BLOCKED_PROMOTE
;
2793 if (!r
) { // it must be a write
2794 do_cache_redirect(op
);
2795 return cache_result_t::HANDLED_REDIRECT
;
2797 // crap, there was a failure of some kind
2798 return cache_result_t::NOOP
;
2800 case pg_pool_t::CACHEMODE_FORWARD
:
2801 // this mode is deprecated; proxy instead
2802 case pg_pool_t::CACHEMODE_PROXY
:
2803 if (!must_promote
) {
2804 if (op
->may_write() || op
->may_cache() || write_ordered
) {
2806 return cache_result_t::HANDLED_PROXY
;
2809 return cache_result_t::HANDLED_PROXY
;
2812 // ugh, we're forced to promote.
2814 agent_state
->evict_mode
== TierAgentState::EVICT_MODE_FULL
) {
2815 dout(20) << __func__
<< " cache pool full, waiting" << dendl
;
2816 block_write_on_full_cache(missing_oid
, op
);
2817 return cache_result_t::BLOCKED_FULL
;
2819 promote_object(obc
, missing_oid
, oloc
, op
, promote_obc
);
2820 return cache_result_t::BLOCKED_PROMOTE
;
2822 case pg_pool_t::CACHEMODE_READFORWARD
:
2823 // this mode is deprecated; proxy instead
2824 case pg_pool_t::CACHEMODE_READPROXY
:
2825 // Do writeback to the cache tier for writes
2826 if (op
->may_write() || write_ordered
|| must_promote
) {
2828 agent_state
->evict_mode
== TierAgentState::EVICT_MODE_FULL
) {
2829 dout(20) << __func__
<< " cache pool full, waiting" << dendl
;
2830 block_write_on_full_cache(missing_oid
, op
);
2831 return cache_result_t::BLOCKED_FULL
;
2833 promote_object(obc
, missing_oid
, oloc
, op
, promote_obc
);
2834 return cache_result_t::BLOCKED_PROMOTE
;
2837 // If it is a read, we can read, we need to proxy it
2839 return cache_result_t::HANDLED_PROXY
;
2842 ceph_abort_msg("unrecognized cache_mode");
2844 return cache_result_t::NOOP
;
2847 bool PrimaryLogPG::maybe_promote(ObjectContextRef obc
,
2848 const hobject_t
& missing_oid
,
2849 const object_locator_t
& oloc
,
2852 OpRequestRef promote_op
,
2853 ObjectContextRef
*promote_obc
)
2855 dout(20) << __func__
<< " missing_oid " << missing_oid
2856 << " in_hit_set " << in_hit_set
<< dendl
;
2862 // Check if in the current hit set
2872 unsigned count
= (int)in_hit_set
;
2874 // Check if in other hit sets
2875 const hobject_t
& oid
= obc
.get() ? obc
->obs
.oi
.soid
: missing_oid
;
2876 for (map
<time_t,HitSetRef
>::reverse_iterator itor
=
2877 agent_state
->hit_set_map
.rbegin();
2878 itor
!= agent_state
->hit_set_map
.rend();
2880 if (!itor
->second
->contains(oid
)) {
2884 if (count
>= recency
) {
2889 if (count
>= recency
) {
2892 return false; // not promoting
2897 if (osd
->promote_throttle()) {
2898 dout(10) << __func__
<< " promote throttled" << dendl
;
2901 promote_object(obc
, missing_oid
, oloc
, promote_op
, promote_obc
);
2905 void PrimaryLogPG::do_cache_redirect(OpRequestRef op
)
2907 auto m
= op
->get_req
<MOSDOp
>();
2908 int flags
= m
->get_flags() & (CEPH_OSD_FLAG_ACK
|CEPH_OSD_FLAG_ONDISK
);
2909 MOSDOpReply
*reply
= new MOSDOpReply(m
, -ENOENT
, get_osdmap_epoch(),
2911 request_redirect_t
redir(m
->get_object_locator(), pool
.info
.tier_of
);
2912 reply
->set_redirect(redir
);
2913 dout(10) << "sending redirect to pool " << pool
.info
.tier_of
<< " for op "
2915 m
->get_connection()->send_message(reply
);
2919 struct C_ProxyRead
: public Context
{
2922 epoch_t last_peering_reset
;
2924 PrimaryLogPG::ProxyReadOpRef prdop
;
2926 C_ProxyRead(PrimaryLogPG
*p
, hobject_t o
, epoch_t lpr
,
2927 const PrimaryLogPG::ProxyReadOpRef
& prd
)
2928 : pg(p
), oid(o
), last_peering_reset(lpr
),
2929 tid(0), prdop(prd
), start(ceph_clock_now())
2931 void finish(int r
) override
{
2932 if (prdop
->canceled
)
2934 std::scoped_lock locker
{*pg
};
2935 if (prdop
->canceled
) {
2938 if (last_peering_reset
== pg
->get_last_peering_reset()) {
2939 pg
->finish_proxy_read(oid
, tid
, r
);
2940 pg
->osd
->logger
->tinc(l_osd_tier_r_lat
, ceph_clock_now() - start
);
2945 struct C_ProxyChunkRead
: public Context
{
2948 epoch_t last_peering_reset
;
2950 PrimaryLogPG::ProxyReadOpRef prdop
;
2952 ObjectOperation
*obj_op
;
2954 uint64_t req_offset
= 0;
2955 ObjectContextRef obc
;
2956 uint64_t req_total_len
= 0;
2957 C_ProxyChunkRead(PrimaryLogPG
*p
, hobject_t o
, epoch_t lpr
,
2958 const PrimaryLogPG::ProxyReadOpRef
& prd
)
2959 : pg(p
), oid(o
), last_peering_reset(lpr
),
2960 tid(0), prdop(prd
), start(ceph_clock_now()), obj_op(NULL
)
2962 void finish(int r
) override
{
2963 if (prdop
->canceled
)
2965 std::scoped_lock locker
{*pg
};
2966 if (prdop
->canceled
) {
2969 if (last_peering_reset
== pg
->get_last_peering_reset()) {
2971 if (!prdop
->ops
[op_index
].outdata
.length()) {
2972 ceph_assert(req_total_len
);
2974 bufferptr
bptr(req_total_len
);
2975 list
.push_back(std::move(bptr
));
2976 prdop
->ops
[op_index
].outdata
.append(list
);
2978 ceph_assert(obj_op
);
2979 uint64_t copy_offset
;
2980 if (req_offset
>= prdop
->ops
[op_index
].op
.extent
.offset
) {
2981 copy_offset
= req_offset
- prdop
->ops
[op_index
].op
.extent
.offset
;
2985 prdop
->ops
[op_index
].outdata
.begin(copy_offset
).copy_in(
2986 obj_op
->ops
[0].outdata
.length(),
2987 obj_op
->ops
[0].outdata
.c_str());
2990 pg
->finish_proxy_read(oid
, tid
, r
);
2991 pg
->osd
->logger
->tinc(l_osd_tier_r_lat
, ceph_clock_now() - start
);
2999 void PrimaryLogPG::do_proxy_read(OpRequestRef op
, ObjectContextRef obc
)
3001 // NOTE: non-const here because the ProxyReadOp needs mutable refs to
3002 // stash the result in the request's OSDOp vector
3003 MOSDOp
*m
= static_cast<MOSDOp
*>(op
->get_nonconst_req());
3004 object_locator_t oloc
;
3006 /* extensible tier */
3007 if (obc
&& obc
->obs
.exists
&& obc
->obs
.oi
.has_manifest()) {
3008 switch (obc
->obs
.oi
.manifest
.type
) {
3009 case object_manifest_t::TYPE_REDIRECT
:
3010 oloc
= object_locator_t(obc
->obs
.oi
.manifest
.redirect_target
);
3011 soid
= obc
->obs
.oi
.manifest
.redirect_target
;
3014 ceph_abort_msg("unrecognized manifest type");
3018 soid
= m
->get_hobj();
3019 oloc
= object_locator_t(m
->get_object_locator());
3020 oloc
.pool
= pool
.info
.tier_of
;
3022 unsigned flags
= CEPH_OSD_FLAG_IGNORE_CACHE
| CEPH_OSD_FLAG_IGNORE_OVERLAY
;
3024 // pass through some original flags that make sense.
3025 // - leave out redirection and balancing flags since we are
3026 // already proxying through the primary
3027 // - leave off read/write/exec flags that are derived from the op
3028 flags
|= m
->get_flags() & (CEPH_OSD_FLAG_RWORDERED
|
3029 CEPH_OSD_FLAG_ORDERSNAP
|
3030 CEPH_OSD_FLAG_ENFORCE_SNAPC
|
3031 CEPH_OSD_FLAG_MAP_SNAP_CLONE
);
3033 dout(10) << __func__
<< " Start proxy read for " << *m
<< dendl
;
3035 ProxyReadOpRef
prdop(std::make_shared
<ProxyReadOp
>(op
, soid
, m
->ops
));
3037 ObjectOperation obj_op
;
3038 obj_op
.dup(prdop
->ops
);
3040 if (pool
.info
.cache_mode
== pg_pool_t::CACHEMODE_WRITEBACK
&&
3041 (agent_state
&& agent_state
->evict_mode
!= TierAgentState::EVICT_MODE_FULL
)) {
3042 for (unsigned i
= 0; i
< obj_op
.ops
.size(); i
++) {
3043 ceph_osd_op op
= obj_op
.ops
[i
].op
;
3045 case CEPH_OSD_OP_READ
:
3046 case CEPH_OSD_OP_SYNC_READ
:
3047 case CEPH_OSD_OP_SPARSE_READ
:
3048 case CEPH_OSD_OP_CHECKSUM
:
3049 case CEPH_OSD_OP_CMPEXT
:
3050 op
.flags
= (op
.flags
| CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL
) &
3051 ~(CEPH_OSD_OP_FLAG_FADVISE_DONTNEED
| CEPH_OSD_OP_FLAG_FADVISE_NOCACHE
);
3056 C_ProxyRead
*fin
= new C_ProxyRead(this, soid
, get_last_peering_reset(),
3058 ceph_tid_t tid
= osd
->objecter
->read(
3059 soid
.oid
, oloc
, obj_op
,
3060 m
->get_snapid(), NULL
,
3061 flags
, new C_OnFinisher(fin
, osd
->get_objecter_finisher(get_pg_shard())),
3062 &prdop
->user_version
,
3063 &prdop
->data_offset
,
3066 prdop
->objecter_tid
= tid
;
3067 proxyread_ops
[tid
] = prdop
;
3068 in_progress_proxy_ops
[soid
].push_back(op
);
3071 void PrimaryLogPG::finish_proxy_read(hobject_t oid
, ceph_tid_t tid
, int r
)
3073 dout(10) << __func__
<< " " << oid
<< " tid " << tid
3074 << " " << cpp_strerror(r
) << dendl
;
3076 map
<ceph_tid_t
, ProxyReadOpRef
>::iterator p
= proxyread_ops
.find(tid
);
3077 if (p
== proxyread_ops
.end()) {
3078 dout(10) << __func__
<< " no proxyread_op found" << dendl
;
3081 ProxyReadOpRef prdop
= p
->second
;
3082 if (tid
!= prdop
->objecter_tid
) {
3083 dout(10) << __func__
<< " tid " << tid
<< " != prdop " << prdop
3084 << " tid " << prdop
->objecter_tid
<< dendl
;
3087 if (oid
!= prdop
->soid
) {
3088 dout(10) << __func__
<< " oid " << oid
<< " != prdop " << prdop
3089 << " soid " << prdop
->soid
<< dendl
;
3092 proxyread_ops
.erase(tid
);
3094 map
<hobject_t
, list
<OpRequestRef
>>::iterator q
= in_progress_proxy_ops
.find(oid
);
3095 if (q
== in_progress_proxy_ops
.end()) {
3096 dout(10) << __func__
<< " no in_progress_proxy_ops found" << dendl
;
3099 ceph_assert(q
->second
.size());
3100 list
<OpRequestRef
>::iterator it
= std::find(q
->second
.begin(),
3103 ceph_assert(it
!= q
->second
.end());
3104 OpRequestRef op
= *it
;
3105 q
->second
.erase(it
);
3106 if (q
->second
.size() == 0) {
3107 in_progress_proxy_ops
.erase(oid
);
3108 } else if (std::find(q
->second
.begin(),
3110 prdop
->op
) != q
->second
.end()) {
3111 /* multiple read case */
3112 dout(20) << __func__
<< " " << oid
<< " is not completed " << dendl
;
3116 osd
->logger
->inc(l_osd_tier_proxy_read
);
3118 auto m
= op
->get_req
<MOSDOp
>();
3119 OpContext
*ctx
= new OpContext(op
, m
->get_reqid(), &prdop
->ops
, this);
3120 ctx
->reply
= new MOSDOpReply(m
, 0, get_osdmap_epoch(), 0, false);
3121 ctx
->user_at_version
= prdop
->user_version
;
3122 ctx
->data_off
= prdop
->data_offset
;
3123 ctx
->ignore_log_op_stats
= true;
3124 complete_read_ctx(r
, ctx
);
3127 void PrimaryLogPG::kick_proxy_ops_blocked(hobject_t
& soid
)
3129 map
<hobject_t
, list
<OpRequestRef
>>::iterator p
= in_progress_proxy_ops
.find(soid
);
3130 if (p
== in_progress_proxy_ops
.end())
3133 list
<OpRequestRef
>& ls
= p
->second
;
3134 dout(10) << __func__
<< " " << soid
<< " requeuing " << ls
.size() << " requests" << dendl
;
3136 in_progress_proxy_ops
.erase(p
);
3139 void PrimaryLogPG::cancel_proxy_read(ProxyReadOpRef prdop
,
3140 vector
<ceph_tid_t
> *tids
)
3142 dout(10) << __func__
<< " " << prdop
->soid
<< dendl
;
3143 prdop
->canceled
= true;
3145 // cancel objecter op, if we can
3146 if (prdop
->objecter_tid
) {
3147 tids
->push_back(prdop
->objecter_tid
);
3148 for (uint32_t i
= 0; i
< prdop
->ops
.size(); i
++) {
3149 prdop
->ops
[i
].outdata
.clear();
3151 proxyread_ops
.erase(prdop
->objecter_tid
);
3152 prdop
->objecter_tid
= 0;
3156 void PrimaryLogPG::cancel_proxy_ops(bool requeue
, vector
<ceph_tid_t
> *tids
)
3158 dout(10) << __func__
<< dendl
;
3160 // cancel proxy reads
3161 map
<ceph_tid_t
, ProxyReadOpRef
>::iterator p
= proxyread_ops
.begin();
3162 while (p
!= proxyread_ops
.end()) {
3163 cancel_proxy_read((p
++)->second
, tids
);
3166 // cancel proxy writes
3167 map
<ceph_tid_t
, ProxyWriteOpRef
>::iterator q
= proxywrite_ops
.begin();
3168 while (q
!= proxywrite_ops
.end()) {
3169 cancel_proxy_write((q
++)->second
, tids
);
3173 map
<hobject_t
, list
<OpRequestRef
>>::iterator p
=
3174 in_progress_proxy_ops
.begin();
3175 while (p
!= in_progress_proxy_ops
.end()) {
3176 list
<OpRequestRef
>& ls
= p
->second
;
3177 dout(10) << __func__
<< " " << p
->first
<< " requeuing " << ls
.size()
3178 << " requests" << dendl
;
3180 in_progress_proxy_ops
.erase(p
++);
3183 in_progress_proxy_ops
.clear();
3187 struct C_ProxyWrite_Commit
: public Context
{
3190 epoch_t last_peering_reset
;
3192 PrimaryLogPG::ProxyWriteOpRef pwop
;
3193 C_ProxyWrite_Commit(PrimaryLogPG
*p
, hobject_t o
, epoch_t lpr
,
3194 const PrimaryLogPG::ProxyWriteOpRef
& pw
)
3195 : pg(p
), oid(o
), last_peering_reset(lpr
),
3198 void finish(int r
) override
{
3201 std::scoped_lock locker
{*pg
};
3202 if (pwop
->canceled
) {
3205 if (last_peering_reset
== pg
->get_last_peering_reset()) {
3206 pg
->finish_proxy_write(oid
, tid
, r
);
3211 void PrimaryLogPG::do_proxy_write(OpRequestRef op
, ObjectContextRef obc
)
3213 // NOTE: non-const because ProxyWriteOp takes a mutable ref
3214 MOSDOp
*m
= static_cast<MOSDOp
*>(op
->get_nonconst_req());
3215 object_locator_t oloc
;
3216 SnapContext
snapc(m
->get_snap_seq(), m
->get_snaps());
3218 /* extensible tier */
3219 if (obc
&& obc
->obs
.exists
&& obc
->obs
.oi
.has_manifest()) {
3220 switch (obc
->obs
.oi
.manifest
.type
) {
3221 case object_manifest_t::TYPE_REDIRECT
:
3222 oloc
= object_locator_t(obc
->obs
.oi
.manifest
.redirect_target
);
3223 soid
= obc
->obs
.oi
.manifest
.redirect_target
;
3226 ceph_abort_msg("unrecognized manifest type");
3230 soid
= m
->get_hobj();
3231 oloc
= object_locator_t(m
->get_object_locator());
3232 oloc
.pool
= pool
.info
.tier_of
;
3235 unsigned flags
= CEPH_OSD_FLAG_IGNORE_CACHE
| CEPH_OSD_FLAG_IGNORE_OVERLAY
;
3236 if (!(op
->may_write() || op
->may_cache())) {
3237 flags
|= CEPH_OSD_FLAG_RWORDERED
;
3239 if (op
->allows_returnvec()) {
3240 flags
|= CEPH_OSD_FLAG_RETURNVEC
;
3243 dout(10) << __func__
<< " Start proxy write for " << *m
<< dendl
;
3245 ProxyWriteOpRef
pwop(std::make_shared
<ProxyWriteOp
>(op
, soid
, m
->ops
, m
->get_reqid()));
3246 pwop
->ctx
= new OpContext(op
, m
->get_reqid(), &pwop
->ops
, this);
3247 pwop
->mtime
= m
->get_mtime();
3249 ObjectOperation obj_op
;
3250 obj_op
.dup(pwop
->ops
);
3252 C_ProxyWrite_Commit
*fin
= new C_ProxyWrite_Commit(
3253 this, soid
, get_last_peering_reset(), pwop
);
3254 ceph_tid_t tid
= osd
->objecter
->mutate(
3255 soid
.oid
, oloc
, obj_op
, snapc
,
3256 ceph::real_clock::from_ceph_timespec(pwop
->mtime
),
3257 flags
, new C_OnFinisher(fin
, osd
->get_objecter_finisher(get_pg_shard())),
3258 &pwop
->user_version
, pwop
->reqid
);
3260 pwop
->objecter_tid
= tid
;
3261 proxywrite_ops
[tid
] = pwop
;
3262 in_progress_proxy_ops
[soid
].push_back(op
);
3265 void PrimaryLogPG::do_proxy_chunked_op(OpRequestRef op
, const hobject_t
& missing_oid
,
3266 ObjectContextRef obc
, bool write_ordered
)
3268 MOSDOp
*m
= static_cast<MOSDOp
*>(op
->get_nonconst_req());
3269 OSDOp
*osd_op
= NULL
;
3270 for (unsigned int i
= 0; i
< m
->ops
.size(); i
++) {
3271 osd_op
= &m
->ops
[i
];
3272 uint64_t cursor
= osd_op
->op
.extent
.offset
;
3273 uint64_t op_length
= osd_op
->op
.extent
.offset
+ osd_op
->op
.extent
.length
;
3274 uint64_t chunk_length
= 0, chunk_index
= 0, req_len
= 0;
3275 object_manifest_t
*manifest
= &obc
->obs
.oi
.manifest
;
3276 map
<uint64_t, map
<uint64_t, uint64_t>> chunk_read
;
3278 while (cursor
< op_length
) {
3281 /* find the right chunk position for cursor */
3282 for (auto &p
: manifest
->chunk_map
) {
3283 if (p
.first
<= cursor
&& p
.first
+ p
.second
.length
> cursor
) {
3284 chunk_length
= p
.second
.length
;
3285 chunk_index
= p
.first
;
3290 if (!chunk_index
&& !chunk_length
) {
3291 if (cursor
== osd_op
->op
.extent
.offset
) {
3292 OpContext
*ctx
= new OpContext(op
, m
->get_reqid(), &m
->ops
, this);
3293 ctx
->reply
= new MOSDOpReply(m
, 0, get_osdmap_epoch(), 0, false);
3294 ctx
->data_off
= osd_op
->op
.extent
.offset
;
3295 ctx
->ignore_log_op_stats
= true;
3296 complete_read_ctx(0, ctx
);
3300 uint64_t next_length
= chunk_length
;
3301 /* the size to read -> | op length | */
3303 if (cursor
+ next_length
> op_length
) {
3304 next_length
= op_length
- cursor
;
3306 /* the size to read -> | op length | */
3308 if (cursor
+ next_length
> chunk_index
+ chunk_length
) {
3309 next_length
= chunk_index
+ chunk_length
- cursor
;
3312 chunk_read
[cursor
] = {{chunk_index
, next_length
}};
3313 cursor
+= next_length
;
3316 req_len
= cursor
- osd_op
->op
.extent
.offset
;
3317 for (auto &p
: chunk_read
) {
3318 auto chunks
= p
.second
.begin();
3319 dout(20) << __func__
<< " chunk_index: " << chunks
->first
3320 << " next_length: " << chunks
->second
<< " cursor: "
3321 << p
.first
<< dendl
;
3322 do_proxy_chunked_read(op
, obc
, i
, chunks
->first
, p
.first
, chunks
->second
, req_len
, write_ordered
);
3327 struct RefCountCallback
: public Context
{
3329 PrimaryLogPG::OpContext
*ctx
;
3331 bool requeue
= false;
3333 RefCountCallback(PrimaryLogPG::OpContext
*ctx
, OSDOp
&osd_op
)
3334 : ctx(ctx
), osd_op(osd_op
) {}
3335 void finish(int r
) override
{
3336 // NB: caller must already have pg->lock held
3337 ctx
->obc
->stop_block();
3338 ctx
->pg
->kick_object_context_blocked(ctx
->obc
);
3341 ctx
->pg
->execute_ctx(ctx
);
3343 // on cancel simply toss op out,
3344 // or requeue as requested
3345 if (r
!= -ECANCELED
) {
3347 ctx
->pg
->osd
->reply_op_error(ctx
->op
, r
);
3348 } else if (requeue
) {
3350 ctx
->pg
->requeue_op(ctx
->op
);
3352 ctx
->pg
->close_op_ctx(ctx
);
3355 void set_requeue(bool rq
) {
3360 struct SetManifestFinisher
: public PrimaryLogPG::OpFinisher
{
3363 explicit SetManifestFinisher(OSDOp
& osd_op
) : osd_op(osd_op
) {
3366 int execute() override
{
3371 struct C_SetManifestRefCountDone
: public Context
{
3376 C_SetManifestRefCountDone(PrimaryLogPG
*p
,
3377 hobject_t soid
, uint64_t offset
) :
3378 pg(p
), soid(soid
), offset(offset
) {}
3379 void finish(int r
) override
{
3380 if (r
== -ECANCELED
)
3382 std::scoped_lock locker
{*pg
};
3383 pg
->finish_set_manifest_refcount(soid
, r
, tid
, offset
);
3387 struct C_SetDedupChunks
: public Context
{
3390 epoch_t last_peering_reset
;
3394 C_SetDedupChunks(PrimaryLogPG
*p
, hobject_t o
, epoch_t lpr
, uint64_t offset
)
3395 : pg(p
), oid(o
), last_peering_reset(lpr
),
3396 tid(0), offset(offset
)
3398 void finish(int r
) override
{
3399 if (r
== -ECANCELED
)
3401 std::scoped_lock locker
{*pg
};
3402 if (last_peering_reset
!= pg
->get_last_peering_reset()) {
3405 pg
->finish_set_dedup(oid
, r
, tid
, offset
);
3409 void PrimaryLogPG::cancel_manifest_ops(bool requeue
, vector
<ceph_tid_t
> *tids
)
3411 dout(10) << __func__
<< dendl
;
3412 auto p
= manifest_ops
.begin();
3413 while (p
!= manifest_ops
.end()) {
3414 auto mop
= p
->second
;
3415 // cancel objecter op, if we can
3416 if (mop
->objecter_tid
) {
3417 tids
->push_back(mop
->objecter_tid
);
3418 mop
->objecter_tid
= 0;
3419 } else if (!mop
->tids
.empty()) {
3420 for (auto &p
: mop
->tids
) {
3421 tids
->push_back(p
.second
);
3425 mop
->cb
->set_requeue(requeue
);
3426 mop
->cb
->complete(-ECANCELED
);
3428 manifest_ops
.erase(p
++);
3432 int PrimaryLogPG::get_manifest_ref_count(ObjectContextRef obc
, std::string
& fp_oid
, OpRequestRef op
)
3436 for (auto &p
: obc
->obs
.oi
.manifest
.chunk_map
) {
3437 if (p
.second
.oid
.oid
.name
== fp_oid
) {
3442 SnapSet
& ss
= obc
->ssc
->snapset
;
3443 const OSDMapRef
& osdmap
= get_osdmap();
3444 for (vector
<snapid_t
>::const_reverse_iterator p
= ss
.clones
.rbegin();
3445 p
!= ss
.clones
.rend();
3447 object_ref_delta_t refs
;
3448 ObjectContextRef obc_l
= nullptr;
3449 ObjectContextRef obc_g
= nullptr;
3450 hobject_t clone_oid
= obc
->obs
.oi
.soid
;
3451 clone_oid
.snap
= *p
;
3452 if (osdmap
->in_removed_snaps_queue(info
.pgid
.pgid
.pool(), *p
)) {
3455 ObjectContextRef clone_obc
= get_object_context(clone_oid
, false);
3459 if (recover_adjacent_clones(clone_obc
, op
)) {
3462 get_adjacent_clones(clone_obc
, obc_l
, obc_g
);
3463 clone_obc
->obs
.oi
.manifest
.calc_refs_to_inc_on_set(
3464 obc_g
? &(obc_g
->obs
.oi
.manifest
) : nullptr ,
3467 for (auto p
= refs
.begin(); p
!= refs
.end(); ++p
) {
3468 if (p
->first
.oid
.name
== fp_oid
&& p
->second
> 0) {
3477 bool PrimaryLogPG::recover_adjacent_clones(ObjectContextRef obc
, OpRequestRef op
)
3479 if (!obc
->ssc
|| !obc
->ssc
->snapset
.clones
.size()) {
3482 MOSDOp
*m
= static_cast<MOSDOp
*>(op
->get_nonconst_req());
3483 bool has_manifest_op
= std::any_of(
3486 [](const auto& osd_op
) {
3487 return osd_op
.op
.op
== CEPH_OSD_OP_SET_CHUNK
;
3489 if (!obc
->obs
.oi
.manifest
.is_chunked() && !has_manifest_op
) {
3494 const SnapSet
& snapset
= obc
->ssc
->snapset
;
3495 auto s
= std::find(snapset
.clones
.begin(), snapset
.clones
.end(), obc
->obs
.oi
.soid
.snap
);
3496 auto is_unreadable_snap
= [this, obc
, &snapset
, op
](auto iter
) -> bool {
3497 hobject_t cid
= obc
->obs
.oi
.soid
;
3498 cid
.snap
= (iter
== snapset
.clones
.end()) ? snapid_t(CEPH_NOSNAP
) : *iter
;
3499 if (is_unreadable_object(cid
)) {
3500 dout(10) << __func__
<< ": clone " << cid
3501 << " is unreadable, waiting" << dendl
;
3502 wait_for_unreadable_object(cid
, op
);
3507 if (s
!= snapset
.clones
.begin()) {
3508 if (is_unreadable_snap(s
- 1)) {
3512 if (s
!= snapset
.clones
.end()) {
3513 if (is_unreadable_snap(s
+ 1)) {
3520 ObjectContextRef
PrimaryLogPG::get_prev_clone_obc(ObjectContextRef obc
)
3522 auto s
= std::find(obc
->ssc
->snapset
.clones
.begin(), obc
->ssc
->snapset
.clones
.end(),
3523 obc
->obs
.oi
.soid
.snap
);
3524 if (s
!= obc
->ssc
->snapset
.clones
.begin()) {
3525 auto s_iter
= s
- 1;
3526 hobject_t cid
= obc
->obs
.oi
.soid
;
3527 object_ref_delta_t refs
;
3529 ObjectContextRef cobc
= get_object_context(cid
, false, NULL
);
3536 void PrimaryLogPG::dec_refcount(const hobject_t
& soid
, const object_ref_delta_t
& refs
)
3538 for (auto p
= refs
.begin(); p
!= refs
.end(); ++p
) {
3539 int dec_ref_count
= p
->second
;
3540 ceph_assert(dec_ref_count
< 0);
3541 while (dec_ref_count
< 0) {
3542 dout(10) << __func__
<< ": decrement reference on offset oid: " << p
->first
<< dendl
;
3543 refcount_manifest(soid
, p
->first
,
3544 refcount_t::DECREMENT_REF
, NULL
, std::nullopt
);
3551 void PrimaryLogPG::get_adjacent_clones(ObjectContextRef src_obc
,
3552 ObjectContextRef
& _l
, ObjectContextRef
& _g
)
3554 const SnapSet
& snapset
= src_obc
->ssc
->snapset
;
3555 const object_info_t
& oi
= src_obc
->obs
.oi
;
3557 auto get_context
= [this, &oi
, &snapset
](auto iter
)
3558 -> ObjectContextRef
{
3559 hobject_t cid
= oi
.soid
;
3560 cid
.snap
= (iter
== snapset
.clones
.end()) ? snapid_t(CEPH_NOSNAP
) : *iter
;
3561 ObjectContextRef obc
= get_object_context(cid
, false, NULL
);
3566 // check adjacent clones
3567 auto s
= std::find(snapset
.clones
.begin(), snapset
.clones
.end(), oi
.soid
.snap
);
3569 // We *must* find the clone iff it's not head,
3570 // let s == snapset.clones.end() mean head
3571 ceph_assert((s
== snapset
.clones
.end()) == oi
.soid
.is_head());
3573 if (s
!= snapset
.clones
.begin()) {
3574 _l
= get_context(s
- 1);
3577 if (s
!= snapset
.clones
.end()) {
3578 _g
= get_context(s
+ 1);
3582 bool PrimaryLogPG::inc_refcount_by_set(OpContext
* ctx
, object_manifest_t
& set_chunk
,
3585 object_ref_delta_t refs
;
3586 ObjectContextRef obc_l
, obc_g
;
3587 get_adjacent_clones(ctx
->obc
, obc_l
, obc_g
);
3588 set_chunk
.calc_refs_to_inc_on_set(
3589 obc_l
? &(obc_l
->obs
.oi
.manifest
) : nullptr,
3590 obc_g
? &(obc_g
->obs
.oi
.manifest
) : nullptr,
3592 bool need_inc_ref
= false;
3593 if (!refs
.is_empty()) {
3594 ManifestOpRef
mop(std::make_shared
<ManifestOp
>());
3595 for (auto c
: set_chunk
.chunk_map
) {
3596 auto p
= refs
.find(c
.second
.oid
);
3597 if (p
== refs
.end()) {
3601 int inc_ref_count
= p
->second
;
3602 if (inc_ref_count
> 0) {
3604 * In set-chunk case, the first thing we should do is to increment
3605 * the reference the targe object has prior to update object_manifest in object_info_t.
3606 * So, call directly refcount_manifest.
3608 auto target_oid
= p
->first
;
3609 auto offset
= c
.first
;
3610 auto length
= c
.second
.length
;
3611 auto* fin
= new C_SetManifestRefCountDone(this, ctx
->obs
->oi
.soid
, offset
);
3612 ceph_tid_t tid
= refcount_manifest(ctx
->obs
->oi
.soid
, target_oid
,
3613 refcount_t::INCREMENT_REF
, fin
, std::nullopt
);
3615 mop
->chunks
[target_oid
] = make_pair(offset
, length
);
3617 mop
->tids
[offset
] = tid
;
3619 if (!ctx
->obc
->is_blocked()) {
3620 ctx
->obc
->start_block();
3622 need_inc_ref
= true;
3623 } else if (inc_ref_count
< 0) {
3624 hobject_t src
= ctx
->obs
->oi
.soid
;
3625 hobject_t tgt
= p
->first
;
3626 ctx
->register_on_commit(
3628 refcount_manifest(src
, tgt
, refcount_t::DECREMENT_REF
, NULL
, std::nullopt
);
3632 if (mop
->tids
.size()) {
3633 mop
->cb
= new RefCountCallback(ctx
, osd_op
);
3634 manifest_ops
[ctx
->obs
->oi
.soid
] = mop
;
3635 manifest_ops
[ctx
->obs
->oi
.soid
]->op
= ctx
->op
;
3639 return need_inc_ref
;
3642 void PrimaryLogPG::update_chunk_map_by_dirty(OpContext
* ctx
) {
3644 * We should consider two cases here:
3645 * 1) just modification: This created dirty regions, but didn't update chunk_map.
3646 * 2) rollback: In rollback, head will be converted to the clone the rollback targets.
3647 * Also, rollback already updated chunk_map.
3648 * So, we should do here is to check whether chunk_map is updated and the clean_region has dirty regions.
3649 * In case of the rollback, chunk_map doesn't need to be clear
3651 for (auto &p
: ctx
->obs
->oi
.manifest
.chunk_map
) {
3652 if (!ctx
->clean_regions
.is_clean_region(p
.first
, p
.second
.length
)) {
3653 ctx
->new_obs
.oi
.manifest
.chunk_map
.erase(p
.first
);
3654 if (ctx
->new_obs
.oi
.manifest
.chunk_map
.empty()) {
3655 ctx
->new_obs
.oi
.manifest
.type
= object_manifest_t::TYPE_NONE
;
3656 ctx
->new_obs
.oi
.clear_flag(object_info_t::FLAG_MANIFEST
);
3657 ctx
->delta_stats
.num_objects_manifest
--;
3663 void PrimaryLogPG::dec_refcount_by_dirty(OpContext
* ctx
)
3665 object_ref_delta_t refs
;
3666 ObjectContextRef cobc
= nullptr;
3667 ObjectContextRef obc
= ctx
->obc
;
3668 // Look over previous snapshot, then figure out whether updated chunk needs to be deleted
3669 cobc
= get_prev_clone_obc(obc
);
3670 obc
->obs
.oi
.manifest
.calc_refs_to_drop_on_modify(
3671 cobc
? &cobc
->obs
.oi
.manifest
: nullptr,
3674 if (!refs
.is_empty()) {
3675 hobject_t soid
= obc
->obs
.oi
.soid
;
3676 ctx
->register_on_commit(
3677 [soid
, this, refs
](){
3678 dec_refcount(soid
, refs
);
3683 void PrimaryLogPG::dec_all_refcount_manifest(const object_info_t
& oi
, OpContext
* ctx
)
3685 ceph_assert(oi
.has_manifest());
3686 ceph_assert(ctx
->obc
->ssc
);
3688 if (oi
.manifest
.is_chunked()) {
3689 object_ref_delta_t refs
;
3690 ObjectContextRef obc_l
, obc_g
, obc
;
3691 /* in trim_object, oi and ctx can have different oid */
3692 obc
= get_object_context(oi
.soid
, false, NULL
);
3694 get_adjacent_clones(obc
, obc_l
, obc_g
);
3695 oi
.manifest
.calc_refs_to_drop_on_removal(
3696 obc_l
? &(obc_l
->obs
.oi
.manifest
) : nullptr,
3697 obc_g
? &(obc_g
->obs
.oi
.manifest
) : nullptr,
3700 if (!refs
.is_empty()) {
3701 /* dec_refcount will use head object anyway */
3702 hobject_t soid
= ctx
->obc
->obs
.oi
.soid
;
3703 ctx
->register_on_commit(
3704 [soid
, this, refs
](){
3705 dec_refcount(soid
, refs
);
3708 } else if (oi
.manifest
.is_redirect() &&
3709 oi
.test_flag(object_info_t::FLAG_REDIRECT_HAS_REFERENCE
)) {
3710 ctx
->register_on_commit(
3712 refcount_manifest(oi
.soid
, oi
.manifest
.redirect_target
,
3713 refcount_t::DECREMENT_REF
, NULL
, std::nullopt
);
3718 ceph_tid_t
PrimaryLogPG::refcount_manifest(hobject_t src_soid
, hobject_t tgt_soid
, refcount_t type
,
3719 Context
*cb
, std::optional
<bufferlist
> chunk
)
3721 unsigned flags
= CEPH_OSD_FLAG_IGNORE_CACHE
| CEPH_OSD_FLAG_IGNORE_OVERLAY
|
3722 CEPH_OSD_FLAG_RWORDERED
;
3724 dout(10) << __func__
<< " Start refcount from " << src_soid
3725 << " to " << tgt_soid
<< dendl
;
3727 ObjectOperation obj_op
;
3729 if (type
== refcount_t::INCREMENT_REF
) {
3730 cls_cas_chunk_get_ref_op call
;
3731 call
.source
= src_soid
.get_head();
3733 obj_op
.call("cas", "chunk_get_ref", in
);
3734 } else if (type
== refcount_t::DECREMENT_REF
) {
3735 cls_cas_chunk_put_ref_op call
;
3736 call
.source
= src_soid
.get_head();
3738 obj_op
.call("cas", "chunk_put_ref", in
);
3739 } else if (type
== refcount_t::CREATE_OR_GET_REF
) {
3740 cls_cas_chunk_create_or_get_ref_op get_call
;
3741 get_call
.source
= src_soid
.get_head();
3743 get_call
.data
= std::move(*chunk
);
3744 ::encode(get_call
, in
);
3745 obj_op
.call("cas", "chunk_create_or_get_ref", in
);
3747 ceph_assert(0 == "unrecognized type");
3750 Context
*c
= nullptr;
3752 c
= new C_OnFinisher(cb
, osd
->get_objecter_finisher(get_pg_shard()));
3755 object_locator_t
oloc(tgt_soid
);
3756 ObjectContextRef src_obc
= get_object_context(src_soid
, false, NULL
);
3757 ceph_assert(src_obc
);
3758 auto tid
= osd
->objecter
->mutate(
3759 tgt_soid
.oid
, oloc
, obj_op
, SnapContext(),
3760 ceph::real_clock::from_ceph_timespec(src_obc
->obs
.oi
.mtime
),
3765 void PrimaryLogPG::do_proxy_chunked_read(OpRequestRef op
, ObjectContextRef obc
, int op_index
,
3766 uint64_t chunk_index
, uint64_t req_offset
, uint64_t req_length
,
3767 uint64_t req_total_len
, bool write_ordered
)
3769 MOSDOp
*m
= static_cast<MOSDOp
*>(op
->get_nonconst_req());
3770 object_manifest_t
*manifest
= &obc
->obs
.oi
.manifest
;
3771 if (!manifest
->chunk_map
.count(chunk_index
)) {
3774 uint64_t chunk_length
= manifest
->chunk_map
[chunk_index
].length
;
3775 hobject_t soid
= manifest
->chunk_map
[chunk_index
].oid
;
3776 hobject_t ori_soid
= m
->get_hobj();
3777 object_locator_t
oloc(soid
);
3778 unsigned flags
= CEPH_OSD_FLAG_IGNORE_CACHE
| CEPH_OSD_FLAG_IGNORE_OVERLAY
;
3779 if (write_ordered
) {
3780 flags
|= CEPH_OSD_FLAG_RWORDERED
;
3783 if (!chunk_length
|| soid
== hobject_t()) {
3787 /* same as do_proxy_read() */
3788 flags
|= m
->get_flags() & (CEPH_OSD_FLAG_RWORDERED
|
3789 CEPH_OSD_FLAG_ORDERSNAP
|
3790 CEPH_OSD_FLAG_ENFORCE_SNAPC
|
3791 CEPH_OSD_FLAG_MAP_SNAP_CLONE
);
3793 dout(10) << __func__
<< " Start do chunk proxy read for " << *m
3794 << " index: " << op_index
<< " oid: " << soid
.oid
.name
<< " req_offset: " << req_offset
3795 << " req_length: " << req_length
<< dendl
;
3797 ProxyReadOpRef
prdop(std::make_shared
<ProxyReadOp
>(op
, ori_soid
, m
->ops
));
3799 ObjectOperation
*pobj_op
= new ObjectOperation
;
3800 OSDOp
&osd_op
= pobj_op
->add_op(m
->ops
[op_index
].op
.op
);
3802 if (chunk_index
<= req_offset
) {
3803 osd_op
.op
.extent
.offset
= manifest
->chunk_map
[chunk_index
].offset
+ req_offset
- chunk_index
;
3805 ceph_abort_msg("chunk_index > req_offset");
3807 osd_op
.op
.extent
.length
= req_length
;
3809 ObjectOperation obj_op
;
3810 obj_op
.dup(pobj_op
->ops
);
3812 C_ProxyChunkRead
*fin
= new C_ProxyChunkRead(this, ori_soid
, get_last_peering_reset(),
3814 fin
->obj_op
= pobj_op
;
3815 fin
->op_index
= op_index
;
3816 fin
->req_offset
= req_offset
;
3818 fin
->req_total_len
= req_total_len
;
3820 ceph_tid_t tid
= osd
->objecter
->read(
3821 soid
.oid
, oloc
, obj_op
,
3822 m
->get_snapid(), NULL
,
3823 flags
, new C_OnFinisher(fin
, osd
->get_objecter_finisher(get_pg_shard())),
3824 &prdop
->user_version
,
3825 &prdop
->data_offset
,
3828 prdop
->objecter_tid
= tid
;
3829 proxyread_ops
[tid
] = prdop
;
3830 in_progress_proxy_ops
[ori_soid
].push_back(op
);
3833 bool PrimaryLogPG::can_proxy_chunked_read(OpRequestRef op
, ObjectContextRef obc
)
3835 MOSDOp
*m
= static_cast<MOSDOp
*>(op
->get_nonconst_req());
3836 OSDOp
*osd_op
= NULL
;
3838 for (unsigned int i
= 0; i
< m
->ops
.size(); i
++) {
3839 osd_op
= &m
->ops
[i
];
3840 ceph_osd_op op
= osd_op
->op
;
3842 case CEPH_OSD_OP_READ
:
3843 case CEPH_OSD_OP_SYNC_READ
: {
3844 uint64_t cursor
= osd_op
->op
.extent
.offset
;
3845 uint64_t remain
= osd_op
->op
.extent
.length
;
3847 /* requested chunks exist in chunk_map ? */
3848 for (auto &p
: obc
->obs
.oi
.manifest
.chunk_map
) {
3849 if (p
.first
<= cursor
&& p
.first
+ p
.second
.length
> cursor
) {
3850 if (!p
.second
.is_missing()) {
3853 if (p
.second
.length
>= remain
) {
3857 remain
= remain
- p
.second
.length
;
3859 cursor
+= p
.second
.length
;
3864 dout(20) << __func__
<< " requested chunks don't exist in chunk_map " << dendl
;
3876 void PrimaryLogPG::finish_proxy_write(hobject_t oid
, ceph_tid_t tid
, int r
)
3878 dout(10) << __func__
<< " " << oid
<< " tid " << tid
3879 << " " << cpp_strerror(r
) << dendl
;
3881 map
<ceph_tid_t
, ProxyWriteOpRef
>::iterator p
= proxywrite_ops
.find(tid
);
3882 if (p
== proxywrite_ops
.end()) {
3883 dout(10) << __func__
<< " no proxywrite_op found" << dendl
;
3886 ProxyWriteOpRef pwop
= p
->second
;
3887 ceph_assert(tid
== pwop
->objecter_tid
);
3888 ceph_assert(oid
== pwop
->soid
);
3890 proxywrite_ops
.erase(tid
);
3892 map
<hobject_t
, list
<OpRequestRef
> >::iterator q
= in_progress_proxy_ops
.find(oid
);
3893 if (q
== in_progress_proxy_ops
.end()) {
3894 dout(10) << __func__
<< " no in_progress_proxy_ops found" << dendl
;
3899 list
<OpRequestRef
>& in_progress_op
= q
->second
;
3900 ceph_assert(in_progress_op
.size());
3901 list
<OpRequestRef
>::iterator it
= std::find(in_progress_op
.begin(),
3902 in_progress_op
.end(),
3904 ceph_assert(it
!= in_progress_op
.end());
3905 in_progress_op
.erase(it
);
3906 if (in_progress_op
.size() == 0) {
3907 in_progress_proxy_ops
.erase(oid
);
3908 } else if (std::find(in_progress_op
.begin(),
3909 in_progress_op
.end(),
3910 pwop
->op
) != in_progress_op
.end()) {
3914 dout(20) << __func__
<< " " << oid
<< " tid " << tid
3915 << " in_progress_op size: "
3916 << in_progress_op
.size() << dendl
;
3920 osd
->logger
->inc(l_osd_tier_proxy_write
);
3922 auto m
= pwop
->op
->get_req
<MOSDOp
>();
3923 ceph_assert(m
!= NULL
);
3925 if (!pwop
->sent_reply
) {
3927 assert(pwop
->ctx
->reply
== nullptr);
3928 MOSDOpReply
*reply
= new MOSDOpReply(m
, r
, get_osdmap_epoch(), 0,
3929 true /* we claim it below */);
3930 reply
->set_reply_versions(eversion_t(), pwop
->user_version
);
3931 reply
->add_flags(CEPH_OSD_FLAG_ACK
| CEPH_OSD_FLAG_ONDISK
);
3932 reply
->claim_op_out_data(pwop
->ops
);
3933 dout(10) << " sending commit on " << pwop
<< " " << reply
<< dendl
;
3934 osd
->send_message_osd_client(reply
, m
->get_connection());
3935 pwop
->sent_reply
= true;
3936 pwop
->ctx
->op
->mark_commit_sent();
3943 void PrimaryLogPG::cancel_proxy_write(ProxyWriteOpRef pwop
,
3944 vector
<ceph_tid_t
> *tids
)
3946 dout(10) << __func__
<< " " << pwop
->soid
<< dendl
;
3947 pwop
->canceled
= true;
3949 // cancel objecter op, if we can
3950 if (pwop
->objecter_tid
) {
3951 tids
->push_back(pwop
->objecter_tid
);
3954 proxywrite_ops
.erase(pwop
->objecter_tid
);
3955 pwop
->objecter_tid
= 0;
3959 class PromoteCallback
: public PrimaryLogPG::CopyCallback
{
3960 ObjectContextRef obc
;
3964 PromoteCallback(ObjectContextRef obc_
, PrimaryLogPG
*pg_
)
3967 start(ceph_clock_now()) {}
3969 void finish(PrimaryLogPG::CopyCallbackResults results
) override
{
3970 PrimaryLogPG::CopyResults
*results_data
= results
.get
<1>();
3971 int r
= results
.get
<0>();
3972 if (obc
->obs
.oi
.has_manifest() && obc
->obs
.oi
.manifest
.is_chunked()) {
3973 pg
->finish_promote_manifest(r
, results_data
, obc
);
3975 pg
->finish_promote(r
, results_data
, obc
);
3977 pg
->osd
->logger
->tinc(l_osd_tier_promote_lat
, ceph_clock_now() - start
);
3981 class PromoteManifestCallback
: public PrimaryLogPG::CopyCallback
{
3982 ObjectContextRef obc
;
3985 PrimaryLogPG::OpContext
*ctx
;
3986 PrimaryLogPG::CopyCallbackResults promote_results
;
3988 PromoteManifestCallback(ObjectContextRef obc_
, PrimaryLogPG
*pg_
, PrimaryLogPG::OpContext
*ctx
)
3991 start(ceph_clock_now()), ctx(ctx
) {}
3993 void finish(PrimaryLogPG::CopyCallbackResults results
) override
{
3994 PrimaryLogPG::CopyResults
*results_data
= results
.get
<1>();
3995 int r
= results
.get
<0>();
3996 promote_results
= results
;
3997 if (obc
->obs
.oi
.has_manifest() && obc
->obs
.oi
.manifest
.is_redirect()) {
3998 ctx
->user_at_version
= results_data
->user_version
;
4001 ctx
->pg
->execute_ctx(ctx
);
4003 if (r
!= -ECANCELED
) {
4005 ctx
->pg
->osd
->reply_op_error(ctx
->op
, r
);
4006 } else if (results_data
->should_requeue
) {
4008 ctx
->pg
->requeue_op(ctx
->op
);
4010 ctx
->pg
->close_op_ctx(ctx
);
4012 pg
->osd
->logger
->tinc(l_osd_tier_promote_lat
, ceph_clock_now() - start
);
4014 friend struct PromoteFinisher
;
4017 struct PromoteFinisher
: public PrimaryLogPG::OpFinisher
{
4018 PromoteManifestCallback
*promote_callback
;
4020 explicit PromoteFinisher(PromoteManifestCallback
*promote_callback
)
4021 : promote_callback(promote_callback
) {
4024 int execute() override
{
4025 if (promote_callback
->ctx
->obc
->obs
.oi
.manifest
.is_redirect()) {
4026 promote_callback
->ctx
->pg
->finish_promote(promote_callback
->promote_results
.get
<0>(),
4027 promote_callback
->promote_results
.get
<1>(),
4028 promote_callback
->obc
);
4029 } else if (promote_callback
->ctx
->obc
->obs
.oi
.manifest
.is_chunked()) {
4030 promote_callback
->ctx
->pg
->finish_promote_manifest(promote_callback
->promote_results
.get
<0>(),
4031 promote_callback
->promote_results
.get
<1>(),
4032 promote_callback
->obc
);
4034 ceph_abort_msg("unrecognized manifest type");
4040 void PrimaryLogPG::promote_object(ObjectContextRef obc
,
4041 const hobject_t
& missing_oid
,
4042 const object_locator_t
& oloc
,
4044 ObjectContextRef
*promote_obc
)
4046 hobject_t hoid
= obc
? obc
->obs
.oi
.soid
: missing_oid
;
4047 ceph_assert(hoid
!= hobject_t());
4048 if (m_scrubber
->write_blocked_by_scrub(hoid
)) {
4049 dout(10) << __func__
<< " " << hoid
4050 << " blocked by scrub" << dendl
;
4052 waiting_for_scrub
.push_back(op
);
4053 op
->mark_delayed("waiting for scrub");
4054 dout(10) << __func__
<< " " << hoid
4055 << " placing op in waiting_for_scrub" << dendl
;
4057 dout(10) << __func__
<< " " << hoid
4058 << " no op, dropping on the floor" << dendl
;
4062 if (op
&& !check_laggy_requeue(op
)) {
4065 if (!obc
) { // we need to create an ObjectContext
4066 ceph_assert(missing_oid
!= hobject_t());
4067 obc
= get_object_context(missing_oid
, true);
4073 * Before promote complete, if there are proxy-reads for the object,
4074 * for this case we don't use DONTNEED.
4076 unsigned src_fadvise_flags
= LIBRADOS_OP_FLAG_FADVISE_SEQUENTIAL
;
4077 map
<hobject_t
, list
<OpRequestRef
>>::iterator q
= in_progress_proxy_ops
.find(obc
->obs
.oi
.soid
);
4078 if (q
== in_progress_proxy_ops
.end()) {
4079 src_fadvise_flags
|= LIBRADOS_OP_FLAG_FADVISE_DONTNEED
;
4083 object_locator_t my_oloc
;
4085 if (!obc
->obs
.oi
.has_manifest()) {
4087 my_oloc
.pool
= pool
.info
.tier_of
;
4088 src_hoid
= obc
->obs
.oi
.soid
;
4089 cb
= new PromoteCallback(obc
, this);
4091 if (obc
->obs
.oi
.manifest
.is_chunked()) {
4092 src_hoid
= obc
->obs
.oi
.soid
;
4093 cb
= new PromoteCallback(obc
, this);
4094 } else if (obc
->obs
.oi
.manifest
.is_redirect()) {
4095 object_locator_t
src_oloc(obc
->obs
.oi
.manifest
.redirect_target
);
4097 src_hoid
= obc
->obs
.oi
.manifest
.redirect_target
;
4098 cb
= new PromoteCallback(obc
, this);
4100 ceph_abort_msg("unrecognized manifest type");
4104 unsigned flags
= CEPH_OSD_COPY_FROM_FLAG_IGNORE_OVERLAY
|
4105 CEPH_OSD_COPY_FROM_FLAG_IGNORE_CACHE
|
4106 CEPH_OSD_COPY_FROM_FLAG_MAP_SNAP_CLONE
|
4107 CEPH_OSD_COPY_FROM_FLAG_RWORDERED
;
4108 start_copy(cb
, obc
, src_hoid
, my_oloc
, 0, flags
,
4109 obc
->obs
.oi
.soid
.snap
== CEPH_NOSNAP
,
4110 src_fadvise_flags
, 0);
4112 ceph_assert(obc
->is_blocked());
4115 wait_for_blocked_object(obc
->obs
.oi
.soid
, op
);
4117 recovery_state
.update_stats(
4118 [](auto &history
, auto &stats
) {
4119 stats
.stats
.sum
.num_promote
++;
4124 void PrimaryLogPG::execute_ctx(OpContext
*ctx
)
4127 dout(10) << __func__
<< " " << ctx
<< dendl
;
4128 ctx
->reset_obs(ctx
->obc
);
4129 ctx
->update_log_only
= false; // reset in case finish_copyfrom() is re-running execute_ctx
4130 OpRequestRef op
= ctx
->op
;
4131 auto m
= op
->get_req
<MOSDOp
>();
4132 ObjectContextRef obc
= ctx
->obc
;
4133 const hobject_t
& soid
= obc
->obs
.oi
.soid
;
4135 // this method must be idempotent since we may call it several times
4136 // before we finally apply the resulting transaction.
4137 ctx
->op_t
.reset(new PGTransaction
);
4139 if (op
->may_write() || op
->may_cache()) {
4141 if (!(m
->has_flag(CEPH_OSD_FLAG_ENFORCE_SNAPC
)) &&
4142 pool
.info
.is_pool_snaps_mode()) {
4144 ctx
->snapc
= pool
.snapc
;
4146 // client specified snapc
4147 ctx
->snapc
.seq
= m
->get_snap_seq();
4148 ctx
->snapc
.snaps
= m
->get_snaps();
4149 filter_snapc(ctx
->snapc
.snaps
);
4151 if ((m
->has_flag(CEPH_OSD_FLAG_ORDERSNAP
)) &&
4152 ctx
->snapc
.seq
< obc
->ssc
->snapset
.seq
) {
4153 dout(10) << " ORDERSNAP flag set and snapc seq " << ctx
->snapc
.seq
4154 << " < snapset seq " << obc
->ssc
->snapset
.seq
4155 << " on " << obc
->obs
.oi
.soid
<< dendl
;
4156 reply_ctx(ctx
, -EOLDSNAPC
);
4161 ctx
->at_version
= get_next_version();
4162 ctx
->mtime
= m
->get_mtime();
4164 dout(10) << __func__
<< " " << soid
<< " " << *ctx
->ops
4165 << " ov " << obc
->obs
.oi
.version
<< " av " << ctx
->at_version
4166 << " snapc " << ctx
->snapc
4167 << " snapset " << obc
->ssc
->snapset
4170 dout(10) << __func__
<< " " << soid
<< " " << *ctx
->ops
4171 << " ov " << obc
->obs
.oi
.version
4175 if (!ctx
->user_at_version
)
4176 ctx
->user_at_version
= obc
->obs
.oi
.user_version
;
4177 dout(30) << __func__
<< " user_at_version " << ctx
->user_at_version
<< dendl
;
4181 osd_reqid_t reqid
= ctx
->op
->get_reqid();
4183 tracepoint(osd
, prepare_tx_enter
, reqid
.name
._type
,
4184 reqid
.name
._num
, reqid
.tid
, reqid
.inc
);
4187 [[maybe_unused
]] auto span
= tracing::osd::tracer
.add_span(__func__
, ctx
->op
->osd_parent_span
);
4189 int result
= prepare_transaction(ctx
);
4193 osd_reqid_t reqid
= ctx
->op
->get_reqid();
4195 tracepoint(osd
, prepare_tx_exit
, reqid
.name
._type
,
4196 reqid
.name
._num
, reqid
.tid
, reqid
.inc
);
4199 bool pending_async_reads
= !ctx
->pending_async_reads
.empty();
4200 if (result
== -EINPROGRESS
|| pending_async_reads
) {
4202 if (pending_async_reads
) {
4203 ceph_assert(pool
.info
.is_erasure());
4204 in_progress_async_reads
.push_back(make_pair(op
, ctx
));
4205 ctx
->start_async_reads(this);
4210 if (result
== -EAGAIN
) {
4211 // clean up after the ctx
4216 bool ignore_out_data
= false;
4217 if (!ctx
->op_t
->empty() &&
4220 // successful update
4221 if (ctx
->op
->allows_returnvec()) {
4222 // enforce reasonable bound on the return buffer sizes
4223 for (auto& i
: *ctx
->ops
) {
4224 if (i
.outdata
.length() > cct
->_conf
->osd_max_write_op_reply_len
) {
4225 dout(10) << __func__
<< " op " << i
<< " outdata overflow" << dendl
;
4226 result
= -EOVERFLOW
; // overall result is overflow
4227 i
.rval
= -EOVERFLOW
;
4232 // legacy behavior -- zero result and return data etc.
4233 ignore_out_data
= true;
4238 // prepare the reply
4239 ctx
->reply
= new MOSDOpReply(m
, result
, get_osdmap_epoch(), 0,
4241 dout(20) << __func__
<< " alloc reply " << ctx
->reply
4242 << " result " << result
<< dendl
;
4245 if ((ctx
->op_t
->empty() || result
< 0) && !ctx
->update_log_only
) {
4246 // finish side-effects
4248 do_osd_op_effects(ctx
, m
->get_connection());
4250 complete_read_ctx(result
, ctx
);
4254 ctx
->reply
->set_reply_versions(ctx
->at_version
, ctx
->user_at_version
);
4256 ceph_assert(op
->may_write() || op
->may_cache());
4259 recovery_state
.update_trim_to();
4261 // verify that we are doing this in order?
4262 if (cct
->_conf
->osd_debug_op_order
&& m
->get_source().is_client() &&
4263 !pool
.info
.is_tier() && !pool
.info
.has_tiers()) {
4264 map
<client_t
,ceph_tid_t
>& cm
= debug_op_order
[obc
->obs
.oi
.soid
];
4265 ceph_tid_t t
= m
->get_tid();
4266 client_t n
= m
->get_source().num();
4267 map
<client_t
,ceph_tid_t
>::iterator p
= cm
.find(n
);
4268 if (p
== cm
.end()) {
4269 dout(20) << " op order client." << n
<< " tid " << t
<< " (first)" << dendl
;
4272 dout(20) << " op order client." << n
<< " tid " << t
<< " last was " << p
->second
<< dendl
;
4273 if (p
->second
> t
) {
4274 derr
<< "bad op order, already applied " << p
->second
<< " > this " << t
<< dendl
;
4275 ceph_abort_msg("out of order op");
4281 if (ctx
->update_log_only
) {
4283 do_osd_op_effects(ctx
, m
->get_connection());
4285 dout(20) << __func__
<< " update_log_only -- result=" << result
<< dendl
;
4286 // save just what we need from ctx
4287 MOSDOpReply
*reply
= ctx
->reply
;
4288 ctx
->reply
= nullptr;
4289 reply
->get_header().data_off
= (ctx
->data_off
? *ctx
->data_off
: 0);
4291 if (result
== -ENOENT
) {
4292 reply
->set_enoent_reply_versions(info
.last_update
,
4293 info
.last_user_version
);
4295 reply
->add_flags(CEPH_OSD_FLAG_ACK
| CEPH_OSD_FLAG_ONDISK
);
4296 // append to pg log for dup detection - don't save buffers for now
4297 record_write_error(op
, soid
, reply
, result
,
4298 ctx
->op
->allows_returnvec() ? ctx
: nullptr);
4303 // no need to capture PG ref, repop cancel will handle that
4304 // Can capture the ctx by pointer, it's owned by the repop
4305 ctx
->register_on_commit(
4308 log_op_stats(*ctx
->op
, ctx
->bytes_written
, ctx
->bytes_read
);
4310 if (m
&& !ctx
->sent_reply
) {
4311 MOSDOpReply
*reply
= ctx
->reply
;
4312 ctx
->reply
= nullptr;
4313 reply
->add_flags(CEPH_OSD_FLAG_ACK
| CEPH_OSD_FLAG_ONDISK
);
4314 dout(10) << " sending reply on " << *m
<< " " << reply
<< dendl
;
4315 osd
->send_message_osd_client(reply
, m
->get_connection());
4316 ctx
->sent_reply
= true;
4317 ctx
->op
->mark_commit_sent();
4320 ctx
->register_on_success(
4324 ctx
->op
? ctx
->op
->get_req()->get_connection() :
4327 ctx
->register_on_finish(
4332 // issue replica writes
4333 ceph_tid_t rep_tid
= osd
->get_tid();
4335 RepGather
*repop
= new_repop(ctx
, rep_tid
);
4337 issue_repop(repop
, ctx
);
4342 void PrimaryLogPG::close_op_ctx(OpContext
*ctx
) {
4343 release_object_locks(ctx
->lock_manager
);
4347 for (auto p
= ctx
->on_finish
.begin(); p
!= ctx
->on_finish
.end();
4348 ctx
->on_finish
.erase(p
++)) {
4354 void PrimaryLogPG::reply_ctx(OpContext
*ctx
, int r
)
4357 osd
->reply_op_error(ctx
->op
, r
);
4361 void PrimaryLogPG::log_op_stats(const OpRequest
& op
,
4363 const uint64_t outb
)
4365 auto m
= op
.get_req
<MOSDOp
>();
4366 const utime_t now
= ceph_clock_now();
4368 const utime_t latency
= now
- m
->get_recv_stamp();
4369 const utime_t process_latency
= now
- op
.get_dequeued_time();
4371 osd
->logger
->inc(l_osd_op
);
4373 osd
->logger
->inc(l_osd_op_outb
, outb
);
4374 osd
->logger
->inc(l_osd_op_inb
, inb
);
4375 osd
->logger
->tinc(l_osd_op_lat
, latency
);
4376 osd
->logger
->tinc(l_osd_op_process_lat
, process_latency
);
4378 if (op
.may_read() && op
.may_write()) {
4379 osd
->logger
->inc(l_osd_op_rw
);
4380 osd
->logger
->inc(l_osd_op_rw_inb
, inb
);
4381 osd
->logger
->inc(l_osd_op_rw_outb
, outb
);
4382 osd
->logger
->tinc(l_osd_op_rw_lat
, latency
);
4383 osd
->logger
->hinc(l_osd_op_rw_lat_inb_hist
, latency
.to_nsec(), inb
);
4384 osd
->logger
->hinc(l_osd_op_rw_lat_outb_hist
, latency
.to_nsec(), outb
);
4385 osd
->logger
->tinc(l_osd_op_rw_process_lat
, process_latency
);
4386 } else if (op
.may_read()) {
4387 osd
->logger
->inc(l_osd_op_r
);
4388 osd
->logger
->inc(l_osd_op_r_outb
, outb
);
4389 osd
->logger
->tinc(l_osd_op_r_lat
, latency
);
4390 osd
->logger
->hinc(l_osd_op_r_lat_outb_hist
, latency
.to_nsec(), outb
);
4391 osd
->logger
->tinc(l_osd_op_r_process_lat
, process_latency
);
4392 } else if (op
.may_write() || op
.may_cache()) {
4393 osd
->logger
->inc(l_osd_op_w
);
4394 osd
->logger
->inc(l_osd_op_w_inb
, inb
);
4395 osd
->logger
->tinc(l_osd_op_w_lat
, latency
);
4396 osd
->logger
->hinc(l_osd_op_w_lat_inb_hist
, latency
.to_nsec(), inb
);
4397 osd
->logger
->tinc(l_osd_op_w_process_lat
, process_latency
);
4402 dout(15) << "log_op_stats " << *m
4405 << " lat " << latency
<< dendl
;
4407 if (m_dynamic_perf_stats
.is_enabled()) {
4408 m_dynamic_perf_stats
.add(osd
, info
, op
, inb
, outb
, latency
);
4412 void PrimaryLogPG::set_dynamic_perf_stats_queries(
4413 const std::list
<OSDPerfMetricQuery
> &queries
)
4415 m_dynamic_perf_stats
.set_queries(queries
);
4418 void PrimaryLogPG::get_dynamic_perf_stats(DynamicPerfStats
*stats
)
4420 std::swap(m_dynamic_perf_stats
, *stats
);
4423 void PrimaryLogPG::do_scan(
4425 ThreadPool::TPHandle
&handle
)
4427 auto m
= op
->get_req
<MOSDPGScan
>();
4428 ceph_assert(m
->get_type() == MSG_OSD_PG_SCAN
);
4429 dout(10) << "do_scan " << *m
<< dendl
;
4434 case MOSDPGScan::OP_SCAN_GET_DIGEST
:
4436 auto dpp
= get_dpp();
4437 if (osd
->check_backfill_full(dpp
)) {
4438 dout(1) << __func__
<< ": Canceling backfill: Full." << dendl
;
4439 queue_peering_event(
4441 std::make_shared
<PGPeeringEvent
>(
4444 PeeringState::BackfillTooFull())));
4448 BackfillInterval bi
;
4449 bi
.begin
= m
->begin
;
4450 // No need to flush, there won't be any in progress writes occuring
4453 cct
->_conf
->osd_backfill_scan_min
,
4454 cct
->_conf
->osd_backfill_scan_max
,
4457 MOSDPGScan
*reply
= new MOSDPGScan(
4458 MOSDPGScan::OP_SCAN_DIGEST
,
4460 get_osdmap_epoch(), m
->query_epoch
,
4461 spg_t(info
.pgid
.pgid
, get_primary().shard
), bi
.begin
, bi
.end
);
4462 encode(bi
.objects
, reply
->get_data());
4463 osd
->send_message_osd_cluster(reply
, m
->get_connection());
4467 case MOSDPGScan::OP_SCAN_DIGEST
:
4469 pg_shard_t from
= m
->from
;
4471 // Check that from is in backfill_targets vector
4472 ceph_assert(is_backfill_target(from
));
4474 BackfillInterval
& bi
= peer_backfill_info
[from
];
4475 bi
.begin
= m
->begin
;
4477 auto p
= m
->get_data().cbegin();
4479 // take care to preserve ordering!
4481 decode_noclear(bi
.objects
, p
);
4482 dout(10) << __func__
<< " bi.begin=" << bi
.begin
<< " bi.end=" << bi
.end
4483 << " bi.objects.size()=" << bi
.objects
.size() << dendl
;
4485 if (waiting_on_backfill
.erase(from
)) {
4486 if (waiting_on_backfill
.empty()) {
4488 peer_backfill_info
.size() ==
4489 get_backfill_targets().size());
4490 finish_recovery_op(hobject_t::get_max());
4493 // we canceled backfill for a while due to a too full, and this
4494 // is an extra response from a non-too-full peer
4495 dout(20) << __func__
<< " canceled backfill (too full?)" << dendl
;
4502 void PrimaryLogPG::do_backfill(OpRequestRef op
)
4504 auto m
= op
->get_req
<MOSDPGBackfill
>();
4505 ceph_assert(m
->get_type() == MSG_OSD_PG_BACKFILL
);
4506 dout(10) << "do_backfill " << *m
<< dendl
;
4511 case MOSDPGBackfill::OP_BACKFILL_FINISH
:
4513 ceph_assert(cct
->_conf
->osd_kill_backfill_at
!= 1);
4515 MOSDPGBackfill
*reply
= new MOSDPGBackfill(
4516 MOSDPGBackfill::OP_BACKFILL_FINISH_ACK
,
4519 spg_t(info
.pgid
.pgid
, get_primary().shard
));
4520 reply
->set_priority(get_recovery_op_priority());
4521 osd
->send_message_osd_cluster(reply
, m
->get_connection());
4522 queue_peering_event(
4524 std::make_shared
<PGPeeringEvent
>(
4531 case MOSDPGBackfill::OP_BACKFILL_PROGRESS
:
4533 ceph_assert(cct
->_conf
->osd_kill_backfill_at
!= 2);
4535 ObjectStore::Transaction t
;
4536 recovery_state
.update_backfill_progress(
4539 m
->op
== MOSDPGBackfill::OP_BACKFILL_PROGRESS
,
4542 int tr
= osd
->store
->queue_transaction(ch
, std::move(t
), NULL
);
4543 ceph_assert(tr
== 0);
4547 case MOSDPGBackfill::OP_BACKFILL_FINISH_ACK
:
4549 ceph_assert(is_primary());
4550 ceph_assert(cct
->_conf
->osd_kill_backfill_at
!= 3);
4551 finish_recovery_op(hobject_t::get_max());
4557 void PrimaryLogPG::do_backfill_remove(OpRequestRef op
)
4559 const MOSDPGBackfillRemove
*m
= static_cast<const MOSDPGBackfillRemove
*>(
4561 ceph_assert(m
->get_type() == MSG_OSD_PG_BACKFILL_REMOVE
);
4562 dout(7) << __func__
<< " " << m
->ls
<< dendl
;
4566 ObjectStore::Transaction t
;
4567 for (auto& p
: m
->ls
) {
4568 if (is_remote_backfilling()) {
4570 int r
= osd
->store
->stat(ch
, ghobject_t(p
.first
, ghobject_t::NO_GEN
,
4571 pg_whoami
.shard
) , &st
);
4573 sub_local_num_bytes(st
.st_size
);
4575 if (pool
.info
.is_erasure()) {
4577 int r
= osd
->store
->getattr(
4579 ghobject_t(p
.first
, ghobject_t::NO_GEN
, pg_whoami
.shard
),
4583 object_info_t
oi(bv
);
4584 usersize
= oi
.size
* pgbackend
->get_ec_data_chunk_count();
4586 dout(0) << __func__
<< " " << ghobject_t(p
.first
, ghobject_t::NO_GEN
, pg_whoami
.shard
)
4587 << " can't get object info" << dendl
;
4591 usersize
= st
.st_size
;
4593 sub_num_bytes(usersize
);
4594 dout(10) << __func__
<< " " << ghobject_t(p
.first
, ghobject_t::NO_GEN
, pg_whoami
.shard
)
4595 << " sub actual data by " << st
.st_size
4596 << " sub num_bytes by " << usersize
4600 remove_snap_mapped_object(t
, p
.first
);
4602 int r
= osd
->store
->queue_transaction(ch
, std::move(t
), NULL
);
4603 ceph_assert(r
== 0);
4606 int PrimaryLogPG::trim_object(
4607 bool first
, const hobject_t
&coid
, snapid_t snap_to_trim
,
4608 PrimaryLogPG::OpContextUPtr
*ctxp
)
4614 ObjectContextRef obc
= get_object_context(coid
, false, NULL
);
4615 if (!obc
|| !obc
->ssc
|| !obc
->ssc
->exists
) {
4616 osd
->clog
->error() << __func__
<< ": Can not trim " << coid
4617 << " repair needed " << (obc
? "(no obc->ssc or !exists)" : "(no obc)");
4621 hobject_t head_oid
= coid
.get_head();
4622 ObjectContextRef head_obc
= get_object_context(head_oid
, false);
4624 osd
->clog
->error() << __func__
<< ": Can not trim " << coid
4625 << " repair needed, no snapset obc for " << head_oid
;
4629 SnapSet
& snapset
= obc
->ssc
->snapset
;
4631 object_info_t
&coi
= obc
->obs
.oi
;
4632 auto citer
= snapset
.clone_snaps
.find(coid
.snap
);
4633 if (citer
== snapset
.clone_snaps
.end()) {
4634 osd
->clog
->error() << "No clone_snaps in snapset " << snapset
4635 << " for object " << coid
<< "\n";
4638 set
<snapid_t
> old_snaps(citer
->second
.begin(), citer
->second
.end());
4639 if (old_snaps
.empty()) {
4640 osd
->clog
->error() << "No object info snaps for object " << coid
;
4644 dout(10) << coid
<< " old_snaps " << old_snaps
4645 << " old snapset " << snapset
<< dendl
;
4646 if (snapset
.seq
== 0) {
4647 osd
->clog
->error() << "No snapset.seq for object " << coid
;
4651 set
<snapid_t
> new_snaps
;
4652 const OSDMapRef
& osdmap
= get_osdmap();
4653 for (set
<snapid_t
>::iterator i
= old_snaps
.begin();
4654 i
!= old_snaps
.end();
4656 if (!osdmap
->in_removed_snaps_queue(info
.pgid
.pgid
.pool(), *i
) &&
4657 *i
!= snap_to_trim
) {
4658 new_snaps
.insert(*i
);
4662 vector
<snapid_t
>::iterator p
= snapset
.clones
.end();
4664 if (new_snaps
.empty()) {
4665 p
= std::find(snapset
.clones
.begin(), snapset
.clones
.end(), coid
.snap
);
4666 if (p
== snapset
.clones
.end()) {
4667 osd
->clog
->error() << "Snap " << coid
.snap
<< " not in clones";
4672 OpContextUPtr ctx
= simple_opc_create(obc
);
4673 ctx
->head_obc
= head_obc
;
4675 if (!ctx
->lock_manager
.get_snaptrimmer_write(
4679 close_op_ctx(ctx
.release());
4680 dout(10) << __func__
<< ": Unable to get a wlock on " << coid
<< dendl
;
4684 if (!ctx
->lock_manager
.get_snaptrimmer_write(
4688 close_op_ctx(ctx
.release());
4689 dout(10) << __func__
<< ": Unable to get a wlock on " << head_oid
<< dendl
;
4693 ctx
->at_version
= get_next_version();
4695 PGTransaction
*t
= ctx
->op_t
.get();
4697 if (new_snaps
.empty()) {
4699 dout(10) << coid
<< " snaps " << old_snaps
<< " -> "
4700 << new_snaps
<< " ... deleting" << dendl
;
4703 ceph_assert(p
!= snapset
.clones
.end());
4705 snapid_t last
= coid
.snap
;
4706 ctx
->delta_stats
.num_bytes
-= snapset
.get_clone_bytes(last
);
4708 if (p
!= snapset
.clones
.begin()) {
4709 // not the oldest... merge overlap into next older clone
4710 vector
<snapid_t
>::iterator n
= p
- 1;
4711 hobject_t prev_coid
= coid
;
4712 prev_coid
.snap
= *n
;
4713 bool adjust_prev_bytes
= is_present_clone(prev_coid
);
4715 if (adjust_prev_bytes
)
4716 ctx
->delta_stats
.num_bytes
-= snapset
.get_clone_bytes(*n
);
4718 snapset
.clone_overlap
[*n
].intersection_of(
4719 snapset
.clone_overlap
[*p
]);
4721 if (adjust_prev_bytes
)
4722 ctx
->delta_stats
.num_bytes
+= snapset
.get_clone_bytes(*n
);
4724 ctx
->delta_stats
.num_objects
--;
4726 ctx
->delta_stats
.num_objects_dirty
--;
4728 ctx
->delta_stats
.num_objects_omap
--;
4729 if (coi
.is_whiteout()) {
4730 dout(20) << __func__
<< " trimming whiteout on " << coid
<< dendl
;
4731 ctx
->delta_stats
.num_whiteouts
--;
4733 ctx
->delta_stats
.num_object_clones
--;
4734 if (coi
.is_cache_pinned())
4735 ctx
->delta_stats
.num_objects_pinned
--;
4736 if (coi
.has_manifest()) {
4737 dec_all_refcount_manifest(coi
, ctx
.get());
4738 ctx
->delta_stats
.num_objects_manifest
--;
4740 obc
->obs
.exists
= false;
4742 snapset
.clones
.erase(p
);
4743 snapset
.clone_overlap
.erase(last
);
4744 snapset
.clone_size
.erase(last
);
4745 snapset
.clone_snaps
.erase(last
);
4749 pg_log_entry_t::DELETE
,
4752 ctx
->obs
->oi
.version
,
4764 coi
= object_info_t(coid
);
4766 ctx
->at_version
.version
++;
4768 // save adjusted snaps for this object
4769 dout(10) << coid
<< " snaps " << old_snaps
<< " -> " << new_snaps
<< dendl
;
4770 snapset
.clone_snaps
[coid
.snap
] =
4771 vector
<snapid_t
>(new_snaps
.rbegin(), new_snaps
.rend());
4772 // we still do a 'modify' event on this object just to trigger a
4773 // snapmapper.update ... :(
4775 coi
.prior_version
= coi
.version
;
4776 coi
.version
= ctx
->at_version
;
4778 encode(coi
, bl
, get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD
, nullptr));
4779 t
->setattr(coid
, OI_ATTR
, bl
);
4783 pg_log_entry_t::MODIFY
,
4792 ctx
->at_version
.version
++;
4800 // save head snapset
4801 dout(10) << coid
<< " new snapset " << snapset
<< " on "
4802 << head_obc
->obs
.oi
<< dendl
;
4803 if (snapset
.clones
.empty() &&
4804 (head_obc
->obs
.oi
.is_whiteout() &&
4805 !(head_obc
->obs
.oi
.is_dirty() && pool
.info
.is_tier()) &&
4806 !head_obc
->obs
.oi
.is_cache_pinned())) {
4807 // NOTE: this arguably constitutes minor interference with the
4808 // tiering agent if this is a cache tier since a snap trim event
4809 // is effectively evicting a whiteout we might otherwise want to
4811 dout(10) << coid
<< " removing " << head_oid
<< dendl
;
4814 pg_log_entry_t::DELETE
,
4817 head_obc
->obs
.oi
.version
,
4823 dout(10) << "removing snap head" << dendl
;
4824 object_info_t
& oi
= head_obc
->obs
.oi
;
4825 ctx
->delta_stats
.num_objects
--;
4826 if (oi
.is_dirty()) {
4827 ctx
->delta_stats
.num_objects_dirty
--;
4830 ctx
->delta_stats
.num_objects_omap
--;
4831 if (oi
.is_whiteout()) {
4832 dout(20) << __func__
<< " trimming whiteout on " << oi
.soid
<< dendl
;
4833 ctx
->delta_stats
.num_whiteouts
--;
4835 if (oi
.is_cache_pinned()) {
4836 ctx
->delta_stats
.num_objects_pinned
--;
4838 if (oi
.has_manifest()) {
4839 ctx
->delta_stats
.num_objects_manifest
--;
4840 dec_all_refcount_manifest(oi
, ctx
.get());
4842 head_obc
->obs
.exists
= false;
4843 head_obc
->obs
.oi
= object_info_t(head_oid
);
4844 t
->remove(head_oid
);
4846 if (get_osdmap()->require_osd_release
< ceph_release_t::octopus
) {
4847 // filter SnapSet::snaps for the benefit of pre-octopus
4848 // peers. This is perhaps overly conservative in that I'm not
4849 // certain they need this, but let's be conservative here.
4850 dout(10) << coid
<< " filtering snapset on " << head_oid
<< dendl
;
4851 snapset
.filter(pool
.info
);
4853 snapset
.snaps
.clear();
4855 dout(10) << coid
<< " writing updated snapset on " << head_oid
4856 << ", snapset is " << snapset
<< dendl
;
4859 pg_log_entry_t::MODIFY
,
4862 head_obc
->obs
.oi
.version
,
4869 head_obc
->obs
.oi
.prior_version
= head_obc
->obs
.oi
.version
;
4870 head_obc
->obs
.oi
.version
= ctx
->at_version
;
4872 map
<string
, bufferlist
, less
<>> attrs
;
4874 encode(snapset
, bl
);
4875 attrs
[SS_ATTR
] = std::move(bl
);
4878 encode(head_obc
->obs
.oi
, bl
,
4879 get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD
, nullptr));
4880 attrs
[OI_ATTR
] = std::move(bl
);
4881 t
->setattrs(head_oid
, attrs
);
4884 *ctxp
= std::move(ctx
);
4888 void PrimaryLogPG::kick_snap_trim()
4890 ceph_assert(is_active());
4891 ceph_assert(is_primary());
4893 !state_test(PG_STATE_PREMERGE
) &&
4894 !snap_trimq
.empty()) {
4895 if (get_osdmap()->test_flag(CEPH_OSDMAP_NOSNAPTRIM
)) {
4896 dout(10) << __func__
<< ": nosnaptrim set, not kicking" << dendl
;
4898 dout(10) << __func__
<< ": clean and snaps to trim, kicking" << dendl
;
4899 snap_trimmer_machine
.process_event(KickTrim());
4904 void PrimaryLogPG::snap_trimmer_scrub_complete()
4906 if (is_primary() && is_active() && is_clean()) {
4907 ceph_assert(!snap_trimq
.empty());
4908 snap_trimmer_machine
.process_event(ScrubComplete());
4912 void PrimaryLogPG::snap_trimmer(epoch_t queued
)
4914 if (recovery_state
.is_deleting() || pg_has_reset_since(queued
)) {
4918 ceph_assert(is_primary());
4920 dout(10) << "snap_trimmer posting" << dendl
;
4921 snap_trimmer_machine
.process_event(DoSnapWork());
4922 dout(10) << "snap_trimmer complete" << dendl
;
4928 template<typename U
, typename V
>
4929 int do_cmp_xattr(int op
, const U
& lhs
, const V
& rhs
)
4932 case CEPH_OSD_CMPXATTR_OP_EQ
:
4934 case CEPH_OSD_CMPXATTR_OP_NE
:
4936 case CEPH_OSD_CMPXATTR_OP_GT
:
4938 case CEPH_OSD_CMPXATTR_OP_GTE
:
4940 case CEPH_OSD_CMPXATTR_OP_LT
:
4942 case CEPH_OSD_CMPXATTR_OP_LTE
:
4949 } // anonymous namespace
4951 int PrimaryLogPG::do_xattr_cmp_u64(int op
, uint64_t v1
, bufferlist
& xattr
)
4955 if (xattr
.length()) {
4956 const char* first
= xattr
.c_str();
4957 if (auto [p
, ec
] = std::from_chars(first
, first
+ xattr
.length(), v2
);
4958 ec
!= std::errc()) {
4964 dout(20) << "do_xattr_cmp_u64 '" << v1
<< "' vs '" << v2
<< "' op " << op
<< dendl
;
4965 return do_cmp_xattr(op
, v1
, v2
);
4968 int PrimaryLogPG::do_xattr_cmp_str(int op
, string
& v1s
, bufferlist
& xattr
)
4970 string_view
v2s(xattr
.c_str(), xattr
.length());
4971 dout(20) << "do_xattr_cmp_str '" << v1s
<< "' vs '" << v2s
<< "' op " << op
<< dendl
;
4972 return do_cmp_xattr(op
, v1s
, v2s
);
4975 int PrimaryLogPG::do_writesame(OpContext
*ctx
, OSDOp
& osd_op
)
4977 ceph_osd_op
& op
= osd_op
.op
;
4978 vector
<OSDOp
> write_ops(1);
4979 OSDOp
& write_op
= write_ops
[0];
4980 uint64_t write_length
= op
.writesame
.length
;
4986 if (!op
.writesame
.data_length
|| write_length
% op
.writesame
.data_length
)
4989 if (op
.writesame
.data_length
!= osd_op
.indata
.length()) {
4990 derr
<< "invalid length ws data length " << op
.writesame
.data_length
<< " actual len " << osd_op
.indata
.length() << dendl
;
4994 while (write_length
) {
4995 write_op
.indata
.append(osd_op
.indata
);
4996 write_length
-= op
.writesame
.data_length
;
4999 write_op
.op
.op
= CEPH_OSD_OP_WRITE
;
5000 write_op
.op
.extent
.offset
= op
.writesame
.offset
;
5001 write_op
.op
.extent
.length
= op
.writesame
.length
;
5002 result
= do_osd_ops(ctx
, write_ops
);
5004 derr
<< "do_writesame do_osd_ops failed " << result
<< dendl
;
5009 // ========================================================================
5010 // low level osd ops
5012 int PrimaryLogPG::do_tmap2omap(OpContext
*ctx
, unsigned flags
)
5014 dout(20) << " convert tmap to omap for " << ctx
->new_obs
.oi
.soid
<< dendl
;
5015 bufferlist header
, vals
;
5016 int r
= _get_tmap(ctx
, &header
, &vals
);
5018 if (r
== -ENODATA
&& (flags
& CEPH_OSD_TMAP2OMAP_NULLOK
))
5023 vector
<OSDOp
> ops(3);
5025 ops
[0].op
.op
= CEPH_OSD_OP_TRUNCATE
;
5026 ops
[0].op
.extent
.offset
= 0;
5027 ops
[0].op
.extent
.length
= 0;
5029 ops
[1].op
.op
= CEPH_OSD_OP_OMAPSETHEADER
;
5030 ops
[1].indata
= std::move(header
);
5032 ops
[2].op
.op
= CEPH_OSD_OP_OMAPSETVALS
;
5033 ops
[2].indata
= std::move(vals
);
5035 return do_osd_ops(ctx
, ops
);
5038 int PrimaryLogPG::do_tmapup_slow(OpContext
*ctx
, bufferlist::const_iterator
& bp
,
5039 OSDOp
& osd_op
, bufferlist
& bl
)
5043 map
<string
, bufferlist
> m
;
5045 auto p
= bl
.cbegin();
5048 ceph_assert(p
.end());
5058 case CEPH_OSD_TMAP_SET
: // insert key
5066 case CEPH_OSD_TMAP_RM
: // remove key
5068 if (!m
.count(key
)) {
5073 case CEPH_OSD_TMAP_RMSLOPPY
: // remove key
5077 case CEPH_OSD_TMAP_HDR
: // update header
5089 encode(header
, obl
);
5093 vector
<OSDOp
> nops(1);
5094 OSDOp
& newop
= nops
[0];
5095 newop
.op
.op
= CEPH_OSD_OP_WRITEFULL
;
5096 newop
.op
.extent
.offset
= 0;
5097 newop
.op
.extent
.length
= obl
.length();
5099 do_osd_ops(ctx
, nops
);
5103 int PrimaryLogPG::do_tmapup(OpContext
*ctx
, bufferlist::const_iterator
& bp
, OSDOp
& osd_op
)
5105 bufferlist::const_iterator orig_bp
= bp
;
5108 dout(10) << "tmapup is a no-op" << dendl
;
5110 // read the whole object
5111 vector
<OSDOp
> nops(1);
5112 OSDOp
& newop
= nops
[0];
5113 newop
.op
.op
= CEPH_OSD_OP_READ
;
5114 newop
.op
.extent
.offset
= 0;
5115 newop
.op
.extent
.length
= 0;
5116 result
= do_osd_ops(ctx
, nops
);
5118 dout(10) << "tmapup read " << newop
.outdata
.length() << dendl
;
5120 dout(30) << " starting is \n";
5121 newop
.outdata
.hexdump(*_dout
);
5124 auto ip
= newop
.outdata
.cbegin();
5127 dout(30) << "the update command is: \n";
5128 osd_op
.indata
.hexdump(*_dout
);
5134 if (newop
.outdata
.length()) {
5138 dout(10) << "tmapup header " << header
.length() << dendl
;
5140 if (!bp
.end() && *bp
== CEPH_OSD_TMAP_HDR
) {
5143 dout(10) << "tmapup new header " << header
.length() << dendl
;
5146 encode(header
, obl
);
5148 dout(20) << "tmapup initial nkeys " << nkeys
<< dendl
;
5151 bufferlist newkeydata
;
5152 string nextkey
, last_in_key
;
5154 bool have_next
= false;
5157 decode(nextkey
, ip
);
5158 decode(nextval
, ip
);
5160 while (!bp
.end() && !result
) {
5167 catch (ceph::buffer::error
& e
) {
5170 if (key
< last_in_key
) {
5171 dout(5) << "tmapup warning: key '" << key
<< "' < previous key '" << last_in_key
5172 << "', falling back to an inefficient (unsorted) update" << dendl
;
5174 return do_tmapup_slow(ctx
, bp
, osd_op
, newop
.outdata
);
5178 dout(10) << "tmapup op " << (int)op
<< " key " << key
<< dendl
;
5180 // skip existing intervening keys
5181 bool key_exists
= false;
5182 while (have_next
&& !key_exists
) {
5183 dout(20) << " (have_next=" << have_next
<< " nextkey=" << nextkey
<< ")" << dendl
;
5186 if (nextkey
< key
) {
5188 encode(nextkey
, newkeydata
);
5189 encode(nextval
, newkeydata
);
5190 dout(20) << " keep " << nextkey
<< " " << nextval
.length() << dendl
;
5192 // don't copy; discard old value. and stop.
5193 dout(20) << " drop " << nextkey
<< " " << nextval
.length() << dendl
;
5198 decode(nextkey
, ip
);
5199 decode(nextval
, ip
);
5205 if (op
== CEPH_OSD_TMAP_SET
) {
5210 catch (ceph::buffer::error
& e
) {
5213 encode(key
, newkeydata
);
5214 encode(val
, newkeydata
);
5215 dout(20) << " set " << key
<< " " << val
.length() << dendl
;
5217 } else if (op
== CEPH_OSD_TMAP_CREATE
) {
5225 catch (ceph::buffer::error
& e
) {
5228 encode(key
, newkeydata
);
5229 encode(val
, newkeydata
);
5230 dout(20) << " create " << key
<< " " << val
.length() << dendl
;
5232 } else if (op
== CEPH_OSD_TMAP_RM
) {
5237 } else if (op
== CEPH_OSD_TMAP_RMSLOPPY
) {
5240 dout(10) << " invalid tmap op " << (int)op
<< dendl
;
5247 encode(nextkey
, newkeydata
);
5248 encode(nextval
, newkeydata
);
5249 dout(20) << " keep " << nextkey
<< " " << nextval
.length() << dendl
;
5253 rest
.substr_of(newop
.outdata
, ip
.get_off(), newop
.outdata
.length() - ip
.get_off());
5254 dout(20) << " keep trailing " << rest
.length()
5255 << " at " << newkeydata
.length() << dendl
;
5256 newkeydata
.claim_append(rest
);
5259 // encode final key count + key data
5260 dout(20) << "tmapup final nkeys " << nkeys
<< dendl
;
5262 obl
.claim_append(newkeydata
);
5265 dout(30) << " final is \n";
5266 obl
.hexdump(*_dout
);
5270 auto tp
= obl
.cbegin();
5273 map
<string
,bufferlist
> d
;
5275 ceph_assert(tp
.end());
5276 dout(0) << " **** debug sanity check, looks ok ****" << dendl
;
5281 dout(20) << "tmapput write " << obl
.length() << dendl
;
5282 newop
.op
.op
= CEPH_OSD_OP_WRITEFULL
;
5283 newop
.op
.extent
.offset
= 0;
5284 newop
.op
.extent
.length
= obl
.length();
5286 do_osd_ops(ctx
, nops
);
5292 static int check_offset_and_length(uint64_t offset
, uint64_t length
,
5293 uint64_t max
, DoutPrefixProvider
*dpp
)
5295 if (offset
>= max
||
5297 offset
+ length
> max
) {
5298 ldpp_dout(dpp
, 10) << __func__
<< " "
5299 << "osd_max_object_size: " << max
5300 << "; Hard limit of object size is 4GB." << dendl
;
5307 struct FillInVerifyExtent
: public Context
{
5310 bufferlist
*outdatap
;
5311 std::optional
<uint32_t> maybe_crc
;
5316 FillInVerifyExtent(ceph_le64
*r
, int32_t *rv
, bufferlist
*blp
,
5317 std::optional
<uint32_t> mc
, uint64_t size
,
5318 OSDService
*osd
, hobject_t soid
, uint32_t flags
) :
5319 r(r
), rval(rv
), outdatap(blp
), maybe_crc(mc
),
5320 size(size
), osd(osd
), soid(soid
), flags(flags
) {}
5321 void finish(int len
) override
{
5329 // whole object? can we verify the checksum?
5330 if (maybe_crc
&& *r
== size
) {
5331 uint32_t crc
= outdatap
->crc32c(-1);
5332 if (maybe_crc
!= crc
) {
5333 osd
->clog
->error() << std::hex
<< " full-object read crc 0x" << crc
5334 << " != expected 0x" << *maybe_crc
5335 << std::dec
<< " on " << soid
;
5336 if (!(flags
& CEPH_OSD_OP_FLAG_FAILOK
)) {
5345 struct ToSparseReadResult
: public Context
{
5347 bufferlist
* data_bl
;
5348 uint64_t data_offset
;
5350 ToSparseReadResult(int* result
, bufferlist
* bl
, uint64_t offset
,
5352 : result(result
), data_bl(bl
), data_offset(offset
),len(len
) {}
5353 void finish(int r
) override
{
5361 map
<uint64_t, uint64_t> extents
= {{data_offset
, r
}};
5362 encode(extents
, outdata
);
5363 encode_destructively(*data_bl
, outdata
);
5364 data_bl
->swap(outdata
);
5368 template<typename V
>
5369 static string
list_keys(const map
<string
, V
>& m
) {
5371 for (typename map
<string
, V
>::const_iterator itr
= m
.begin(); itr
!= m
.end(); ++itr
) {
5375 s
.append(itr
->first
);
5380 template<typename T
>
5381 static string
list_entries(const T
& m
) {
5383 for (typename
T::const_iterator itr
= m
.begin(); itr
!= m
.end(); ++itr
) {
5392 void PrimaryLogPG::maybe_create_new_object(
5394 bool ignore_transaction
)
5396 ObjectState
& obs
= ctx
->new_obs
;
5398 ctx
->delta_stats
.num_objects
++;
5400 ceph_assert(!obs
.oi
.is_whiteout());
5401 obs
.oi
.new_object();
5402 if (!ignore_transaction
)
5403 ctx
->op_t
->create(obs
.oi
.soid
);
5404 } else if (obs
.oi
.is_whiteout()) {
5405 dout(10) << __func__
<< " clearing whiteout on " << obs
.oi
.soid
<< dendl
;
5406 ctx
->new_obs
.oi
.clear_flag(object_info_t::FLAG_WHITEOUT
);
5407 --ctx
->delta_stats
.num_whiteouts
;
5411 struct ReadFinisher
: public PrimaryLogPG::OpFinisher
{
5414 explicit ReadFinisher(OSDOp
& osd_op
) : osd_op(osd_op
) {
5417 int execute() override
{
5422 struct C_ChecksumRead
: public Context
{
5423 PrimaryLogPG
*primary_log_pg
;
5425 Checksummer::CSumType csum_type
;
5426 bufferlist init_value_bl
;
5427 ceph_le64 read_length
;
5429 Context
*fill_extent_ctx
;
5431 C_ChecksumRead(PrimaryLogPG
*primary_log_pg
, OSDOp
&osd_op
,
5432 Checksummer::CSumType csum_type
, bufferlist
&&init_value_bl
,
5433 std::optional
<uint32_t> maybe_crc
, uint64_t size
,
5434 OSDService
*osd
, hobject_t soid
, uint32_t flags
)
5435 : primary_log_pg(primary_log_pg
), osd_op(osd_op
),
5436 csum_type(csum_type
), init_value_bl(std::move(init_value_bl
)),
5437 fill_extent_ctx(new FillInVerifyExtent(&read_length
, &osd_op
.rval
,
5438 &read_bl
, maybe_crc
, size
,
5439 osd
, soid
, flags
)) {
5441 ~C_ChecksumRead() override
{
5442 delete fill_extent_ctx
;
5445 void finish(int r
) override
{
5446 fill_extent_ctx
->complete(r
);
5447 fill_extent_ctx
= nullptr;
5449 if (osd_op
.rval
>= 0) {
5450 bufferlist::const_iterator init_value_bl_it
= init_value_bl
.begin();
5451 osd_op
.rval
= primary_log_pg
->finish_checksum(osd_op
, csum_type
,
5452 &init_value_bl_it
, read_bl
);
5457 int PrimaryLogPG::do_checksum(OpContext
*ctx
, OSDOp
& osd_op
,
5458 bufferlist::const_iterator
*bl_it
)
5460 dout(20) << __func__
<< dendl
;
5462 auto& op
= osd_op
.op
;
5463 if (op
.checksum
.chunk_size
> 0) {
5464 if (op
.checksum
.length
== 0) {
5465 dout(10) << __func__
<< ": length required when chunk size provided"
5469 if (op
.checksum
.length
% op
.checksum
.chunk_size
!= 0) {
5470 dout(10) << __func__
<< ": length not aligned to chunk size" << dendl
;
5475 auto& oi
= ctx
->new_obs
.oi
;
5476 if (op
.checksum
.offset
== 0 && op
.checksum
.length
== 0) {
5477 // zeroed offset+length implies checksum whole object
5478 op
.checksum
.length
= oi
.size
;
5479 } else if (op
.checksum
.offset
>= oi
.size
) {
5480 // read size was trimmed to zero, do nothing
5481 // see PrimaryLogPG::do_read
5483 } else if (op
.extent
.offset
+ op
.extent
.length
> oi
.size
) {
5484 op
.extent
.length
= oi
.size
- op
.extent
.offset
;
5485 if (op
.checksum
.chunk_size
> 0 &&
5486 op
.checksum
.length
% op
.checksum
.chunk_size
!= 0) {
5487 dout(10) << __func__
<< ": length (trimmed to 0x"
5488 << std::hex
<< op
.checksum
.length
5489 << ") not aligned to chunk size 0x"
5490 << op
.checksum
.chunk_size
<< std::dec
5496 Checksummer::CSumType csum_type
;
5497 switch (op
.checksum
.type
) {
5498 case CEPH_OSD_CHECKSUM_OP_TYPE_XXHASH32
:
5499 csum_type
= Checksummer::CSUM_XXHASH32
;
5501 case CEPH_OSD_CHECKSUM_OP_TYPE_XXHASH64
:
5502 csum_type
= Checksummer::CSUM_XXHASH64
;
5504 case CEPH_OSD_CHECKSUM_OP_TYPE_CRC32C
:
5505 csum_type
= Checksummer::CSUM_CRC32C
;
5508 dout(10) << __func__
<< ": unknown crc type ("
5509 << static_cast<uint32_t>(op
.checksum
.type
) << ")" << dendl
;
5513 size_t csum_init_value_size
= Checksummer::get_csum_init_value_size(csum_type
);
5514 if (bl_it
->get_remaining() < csum_init_value_size
) {
5515 dout(10) << __func__
<< ": init value not provided" << dendl
;
5519 bufferlist init_value_bl
;
5520 init_value_bl
.substr_of(bl_it
->get_bl(), bl_it
->get_off(),
5521 csum_init_value_size
);
5522 *bl_it
+= csum_init_value_size
;
5524 if (pool
.info
.is_erasure() && op
.checksum
.length
> 0) {
5525 // If there is a data digest and it is possible we are reading
5526 // entire object, pass the digest.
5527 std::optional
<uint32_t> maybe_crc
;
5528 if (oi
.is_data_digest() && op
.checksum
.offset
== 0 &&
5529 op
.checksum
.length
>= oi
.size
) {
5530 maybe_crc
= oi
.data_digest
;
5534 auto& soid
= oi
.soid
;
5535 auto checksum_ctx
= new C_ChecksumRead(this, osd_op
, csum_type
,
5536 std::move(init_value_bl
), maybe_crc
,
5537 oi
.size
, osd
, soid
, op
.flags
);
5539 ctx
->pending_async_reads
.push_back({
5540 {op
.checksum
.offset
, op
.checksum
.length
, op
.flags
},
5541 {&checksum_ctx
->read_bl
, checksum_ctx
}});
5543 dout(10) << __func__
<< ": async_read noted for " << soid
<< dendl
;
5544 ctx
->op_finishers
[ctx
->current_osd_subop_num
].reset(
5545 new ReadFinisher(osd_op
));
5546 return -EINPROGRESS
;
5550 std::vector
<OSDOp
> read_ops(1);
5551 auto& read_op
= read_ops
[0];
5552 if (op
.checksum
.length
> 0) {
5553 read_op
.op
.op
= CEPH_OSD_OP_READ
;
5554 read_op
.op
.flags
= op
.flags
;
5555 read_op
.op
.extent
.offset
= op
.checksum
.offset
;
5556 read_op
.op
.extent
.length
= op
.checksum
.length
;
5557 read_op
.op
.extent
.truncate_size
= 0;
5558 read_op
.op
.extent
.truncate_seq
= 0;
5560 int r
= do_osd_ops(ctx
, read_ops
);
5562 derr
<< __func__
<< ": do_osd_ops failed: " << cpp_strerror(r
) << dendl
;
5567 bufferlist::const_iterator init_value_bl_it
= init_value_bl
.begin();
5568 return finish_checksum(osd_op
, csum_type
, &init_value_bl_it
,
5572 int PrimaryLogPG::finish_checksum(OSDOp
& osd_op
,
5573 Checksummer::CSumType csum_type
,
5574 bufferlist::const_iterator
*init_value_bl_it
,
5575 const bufferlist
&read_bl
) {
5576 dout(20) << __func__
<< dendl
;
5578 auto& op
= osd_op
.op
;
5580 if (op
.checksum
.length
> 0 && read_bl
.length() != op
.checksum
.length
) {
5581 derr
<< __func__
<< ": bytes read " << read_bl
.length() << " != "
5582 << op
.checksum
.length
<< dendl
;
5586 size_t csum_chunk_size
= (op
.checksum
.chunk_size
!= 0 ?
5587 op
.checksum
.chunk_size
: read_bl
.length());
5588 uint32_t csum_count
= (csum_chunk_size
> 0 ?
5589 read_bl
.length() / csum_chunk_size
: 0);
5592 bufferptr csum_data
;
5593 if (csum_count
> 0) {
5594 size_t csum_value_size
= Checksummer::get_csum_value_size(csum_type
);
5595 csum_data
= ceph::buffer::create(csum_value_size
* csum_count
);
5597 csum
.append(csum_data
);
5599 switch (csum_type
) {
5600 case Checksummer::CSUM_XXHASH32
:
5602 Checksummer::xxhash32::init_value_t init_value
;
5603 decode(init_value
, *init_value_bl_it
);
5604 Checksummer::calculate
<Checksummer::xxhash32
>(
5605 init_value
, csum_chunk_size
, 0, read_bl
.length(), read_bl
,
5609 case Checksummer::CSUM_XXHASH64
:
5611 Checksummer::xxhash64::init_value_t init_value
;
5612 decode(init_value
, *init_value_bl_it
);
5613 Checksummer::calculate
<Checksummer::xxhash64
>(
5614 init_value
, csum_chunk_size
, 0, read_bl
.length(), read_bl
,
5618 case Checksummer::CSUM_CRC32C
:
5620 Checksummer::crc32c::init_value_t init_value
;
5621 decode(init_value
, *init_value_bl_it
);
5622 Checksummer::calculate
<Checksummer::crc32c
>(
5623 init_value
, csum_chunk_size
, 0, read_bl
.length(), read_bl
,
5632 encode(csum_count
, osd_op
.outdata
);
5633 osd_op
.outdata
.claim_append(csum
);
5637 struct C_ExtentCmpRead
: public Context
{
5638 PrimaryLogPG
*primary_log_pg
;
5640 ceph_le64 read_length
{};
5642 Context
*fill_extent_ctx
;
5644 C_ExtentCmpRead(PrimaryLogPG
*primary_log_pg
, OSDOp
&osd_op
,
5645 std::optional
<uint32_t> maybe_crc
, uint64_t size
,
5646 OSDService
*osd
, hobject_t soid
, uint32_t flags
)
5647 : primary_log_pg(primary_log_pg
), osd_op(osd_op
),
5648 fill_extent_ctx(new FillInVerifyExtent(&read_length
, &osd_op
.rval
,
5649 &read_bl
, maybe_crc
, size
,
5650 osd
, soid
, flags
)) {
5652 ~C_ExtentCmpRead() override
{
5653 delete fill_extent_ctx
;
5656 void finish(int r
) override
{
5660 delete fill_extent_ctx
;
5662 fill_extent_ctx
->complete(r
);
5664 fill_extent_ctx
= nullptr;
5666 if (osd_op
.rval
>= 0) {
5667 osd_op
.rval
= primary_log_pg
->finish_extent_cmp(osd_op
, read_bl
);
5672 int PrimaryLogPG::do_extent_cmp(OpContext
*ctx
, OSDOp
& osd_op
)
5674 dout(20) << __func__
<< dendl
;
5675 ceph_osd_op
& op
= osd_op
.op
;
5677 auto& oi
= ctx
->new_obs
.oi
;
5678 uint64_t size
= oi
.size
;
5679 if ((oi
.truncate_seq
< op
.extent
.truncate_seq
) &&
5680 (op
.extent
.offset
+ op
.extent
.length
> op
.extent
.truncate_size
)) {
5681 size
= op
.extent
.truncate_size
;
5684 if (op
.extent
.offset
>= size
) {
5685 op
.extent
.length
= 0;
5686 } else if (op
.extent
.offset
+ op
.extent
.length
> size
) {
5687 op
.extent
.length
= size
- op
.extent
.offset
;
5690 if (op
.extent
.length
== 0) {
5691 dout(20) << __func__
<< " zero length extent" << dendl
;
5692 return finish_extent_cmp(osd_op
, bufferlist
{});
5693 } else if (!ctx
->obs
->exists
|| ctx
->obs
->oi
.is_whiteout()) {
5694 dout(20) << __func__
<< " object DNE" << dendl
;
5695 return finish_extent_cmp(osd_op
, {});
5696 } else if (pool
.info
.is_erasure()) {
5697 // If there is a data digest and it is possible we are reading
5698 // entire object, pass the digest.
5699 std::optional
<uint32_t> maybe_crc
;
5700 if (oi
.is_data_digest() && op
.checksum
.offset
== 0 &&
5701 op
.checksum
.length
>= oi
.size
) {
5702 maybe_crc
= oi
.data_digest
;
5706 auto& soid
= oi
.soid
;
5707 auto extent_cmp_ctx
= new C_ExtentCmpRead(this, osd_op
, maybe_crc
, oi
.size
,
5708 osd
, soid
, op
.flags
);
5709 ctx
->pending_async_reads
.push_back({
5710 {op
.extent
.offset
, op
.extent
.length
, op
.flags
},
5711 {&extent_cmp_ctx
->read_bl
, extent_cmp_ctx
}});
5713 dout(10) << __func__
<< ": async_read noted for " << soid
<< dendl
;
5715 ctx
->op_finishers
[ctx
->current_osd_subop_num
].reset(
5716 new ReadFinisher(osd_op
));
5717 return -EINPROGRESS
;
5721 vector
<OSDOp
> read_ops(1);
5722 OSDOp
& read_op
= read_ops
[0];
5724 read_op
.op
.op
= CEPH_OSD_OP_SYNC_READ
;
5725 read_op
.op
.extent
.offset
= op
.extent
.offset
;
5726 read_op
.op
.extent
.length
= op
.extent
.length
;
5727 read_op
.op
.extent
.truncate_seq
= op
.extent
.truncate_seq
;
5728 read_op
.op
.extent
.truncate_size
= op
.extent
.truncate_size
;
5730 int result
= do_osd_ops(ctx
, read_ops
);
5732 derr
<< __func__
<< " failed " << result
<< dendl
;
5735 return finish_extent_cmp(osd_op
, read_op
.outdata
);
5738 int PrimaryLogPG::finish_extent_cmp(OSDOp
& osd_op
, const bufferlist
&read_bl
)
5740 for (uint64_t idx
= 0; idx
< osd_op
.indata
.length(); ++idx
) {
5741 char read_byte
= (idx
< read_bl
.length() ? read_bl
[idx
] : 0);
5742 if (osd_op
.indata
[idx
] != read_byte
) {
5743 return (-MAX_ERRNO
- idx
);
5750 int PrimaryLogPG::do_read(OpContext
*ctx
, OSDOp
& osd_op
) {
5751 dout(20) << __func__
<< dendl
;
5752 auto& op
= osd_op
.op
;
5753 auto& oi
= ctx
->new_obs
.oi
;
5754 auto& soid
= oi
.soid
;
5755 __u32 seq
= oi
.truncate_seq
;
5756 uint64_t size
= oi
.size
;
5757 bool trimmed_read
= false;
5759 dout(30) << __func__
<< " oi.size: " << oi
.size
<< dendl
;
5760 dout(30) << __func__
<< " oi.truncate_seq: " << oi
.truncate_seq
<< dendl
;
5761 dout(30) << __func__
<< " op.extent.truncate_seq: " << op
.extent
.truncate_seq
<< dendl
;
5762 dout(30) << __func__
<< " op.extent.truncate_size: " << op
.extent
.truncate_size
<< dendl
;
5764 // are we beyond truncate_size?
5765 if ( (seq
< op
.extent
.truncate_seq
) &&
5766 (op
.extent
.offset
+ op
.extent
.length
> op
.extent
.truncate_size
) &&
5767 (size
> op
.extent
.truncate_size
) )
5768 size
= op
.extent
.truncate_size
;
5770 if (op
.extent
.length
== 0) //length is zero mean read the whole object
5771 op
.extent
.length
= size
;
5773 if (op
.extent
.offset
>= size
) {
5774 op
.extent
.length
= 0;
5775 trimmed_read
= true;
5776 } else if (op
.extent
.offset
+ op
.extent
.length
> size
) {
5777 op
.extent
.length
= size
- op
.extent
.offset
;
5778 trimmed_read
= true;
5781 dout(30) << __func__
<< "op.extent.length is now " << op
.extent
.length
<< dendl
;
5783 // read into a buffer
5785 if (trimmed_read
&& op
.extent
.length
== 0) {
5786 // read size was trimmed to zero and it is expected to do nothing
5787 // a read operation of 0 bytes does *not* do nothing, this is why
5788 // the trimmed_read boolean is needed
5789 } else if (pool
.info
.is_erasure()) {
5790 // The initialisation below is required to silence a false positive
5791 // -Wmaybe-uninitialized warning
5792 std::optional
<uint32_t> maybe_crc
;
5793 // If there is a data digest and it is possible we are reading
5794 // entire object, pass the digest. FillInVerifyExtent will
5795 // will check the oi.size again.
5796 if (oi
.is_data_digest() && op
.extent
.offset
== 0 &&
5797 op
.extent
.length
>= oi
.size
)
5798 maybe_crc
= oi
.data_digest
;
5799 ctx
->pending_async_reads
.push_back(
5801 boost::make_tuple(op
.extent
.offset
, op
.extent
.length
, op
.flags
),
5802 make_pair(&osd_op
.outdata
,
5803 new FillInVerifyExtent(&op
.extent
.length
, &osd_op
.rval
,
5804 &osd_op
.outdata
, maybe_crc
, oi
.size
,
5805 osd
, soid
, op
.flags
))));
5806 dout(10) << " async_read noted for " << soid
<< dendl
;
5808 ctx
->op_finishers
[ctx
->current_osd_subop_num
].reset(
5809 new ReadFinisher(osd_op
));
5811 int r
= pgbackend
->objects_read_sync(
5812 soid
, op
.extent
.offset
, op
.extent
.length
, op
.flags
, &osd_op
.outdata
);
5813 // whole object? can we verify the checksum?
5814 if (r
>= 0 && op
.extent
.offset
== 0 &&
5815 (uint64_t)r
== oi
.size
&& oi
.is_data_digest()) {
5816 uint32_t crc
= osd_op
.outdata
.crc32c(-1);
5817 if (oi
.data_digest
!= crc
) {
5818 osd
->clog
->error() << info
.pgid
<< std::hex
5819 << " full-object read crc 0x" << crc
5820 << " != expected 0x" << oi
.data_digest
5821 << std::dec
<< " on " << soid
;
5822 r
= -EIO
; // try repair later
5826 r
= rep_repair_primary_object(soid
, ctx
);
5829 op
.extent
.length
= r
;
5830 else if (r
== -EAGAIN
) {
5834 op
.extent
.length
= 0;
5836 dout(10) << " read got " << r
<< " / " << op
.extent
.length
5837 << " bytes from obj " << soid
<< dendl
;
5840 ctx
->delta_stats
.num_rd_kb
+= shift_round_up(op
.extent
.length
, 10);
5841 ctx
->delta_stats
.num_rd
++;
5846 int PrimaryLogPG::do_sparse_read(OpContext
*ctx
, OSDOp
& osd_op
) {
5847 dout(20) << __func__
<< dendl
;
5848 auto& op
= osd_op
.op
;
5849 auto& oi
= ctx
->new_obs
.oi
;
5850 auto& soid
= oi
.soid
;
5852 if (op
.extent
.truncate_seq
) {
5853 dout(0) << "sparse_read does not support truncation sequence " << dendl
;
5858 if (pool
.info
.is_erasure()) {
5859 // translate sparse read to a normal one if not supported
5860 uint64_t offset
= op
.extent
.offset
;
5861 uint64_t length
= op
.extent
.length
;
5862 if (offset
> oi
.size
) {
5864 } else if (offset
+ length
> oi
.size
) {
5865 length
= oi
.size
- offset
;
5869 ctx
->pending_async_reads
.push_back(
5871 boost::make_tuple(offset
, length
, op
.flags
),
5874 new ToSparseReadResult(&osd_op
.rval
, &osd_op
.outdata
, offset
,
5875 &op
.extent
.length
))));
5876 dout(10) << " async_read (was sparse_read) noted for " << soid
<< dendl
;
5878 ctx
->op_finishers
[ctx
->current_osd_subop_num
].reset(
5879 new ReadFinisher(osd_op
));
5881 dout(10) << " sparse read ended up empty for " << soid
<< dendl
;
5882 map
<uint64_t, uint64_t> extents
;
5883 encode(extents
, osd_op
.outdata
);
5886 // read into a buffer
5887 map
<uint64_t, uint64_t> m
;
5888 int r
= osd
->store
->fiemap(ch
, ghobject_t(soid
, ghobject_t::NO_GEN
,
5890 op
.extent
.offset
, op
.extent
.length
, m
);
5896 r
= pgbackend
->objects_readv_sync(soid
, std::move(m
), op
.flags
, &data_bl
);
5898 r
= rep_repair_primary_object(soid
, ctx
);
5904 // Why SPARSE_READ need checksum? In fact, librbd always use sparse-read.
5905 // Maybe at first, there is no much whole objects. With continued use, more
5906 // and more whole object exist. So from this point, for spare-read add
5907 // checksum make sense.
5908 if ((uint64_t)r
== oi
.size
&& oi
.is_data_digest()) {
5909 uint32_t crc
= data_bl
.crc32c(-1);
5910 if (oi
.data_digest
!= crc
) {
5911 osd
->clog
->error() << info
.pgid
<< std::hex
5912 << " full-object read crc 0x" << crc
5913 << " != expected 0x" << oi
.data_digest
5914 << std::dec
<< " on " << soid
;
5915 r
= rep_repair_primary_object(soid
, ctx
);
5922 op
.extent
.length
= r
;
5924 encode(m
, osd_op
.outdata
); // re-encode since it might be modified
5925 ::encode_destructively(data_bl
, osd_op
.outdata
);
5927 dout(10) << " sparse_read got " << r
<< " bytes from object "
5931 ctx
->delta_stats
.num_rd_kb
+= shift_round_up(op
.extent
.length
, 10);
5932 ctx
->delta_stats
.num_rd
++;
5936 int PrimaryLogPG::do_osd_ops(OpContext
*ctx
, vector
<OSDOp
>& ops
)
5939 SnapSetContext
*ssc
= ctx
->obc
->ssc
;
5940 ObjectState
& obs
= ctx
->new_obs
;
5941 object_info_t
& oi
= obs
.oi
;
5942 const hobject_t
& soid
= oi
.soid
;
5943 const bool skip_data_digest
= osd
->store
->has_builtin_csum() &&
5944 osd
->osd_skip_data_digest
;
5946 PGTransaction
* t
= ctx
->op_t
.get();
5948 dout(10) << "do_osd_op " << soid
<< " " << ops
<< dendl
;
5952 span
= tracing::osd::tracer
.add_span(__func__
, ctx
->op
->osd_parent_span
);
5954 ctx
->current_osd_subop_num
= 0;
5955 for (auto p
= ops
.begin(); p
!= ops
.end(); ++p
, ctx
->current_osd_subop_num
++, ctx
->processed_subop_count
++) {
5957 ceph_osd_op
& op
= osd_op
.op
;
5959 OpFinisher
* op_finisher
= nullptr;
5961 auto op_finisher_it
= ctx
->op_finishers
.find(ctx
->current_osd_subop_num
);
5962 if (op_finisher_it
!= ctx
->op_finishers
.end()) {
5963 op_finisher
= op_finisher_it
->second
.get();
5967 // TODO: check endianness (ceph_le32 vs uint32_t, etc.)
5968 // The fields in ceph_osd_op are little-endian (according to the definition in rados.h),
5969 // but the code in this function seems to treat them as native-endian. What should the
5971 tracepoint(osd
, do_osd_op_pre
, soid
.oid
.name
.c_str(), soid
.snap
.val
, op
.op
, ceph_osd_op_name(op
.op
), op
.flags
);
5973 dout(10) << "do_osd_op " << osd_op
<< dendl
;
5975 auto bp
= osd_op
.indata
.cbegin();
5977 // user-visible modifcation?
5979 // non user-visible modifications
5980 case CEPH_OSD_OP_WATCH
:
5981 case CEPH_OSD_OP_CACHE_EVICT
:
5982 case CEPH_OSD_OP_CACHE_FLUSH
:
5983 case CEPH_OSD_OP_CACHE_TRY_FLUSH
:
5984 case CEPH_OSD_OP_UNDIRTY
:
5985 case CEPH_OSD_OP_COPY_FROM
: // we handle user_version update explicitly
5986 case CEPH_OSD_OP_COPY_FROM2
:
5987 case CEPH_OSD_OP_CACHE_PIN
:
5988 case CEPH_OSD_OP_CACHE_UNPIN
:
5989 case CEPH_OSD_OP_SET_REDIRECT
:
5990 case CEPH_OSD_OP_SET_CHUNK
:
5991 case CEPH_OSD_OP_TIER_PROMOTE
:
5992 case CEPH_OSD_OP_TIER_FLUSH
:
5993 case CEPH_OSD_OP_TIER_EVICT
:
5996 if (op
.op
& CEPH_OSD_OP_MODE_WR
)
5997 ctx
->user_modify
= true;
6000 // munge -1 truncate to 0 truncate
6001 if (ceph_osd_op_uses_extent(op
.op
) &&
6002 op
.extent
.truncate_seq
== 1 &&
6003 op
.extent
.truncate_size
== (-1ULL)) {
6004 op
.extent
.truncate_size
= 0;
6005 op
.extent
.truncate_seq
= 0;
6008 // munge ZERO -> TRUNCATE? (don't munge to DELETE or we risk hosing attributes)
6009 if (op
.op
== CEPH_OSD_OP_ZERO
&&
6011 op
.extent
.offset
< static_cast<Option::size_t>(osd
->osd_max_object_size
) &&
6012 op
.extent
.length
>= 1 &&
6013 op
.extent
.length
<= static_cast<Option::size_t>(osd
->osd_max_object_size
) &&
6014 op
.extent
.offset
+ op
.extent
.length
>= oi
.size
) {
6015 if (op
.extent
.offset
>= oi
.size
) {
6019 dout(10) << " munging ZERO " << op
.extent
.offset
<< "~" << op
.extent
.length
6020 << " -> TRUNCATE " << op
.extent
.offset
<< " (old size is " << oi
.size
<< ")" << dendl
;
6021 op
.op
= CEPH_OSD_OP_TRUNCATE
;
6028 case CEPH_OSD_OP_CMPEXT
:
6030 tracepoint(osd
, do_osd_op_pre_extent_cmp
, soid
.oid
.name
.c_str(),
6031 soid
.snap
.val
, oi
.size
, oi
.truncate_seq
, op
.extent
.offset
,
6032 op
.extent
.length
, op
.extent
.truncate_size
,
6033 op
.extent
.truncate_seq
);
6035 if (op_finisher
== nullptr) {
6036 result
= do_extent_cmp(ctx
, osd_op
);
6038 result
= op_finisher
->execute();
6042 case CEPH_OSD_OP_SYNC_READ
:
6043 if (pool
.info
.is_erasure()) {
6044 result
= -EOPNOTSUPP
;
6048 case CEPH_OSD_OP_READ
:
6050 tracepoint(osd
, do_osd_op_pre_read
, soid
.oid
.name
.c_str(),
6051 soid
.snap
.val
, oi
.size
, oi
.truncate_seq
, op
.extent
.offset
,
6052 op
.extent
.length
, op
.extent
.truncate_size
,
6053 op
.extent
.truncate_seq
);
6054 if (op_finisher
== nullptr) {
6055 if (!ctx
->data_off
) {
6056 ctx
->data_off
= op
.extent
.offset
;
6058 result
= do_read(ctx
, osd_op
);
6060 result
= op_finisher
->execute();
6064 case CEPH_OSD_OP_CHECKSUM
:
6067 tracepoint(osd
, do_osd_op_pre_checksum
, soid
.oid
.name
.c_str(),
6068 soid
.snap
.val
, oi
.size
, oi
.truncate_seq
, op
.checksum
.type
,
6069 op
.checksum
.offset
, op
.checksum
.length
,
6070 op
.checksum
.chunk_size
);
6072 if (op_finisher
== nullptr) {
6073 result
= do_checksum(ctx
, osd_op
, &bp
);
6075 result
= op_finisher
->execute();
6081 case CEPH_OSD_OP_MAPEXT
:
6082 tracepoint(osd
, do_osd_op_pre_mapext
, soid
.oid
.name
.c_str(), soid
.snap
.val
, op
.extent
.offset
, op
.extent
.length
);
6083 if (pool
.info
.is_erasure()) {
6084 result
= -EOPNOTSUPP
;
6089 // read into a buffer
6091 int r
= osd
->store
->fiemap(ch
, ghobject_t(soid
, ghobject_t::NO_GEN
,
6093 op
.extent
.offset
, op
.extent
.length
, bl
);
6094 osd_op
.outdata
= std::move(bl
);
6098 ctx
->delta_stats
.num_rd_kb
+= shift_round_up(bl
.length(), 10);
6099 ctx
->delta_stats
.num_rd
++;
6100 dout(10) << " map_extents done on object " << soid
<< dendl
;
6105 case CEPH_OSD_OP_SPARSE_READ
:
6106 tracepoint(osd
, do_osd_op_pre_sparse_read
, soid
.oid
.name
.c_str(),
6107 soid
.snap
.val
, oi
.size
, oi
.truncate_seq
, op
.extent
.offset
,
6108 op
.extent
.length
, op
.extent
.truncate_size
,
6109 op
.extent
.truncate_seq
);
6110 if (op_finisher
== nullptr) {
6111 result
= do_sparse_read(ctx
, osd_op
);
6113 result
= op_finisher
->execute();
6117 case CEPH_OSD_OP_CALL
:
6119 string cname
, mname
;
6122 bp
.copy(op
.cls
.class_len
, cname
);
6123 bp
.copy(op
.cls
.method_len
, mname
);
6124 bp
.copy(op
.cls
.indata_len
, indata
);
6125 } catch (ceph::buffer::error
& e
) {
6126 dout(10) << "call unable to decode class + method + indata" << dendl
;
6127 dout(30) << "in dump: ";
6128 osd_op
.indata
.hexdump(*_dout
);
6131 tracepoint(osd
, do_osd_op_pre_call
, soid
.oid
.name
.c_str(), soid
.snap
.val
, "???", "???");
6134 tracepoint(osd
, do_osd_op_pre_call
, soid
.oid
.name
.c_str(), soid
.snap
.val
, cname
.c_str(), mname
.c_str());
6136 ClassHandler::ClassData
*cls
;
6137 result
= ClassHandler::get_instance().open_class(cname
, &cls
);
6138 ceph_assert(result
== 0); // init_op_flags() already verified this works.
6140 ClassHandler::ClassMethod
*method
= cls
->get_method(mname
);
6142 dout(10) << "call method " << cname
<< "." << mname
<< " does not exist" << dendl
;
6143 result
= -EOPNOTSUPP
;
6147 int flags
= method
->get_flags();
6148 if (flags
& CLS_METHOD_WR
)
6149 ctx
->user_modify
= true;
6152 dout(10) << "call method " << cname
<< "." << mname
<< dendl
;
6153 int prev_rd
= ctx
->num_read
;
6154 int prev_wr
= ctx
->num_write
;
6155 result
= method
->exec((cls_method_context_t
)&ctx
, indata
, outdata
);
6157 if (ctx
->num_read
> prev_rd
&& !(flags
& CLS_METHOD_RD
)) {
6158 derr
<< "method " << cname
<< "." << mname
<< " tried to read object but is not marked RD" << dendl
;
6162 if (ctx
->num_write
> prev_wr
&& !(flags
& CLS_METHOD_WR
)) {
6163 derr
<< "method " << cname
<< "." << mname
<< " tried to update object but is not marked WR" << dendl
;
6168 dout(10) << "method called response length=" << outdata
.length() << dendl
;
6169 op
.extent
.length
= outdata
.length();
6170 osd_op
.outdata
.claim_append(outdata
);
6171 dout(30) << "out dump: ";
6172 osd_op
.outdata
.hexdump(*_dout
);
6177 case CEPH_OSD_OP_STAT
:
6178 // note: stat does not require RD
6180 tracepoint(osd
, do_osd_op_pre_stat
, soid
.oid
.name
.c_str(), soid
.snap
.val
);
6182 if (obs
.exists
&& !oi
.is_whiteout()) {
6183 encode(oi
.size
, osd_op
.outdata
);
6184 encode(oi
.mtime
, osd_op
.outdata
);
6185 dout(10) << "stat oi has " << oi
.size
<< " " << oi
.mtime
<< dendl
;
6188 dout(10) << "stat oi object does not exist" << dendl
;
6191 ctx
->delta_stats
.num_rd
++;
6195 case CEPH_OSD_OP_ISDIRTY
:
6198 tracepoint(osd
, do_osd_op_pre_isdirty
, soid
.oid
.name
.c_str(), soid
.snap
.val
);
6199 bool is_dirty
= obs
.oi
.is_dirty();
6200 encode(is_dirty
, osd_op
.outdata
);
6201 ctx
->delta_stats
.num_rd
++;
6206 case CEPH_OSD_OP_UNDIRTY
:
6210 tracepoint(osd
, do_osd_op_pre_undirty
, soid
.oid
.name
.c_str(), soid
.snap
.val
);
6211 if (oi
.is_dirty()) {
6212 ctx
->undirty
= true; // see make_writeable()
6214 ctx
->delta_stats
.num_wr
++;
6219 case CEPH_OSD_OP_CACHE_TRY_FLUSH
:
6223 tracepoint(osd
, do_osd_op_pre_try_flush
, soid
.oid
.name
.c_str(), soid
.snap
.val
);
6224 if (ctx
->lock_type
!= RWState::RWNONE
) {
6225 dout(10) << "cache-try-flush without SKIPRWLOCKS flag set" << dendl
;
6229 if (pool
.info
.cache_mode
== pg_pool_t::CACHEMODE_NONE
|| obs
.oi
.has_manifest()) {
6237 if (oi
.is_cache_pinned()) {
6238 dout(10) << "cache-try-flush on a pinned object, consider unpin this object first" << dendl
;
6242 if (oi
.is_dirty()) {
6243 result
= start_flush(ctx
->op
, ctx
->obc
, false, NULL
, std::nullopt
);
6244 if (result
== -EINPROGRESS
)
6252 case CEPH_OSD_OP_CACHE_FLUSH
:
6256 tracepoint(osd
, do_osd_op_pre_cache_flush
, soid
.oid
.name
.c_str(), soid
.snap
.val
);
6257 if (ctx
->lock_type
== RWState::RWNONE
) {
6258 dout(10) << "cache-flush with SKIPRWLOCKS flag set" << dendl
;
6262 if (pool
.info
.cache_mode
== pg_pool_t::CACHEMODE_NONE
|| obs
.oi
.has_manifest()) {
6270 if (oi
.is_cache_pinned()) {
6271 dout(10) << "cache-flush on a pinned object, consider unpin this object first" << dendl
;
6276 if (oi
.is_dirty()) {
6277 result
= start_flush(ctx
->op
, ctx
->obc
, true, &missing
, std::nullopt
);
6278 if (result
== -EINPROGRESS
)
6283 // Check special return value which has set missing_return
6284 if (result
== -ENOENT
) {
6285 dout(10) << __func__
<< " CEPH_OSD_OP_CACHE_FLUSH got ENOENT" << dendl
;
6286 ceph_assert(!missing
.is_min());
6287 wait_for_unreadable_object(missing
, ctx
->op
);
6288 // Error code which is used elsewhere when wait_for_unreadable_object() is used
6294 case CEPH_OSD_OP_CACHE_EVICT
:
6298 tracepoint(osd
, do_osd_op_pre_cache_evict
, soid
.oid
.name
.c_str(), soid
.snap
.val
);
6299 if (pool
.info
.cache_mode
== pg_pool_t::CACHEMODE_NONE
|| obs
.oi
.has_manifest()) {
6307 if (oi
.is_cache_pinned()) {
6308 dout(10) << "cache-evict on a pinned object, consider unpin this object first" << dendl
;
6312 if (oi
.is_dirty()) {
6316 if (!oi
.watchers
.empty()) {
6320 if (soid
.snap
== CEPH_NOSNAP
) {
6321 result
= _verify_no_head_clones(soid
, ssc
->snapset
);
6325 result
= _delete_oid(ctx
, true, false);
6327 // mark that this is a cache eviction to avoid triggering normal
6328 // make_writeable() clone creation in finish_ctx()
6329 ctx
->cache_operation
= true;
6331 osd
->logger
->inc(l_osd_tier_evict
);
6335 case CEPH_OSD_OP_GETXATTR
:
6339 bp
.copy(op
.xattr
.name_len
, aname
);
6340 tracepoint(osd
, do_osd_op_pre_getxattr
, soid
.oid
.name
.c_str(), soid
.snap
.val
, aname
.c_str());
6341 string name
= "_" + aname
;
6342 int r
= getattr_maybe_cache(
6347 op
.xattr
.value_len
= osd_op
.outdata
.length();
6349 ctx
->delta_stats
.num_rd_kb
+= shift_round_up(osd_op
.outdata
.length(), 10);
6353 ctx
->delta_stats
.num_rd
++;
6357 case CEPH_OSD_OP_GETXATTRS
:
6360 tracepoint(osd
, do_osd_op_pre_getxattrs
, soid
.oid
.name
.c_str(), soid
.snap
.val
);
6361 map
<string
, bufferlist
,less
<>> out
;
6362 result
= getattrs_maybe_cache(
6368 ctx
->delta_stats
.num_rd_kb
+= shift_round_up(bl
.length(), 10);
6369 ctx
->delta_stats
.num_rd
++;
6370 osd_op
.outdata
.claim_append(bl
);
6374 case CEPH_OSD_OP_CMPXATTR
:
6378 bp
.copy(op
.xattr
.name_len
, aname
);
6379 tracepoint(osd
, do_osd_op_pre_cmpxattr
, soid
.oid
.name
.c_str(), soid
.snap
.val
, aname
.c_str());
6380 string name
= "_" + aname
;
6381 name
[op
.xattr
.name_len
+ 1] = 0;
6384 result
= getattr_maybe_cache(
6388 if (result
< 0 && result
!= -EEXIST
&& result
!= -ENODATA
)
6391 ctx
->delta_stats
.num_rd
++;
6392 ctx
->delta_stats
.num_rd_kb
+= shift_round_up(xattr
.length(), 10);
6394 switch (op
.xattr
.cmp_mode
) {
6395 case CEPH_OSD_CMPXATTR_MODE_STRING
:
6398 bp
.copy(op
.xattr
.value_len
, val
);
6399 val
[op
.xattr
.value_len
] = 0;
6400 dout(10) << "CEPH_OSD_OP_CMPXATTR name=" << name
<< " val=" << val
6401 << " op=" << (int)op
.xattr
.cmp_op
<< " mode=" << (int)op
.xattr
.cmp_mode
<< dendl
;
6402 result
= do_xattr_cmp_str(op
.xattr
.cmp_op
, val
, xattr
);
6406 case CEPH_OSD_CMPXATTR_MODE_U64
:
6412 catch (ceph::buffer::error
& e
) {
6416 dout(10) << "CEPH_OSD_OP_CMPXATTR name=" << name
<< " val=" << u64val
6417 << " op=" << (int)op
.xattr
.cmp_op
<< " mode=" << (int)op
.xattr
.cmp_mode
<< dendl
;
6418 result
= do_xattr_cmp_u64(op
.xattr
.cmp_op
, u64val
, xattr
);
6423 dout(10) << "bad cmp mode " << (int)op
.xattr
.cmp_mode
<< dendl
;
6428 dout(10) << "comparison returned false" << dendl
;
6429 result
= -ECANCELED
;
6433 dout(10) << "comparison returned " << result
<< " " << cpp_strerror(-result
) << dendl
;
6437 dout(10) << "comparison returned true" << dendl
;
6441 case CEPH_OSD_OP_ASSERT_VER
:
6444 uint64_t ver
= op
.assert_ver
.ver
;
6445 tracepoint(osd
, do_osd_op_pre_assert_ver
, soid
.oid
.name
.c_str(), soid
.snap
.val
, ver
);
6448 else if (ver
< oi
.user_version
)
6450 else if (ver
> oi
.user_version
)
6451 result
= -EOVERFLOW
;
6455 case CEPH_OSD_OP_LIST_WATCHERS
:
6458 tracepoint(osd
, do_osd_op_pre_list_watchers
, soid
.oid
.name
.c_str(), soid
.snap
.val
);
6459 obj_list_watch_response_t resp
;
6461 map
<pair
<uint64_t, entity_name_t
>, watch_info_t
>::const_iterator oi_iter
;
6462 for (oi_iter
= oi
.watchers
.begin(); oi_iter
!= oi
.watchers
.end();
6464 dout(20) << "key cookie=" << oi_iter
->first
.first
6465 << " entity=" << oi_iter
->first
.second
<< " "
6466 << oi_iter
->second
<< dendl
;
6467 ceph_assert(oi_iter
->first
.first
== oi_iter
->second
.cookie
);
6468 ceph_assert(oi_iter
->first
.second
.is_client());
6470 watch_item_t
wi(oi_iter
->first
.second
, oi_iter
->second
.cookie
,
6471 oi_iter
->second
.timeout_seconds
, oi_iter
->second
.addr
);
6472 resp
.entries
.push_back(wi
);
6475 resp
.encode(osd_op
.outdata
, ctx
->get_features());
6478 ctx
->delta_stats
.num_rd
++;
6482 case CEPH_OSD_OP_LIST_SNAPS
:
6485 tracepoint(osd
, do_osd_op_pre_list_snaps
, soid
.oid
.name
.c_str(), soid
.snap
.val
);
6486 obj_list_snap_response_t resp
;
6489 ssc
= ctx
->obc
->ssc
= get_snapset_context(soid
, false);
6492 dout(20) << " snapset " << ssc
->snapset
<< dendl
;
6494 int clonecount
= ssc
->snapset
.clones
.size();
6495 clonecount
++; // for head
6496 resp
.clones
.reserve(clonecount
);
6497 for (auto clone_iter
= ssc
->snapset
.clones
.begin();
6498 clone_iter
!= ssc
->snapset
.clones
.end(); ++clone_iter
) {
6500 ci
.cloneid
= *clone_iter
;
6502 hobject_t clone_oid
= soid
;
6503 clone_oid
.snap
= *clone_iter
;
6505 auto p
= ssc
->snapset
.clone_snaps
.find(*clone_iter
);
6506 if (p
== ssc
->snapset
.clone_snaps
.end()) {
6507 osd
->clog
->error() << "osd." << osd
->whoami
6508 << ": inconsistent clone_snaps found for oid "
6509 << soid
<< " clone " << *clone_iter
6510 << " snapset " << ssc
->snapset
;
6514 for (auto q
= p
->second
.rbegin(); q
!= p
->second
.rend(); ++q
) {
6515 ci
.snaps
.push_back(*q
);
6518 dout(20) << " clone " << *clone_iter
<< " snaps " << ci
.snaps
<< dendl
;
6520 map
<snapid_t
, interval_set
<uint64_t> >::const_iterator coi
;
6521 coi
= ssc
->snapset
.clone_overlap
.find(ci
.cloneid
);
6522 if (coi
== ssc
->snapset
.clone_overlap
.end()) {
6523 osd
->clog
->error() << "osd." << osd
->whoami
6524 << ": inconsistent clone_overlap found for oid "
6525 << soid
<< " clone " << *clone_iter
;
6529 const interval_set
<uint64_t> &o
= coi
->second
;
6530 ci
.overlap
.reserve(o
.num_intervals());
6531 for (interval_set
<uint64_t>::const_iterator r
= o
.begin();
6532 r
!= o
.end(); ++r
) {
6533 ci
.overlap
.push_back(pair
<uint64_t,uint64_t>(r
.get_start(),
6537 map
<snapid_t
, uint64_t>::const_iterator si
;
6538 si
= ssc
->snapset
.clone_size
.find(ci
.cloneid
);
6539 if (si
== ssc
->snapset
.clone_size
.end()) {
6540 osd
->clog
->error() << "osd." << osd
->whoami
6541 << ": inconsistent clone_size found for oid "
6542 << soid
<< " clone " << *clone_iter
;
6546 ci
.size
= si
->second
;
6548 resp
.clones
.push_back(ci
);
6553 if (!ctx
->obc
->obs
.oi
.is_whiteout()) {
6554 ceph_assert(obs
.exists
);
6556 ci
.cloneid
= CEPH_NOSNAP
;
6558 //Size for HEAD is oi.size
6561 resp
.clones
.push_back(ci
);
6563 resp
.seq
= ssc
->snapset
.seq
;
6565 resp
.encode(osd_op
.outdata
);
6568 ctx
->delta_stats
.num_rd
++;
6572 case CEPH_OSD_OP_NOTIFY
:
6579 uint32_t ver
; // obsolete
6581 decode(timeout
, bp
);
6583 } catch (const ceph::buffer::error
&e
) {
6586 tracepoint(osd
, do_osd_op_pre_notify
, soid
.oid
.name
.c_str(), soid
.snap
.val
, timeout
);
6588 timeout
= cct
->_conf
->osd_default_notify_timeout
;
6591 n
.timeout
= timeout
;
6592 n
.notify_id
= osd
->get_next_id(get_osdmap_epoch());
6593 n
.cookie
= op
.notify
.cookie
;
6595 ctx
->notifies
.push_back(n
);
6597 // return our unique notify id to the client
6598 encode(n
.notify_id
, osd_op
.outdata
);
6602 case CEPH_OSD_OP_NOTIFY_ACK
:
6606 uint64_t notify_id
= 0;
6607 uint64_t watch_cookie
= 0;
6608 decode(notify_id
, bp
);
6609 decode(watch_cookie
, bp
);
6610 bufferlist reply_bl
;
6612 decode(reply_bl
, bp
);
6614 tracepoint(osd
, do_osd_op_pre_notify_ack
, soid
.oid
.name
.c_str(), soid
.snap
.val
, notify_id
, watch_cookie
, "Y");
6615 OpContext::NotifyAck
ack(notify_id
, watch_cookie
, reply_bl
);
6616 ctx
->notify_acks
.push_back(ack
);
6617 } catch (const ceph::buffer::error
&e
) {
6618 tracepoint(osd
, do_osd_op_pre_notify_ack
, soid
.oid
.name
.c_str(), soid
.snap
.val
, op
.watch
.cookie
, 0, "N");
6619 OpContext::NotifyAck
ack(
6620 // op.watch.cookie is actually the notify_id for historical reasons
6623 ctx
->notify_acks
.push_back(ack
);
6628 case CEPH_OSD_OP_SETALLOCHINT
:
6632 tracepoint(osd
, do_osd_op_pre_setallochint
, soid
.oid
.name
.c_str(), soid
.snap
.val
, op
.alloc_hint
.expected_object_size
, op
.alloc_hint
.expected_write_size
);
6633 maybe_create_new_object(ctx
);
6634 oi
.expected_object_size
= op
.alloc_hint
.expected_object_size
;
6635 oi
.expected_write_size
= op
.alloc_hint
.expected_write_size
;
6636 oi
.alloc_hint_flags
= op
.alloc_hint
.flags
;
6637 t
->set_alloc_hint(soid
, op
.alloc_hint
.expected_object_size
,
6638 op
.alloc_hint
.expected_write_size
,
6639 op
.alloc_hint
.flags
);
6646 // -- object data --
6648 case CEPH_OSD_OP_WRITE
:
6652 __u32 seq
= oi
.truncate_seq
;
6653 tracepoint(osd
, do_osd_op_pre_write
, soid
.oid
.name
.c_str(), soid
.snap
.val
, oi
.size
, seq
, op
.extent
.offset
, op
.extent
.length
, op
.extent
.truncate_size
, op
.extent
.truncate_seq
);
6654 if (op
.extent
.length
!= osd_op
.indata
.length()) {
6659 if (pool
.info
.has_flag(pg_pool_t::FLAG_WRITE_FADVISE_DONTNEED
))
6660 op
.flags
= op
.flags
| CEPH_OSD_OP_FLAG_FADVISE_DONTNEED
;
6662 if (pool
.info
.requires_aligned_append() &&
6663 (op
.extent
.offset
% pool
.info
.required_alignment() != 0)) {
6664 result
= -EOPNOTSUPP
;
6669 if (pool
.info
.requires_aligned_append() && op
.extent
.offset
) {
6670 result
= -EOPNOTSUPP
;
6673 } else if (op
.extent
.offset
!= oi
.size
&&
6674 pool
.info
.requires_aligned_append()) {
6675 result
= -EOPNOTSUPP
;
6679 if (seq
&& (seq
> op
.extent
.truncate_seq
) &&
6680 (op
.extent
.offset
+ op
.extent
.length
> oi
.size
)) {
6681 // old write, arrived after trimtrunc
6682 op
.extent
.length
= (op
.extent
.offset
> oi
.size
? 0 : oi
.size
- op
.extent
.offset
);
6683 dout(10) << " old truncate_seq " << op
.extent
.truncate_seq
<< " < current " << seq
6684 << ", adjusting write length to " << op
.extent
.length
<< dendl
;
6686 t
.substr_of(osd_op
.indata
, 0, op
.extent
.length
);
6687 osd_op
.indata
.swap(t
);
6689 if (op
.extent
.truncate_seq
> seq
) {
6690 // write arrives before trimtrunc
6691 if (obs
.exists
&& !oi
.is_whiteout()) {
6692 dout(10) << " truncate_seq " << op
.extent
.truncate_seq
<< " > current " << seq
6693 << ", truncating to " << op
.extent
.truncate_size
<< dendl
;
6694 t
->truncate(soid
, op
.extent
.truncate_size
);
6695 oi
.truncate_seq
= op
.extent
.truncate_seq
;
6696 oi
.truncate_size
= op
.extent
.truncate_size
;
6697 if (oi
.size
> op
.extent
.truncate_size
) {
6698 interval_set
<uint64_t> trim
;
6699 trim
.insert(op
.extent
.truncate_size
,
6700 oi
.size
- op
.extent
.truncate_size
);
6701 ctx
->modified_ranges
.union_of(trim
);
6702 ctx
->clean_regions
.mark_data_region_dirty(op
.extent
.truncate_size
, oi
.size
- op
.extent
.truncate_size
);
6703 oi
.clear_data_digest();
6705 if (op
.extent
.truncate_size
!= oi
.size
) {
6706 truncate_update_size_and_usage(ctx
->delta_stats
,
6708 op
.extent
.truncate_size
);
6711 dout(10) << " truncate_seq " << op
.extent
.truncate_seq
<< " > current " << seq
6712 << ", but object is new" << dendl
;
6713 oi
.truncate_seq
= op
.extent
.truncate_seq
;
6714 oi
.truncate_size
= op
.extent
.truncate_size
;
6717 result
= check_offset_and_length(
6718 op
.extent
.offset
, op
.extent
.length
,
6719 static_cast<Option::size_t>(osd
->osd_max_object_size
), get_dpp());
6723 maybe_create_new_object(ctx
);
6725 if (op
.extent
.length
== 0) {
6726 if (op
.extent
.offset
> oi
.size
) {
6728 soid
, op
.extent
.offset
);
6729 truncate_update_size_and_usage(ctx
->delta_stats
, oi
,
6736 soid
, op
.extent
.offset
, op
.extent
.length
, osd_op
.indata
, op
.flags
);
6739 if (op
.extent
.offset
== 0 && op
.extent
.length
>= oi
.size
6740 && !skip_data_digest
) {
6741 obs
.oi
.set_data_digest(osd_op
.indata
.crc32c(-1));
6742 } else if (op
.extent
.offset
== oi
.size
&& obs
.oi
.is_data_digest()) {
6743 if (skip_data_digest
) {
6744 obs
.oi
.clear_data_digest();
6746 obs
.oi
.set_data_digest(osd_op
.indata
.crc32c(obs
.oi
.data_digest
));
6749 obs
.oi
.clear_data_digest();
6751 write_update_size_and_usage(ctx
->delta_stats
, oi
, ctx
->modified_ranges
,
6752 op
.extent
.offset
, op
.extent
.length
);
6753 ctx
->clean_regions
.mark_data_region_dirty(op
.extent
.offset
, op
.extent
.length
);
6754 dout(10) << "clean_regions modified" << ctx
->clean_regions
<< dendl
;
6758 case CEPH_OSD_OP_WRITEFULL
:
6761 { // write full object
6762 tracepoint(osd
, do_osd_op_pre_writefull
, soid
.oid
.name
.c_str(), soid
.snap
.val
, oi
.size
, 0, op
.extent
.length
);
6764 if (op
.extent
.length
!= osd_op
.indata
.length()) {
6768 result
= check_offset_and_length(
6769 0, op
.extent
.length
,
6770 static_cast<Option::size_t>(osd
->osd_max_object_size
), get_dpp());
6774 if (pool
.info
.has_flag(pg_pool_t::FLAG_WRITE_FADVISE_DONTNEED
))
6775 op
.flags
= op
.flags
| CEPH_OSD_OP_FLAG_FADVISE_DONTNEED
;
6777 maybe_create_new_object(ctx
);
6778 if (pool
.info
.is_erasure()) {
6779 t
->truncate(soid
, 0);
6780 } else if (obs
.exists
&& op
.extent
.length
< oi
.size
) {
6781 t
->truncate(soid
, op
.extent
.length
);
6783 if (op
.extent
.length
) {
6784 t
->write(soid
, 0, op
.extent
.length
, osd_op
.indata
, op
.flags
);
6786 if (!skip_data_digest
) {
6787 obs
.oi
.set_data_digest(osd_op
.indata
.crc32c(-1));
6789 obs
.oi
.clear_data_digest();
6791 ctx
->clean_regions
.mark_data_region_dirty(0,
6792 std::max((uint64_t)op
.extent
.length
, oi
.size
));
6793 write_update_size_and_usage(ctx
->delta_stats
, oi
, ctx
->modified_ranges
,
6794 0, op
.extent
.length
, true);
6798 case CEPH_OSD_OP_WRITESAME
:
6800 tracepoint(osd
, do_osd_op_pre_writesame
, soid
.oid
.name
.c_str(), soid
.snap
.val
, oi
.size
, op
.writesame
.offset
, op
.writesame
.length
, op
.writesame
.data_length
);
6801 result
= do_writesame(ctx
, osd_op
);
6804 case CEPH_OSD_OP_ROLLBACK
:
6806 tracepoint(osd
, do_osd_op_pre_rollback
, soid
.oid
.name
.c_str(), soid
.snap
.val
);
6807 result
= _rollback_to(ctx
, osd_op
);
6810 case CEPH_OSD_OP_ZERO
:
6811 tracepoint(osd
, do_osd_op_pre_zero
, soid
.oid
.name
.c_str(), soid
.snap
.val
, op
.extent
.offset
, op
.extent
.length
);
6812 if (pool
.info
.requires_aligned_append()) {
6813 result
= -EOPNOTSUPP
;
6818 result
= check_offset_and_length(
6819 op
.extent
.offset
, op
.extent
.length
,
6820 static_cast<Option::size_t>(osd
->osd_max_object_size
), get_dpp());
6824 if (op
.extent
.length
&& obs
.exists
&& !oi
.is_whiteout()) {
6825 t
->zero(soid
, op
.extent
.offset
, op
.extent
.length
);
6826 interval_set
<uint64_t> ch
;
6827 ch
.insert(op
.extent
.offset
, op
.extent
.length
);
6828 ctx
->modified_ranges
.union_of(ch
);
6829 ctx
->clean_regions
.mark_data_region_dirty(op
.extent
.offset
, op
.extent
.length
);
6830 ctx
->delta_stats
.num_wr
++;
6831 oi
.clear_data_digest();
6837 case CEPH_OSD_OP_CREATE
:
6841 tracepoint(osd
, do_osd_op_pre_create
, soid
.oid
.name
.c_str(), soid
.snap
.val
);
6842 if (obs
.exists
&& !oi
.is_whiteout() &&
6843 (op
.flags
& CEPH_OSD_OP_FLAG_EXCL
)) {
6844 result
= -EEXIST
; /* this is an exclusive create */
6846 if (osd_op
.indata
.length()) {
6847 auto p
= osd_op
.indata
.cbegin();
6850 decode(category
, p
);
6852 catch (ceph::buffer::error
& e
) {
6856 // category is no longer implemented.
6858 maybe_create_new_object(ctx
);
6864 case CEPH_OSD_OP_TRIMTRUNC
:
6865 op
.extent
.offset
= op
.extent
.truncate_size
;
6868 case CEPH_OSD_OP_TRUNCATE
:
6869 tracepoint(osd
, do_osd_op_pre_truncate
, soid
.oid
.name
.c_str(), soid
.snap
.val
, oi
.size
, oi
.truncate_seq
, op
.extent
.offset
, op
.extent
.length
, op
.extent
.truncate_size
, op
.extent
.truncate_seq
);
6870 if (pool
.info
.requires_aligned_append()) {
6871 result
= -EOPNOTSUPP
;
6878 if (!obs
.exists
|| oi
.is_whiteout()) {
6879 dout(10) << " object dne, truncate is a no-op" << dendl
;
6883 result
= check_offset_and_length(
6884 op
.extent
.offset
, op
.extent
.length
,
6885 static_cast<Option::size_t>(osd
->osd_max_object_size
), get_dpp());
6889 if (op
.extent
.truncate_seq
) {
6890 ceph_assert(op
.extent
.offset
== op
.extent
.truncate_size
);
6891 if (op
.extent
.truncate_seq
<= oi
.truncate_seq
) {
6892 dout(10) << " truncate seq " << op
.extent
.truncate_seq
<< " <= current " << oi
.truncate_seq
6893 << ", no-op" << dendl
;
6896 dout(10) << " truncate seq " << op
.extent
.truncate_seq
<< " > current " << oi
.truncate_seq
6897 << ", truncating" << dendl
;
6898 oi
.truncate_seq
= op
.extent
.truncate_seq
;
6899 oi
.truncate_size
= op
.extent
.truncate_size
;
6902 maybe_create_new_object(ctx
);
6903 t
->truncate(soid
, op
.extent
.offset
);
6904 if (oi
.size
> op
.extent
.offset
) {
6905 interval_set
<uint64_t> trim
;
6906 trim
.insert(op
.extent
.offset
, oi
.size
-op
.extent
.offset
);
6907 ctx
->modified_ranges
.union_of(trim
);
6908 ctx
->clean_regions
.mark_data_region_dirty(op
.extent
.offset
, oi
.size
- op
.extent
.offset
);
6909 } else if (oi
.size
< op
.extent
.offset
) {
6910 ctx
->clean_regions
.mark_data_region_dirty(oi
.size
, op
.extent
.offset
- oi
.size
);
6912 if (op
.extent
.offset
!= oi
.size
) {
6913 truncate_update_size_and_usage(ctx
->delta_stats
,
6917 ctx
->delta_stats
.num_wr
++;
6918 // do no set exists, or we will break above DELETE -> TRUNCATE munging.
6920 oi
.clear_data_digest();
6924 case CEPH_OSD_OP_DELETE
:
6927 tracepoint(osd
, do_osd_op_pre_delete
, soid
.oid
.name
.c_str(), soid
.snap
.val
);
6929 result
= _delete_oid(ctx
, false, ctx
->ignore_cache
);
6933 case CEPH_OSD_OP_WATCH
:
6937 tracepoint(osd
, do_osd_op_pre_watch
, soid
.oid
.name
.c_str(), soid
.snap
.val
,
6938 op
.watch
.cookie
, op
.watch
.op
);
6944 uint64_t cookie
= op
.watch
.cookie
;
6945 entity_name_t entity
= ctx
->reqid
.name
;
6946 ObjectContextRef obc
= ctx
->obc
;
6948 dout(10) << "watch " << ceph_osd_watch_op_name(op
.watch
.op
)
6949 << ": ctx->obc=" << (void *)obc
.get() << " cookie=" << cookie
6950 << " oi.version=" << oi
.version
.version
<< " ctx->at_version=" << ctx
->at_version
<< dendl
;
6951 dout(10) << "watch: oi.user_version=" << oi
.user_version
<< dendl
;
6952 dout(10) << "watch: peer_addr="
6953 << ctx
->op
->get_req()->get_connection()->get_peer_addr() << dendl
;
6955 uint32_t timeout
= cct
->_conf
->osd_client_watch_timeout
;
6956 if (op
.watch
.timeout
!= 0) {
6957 timeout
= op
.watch
.timeout
;
6960 watch_info_t
w(cookie
, timeout
,
6961 ctx
->op
->get_req()->get_connection()->get_peer_addr());
6962 if (op
.watch
.op
== CEPH_OSD_WATCH_OP_WATCH
||
6963 op
.watch
.op
== CEPH_OSD_WATCH_OP_LEGACY_WATCH
) {
6964 if (oi
.watchers
.count(make_pair(cookie
, entity
))) {
6965 dout(10) << " found existing watch " << w
<< " by " << entity
<< dendl
;
6967 dout(10) << " registered new watch " << w
<< " by " << entity
<< dendl
;
6968 oi
.watchers
[make_pair(cookie
, entity
)] = w
;
6969 t
->nop(soid
); // make sure update the object_info on disk!
6971 bool will_ping
= (op
.watch
.op
== CEPH_OSD_WATCH_OP_WATCH
);
6972 ctx
->watch_connects
.push_back(make_pair(w
, will_ping
));
6973 } else if (op
.watch
.op
== CEPH_OSD_WATCH_OP_RECONNECT
) {
6974 if (!oi
.watchers
.count(make_pair(cookie
, entity
))) {
6978 dout(10) << " found existing watch " << w
<< " by " << entity
<< dendl
;
6979 ctx
->watch_connects
.push_back(make_pair(w
, true));
6980 } else if (op
.watch
.op
== CEPH_OSD_WATCH_OP_PING
) {
6981 /* Note: WATCH with PING doesn't cause may_write() to return true,
6982 * so if there is nothing else in the transaction, this is going
6983 * to run do_osd_op_effects, but not write out a log entry */
6984 if (!oi
.watchers
.count(make_pair(cookie
, entity
))) {
6988 map
<pair
<uint64_t,entity_name_t
>,WatchRef
>::iterator p
=
6989 obc
->watchers
.find(make_pair(cookie
, entity
));
6990 if (p
== obc
->watchers
.end() ||
6991 !p
->second
->is_connected()) {
6992 // client needs to reconnect
6993 result
= -ETIMEDOUT
;
6996 dout(10) << " found existing watch " << w
<< " by " << entity
<< dendl
;
6997 p
->second
->got_ping(ceph_clock_now());
6999 } else if (op
.watch
.op
== CEPH_OSD_WATCH_OP_UNWATCH
) {
7000 map
<pair
<uint64_t, entity_name_t
>, watch_info_t
>::iterator oi_iter
=
7001 oi
.watchers
.find(make_pair(cookie
, entity
));
7002 if (oi_iter
!= oi
.watchers
.end()) {
7003 dout(10) << " removed watch " << oi_iter
->second
<< " by "
7005 oi
.watchers
.erase(oi_iter
);
7006 t
->nop(soid
); // update oi on disk
7007 ctx
->watch_disconnects
.push_back(
7008 watch_disconnect_t(cookie
, entity
, false));
7010 dout(10) << " can't remove: no watch by " << entity
<< dendl
;
7016 case CEPH_OSD_OP_CACHE_PIN
:
7017 tracepoint(osd
, do_osd_op_pre_cache_pin
, soid
.oid
.name
.c_str(), soid
.snap
.val
);
7018 if ((!pool
.info
.is_tier() ||
7019 pool
.info
.cache_mode
== pg_pool_t::CACHEMODE_NONE
)) {
7021 dout(10) << " pin object is only allowed on the cache tier " << dendl
;
7027 if (!obs
.exists
|| oi
.is_whiteout()) {
7032 if (!oi
.is_cache_pinned()) {
7033 oi
.set_flag(object_info_t::FLAG_CACHE_PIN
);
7035 ctx
->delta_stats
.num_objects_pinned
++;
7036 ctx
->delta_stats
.num_wr
++;
7041 case CEPH_OSD_OP_CACHE_UNPIN
:
7042 tracepoint(osd
, do_osd_op_pre_cache_unpin
, soid
.oid
.name
.c_str(), soid
.snap
.val
);
7043 if ((!pool
.info
.is_tier() ||
7044 pool
.info
.cache_mode
== pg_pool_t::CACHEMODE_NONE
)) {
7046 dout(10) << " pin object is only allowed on the cache tier " << dendl
;
7052 if (!obs
.exists
|| oi
.is_whiteout()) {
7057 if (oi
.is_cache_pinned()) {
7058 oi
.clear_flag(object_info_t::FLAG_CACHE_PIN
);
7060 ctx
->delta_stats
.num_objects_pinned
--;
7061 ctx
->delta_stats
.num_wr
++;
7066 case CEPH_OSD_OP_SET_REDIRECT
:
7070 if (pool
.info
.is_tier()) {
7078 if (get_osdmap()->require_osd_release
< ceph_release_t::luminous
) {
7079 result
= -EOPNOTSUPP
;
7083 object_t target_name
;
7084 object_locator_t target_oloc
;
7085 snapid_t target_snapid
= (uint64_t)op
.copy_from
.snapid
;
7086 version_t target_version
= op
.copy_from
.src_version
;
7088 decode(target_name
, bp
);
7089 decode(target_oloc
, bp
);
7091 catch (ceph::buffer::error
& e
) {
7096 get_osdmap()->object_locator_to_pg(target_name
, target_oloc
, raw_pg
);
7097 hobject_t
target(target_name
, target_oloc
.key
, target_snapid
,
7098 raw_pg
.ps(), raw_pg
.pool(),
7099 target_oloc
.nspace
);
7100 if (target
== soid
) {
7101 dout(20) << " set-redirect self is invalid" << dendl
;
7106 bool need_reference
= (osd_op
.op
.flags
& CEPH_OSD_OP_FLAG_WITH_REFERENCE
);
7107 bool has_reference
= (oi
.flags
& object_info_t::FLAG_REDIRECT_HAS_REFERENCE
);
7108 if (has_reference
) {
7110 dout(5) << " the object is already a manifest " << dendl
;
7113 if (op_finisher
== nullptr && need_reference
) {
7115 ctx
->op_finishers
[ctx
->current_osd_subop_num
].reset(
7116 new SetManifestFinisher(osd_op
));
7117 ManifestOpRef mop
= std::make_shared
<ManifestOp
>(new RefCountCallback(ctx
, osd_op
));
7118 auto* fin
= new C_SetManifestRefCountDone(this, soid
, 0);
7119 ceph_tid_t tid
= refcount_manifest(soid
, target
,
7120 refcount_t::INCREMENT_REF
, fin
, std::nullopt
);
7124 manifest_ops
[soid
] = mop
;
7125 ctx
->obc
->start_block();
7126 result
= -EINPROGRESS
;
7130 result
= op_finisher
->execute();
7131 ceph_assert(result
== 0);
7134 if (!oi
.has_manifest() && !oi
.manifest
.is_redirect())
7135 ctx
->delta_stats
.num_objects_manifest
++;
7137 oi
.set_flag(object_info_t::FLAG_MANIFEST
);
7138 oi
.manifest
.redirect_target
= target
;
7139 oi
.manifest
.type
= object_manifest_t::TYPE_REDIRECT
;
7140 t
->truncate(soid
, 0);
7141 ctx
->clean_regions
.mark_data_region_dirty(0, oi
.size
);
7142 if (oi
.is_omap() && pool
.info
.supports_omap()) {
7143 t
->omap_clear(soid
);
7144 obs
.oi
.clear_omap_digest();
7145 obs
.oi
.clear_flag(object_info_t::FLAG_OMAP
);
7146 ctx
->clean_regions
.mark_omap_dirty();
7148 write_update_size_and_usage(ctx
->delta_stats
, oi
, ctx
->modified_ranges
,
7150 ctx
->delta_stats
.num_bytes
-= oi
.size
;
7153 oi
.user_version
= target_version
;
7154 ctx
->user_at_version
= target_version
;
7156 map
<string
,bufferlist
,less
<>> rmattrs
;
7157 result
= getattrs_maybe_cache(ctx
->obc
, &rmattrs
);
7159 dout(10) << __func__
<< " error: " << cpp_strerror(result
) << dendl
;
7162 map
<string
, bufferlist
>::iterator iter
;
7163 for (iter
= rmattrs
.begin(); iter
!= rmattrs
.end(); ++iter
) {
7164 const string
& name
= iter
->first
;
7165 t
->rmattr(soid
, name
);
7167 if (!has_reference
&& need_reference
) {
7168 oi
.set_flag(object_info_t::FLAG_REDIRECT_HAS_REFERENCE
);
7170 dout(10) << "set-redirect oid:" << oi
.soid
<< " user_version: " << oi
.user_version
<< dendl
;
7172 ctx
->op_finishers
.erase(ctx
->current_osd_subop_num
);
7179 case CEPH_OSD_OP_SET_CHUNK
:
7183 if (pool
.info
.is_tier()) {
7191 if (get_osdmap()->require_osd_release
< ceph_release_t::luminous
) {
7192 result
= -EOPNOTSUPP
;
7195 if (oi
.manifest
.is_redirect()) {
7200 object_locator_t tgt_oloc
;
7201 uint64_t src_offset
, src_length
, tgt_offset
;
7204 decode(src_offset
, bp
);
7205 decode(src_length
, bp
);
7206 decode(tgt_oloc
, bp
);
7207 decode(tgt_name
, bp
);
7208 decode(tgt_offset
, bp
);
7210 catch (ceph::buffer::error
& e
) {
7219 if (src_offset
+ src_length
> oi
.size
) {
7223 if (!(osd_op
.op
.flags
& CEPH_OSD_OP_FLAG_WITH_REFERENCE
)) {
7224 result
= -EOPNOTSUPP
;
7227 if (pool
.info
.is_erasure()) {
7228 result
= -EOPNOTSUPP
;
7232 for (auto &p
: oi
.manifest
.chunk_map
) {
7233 interval_set
<uint64_t> chunk
;
7234 chunk
.insert(p
.first
, p
.second
.length
);
7235 if (chunk
.intersects(src_offset
, src_length
)) {
7236 dout(20) << __func__
<< " overlapped !! offset: " << src_offset
<< " length: " << src_length
7237 << " chunk_info: " << p
<< dendl
;
7238 result
= -EOPNOTSUPP
;
7244 chunk_info_t chunk_info
;
7245 get_osdmap()->object_locator_to_pg(tgt_name
, tgt_oloc
, raw_pg
);
7246 hobject_t
target(tgt_name
, tgt_oloc
.key
, snapid_t(),
7247 raw_pg
.ps(), raw_pg
.pool(),
7249 bool has_reference
= (oi
.manifest
.chunk_map
.find(src_offset
) != oi
.manifest
.chunk_map
.end()) &&
7250 (oi
.manifest
.chunk_map
[src_offset
].test_flag(chunk_info_t::FLAG_HAS_REFERENCE
));
7251 if (has_reference
) {
7253 dout(5) << " the object is already a manifest " << dendl
;
7256 chunk_info
.oid
= target
;
7257 chunk_info
.offset
= tgt_offset
;
7258 chunk_info
.length
= src_length
;
7259 if (op_finisher
== nullptr) {
7261 ctx
->op_finishers
[ctx
->current_osd_subop_num
].reset(
7262 new SetManifestFinisher(osd_op
));
7263 object_manifest_t set_chunk
;
7264 bool need_inc_ref
= false;
7265 set_chunk
.chunk_map
[src_offset
] = chunk_info
;
7266 need_inc_ref
= inc_refcount_by_set(ctx
, set_chunk
, osd_op
);
7268 result
= -EINPROGRESS
;
7273 result
= op_finisher
->execute();
7274 ceph_assert(result
== 0);
7277 oi
.manifest
.chunk_map
[src_offset
] = chunk_info
;
7278 if (!oi
.has_manifest() && !oi
.manifest
.is_chunked())
7279 ctx
->delta_stats
.num_objects_manifest
++;
7280 oi
.set_flag(object_info_t::FLAG_MANIFEST
);
7281 oi
.manifest
.type
= object_manifest_t::TYPE_CHUNKED
;
7282 if (!has_reference
) {
7283 oi
.manifest
.chunk_map
[src_offset
].set_flag(chunk_info_t::FLAG_HAS_REFERENCE
);
7286 ctx
->cache_operation
= true;
7288 dout(10) << "set-chunked oid:" << oi
.soid
<< " user_version: " << oi
.user_version
7289 << " chunk_info: " << chunk_info
<< dendl
;
7291 ctx
->op_finishers
.erase(ctx
->current_osd_subop_num
);
7297 case CEPH_OSD_OP_TIER_PROMOTE
:
7301 if (pool
.info
.is_tier()) {
7309 if (get_osdmap()->require_osd_release
< ceph_release_t::luminous
) {
7310 result
= -EOPNOTSUPP
;
7313 if (!obs
.oi
.has_manifest()) {
7318 if (op_finisher
== nullptr) {
7319 PromoteManifestCallback
*cb
;
7320 object_locator_t my_oloc
;
7323 if (obs
.oi
.manifest
.is_chunked()) {
7324 src_hoid
= obs
.oi
.soid
;
7325 } else if (obs
.oi
.manifest
.is_redirect()) {
7326 object_locator_t
src_oloc(obs
.oi
.manifest
.redirect_target
);
7328 src_hoid
= obs
.oi
.manifest
.redirect_target
;
7330 ceph_abort_msg("unrecognized manifest type");
7332 cb
= new PromoteManifestCallback(ctx
->obc
, this, ctx
);
7333 ctx
->op_finishers
[ctx
->current_osd_subop_num
].reset(
7334 new PromoteFinisher(cb
));
7335 unsigned flags
= CEPH_OSD_COPY_FROM_FLAG_IGNORE_OVERLAY
|
7336 CEPH_OSD_COPY_FROM_FLAG_IGNORE_CACHE
|
7337 CEPH_OSD_COPY_FROM_FLAG_MAP_SNAP_CLONE
|
7338 CEPH_OSD_COPY_FROM_FLAG_RWORDERED
;
7339 unsigned src_fadvise_flags
= LIBRADOS_OP_FLAG_FADVISE_SEQUENTIAL
;
7340 start_copy(cb
, ctx
->obc
, src_hoid
, my_oloc
, 0, flags
,
7341 obs
.oi
.soid
.snap
== CEPH_NOSNAP
,
7342 src_fadvise_flags
, 0);
7344 dout(10) << "tier-promote oid:" << oi
.soid
<< " manifest: " << obs
.oi
.manifest
<< dendl
;
7345 result
= -EINPROGRESS
;
7347 result
= op_finisher
->execute();
7348 ceph_assert(result
== 0);
7349 ctx
->op_finishers
.erase(ctx
->current_osd_subop_num
);
7355 case CEPH_OSD_OP_TIER_FLUSH
:
7359 if (pool
.info
.is_tier()) {
7367 if (get_osdmap()->require_osd_release
< ceph_release_t::octopus
) {
7368 result
= -EOPNOTSUPP
;
7371 if (!obs
.oi
.has_manifest()) {
7376 if (oi
.is_dirty()) {
7377 result
= start_flush(ctx
->op
, ctx
->obc
, true, NULL
, std::nullopt
);
7378 if (result
== -EINPROGRESS
)
7387 case CEPH_OSD_OP_TIER_EVICT
:
7391 if (pool
.info
.is_tier()) {
7399 if (get_osdmap()->require_osd_release
< ceph_release_t::octopus
) {
7400 result
= -EOPNOTSUPP
;
7403 if (!obs
.oi
.has_manifest()) {
7408 // The chunks already has a reference, so it is just enough to invoke truncate if necessary
7409 for (auto &p
: obs
.oi
.manifest
.chunk_map
) {
7410 p
.second
.set_flag(chunk_info_t::FLAG_MISSING
);
7412 t
->zero(soid
, p
.first
, p
.second
.length
);
7414 oi
.clear_data_digest();
7415 ctx
->delta_stats
.num_wr
++;
7416 ctx
->cache_operation
= true;
7417 osd
->logger
->inc(l_osd_tier_evict
);
7422 case CEPH_OSD_OP_UNSET_MANIFEST
:
7426 if (pool
.info
.is_tier()) {
7434 if (!oi
.has_manifest()) {
7435 result
= -EOPNOTSUPP
;
7438 if (get_osdmap()->require_osd_release
< ceph_release_t::luminous
) {
7439 result
= -EOPNOTSUPP
;
7443 dec_all_refcount_manifest(oi
, ctx
);
7445 oi
.clear_flag(object_info_t::FLAG_MANIFEST
);
7446 oi
.manifest
= object_manifest_t();
7447 ctx
->delta_stats
.num_objects_manifest
--;
7448 ctx
->delta_stats
.num_wr
++;
7454 // -- object attrs --
7456 case CEPH_OSD_OP_SETXATTR
:
7460 if (cct
->_conf
->osd_max_attr_size
> 0 &&
7461 op
.xattr
.value_len
> cct
->_conf
->osd_max_attr_size
) {
7462 tracepoint(osd
, do_osd_op_pre_setxattr
, soid
.oid
.name
.c_str(), soid
.snap
.val
, "???");
7466 unsigned max_name_len
=
7467 std::min
<uint64_t>(osd
->store
->get_max_attr_name_length(),
7468 cct
->_conf
->osd_max_attr_name_len
);
7469 if (op
.xattr
.name_len
> max_name_len
) {
7470 result
= -ENAMETOOLONG
;
7473 maybe_create_new_object(ctx
);
7475 bp
.copy(op
.xattr
.name_len
, aname
);
7476 tracepoint(osd
, do_osd_op_pre_setxattr
, soid
.oid
.name
.c_str(), soid
.snap
.val
, aname
.c_str());
7477 string name
= "_" + aname
;
7479 bp
.copy(op
.xattr
.value_len
, bl
);
7480 t
->setattr(soid
, name
, bl
);
7481 ctx
->delta_stats
.num_wr
++;
7485 case CEPH_OSD_OP_RMXATTR
:
7490 bp
.copy(op
.xattr
.name_len
, aname
);
7491 tracepoint(osd
, do_osd_op_pre_rmxattr
, soid
.oid
.name
.c_str(), soid
.snap
.val
, aname
.c_str());
7492 if (!obs
.exists
|| oi
.is_whiteout()) {
7496 string name
= "_" + aname
;
7497 t
->rmattr(soid
, name
);
7498 ctx
->delta_stats
.num_wr
++;
7503 // -- fancy writers --
7504 case CEPH_OSD_OP_APPEND
:
7506 tracepoint(osd
, do_osd_op_pre_append
, soid
.oid
.name
.c_str(), soid
.snap
.val
, oi
.size
, oi
.truncate_seq
, op
.extent
.offset
, op
.extent
.length
, op
.extent
.truncate_size
, op
.extent
.truncate_seq
);
7507 // just do it inline; this works because we are happy to execute
7508 // fancy op on replicas as well.
7509 vector
<OSDOp
> nops(1);
7510 OSDOp
& newop
= nops
[0];
7511 newop
.op
.op
= CEPH_OSD_OP_WRITE
;
7512 newop
.op
.extent
.offset
= oi
.size
;
7513 newop
.op
.extent
.length
= op
.extent
.length
;
7514 newop
.op
.extent
.truncate_seq
= oi
.truncate_seq
;
7515 newop
.indata
= osd_op
.indata
;
7516 result
= do_osd_ops(ctx
, nops
);
7517 osd_op
.outdata
= std::move(newop
.outdata
);
7521 case CEPH_OSD_OP_STARTSYNC
:
7526 // -- trivial map --
7527 case CEPH_OSD_OP_TMAPGET
:
7528 tracepoint(osd
, do_osd_op_pre_tmapget
, soid
.oid
.name
.c_str(), soid
.snap
.val
);
7529 if (pool
.info
.is_erasure()) {
7530 result
= -EOPNOTSUPP
;
7534 vector
<OSDOp
> nops(1);
7535 OSDOp
& newop
= nops
[0];
7536 newop
.op
.op
= CEPH_OSD_OP_SYNC_READ
;
7537 newop
.op
.extent
.offset
= 0;
7538 newop
.op
.extent
.length
= 0;
7539 result
= do_osd_ops(ctx
, nops
);
7540 osd_op
.outdata
= std::move(newop
.outdata
);
7544 case CEPH_OSD_OP_TMAPPUT
:
7545 tracepoint(osd
, do_osd_op_pre_tmapput
, soid
.oid
.name
.c_str(), soid
.snap
.val
);
7546 if (pool
.info
.is_erasure()) {
7547 result
= -EOPNOTSUPP
;
7551 //_dout_lock.Lock();
7552 //osd_op.data.hexdump(*_dout);
7553 //_dout_lock.Unlock();
7555 // verify sort order
7556 bool unsorted
= false;
7566 dout(10) << "tmapput key " << key
<< dendl
;
7569 if (key
< last_key
) {
7570 dout(10) << "TMAPPUT is unordered; resorting" << dendl
;
7579 vector
<OSDOp
> nops(1);
7580 OSDOp
& newop
= nops
[0];
7581 newop
.op
.op
= CEPH_OSD_OP_WRITEFULL
;
7582 newop
.op
.extent
.offset
= 0;
7583 newop
.op
.extent
.length
= osd_op
.indata
.length();
7584 newop
.indata
= osd_op
.indata
;
7587 bp
= osd_op
.indata
.begin();
7589 map
<string
, bufferlist
> m
;
7592 ceph_assert(bp
.end());
7594 encode(header
, newbl
);
7596 newop
.indata
= newbl
;
7598 result
= do_osd_ops(ctx
, nops
);
7599 ceph_assert(result
== 0);
7603 case CEPH_OSD_OP_TMAPUP
:
7604 tracepoint(osd
, do_osd_op_pre_tmapup
, soid
.oid
.name
.c_str(), soid
.snap
.val
);
7605 if (pool
.info
.is_erasure()) {
7606 result
= -EOPNOTSUPP
;
7610 result
= do_tmapup(ctx
, bp
, osd_op
);
7613 case CEPH_OSD_OP_TMAP2OMAP
:
7615 tracepoint(osd
, do_osd_op_pre_tmap2omap
, soid
.oid
.name
.c_str(), soid
.snap
.val
);
7616 result
= do_tmap2omap(ctx
, op
.tmap2omap
.flags
);
7620 case CEPH_OSD_OP_OMAPGETKEYS
:
7624 uint64_t max_return
;
7626 decode(start_after
, bp
);
7627 decode(max_return
, bp
);
7629 catch (ceph::buffer::error
& e
) {
7631 tracepoint(osd
, do_osd_op_pre_omapgetkeys
, soid
.oid
.name
.c_str(), soid
.snap
.val
, "???", 0);
7634 if (max_return
> cct
->_conf
->osd_max_omap_entries_per_request
) {
7635 max_return
= cct
->_conf
->osd_max_omap_entries_per_request
;
7637 tracepoint(osd
, do_osd_op_pre_omapgetkeys
, soid
.oid
.name
.c_str(), soid
.snap
.val
, start_after
.c_str(), max_return
);
7641 bool truncated
= false;
7643 ObjectMap::ObjectMapIterator iter
= osd
->store
->get_omap_iterator(
7644 ch
, ghobject_t(soid
)
7647 iter
->upper_bound(start_after
);
7648 for (num
= 0; iter
->valid(); ++num
, iter
->next()) {
7649 if (num
>= max_return
||
7650 bl
.length() >= cct
->_conf
->osd_max_omap_bytes_per_request
) {
7654 encode(iter
->key(), bl
);
7656 } // else return empty out_set
7657 encode(num
, osd_op
.outdata
);
7658 osd_op
.outdata
.claim_append(bl
);
7659 encode(truncated
, osd_op
.outdata
);
7660 ctx
->delta_stats
.num_rd_kb
+= shift_round_up(osd_op
.outdata
.length(), 10);
7661 ctx
->delta_stats
.num_rd
++;
7665 case CEPH_OSD_OP_OMAPGETVALS
:
7669 uint64_t max_return
;
7670 string filter_prefix
;
7672 decode(start_after
, bp
);
7673 decode(max_return
, bp
);
7674 decode(filter_prefix
, bp
);
7676 catch (ceph::buffer::error
& e
) {
7678 tracepoint(osd
, do_osd_op_pre_omapgetvals
, soid
.oid
.name
.c_str(), soid
.snap
.val
, "???", 0, "???");
7681 if (max_return
> cct
->_conf
->osd_max_omap_entries_per_request
) {
7682 max_return
= cct
->_conf
->osd_max_omap_entries_per_request
;
7684 tracepoint(osd
, do_osd_op_pre_omapgetvals
, soid
.oid
.name
.c_str(), soid
.snap
.val
, start_after
.c_str(), max_return
, filter_prefix
.c_str());
7687 bool truncated
= false;
7690 ObjectMap::ObjectMapIterator iter
= osd
->store
->get_omap_iterator(
7691 ch
, ghobject_t(soid
)
7697 iter
->upper_bound(start_after
);
7698 if (filter_prefix
> start_after
) iter
->lower_bound(filter_prefix
);
7701 iter
->key().substr(0, filter_prefix
.size()) == filter_prefix
;
7702 ++num
, iter
->next()) {
7703 dout(20) << "Found key " << iter
->key() << dendl
;
7704 if (num
>= max_return
||
7705 bl
.length() >= cct
->_conf
->osd_max_omap_bytes_per_request
) {
7709 encode(iter
->key(), bl
);
7710 encode(iter
->value(), bl
);
7712 } // else return empty out_set
7713 encode(num
, osd_op
.outdata
);
7714 osd_op
.outdata
.claim_append(bl
);
7715 encode(truncated
, osd_op
.outdata
);
7716 ctx
->delta_stats
.num_rd_kb
+= shift_round_up(osd_op
.outdata
.length(), 10);
7717 ctx
->delta_stats
.num_rd
++;
7721 case CEPH_OSD_OP_OMAPGETHEADER
:
7722 tracepoint(osd
, do_osd_op_pre_omapgetheader
, soid
.oid
.name
.c_str(), soid
.snap
.val
);
7723 if (!oi
.is_omap()) {
7724 // return empty header
7729 osd
->store
->omap_get_header(ch
, ghobject_t(soid
), &osd_op
.outdata
);
7730 ctx
->delta_stats
.num_rd_kb
+= shift_round_up(osd_op
.outdata
.length(), 10);
7731 ctx
->delta_stats
.num_rd
++;
7735 case CEPH_OSD_OP_OMAPGETVALSBYKEYS
:
7738 set
<string
> keys_to_get
;
7740 decode(keys_to_get
, bp
);
7742 catch (ceph::buffer::error
& e
) {
7744 tracepoint(osd
, do_osd_op_pre_omapgetvalsbykeys
, soid
.oid
.name
.c_str(), soid
.snap
.val
, "???");
7747 tracepoint(osd
, do_osd_op_pre_omapgetvalsbykeys
, soid
.oid
.name
.c_str(), soid
.snap
.val
, list_entries(keys_to_get
).c_str());
7748 map
<string
, bufferlist
> out
;
7750 osd
->store
->omap_get_values(ch
, ghobject_t(soid
), keys_to_get
, &out
);
7751 } // else return empty omap entries
7752 encode(out
, osd_op
.outdata
);
7753 ctx
->delta_stats
.num_rd_kb
+= shift_round_up(osd_op
.outdata
.length(), 10);
7754 ctx
->delta_stats
.num_rd
++;
7758 case CEPH_OSD_OP_OMAP_CMP
:
7761 if (!obs
.exists
|| oi
.is_whiteout()) {
7763 tracepoint(osd
, do_osd_op_pre_omap_cmp
, soid
.oid
.name
.c_str(), soid
.snap
.val
, "???");
7766 map
<string
, pair
<bufferlist
, int> > assertions
;
7768 decode(assertions
, bp
);
7770 catch (ceph::buffer::error
& e
) {
7772 tracepoint(osd
, do_osd_op_pre_omap_cmp
, soid
.oid
.name
.c_str(), soid
.snap
.val
, "???");
7775 tracepoint(osd
, do_osd_op_pre_omap_cmp
, soid
.oid
.name
.c_str(), soid
.snap
.val
, list_keys(assertions
).c_str());
7777 map
<string
, bufferlist
> out
;
7781 for (map
<string
, pair
<bufferlist
, int> >::iterator i
= assertions
.begin();
7782 i
!= assertions
.end();
7784 to_get
.insert(i
->first
);
7785 int r
= osd
->store
->omap_get_values(ch
, ghobject_t(soid
),
7791 } // else leave out empty
7793 //Should set num_rd_kb based on encode length of map
7794 ctx
->delta_stats
.num_rd
++;
7798 for (map
<string
, pair
<bufferlist
, int> >::iterator i
= assertions
.begin();
7799 i
!= assertions
.end();
7801 auto out_entry
= out
.find(i
->first
);
7802 bufferlist
&bl
= (out_entry
!= out
.end()) ?
7803 out_entry
->second
: empty
;
7804 switch (i
->second
.second
) {
7805 case CEPH_OSD_CMPXATTR_OP_EQ
:
7806 if (!(bl
== i
->second
.first
)) {
7810 case CEPH_OSD_CMPXATTR_OP_LT
:
7811 if (!(bl
< i
->second
.first
)) {
7815 case CEPH_OSD_CMPXATTR_OP_GT
:
7816 if (!(bl
> i
->second
.first
)) {
7834 case CEPH_OSD_OP_OMAPSETVALS
:
7835 if (!pool
.info
.supports_omap()) {
7836 result
= -EOPNOTSUPP
;
7837 tracepoint(osd
, do_osd_op_pre_omapsetvals
, soid
.oid
.name
.c_str(), soid
.snap
.val
);
7843 maybe_create_new_object(ctx
);
7844 bufferlist to_set_bl
;
7846 decode_str_str_map_to_bl(bp
, &to_set_bl
);
7848 catch (ceph::buffer::error
& e
) {
7850 tracepoint(osd
, do_osd_op_pre_omapsetvals
, soid
.oid
.name
.c_str(), soid
.snap
.val
);
7853 tracepoint(osd
, do_osd_op_pre_omapsetvals
, soid
.oid
.name
.c_str(), soid
.snap
.val
);
7854 if (cct
->_conf
->subsys
.should_gather
<dout_subsys
, 20>()) {
7855 dout(20) << "setting vals: " << dendl
;
7856 map
<string
,bufferlist
> to_set
;
7857 bufferlist::const_iterator pt
= to_set_bl
.begin();
7859 for (map
<string
, bufferlist
>::iterator i
= to_set
.begin();
7862 dout(20) << "\t" << i
->first
<< dendl
;
7865 t
->omap_setkeys(soid
, to_set_bl
);
7866 ctx
->clean_regions
.mark_omap_dirty();
7867 ctx
->delta_stats
.num_wr
++;
7868 ctx
->delta_stats
.num_wr_kb
+= shift_round_up(to_set_bl
.length(), 10);
7870 obs
.oi
.set_flag(object_info_t::FLAG_OMAP
);
7871 obs
.oi
.clear_omap_digest();
7874 case CEPH_OSD_OP_OMAPSETHEADER
:
7875 tracepoint(osd
, do_osd_op_pre_omapsetheader
, soid
.oid
.name
.c_str(), soid
.snap
.val
);
7876 if (!pool
.info
.supports_omap()) {
7877 result
= -EOPNOTSUPP
;
7883 maybe_create_new_object(ctx
);
7884 t
->omap_setheader(soid
, osd_op
.indata
);
7885 ctx
->clean_regions
.mark_omap_dirty();
7886 ctx
->delta_stats
.num_wr
++;
7888 obs
.oi
.set_flag(object_info_t::FLAG_OMAP
);
7889 obs
.oi
.clear_omap_digest();
7892 case CEPH_OSD_OP_OMAPCLEAR
:
7893 tracepoint(osd
, do_osd_op_pre_omapclear
, soid
.oid
.name
.c_str(), soid
.snap
.val
);
7894 if (!pool
.info
.supports_omap()) {
7895 result
= -EOPNOTSUPP
;
7901 if (!obs
.exists
|| oi
.is_whiteout()) {
7906 t
->omap_clear(soid
);
7907 ctx
->clean_regions
.mark_omap_dirty();
7908 ctx
->delta_stats
.num_wr
++;
7909 obs
.oi
.clear_omap_digest();
7910 obs
.oi
.clear_flag(object_info_t::FLAG_OMAP
);
7915 case CEPH_OSD_OP_OMAPRMKEYS
:
7916 if (!pool
.info
.supports_omap()) {
7917 result
= -EOPNOTSUPP
;
7918 tracepoint(osd
, do_osd_op_pre_omaprmkeys
, soid
.oid
.name
.c_str(), soid
.snap
.val
);
7924 if (!obs
.exists
|| oi
.is_whiteout()) {
7926 tracepoint(osd
, do_osd_op_pre_omaprmkeys
, soid
.oid
.name
.c_str(), soid
.snap
.val
);
7929 bufferlist to_rm_bl
;
7931 decode_str_set_to_bl(bp
, &to_rm_bl
);
7933 catch (ceph::buffer::error
& e
) {
7935 tracepoint(osd
, do_osd_op_pre_omaprmkeys
, soid
.oid
.name
.c_str(), soid
.snap
.val
);
7938 tracepoint(osd
, do_osd_op_pre_omaprmkeys
, soid
.oid
.name
.c_str(), soid
.snap
.val
);
7939 t
->omap_rmkeys(soid
, to_rm_bl
);
7940 ctx
->clean_regions
.mark_omap_dirty();
7941 ctx
->delta_stats
.num_wr
++;
7943 obs
.oi
.clear_omap_digest();
7946 case CEPH_OSD_OP_OMAPRMKEYRANGE
:
7947 tracepoint(osd
, do_osd_op_pre_omaprmkeyrange
, soid
.oid
.name
.c_str(), soid
.snap
.val
);
7948 if (!pool
.info
.supports_omap()) {
7949 result
= -EOPNOTSUPP
;
7955 if (!obs
.exists
|| oi
.is_whiteout()) {
7959 std::string key_begin
, key_end
;
7961 decode(key_begin
, bp
);
7962 decode(key_end
, bp
);
7963 } catch (ceph::buffer::error
& e
) {
7967 t
->omap_rmkeyrange(soid
, key_begin
, key_end
);
7968 ctx
->delta_stats
.num_wr
++;
7970 obs
.oi
.clear_omap_digest();
7973 case CEPH_OSD_OP_COPY_GET
:
7975 tracepoint(osd
, do_osd_op_pre_copy_get
, soid
.oid
.name
.c_str(),
7977 if (op_finisher
== nullptr) {
7978 result
= do_copy_get(ctx
, bp
, osd_op
, ctx
->obc
);
7980 result
= op_finisher
->execute();
7984 case CEPH_OSD_OP_COPY_FROM
:
7985 case CEPH_OSD_OP_COPY_FROM2
:
7990 object_locator_t src_oloc
;
7991 uint32_t truncate_seq
= 0;
7992 uint64_t truncate_size
= 0;
7993 bool have_truncate
= false;
7994 snapid_t src_snapid
= (uint64_t)op
.copy_from
.snapid
;
7995 version_t src_version
= op
.copy_from
.src_version
;
7997 if ((op
.op
== CEPH_OSD_OP_COPY_FROM2
) &&
7998 (op
.copy_from
.flags
& ~CEPH_OSD_COPY_FROM_FLAGS
)) {
7999 dout(20) << "invalid copy-from2 flags 0x"
8000 << std::hex
<< (int)op
.copy_from
.flags
<< std::dec
<< dendl
;
8005 decode(src_name
, bp
);
8006 decode(src_oloc
, bp
);
8007 // check if client sent us truncate_seq and truncate_size
8008 if ((op
.op
== CEPH_OSD_OP_COPY_FROM2
) &&
8009 (op
.copy_from
.flags
& CEPH_OSD_COPY_FROM_FLAG_TRUNCATE_SEQ
)) {
8010 decode(truncate_seq
, bp
);
8011 decode(truncate_size
, bp
);
8012 have_truncate
= true;
8015 catch (ceph::buffer::error
& e
) {
8018 do_osd_op_pre_copy_from
,
8019 soid
.oid
.name
.c_str(),
8031 do_osd_op_pre_copy_from
,
8032 soid
.oid
.name
.c_str(),
8034 src_name
.name
.c_str(),
8036 src_oloc
.key
.c_str(),
8037 src_oloc
.nspace
.c_str(),
8041 if (op_finisher
== nullptr) {
8044 get_osdmap()->object_locator_to_pg(src_name
, src_oloc
, raw_pg
);
8045 hobject_t
src(src_name
, src_oloc
.key
, src_snapid
,
8046 raw_pg
.ps(), raw_pg
.pool(),
8049 dout(20) << " copy from self is invalid" << dendl
;
8053 CopyFromCallback
*cb
= new CopyFromCallback(ctx
, osd_op
);
8055 cb
->set_truncate(truncate_seq
, truncate_size
);
8056 ctx
->op_finishers
[ctx
->current_osd_subop_num
].reset(
8057 new CopyFromFinisher(cb
));
8058 start_copy(cb
, ctx
->obc
, src
, src_oloc
, src_version
,
8061 op
.copy_from
.src_fadvise_flags
,
8063 result
= -EINPROGRESS
;
8066 result
= op_finisher
->execute();
8067 ceph_assert(result
== 0);
8069 // COPY_FROM cannot be executed multiple times -- it must restart
8070 ctx
->op_finishers
.erase(ctx
->current_osd_subop_num
);
8076 tracepoint(osd
, do_osd_op_pre_unknown
, soid
.oid
.name
.c_str(), soid
.snap
.val
, op
.op
, ceph_osd_op_name(op
.op
));
8077 dout(1) << "unrecognized osd op " << op
.op
8078 << " " << ceph_osd_op_name(op
.op
)
8080 result
= -EOPNOTSUPP
;
8084 osd_op
.rval
= result
;
8085 tracepoint(osd
, do_osd_op_post
, soid
.oid
.name
.c_str(), soid
.snap
.val
, op
.op
, ceph_osd_op_name(op
.op
), op
.flags
, result
);
8086 if (result
< 0 && (op
.flags
& CEPH_OSD_OP_FLAG_FAILOK
) &&
8087 result
!= -EAGAIN
&& result
!= -EINPROGRESS
)
8094 dout(10) << __func__
<< " error: " << cpp_strerror(result
) << dendl
;
8099 int PrimaryLogPG::_get_tmap(OpContext
*ctx
, bufferlist
*header
, bufferlist
*vals
)
8101 if (ctx
->new_obs
.oi
.size
== 0) {
8102 dout(20) << "unable to get tmap for zero sized " << ctx
->new_obs
.oi
.soid
<< dendl
;
8105 vector
<OSDOp
> nops(1);
8106 OSDOp
&newop
= nops
[0];
8107 newop
.op
.op
= CEPH_OSD_OP_TMAPGET
;
8108 do_osd_ops(ctx
, nops
);
8110 bufferlist::const_iterator i
= newop
.outdata
.begin();
8112 (*vals
).substr_of(newop
.outdata
, i
.get_off(), i
.get_remaining());
8114 dout(20) << "unsuccessful at decoding tmap for " << ctx
->new_obs
.oi
.soid
8118 dout(20) << "successful at decoding tmap for " << ctx
->new_obs
.oi
.soid
8123 int PrimaryLogPG::_verify_no_head_clones(const hobject_t
& soid
,
8126 // verify that all clones have been evicted
8127 dout(20) << __func__
<< " verifying clones are absent "
8129 for (vector
<snapid_t
>::const_iterator p
= ss
.clones
.begin();
8130 p
!= ss
.clones
.end();
8132 hobject_t clone_oid
= soid
;
8133 clone_oid
.snap
= *p
;
8134 if (is_missing_object(clone_oid
))
8136 ObjectContextRef clone_obc
= get_object_context(clone_oid
, false);
8137 if (clone_obc
&& clone_obc
->obs
.exists
) {
8138 dout(10) << __func__
<< " cannot evict head before clone "
8139 << clone_oid
<< dendl
;
8142 if (copy_ops
.count(clone_oid
)) {
8143 dout(10) << __func__
<< " cannot evict head, pending promote on clone "
8144 << clone_oid
<< dendl
;
8151 inline int PrimaryLogPG::_delete_oid(
8153 bool no_whiteout
, // no whiteouts, no matter what.
8154 bool try_no_whiteout
) // try not to whiteout
8156 SnapSet
& snapset
= ctx
->new_snapset
;
8157 ObjectState
& obs
= ctx
->new_obs
;
8158 object_info_t
& oi
= obs
.oi
;
8159 const hobject_t
& soid
= oi
.soid
;
8160 PGTransaction
* t
= ctx
->op_t
.get();
8162 // cache: cache: set whiteout on delete?
8163 bool whiteout
= false;
8164 if (pool
.info
.cache_mode
!= pg_pool_t::CACHEMODE_NONE
8166 && !try_no_whiteout
) {
8170 // in luminous or later, we can't delete the head if there are
8171 // clones. we trust the caller passing no_whiteout has already
8172 // verified they don't exist.
8173 if (!snapset
.clones
.empty() ||
8174 (!ctx
->snapc
.snaps
.empty() && ctx
->snapc
.snaps
[0] > snapset
.seq
)) {
8176 dout(20) << __func__
<< " has or will have clones but no_whiteout=1"
8179 dout(20) << __func__
<< " has or will have clones; will whiteout"
8184 dout(20) << __func__
<< " " << soid
<< " whiteout=" << (int)whiteout
8185 << " no_whiteout=" << (int)no_whiteout
8186 << " try_no_whiteout=" << (int)try_no_whiteout
8188 if (!obs
.exists
|| (obs
.oi
.is_whiteout() && whiteout
))
8194 interval_set
<uint64_t> ch
;
8195 ch
.insert(0, oi
.size
);
8196 ctx
->modified_ranges
.union_of(ch
);
8197 ctx
->clean_regions
.mark_data_region_dirty(0, oi
.size
);
8200 ctx
->clean_regions
.mark_omap_dirty();
8201 ctx
->delta_stats
.num_wr
++;
8202 if (soid
.is_snap()) {
8203 ceph_assert(ctx
->obc
->ssc
->snapset
.clone_overlap
.count(soid
.snap
));
8204 ctx
->delta_stats
.num_bytes
-= ctx
->obc
->ssc
->snapset
.get_clone_bytes(soid
.snap
);
8206 ctx
->delta_stats
.num_bytes
-= oi
.size
;
8211 // disconnect all watchers
8212 for (map
<pair
<uint64_t, entity_name_t
>, watch_info_t
>::iterator p
=
8213 oi
.watchers
.begin();
8214 p
!= oi
.watchers
.end();
8216 dout(20) << __func__
<< " will disconnect watcher " << p
->first
<< dendl
;
8217 ctx
->watch_disconnects
.push_back(
8218 watch_disconnect_t(p
->first
.first
, p
->first
.second
, true));
8220 oi
.watchers
.clear();
8223 dout(20) << __func__
<< " setting whiteout on " << soid
<< dendl
;
8224 oi
.set_flag(object_info_t::FLAG_WHITEOUT
);
8225 ctx
->delta_stats
.num_whiteouts
++;
8227 osd
->logger
->inc(l_osd_tier_whiteout
);
8231 if (oi
.has_manifest()) {
8232 ctx
->delta_stats
.num_objects_manifest
--;
8233 dec_all_refcount_manifest(oi
, ctx
);
8237 ctx
->delta_stats
.num_objects
--;
8239 ctx
->delta_stats
.num_object_clones
--;
8240 if (oi
.is_whiteout()) {
8241 dout(20) << __func__
<< " deleting whiteout on " << soid
<< dendl
;
8242 ctx
->delta_stats
.num_whiteouts
--;
8243 oi
.clear_flag(object_info_t::FLAG_WHITEOUT
);
8245 if (oi
.is_cache_pinned()) {
8246 ctx
->delta_stats
.num_objects_pinned
--;
8252 int PrimaryLogPG::_rollback_to(OpContext
*ctx
, OSDOp
& op
)
8254 ObjectState
& obs
= ctx
->new_obs
;
8255 object_info_t
& oi
= obs
.oi
;
8256 const hobject_t
& soid
= oi
.soid
;
8257 snapid_t snapid
= (uint64_t)op
.op
.snap
.snapid
;
8258 hobject_t missing_oid
;
8260 dout(10) << "_rollback_to " << soid
<< " snapid " << snapid
<< dendl
;
8262 ObjectContextRef rollback_to
;
8264 int ret
= find_object_context(
8265 hobject_t(soid
.oid
, soid
.get_key(), snapid
, soid
.get_hash(), info
.pgid
.pool(),
8266 soid
.get_namespace()),
8267 &rollback_to
, false, false, &missing_oid
);
8268 if (ret
== -EAGAIN
) {
8269 /* clone must be missing */
8270 ceph_assert(is_degraded_or_backfilling_object(missing_oid
) || is_degraded_on_async_recovery_target(missing_oid
));
8271 dout(20) << "_rollback_to attempted to roll back to a missing or backfilling clone "
8272 << missing_oid
<< " (requested snapid: ) " << snapid
<< dendl
;
8273 block_write_on_degraded_snap(missing_oid
, ctx
->op
);
8277 ObjectContextRef promote_obc
;
8278 cache_result_t tier_mode_result
;
8279 if (obs
.exists
&& obs
.oi
.has_manifest()) {
8281 * In the case of manifest object, the object_info exists on the base tier at all time,
8282 * so promote_obc should be equal to rollback_to
8284 promote_obc
= rollback_to
;
8286 maybe_handle_manifest_detail(
8292 maybe_handle_cache_detail(
8302 switch (tier_mode_result
) {
8303 case cache_result_t::NOOP
:
8305 case cache_result_t::BLOCKED_PROMOTE
:
8306 ceph_assert(promote_obc
);
8307 block_write_on_snap_rollback(soid
, promote_obc
, ctx
->op
);
8309 case cache_result_t::BLOCKED_FULL
:
8310 block_write_on_full_cache(soid
, ctx
->op
);
8312 case cache_result_t::REPLIED_WITH_EAGAIN
:
8313 ceph_abort_msg("this can't happen, no rollback on replica");
8315 ceph_abort_msg("must promote was set, other values are not valid");
8320 if (ret
== -ENOENT
|| (rollback_to
&& rollback_to
->obs
.oi
.is_whiteout())) {
8321 // there's no snapshot here, or there's no object.
8322 // if there's no snapshot, we delete the object; otherwise, do nothing.
8323 dout(20) << "_rollback_to deleting head on " << soid
.oid
8324 << " because got ENOENT|whiteout on find_object_context" << dendl
;
8325 if (ctx
->obc
->obs
.oi
.watchers
.size()) {
8326 // Cannot delete an object with watchers
8329 _delete_oid(ctx
, false, false);
8333 // ummm....huh? It *can't* return anything else at time of writing.
8334 ceph_abort_msg("unexpected error code in _rollback_to");
8335 } else { //we got our context, let's use it to do the rollback!
8336 hobject_t
& rollback_to_sobject
= rollback_to
->obs
.oi
.soid
;
8337 if (is_degraded_or_backfilling_object(rollback_to_sobject
) ||
8338 is_degraded_on_async_recovery_target(rollback_to_sobject
)) {
8339 dout(20) << "_rollback_to attempted to roll back to a degraded object "
8340 << rollback_to_sobject
<< " (requested snapid: ) " << snapid
<< dendl
;
8341 block_write_on_degraded_snap(rollback_to_sobject
, ctx
->op
);
8343 } else if (rollback_to
->obs
.oi
.soid
.snap
== CEPH_NOSNAP
) {
8344 // rolling back to the head; we just need to clone it.
8347 if (rollback_to
->obs
.oi
.has_manifest() && rollback_to
->obs
.oi
.manifest
.is_chunked()) {
8349 * looking at the following case, the foo head needs the reference of chunk4 and chunk5
8350 * in case snap[1] is removed.
8352 * Before rollback to snap[1]:
8354 * foo snap[1]: [chunk4] [chunk5]
8355 * foo snap[0]: [ chunk2 ]
8356 * foo head : [chunk1] [chunk3]
8360 * foo snap[1]: [chunk4] [chunk5]
8361 * foo snap[0]: [ chunk2 ]
8362 * foo head : [chunk4] [chunk5]
8365 OpFinisher
* op_finisher
= nullptr;
8366 auto op_finisher_it
= ctx
->op_finishers
.find(ctx
->current_osd_subop_num
);
8367 if (op_finisher_it
!= ctx
->op_finishers
.end()) {
8368 op_finisher
= op_finisher_it
->second
.get();
8371 bool need_inc_ref
= inc_refcount_by_set(ctx
, rollback_to
->obs
.oi
.manifest
, op
);
8373 ceph_assert(op_finisher_it
== ctx
->op_finishers
.end());
8374 ctx
->op_finishers
[ctx
->current_osd_subop_num
].reset(
8375 new SetManifestFinisher(op
));
8376 return -EINPROGRESS
;
8379 op_finisher
->execute();
8380 ctx
->op_finishers
.erase(ctx
->current_osd_subop_num
);
8383 _do_rollback_to(ctx
, rollback_to
, op
);
8389 void PrimaryLogPG::_do_rollback_to(OpContext
*ctx
, ObjectContextRef rollback_to
,
8392 SnapSet
& snapset
= ctx
->new_snapset
;
8393 ObjectState
& obs
= ctx
->new_obs
;
8394 object_info_t
& oi
= obs
.oi
;
8395 const hobject_t
& soid
= oi
.soid
;
8396 PGTransaction
* t
= ctx
->op_t
.get();
8397 snapid_t snapid
= (uint64_t)op
.op
.snap
.snapid
;
8398 hobject_t
& rollback_to_sobject
= rollback_to
->obs
.oi
.soid
;
8400 /* 1) Delete current head
8401 * 2) Clone correct snapshot into head
8402 * 3) Calculate clone_overlaps by following overlaps
8403 * forward from rollback snapshot */
8404 dout(10) << "_do_rollback_to deleting " << soid
.oid
8405 << " and rolling back to old snap" << dendl
;
8409 if (obs
.oi
.has_manifest()) {
8410 dec_all_refcount_manifest(obs
.oi
, ctx
);
8411 oi
.manifest
.clear();
8412 oi
.manifest
.type
= object_manifest_t::TYPE_NONE
;
8413 oi
.clear_flag(object_info_t::FLAG_MANIFEST
);
8414 ctx
->delta_stats
.num_objects_manifest
--;
8415 ctx
->cache_operation
= true; // do not trigger to call ref function to calculate refcount
8418 t
->clone(soid
, rollback_to_sobject
);
8419 t
->add_obc(rollback_to
);
8421 map
<snapid_t
, interval_set
<uint64_t> >::iterator iter
=
8422 snapset
.clone_overlap
.lower_bound(snapid
);
8423 ceph_assert(iter
!= snapset
.clone_overlap
.end());
8424 interval_set
<uint64_t> overlaps
= iter
->second
;
8426 iter
!= snapset
.clone_overlap
.end();
8428 overlaps
.intersection_of(iter
->second
);
8430 if (obs
.oi
.size
> 0) {
8431 interval_set
<uint64_t> modified
;
8432 modified
.insert(0, obs
.oi
.size
);
8433 overlaps
.intersection_of(modified
);
8434 modified
.subtract(overlaps
);
8435 ctx
->modified_ranges
.union_of(modified
);
8438 // Adjust the cached objectcontext
8439 maybe_create_new_object(ctx
, true);
8440 ctx
->delta_stats
.num_bytes
-= obs
.oi
.size
;
8441 ctx
->delta_stats
.num_bytes
+= rollback_to
->obs
.oi
.size
;
8442 ctx
->clean_regions
.mark_data_region_dirty(0, std::max(obs
.oi
.size
, rollback_to
->obs
.oi
.size
));
8443 ctx
->clean_regions
.mark_omap_dirty();
8444 obs
.oi
.size
= rollback_to
->obs
.oi
.size
;
8445 if (rollback_to
->obs
.oi
.is_data_digest())
8446 obs
.oi
.set_data_digest(rollback_to
->obs
.oi
.data_digest
);
8448 obs
.oi
.clear_data_digest();
8449 if (rollback_to
->obs
.oi
.is_omap_digest())
8450 obs
.oi
.set_omap_digest(rollback_to
->obs
.oi
.omap_digest
);
8452 obs
.oi
.clear_omap_digest();
8454 if (rollback_to
->obs
.oi
.has_manifest() && rollback_to
->obs
.oi
.manifest
.is_chunked()) {
8455 obs
.oi
.set_flag(object_info_t::FLAG_MANIFEST
);
8456 obs
.oi
.manifest
.type
= rollback_to
->obs
.oi
.manifest
.type
;
8457 obs
.oi
.manifest
.chunk_map
= rollback_to
->obs
.oi
.manifest
.chunk_map
;
8458 ctx
->cache_operation
= true;
8459 ctx
->delta_stats
.num_objects_manifest
++;
8462 if (rollback_to
->obs
.oi
.is_omap()) {
8463 dout(10) << __func__
<< " setting omap flag on " << obs
.oi
.soid
<< dendl
;
8464 obs
.oi
.set_flag(object_info_t::FLAG_OMAP
);
8466 dout(10) << __func__
<< " clearing omap flag on " << obs
.oi
.soid
<< dendl
;
8467 obs
.oi
.clear_flag(object_info_t::FLAG_OMAP
);
8471 void PrimaryLogPG::_make_clone(
8474 ObjectContextRef obc
,
8475 const hobject_t
& head
, const hobject_t
& coid
,
8479 encode(*poi
, bv
, get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD
, nullptr));
8481 t
->clone(coid
, head
);
8482 setattr_maybe_cache(obc
, t
, OI_ATTR
, bv
);
8483 rmattr_maybe_cache(obc
, t
, SS_ATTR
);
8486 void PrimaryLogPG::make_writeable(OpContext
*ctx
)
8488 const hobject_t
& soid
= ctx
->obs
->oi
.soid
;
8489 SnapContext
& snapc
= ctx
->snapc
;
8492 ceph_assert(soid
.snap
== CEPH_NOSNAP
);
8493 dout(20) << "make_writeable " << soid
<< " snapset=" << ctx
->new_snapset
8494 << " snapc=" << snapc
<< dendl
;
8496 bool was_dirty
= ctx
->obc
->obs
.oi
.is_dirty();
8497 if (ctx
->new_obs
.exists
) {
8498 // we will mark the object dirty
8499 if (ctx
->undirty
&& was_dirty
) {
8500 dout(20) << " clearing DIRTY flag" << dendl
;
8501 ceph_assert(ctx
->new_obs
.oi
.is_dirty());
8502 ctx
->new_obs
.oi
.clear_flag(object_info_t::FLAG_DIRTY
);
8503 --ctx
->delta_stats
.num_objects_dirty
;
8504 osd
->logger
->inc(l_osd_tier_clean
);
8505 } else if (!was_dirty
&& !ctx
->undirty
) {
8506 dout(20) << " setting DIRTY flag" << dendl
;
8507 ctx
->new_obs
.oi
.set_flag(object_info_t::FLAG_DIRTY
);
8508 ++ctx
->delta_stats
.num_objects_dirty
;
8509 osd
->logger
->inc(l_osd_tier_dirty
);
8513 dout(20) << " deletion, decrementing num_dirty and clearing flag" << dendl
;
8514 ctx
->new_obs
.oi
.clear_flag(object_info_t::FLAG_DIRTY
);
8515 --ctx
->delta_stats
.num_objects_dirty
;
8519 if ((ctx
->new_obs
.exists
&&
8520 ctx
->new_obs
.oi
.is_omap()) &&
8521 (!ctx
->obc
->obs
.exists
||
8522 !ctx
->obc
->obs
.oi
.is_omap())) {
8523 ++ctx
->delta_stats
.num_objects_omap
;
8525 if ((!ctx
->new_obs
.exists
||
8526 !ctx
->new_obs
.oi
.is_omap()) &&
8527 (ctx
->obc
->obs
.exists
&&
8528 ctx
->obc
->obs
.oi
.is_omap())) {
8529 --ctx
->delta_stats
.num_objects_omap
;
8532 if (ctx
->new_snapset
.seq
> snapc
.seq
) {
8533 dout(10) << " op snapset is old" << dendl
;
8536 if ((ctx
->obs
->exists
&& !ctx
->obs
->oi
.is_whiteout()) && // head exist(ed)
8537 snapc
.snaps
.size() && // there are snaps
8538 !ctx
->cache_operation
&&
8539 snapc
.snaps
[0] > ctx
->new_snapset
.seq
) { // existing object is old
8541 hobject_t coid
= soid
;
8542 coid
.snap
= snapc
.seq
;
8546 l
< snapc
.snaps
.size() && snapc
.snaps
[l
] > ctx
->new_snapset
.seq
;
8549 vector
<snapid_t
> snaps(l
);
8550 for (unsigned i
=0; i
<l
; i
++)
8551 snaps
[i
] = snapc
.snaps
[i
];
8554 object_info_t
static_snap_oi(coid
);
8555 object_info_t
*snap_oi
;
8557 ctx
->clone_obc
= object_contexts
.lookup_or_create(static_snap_oi
.soid
);
8558 ctx
->clone_obc
->destructor_callback
=
8559 new C_PG_ObjectContext(this, ctx
->clone_obc
.get());
8560 ctx
->clone_obc
->obs
.oi
= static_snap_oi
;
8561 ctx
->clone_obc
->obs
.exists
= true;
8562 ctx
->clone_obc
->ssc
= ctx
->obc
->ssc
;
8563 ctx
->clone_obc
->ssc
->ref
++;
8564 if (pool
.info
.is_erasure())
8565 ctx
->clone_obc
->attr_cache
= ctx
->obc
->attr_cache
;
8566 snap_oi
= &ctx
->clone_obc
->obs
.oi
;
8567 if (ctx
->obc
->obs
.oi
.has_manifest()) {
8568 if ((ctx
->obc
->obs
.oi
.flags
& object_info_t::FLAG_REDIRECT_HAS_REFERENCE
) &&
8569 ctx
->obc
->obs
.oi
.manifest
.is_redirect()) {
8570 snap_oi
->set_flag(object_info_t::FLAG_MANIFEST
);
8571 snap_oi
->manifest
.type
= object_manifest_t::TYPE_REDIRECT
;
8572 snap_oi
->manifest
.redirect_target
= ctx
->obc
->obs
.oi
.manifest
.redirect_target
;
8573 } else if (ctx
->obc
->obs
.oi
.manifest
.is_chunked()) {
8574 snap_oi
->set_flag(object_info_t::FLAG_MANIFEST
);
8575 snap_oi
->manifest
.type
= object_manifest_t::TYPE_CHUNKED
;
8576 snap_oi
->manifest
.chunk_map
= ctx
->obc
->obs
.oi
.manifest
.chunk_map
;
8578 ceph_abort_msg("unrecognized manifest type");
8581 bool got
= ctx
->lock_manager
.get_write_greedy(
8586 dout(20) << " got greedy write on clone_obc " << *ctx
->clone_obc
<< dendl
;
8588 snap_oi
= &static_snap_oi
;
8590 snap_oi
->version
= ctx
->at_version
;
8591 snap_oi
->prior_version
= ctx
->obs
->oi
.version
;
8592 snap_oi
->copy_user_bits(ctx
->obs
->oi
);
8594 _make_clone(ctx
, ctx
->op_t
.get(), ctx
->clone_obc
, soid
, coid
, snap_oi
);
8596 ctx
->delta_stats
.num_objects
++;
8597 if (snap_oi
->is_dirty()) {
8598 ctx
->delta_stats
.num_objects_dirty
++;
8599 osd
->logger
->inc(l_osd_tier_dirty
);
8601 if (snap_oi
->is_omap())
8602 ctx
->delta_stats
.num_objects_omap
++;
8603 if (snap_oi
->is_cache_pinned())
8604 ctx
->delta_stats
.num_objects_pinned
++;
8605 if (snap_oi
->has_manifest())
8606 ctx
->delta_stats
.num_objects_manifest
++;
8607 ctx
->delta_stats
.num_object_clones
++;
8608 ctx
->new_snapset
.clones
.push_back(coid
.snap
);
8609 ctx
->new_snapset
.clone_size
[coid
.snap
] = ctx
->obs
->oi
.size
;
8610 ctx
->new_snapset
.clone_snaps
[coid
.snap
] = snaps
;
8612 // clone_overlap should contain an entry for each clone
8613 // (an empty interval_set if there is no overlap)
8614 ctx
->new_snapset
.clone_overlap
[coid
.snap
];
8615 if (ctx
->obs
->oi
.size
)
8616 ctx
->new_snapset
.clone_overlap
[coid
.snap
].insert(0, ctx
->obs
->oi
.size
);
8619 dout(10) << " cloning v " << ctx
->obs
->oi
.version
8620 << " to " << coid
<< " v " << ctx
->at_version
8621 << " snaps=" << snaps
8622 << " snapset=" << ctx
->new_snapset
<< dendl
;
8623 ctx
->log
.push_back(pg_log_entry_t(
8624 pg_log_entry_t::CLONE
, coid
, ctx
->at_version
,
8625 ctx
->obs
->oi
.version
,
8626 ctx
->obs
->oi
.user_version
,
8627 osd_reqid_t(), ctx
->new_obs
.oi
.mtime
, 0));
8628 encode(snaps
, ctx
->log
.back().snaps
);
8630 ctx
->at_version
.version
++;
8633 // update most recent clone_overlap and usage stats
8634 if (ctx
->new_snapset
.clones
.size() > 0) {
8635 // the clone_overlap is difference of range between head and clones.
8636 // we need to check whether the most recent clone exists, if it's
8637 // been evicted, it's not included in the stats, but the clone_overlap
8638 // is still exist in the snapset, so we should update the
8639 // clone_overlap to make it sense.
8640 hobject_t last_clone_oid
= soid
;
8641 last_clone_oid
.snap
= ctx
->new_snapset
.clone_overlap
.rbegin()->first
;
8642 interval_set
<uint64_t> &newest_overlap
=
8643 ctx
->new_snapset
.clone_overlap
.rbegin()->second
;
8644 ctx
->modified_ranges
.intersection_of(newest_overlap
);
8645 if (is_present_clone(last_clone_oid
)) {
8646 // modified_ranges is still in use by the clone
8647 ctx
->delta_stats
.num_bytes
+= ctx
->modified_ranges
.size();
8649 newest_overlap
.subtract(ctx
->modified_ranges
);
8652 if (snapc
.seq
> ctx
->new_snapset
.seq
) {
8653 // update snapset with latest snap context
8654 ctx
->new_snapset
.seq
= snapc
.seq
;
8655 if (get_osdmap()->require_osd_release
< ceph_release_t::octopus
) {
8656 ctx
->new_snapset
.snaps
= snapc
.snaps
;
8658 ctx
->new_snapset
.snaps
.clear();
8661 dout(20) << "make_writeable " << soid
8662 << " done, snapset=" << ctx
->new_snapset
<< dendl
;
8666 void PrimaryLogPG::write_update_size_and_usage(object_stat_sum_t
& delta_stats
, object_info_t
& oi
,
8667 interval_set
<uint64_t>& modified
, uint64_t offset
,
8668 uint64_t length
, bool write_full
)
8670 interval_set
<uint64_t> ch
;
8673 ch
.insert(0, oi
.size
);
8675 ch
.insert(offset
, length
);
8676 modified
.union_of(ch
);
8678 (offset
+ length
> oi
.size
&& length
)) {
8679 uint64_t new_size
= offset
+ length
;
8680 delta_stats
.num_bytes
-= oi
.size
;
8681 delta_stats
.num_bytes
+= new_size
;
8685 delta_stats
.num_wr
++;
8686 delta_stats
.num_wr_kb
+= shift_round_up(length
, 10);
8689 void PrimaryLogPG::truncate_update_size_and_usage(
8690 object_stat_sum_t
& delta_stats
,
8692 uint64_t truncate_size
)
8694 if (oi
.size
!= truncate_size
) {
8695 delta_stats
.num_bytes
-= oi
.size
;
8696 delta_stats
.num_bytes
+= truncate_size
;
8697 oi
.size
= truncate_size
;
8701 void PrimaryLogPG::complete_disconnect_watches(
8702 ObjectContextRef obc
,
8703 const list
<watch_disconnect_t
> &to_disconnect
)
8705 for (list
<watch_disconnect_t
>::const_iterator i
=
8706 to_disconnect
.begin();
8707 i
!= to_disconnect
.end();
8709 pair
<uint64_t, entity_name_t
> watcher(i
->cookie
, i
->name
);
8710 auto watchers_entry
= obc
->watchers
.find(watcher
);
8711 if (watchers_entry
!= obc
->watchers
.end()) {
8712 WatchRef watch
= watchers_entry
->second
;
8713 dout(10) << "do_osd_op_effects disconnect watcher " << watcher
<< dendl
;
8714 obc
->watchers
.erase(watcher
);
8715 watch
->remove(i
->send_disconnect
);
8717 dout(10) << "do_osd_op_effects disconnect failed to find watcher "
8718 << watcher
<< dendl
;
8723 void PrimaryLogPG::do_osd_op_effects(OpContext
*ctx
, const ConnectionRef
& conn
)
8725 entity_name_t entity
= ctx
->reqid
.name
;
8726 dout(15) << "do_osd_op_effects " << entity
<< " con " << conn
.get() << dendl
;
8728 // disconnects first
8729 complete_disconnect_watches(ctx
->obc
, ctx
->watch_disconnects
);
8733 auto session
= conn
->get_priv();
8737 for (list
<pair
<watch_info_t
,bool> >::iterator i
= ctx
->watch_connects
.begin();
8738 i
!= ctx
->watch_connects
.end();
8740 pair
<uint64_t, entity_name_t
> watcher(i
->first
.cookie
, entity
);
8741 dout(15) << "do_osd_op_effects applying watch connect on session "
8742 << session
.get() << " watcher " << watcher
<< dendl
;
8744 if (ctx
->obc
->watchers
.count(watcher
)) {
8745 dout(15) << "do_osd_op_effects found existing watch watcher " << watcher
8747 watch
= ctx
->obc
->watchers
[watcher
];
8749 dout(15) << "do_osd_op_effects new watcher " << watcher
8751 watch
= Watch::makeWatchRef(
8752 this, osd
, ctx
->obc
, i
->first
.timeout_seconds
,
8753 i
->first
.cookie
, entity
, conn
->get_peer_addr());
8754 ctx
->obc
->watchers
.insert(
8759 watch
->connect(conn
, i
->second
);
8762 for (list
<notify_info_t
>::iterator p
= ctx
->notifies
.begin();
8763 p
!= ctx
->notifies
.end();
8765 dout(10) << "do_osd_op_effects, notify " << *p
<< dendl
;
8766 ConnectionRef
conn(ctx
->op
->get_req()->get_connection());
8768 Notify::makeNotifyRef(
8770 ctx
->reqid
.name
.num(),
8775 ctx
->obc
->obs
.oi
.user_version
,
8777 for (map
<pair
<uint64_t, entity_name_t
>, WatchRef
>::iterator i
=
8778 ctx
->obc
->watchers
.begin();
8779 i
!= ctx
->obc
->watchers
.end();
8781 dout(10) << "starting notify on watch " << i
->first
<< dendl
;
8782 i
->second
->start_notify(notif
);
8787 for (list
<OpContext::NotifyAck
>::iterator p
= ctx
->notify_acks
.begin();
8788 p
!= ctx
->notify_acks
.end();
8790 if (p
->watch_cookie
)
8791 dout(10) << "notify_ack " << make_pair(*(p
->watch_cookie
), p
->notify_id
) << dendl
;
8793 dout(10) << "notify_ack " << make_pair("NULL", p
->notify_id
) << dendl
;
8794 for (map
<pair
<uint64_t, entity_name_t
>, WatchRef
>::iterator i
=
8795 ctx
->obc
->watchers
.begin();
8796 i
!= ctx
->obc
->watchers
.end();
8798 if (i
->first
.second
!= entity
) continue;
8799 if (p
->watch_cookie
&&
8800 *(p
->watch_cookie
) != i
->first
.first
) continue;
8801 dout(10) << "acking notify on watch " << i
->first
<< dendl
;
8802 i
->second
->notify_ack(p
->notify_id
, p
->reply_bl
);
8807 hobject_t
PrimaryLogPG::generate_temp_object(const hobject_t
& target
)
8810 ss
<< "temp_" << info
.pgid
<< "_" << get_role()
8811 << "_" << osd
->monc
->get_global_id() << "_" << (++temp_seq
);
8812 hobject_t hoid
= target
.make_temp_hobject(ss
.str());
8813 dout(20) << __func__
<< " " << hoid
<< dendl
;
8817 hobject_t
PrimaryLogPG::get_temp_recovery_object(
8818 const hobject_t
& target
,
8822 ss
<< "temp_recovering_" << info
.pgid
// (note this includes the shardid)
8824 << "_" << info
.history
.same_interval_since
8825 << "_" << target
.snap
;
8826 // pgid + version + interval + snapid is unique, and short
8827 hobject_t hoid
= target
.make_temp_hobject(ss
.str());
8828 dout(20) << __func__
<< " " << hoid
<< dendl
;
8832 int PrimaryLogPG::prepare_transaction(OpContext
*ctx
)
8834 ceph_assert(!ctx
->ops
->empty());
8836 // valid snap context?
8837 if (!ctx
->snapc
.is_valid()) {
8838 dout(10) << " invalid snapc " << ctx
->snapc
<< dendl
;
8842 // prepare the actual mutation
8843 int result
= do_osd_ops(ctx
, *ctx
->ops
);
8845 if (ctx
->op
->may_write() &&
8846 get_osdmap()->require_osd_release
>= ceph_release_t::kraken
) {
8847 // need to save the error code in the pg log, to detect dup ops,
8848 // but do nothing else
8849 ctx
->update_log_only
= true;
8854 // read-op? write-op noop? done?
8855 if (ctx
->op_t
->empty() && !ctx
->modify
) {
8856 if (ctx
->pending_async_reads
.empty())
8857 unstable_stats
.add(ctx
->delta_stats
);
8858 if (ctx
->op
->may_write() &&
8859 get_osdmap()->require_osd_release
>= ceph_release_t::kraken
) {
8860 ctx
->update_log_only
= true;
8866 if ((ctx
->delta_stats
.num_bytes
> 0 ||
8867 ctx
->delta_stats
.num_objects
> 0) && // FIXME: keys?
8868 pool
.info
.has_flag(pg_pool_t::FLAG_FULL
)) {
8869 auto m
= ctx
->op
->get_req
<MOSDOp
>();
8870 if (ctx
->reqid
.name
.is_mds() || // FIXME: ignore MDS for now
8871 m
->has_flag(CEPH_OSD_FLAG_FULL_FORCE
)) {
8872 dout(20) << __func__
<< " full, but proceeding due to FULL_FORCE or MDS"
8874 } else if (m
->has_flag(CEPH_OSD_FLAG_FULL_TRY
)) {
8875 // they tried, they failed.
8876 dout(20) << __func__
<< " full, replying to FULL_TRY op" << dendl
;
8877 return pool
.info
.has_flag(pg_pool_t::FLAG_FULL_QUOTA
) ? -EDQUOT
: -ENOSPC
;
8880 dout(20) << __func__
<< " full, dropping request (bad client)" << dendl
;
8885 const hobject_t
& soid
= ctx
->obs
->oi
.soid
;
8886 // clone, if necessary
8887 if (soid
.snap
== CEPH_NOSNAP
)
8888 make_writeable(ctx
);
8891 ctx
->new_obs
.exists
? pg_log_entry_t::MODIFY
:
8892 pg_log_entry_t::DELETE
,
8898 void PrimaryLogPG::finish_ctx(OpContext
*ctx
, int log_op_type
, int result
)
8900 const hobject_t
& soid
= ctx
->obs
->oi
.soid
;
8901 dout(20) << __func__
<< " " << soid
<< " " << ctx
8902 << " op " << pg_log_entry_t::get_op_name(log_op_type
)
8904 utime_t now
= ceph_clock_now();
8908 span
= tracing::osd::tracer
.add_span(__func__
, ctx
->op
->osd_parent_span
);
8911 // Drop the reference if deduped chunk is modified
8912 if (ctx
->new_obs
.oi
.is_dirty() &&
8913 (ctx
->obs
->oi
.has_manifest() && ctx
->obs
->oi
.manifest
.is_chunked()) &&
8914 !ctx
->cache_operation
&&
8915 log_op_type
!= pg_log_entry_t::PROMOTE
) {
8916 update_chunk_map_by_dirty(ctx
);
8917 // If a clone is creating, ignore dropping the reference for manifest object
8918 if (!ctx
->delta_stats
.num_object_clones
) {
8919 dec_refcount_by_dirty(ctx
);
8923 // finish and log the op.
8924 if (ctx
->user_modify
) {
8925 // update the user_version for any modify ops, except for the watch op
8926 ctx
->user_at_version
= std::max(info
.last_user_version
, ctx
->new_obs
.oi
.user_version
) + 1;
8927 /* In order for new clients and old clients to interoperate properly
8928 * when exchanging versions, we need to lower bound the user_version
8929 * (which our new clients pay proper attention to)
8930 * by the at_version (which is all the old clients can ever see). */
8931 if (ctx
->at_version
.version
> ctx
->user_at_version
)
8932 ctx
->user_at_version
= ctx
->at_version
.version
;
8933 ctx
->new_obs
.oi
.user_version
= ctx
->user_at_version
;
8935 ctx
->bytes_written
= ctx
->op_t
->get_bytes_written();
8937 if (ctx
->new_obs
.exists
) {
8938 ctx
->new_obs
.oi
.version
= ctx
->at_version
;
8939 ctx
->new_obs
.oi
.prior_version
= ctx
->obs
->oi
.version
;
8940 ctx
->new_obs
.oi
.last_reqid
= ctx
->reqid
;
8941 if (ctx
->mtime
!= utime_t()) {
8942 ctx
->new_obs
.oi
.mtime
= ctx
->mtime
;
8943 dout(10) << " set mtime to " << ctx
->new_obs
.oi
.mtime
<< dendl
;
8944 ctx
->new_obs
.oi
.local_mtime
= now
;
8946 dout(10) << " mtime unchanged at " << ctx
->new_obs
.oi
.mtime
<< dendl
;
8950 map
<string
, bufferlist
, less
<>> attrs
;
8951 bufferlist
bv(sizeof(ctx
->new_obs
.oi
));
8952 encode(ctx
->new_obs
.oi
, bv
,
8953 get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD
, nullptr));
8954 attrs
[OI_ATTR
] = std::move(bv
);
8957 if (soid
.snap
== CEPH_NOSNAP
) {
8958 dout(10) << " final snapset " << ctx
->new_snapset
8959 << " in " << soid
<< dendl
;
8961 encode(ctx
->new_snapset
, bss
);
8962 attrs
[SS_ATTR
] = std::move(bss
);
8964 dout(10) << " no snapset (this is a clone)" << dendl
;
8966 ctx
->op_t
->setattrs(soid
, attrs
);
8969 ctx
->new_obs
.oi
= object_info_t(ctx
->obc
->obs
.oi
.soid
);
8974 pg_log_entry_t(log_op_type
, soid
, ctx
->at_version
,
8975 ctx
->obs
->oi
.version
,
8976 ctx
->user_at_version
, ctx
->reqid
,
8978 (ctx
->op
&& ctx
->op
->allows_returnvec()) ? result
: 0));
8979 if (ctx
->op
&& ctx
->op
->allows_returnvec()) {
8980 // also the per-op values
8981 ctx
->log
.back().set_op_returns(*ctx
->ops
);
8982 dout(20) << __func__
<< " op_returns " << ctx
->log
.back().op_returns
8986 ctx
->log
.back().clean_regions
= ctx
->clean_regions
;
8987 dout(20) << __func__
<< " object " << soid
<< " marks clean_regions " << ctx
->log
.back().clean_regions
<< dendl
;
8989 if (soid
.snap
< CEPH_NOSNAP
) {
8990 switch (log_op_type
) {
8991 case pg_log_entry_t::MODIFY
:
8992 case pg_log_entry_t::PROMOTE
:
8993 case pg_log_entry_t::CLEAN
:
8994 dout(20) << __func__
<< " encoding snaps from " << ctx
->new_snapset
8996 encode(ctx
->new_snapset
.clone_snaps
[soid
.snap
], ctx
->log
.back().snaps
);
9003 if (!ctx
->extra_reqids
.empty()) {
9004 dout(20) << __func__
<< " extra_reqids " << ctx
->extra_reqids
<< " "
9005 << ctx
->extra_reqid_return_codes
<< dendl
;
9006 ctx
->log
.back().extra_reqids
.swap(ctx
->extra_reqids
);
9007 ctx
->log
.back().extra_reqid_return_codes
.swap(ctx
->extra_reqid_return_codes
);
9010 // apply new object state.
9011 ctx
->obc
->obs
= ctx
->new_obs
;
9013 if (soid
.is_head() && !ctx
->obc
->obs
.exists
) {
9014 ctx
->obc
->ssc
->exists
= false;
9015 ctx
->obc
->ssc
->snapset
= SnapSet();
9017 ctx
->obc
->ssc
->exists
= true;
9018 ctx
->obc
->ssc
->snapset
= ctx
->new_snapset
;
9022 void PrimaryLogPG::apply_stats(
9023 const hobject_t
&soid
,
9024 const object_stat_sum_t
&delta_stats
) {
9026 recovery_state
.apply_op_stats(soid
, delta_stats
);
9027 for (set
<pg_shard_t
>::const_iterator i
= get_backfill_targets().begin();
9028 i
!= get_backfill_targets().end();
9031 const pg_info_t
& pinfo
= recovery_state
.get_peer_info(bt
);
9032 if (soid
> pinfo
.last_backfill
&& soid
<= last_backfill_started
) {
9033 pending_backfill_updates
[soid
].stats
.add(delta_stats
);
9037 m_scrubber
->stats_of_handled_objects(delta_stats
, soid
);
9040 void PrimaryLogPG::complete_read_ctx(int result
, OpContext
*ctx
)
9042 auto m
= ctx
->op
->get_req
<MOSDOp
>();
9043 ceph_assert(ctx
->async_reads_complete());
9045 for (auto p
= ctx
->ops
->begin();
9046 p
!= ctx
->ops
->end() && result
>= 0; ++p
) {
9047 if (p
->rval
< 0 && !(p
->op
.flags
& CEPH_OSD_OP_FLAG_FAILOK
)) {
9051 ctx
->bytes_read
+= p
->outdata
.length();
9053 ctx
->reply
->get_header().data_off
= (ctx
->data_off
? *ctx
->data_off
: 0);
9055 MOSDOpReply
*reply
= ctx
->reply
;
9056 ctx
->reply
= nullptr;
9059 if (!ctx
->ignore_log_op_stats
) {
9060 log_op_stats(*ctx
->op
, ctx
->bytes_written
, ctx
->bytes_read
);
9062 publish_stats_to_osd();
9065 // on read, return the current object version
9067 reply
->set_reply_versions(eversion_t(), ctx
->obs
->oi
.user_version
);
9069 reply
->set_reply_versions(eversion_t(), ctx
->user_at_version
);
9071 } else if (result
== -ENOENT
) {
9072 // on ENOENT, set a floor for what the next user version will be.
9073 reply
->set_enoent_reply_versions(info
.last_update
, info
.last_user_version
);
9076 reply
->set_result(result
);
9077 reply
->add_flags(CEPH_OSD_FLAG_ACK
| CEPH_OSD_FLAG_ONDISK
);
9078 osd
->send_message_osd_client(reply
, m
->get_connection());
9082 // ========================================================================
9085 struct C_Copyfrom
: public Context
{
9088 epoch_t last_peering_reset
;
9090 PrimaryLogPG::CopyOpRef cop
; // used for keeping the cop alive
9091 C_Copyfrom(PrimaryLogPG
*p
, hobject_t o
, epoch_t lpr
,
9092 const PrimaryLogPG::CopyOpRef
& c
)
9093 : pg(p
), oid(o
), last_peering_reset(lpr
),
9096 void finish(int r
) override
{
9097 if (r
== -ECANCELED
)
9099 std::scoped_lock l
{*pg
};
9100 if (last_peering_reset
== pg
->get_last_peering_reset()) {
9101 pg
->process_copy_chunk(oid
, tid
, r
);
9107 struct C_CopyFrom_AsyncReadCb
: public Context
{
9109 object_copy_data_t reply_obj
;
9112 C_CopyFrom_AsyncReadCb(OSDOp
*osd_op
, uint64_t features
) :
9113 osd_op(osd_op
), features(features
), len(0) {}
9114 void finish(int r
) override
{
9120 ceph_assert(len
> 0);
9121 ceph_assert(len
<= reply_obj
.data
.length());
9123 bl
.substr_of(reply_obj
.data
, 0, len
);
9124 reply_obj
.data
.swap(bl
);
9125 encode(reply_obj
, osd_op
->outdata
, features
);
9129 struct C_CopyChunk
: public Context
{
9132 epoch_t last_peering_reset
;
9134 PrimaryLogPG::CopyOpRef cop
; // used for keeping the cop alive
9135 uint64_t offset
= 0;
9136 C_CopyChunk(PrimaryLogPG
*p
, hobject_t o
, epoch_t lpr
,
9137 const PrimaryLogPG::CopyOpRef
& c
)
9138 : pg(p
), oid(o
), last_peering_reset(lpr
),
9141 void finish(int r
) override
{
9142 if (r
== -ECANCELED
)
9144 std::scoped_lock l
{*pg
};
9145 if (last_peering_reset
== pg
->get_last_peering_reset()) {
9146 pg
->process_copy_chunk_manifest(oid
, tid
, r
, offset
);
9152 int PrimaryLogPG::do_copy_get(OpContext
*ctx
, bufferlist::const_iterator
& bp
,
9153 OSDOp
& osd_op
, ObjectContextRef
&obc
)
9155 object_info_t
& oi
= obc
->obs
.oi
;
9156 hobject_t
& soid
= oi
.soid
;
9158 object_copy_cursor_t cursor
;
9162 decode(out_max
, bp
);
9164 catch (ceph::buffer::error
& e
) {
9169 const MOSDOp
*op
= reinterpret_cast<const MOSDOp
*>(ctx
->op
->get_req());
9170 uint64_t features
= op
->get_features();
9172 bool async_read_started
= false;
9173 object_copy_data_t _reply_obj
;
9174 C_CopyFrom_AsyncReadCb
*cb
= nullptr;
9175 if (pool
.info
.is_erasure()) {
9176 cb
= new C_CopyFrom_AsyncReadCb(&osd_op
, features
);
9178 object_copy_data_t
&reply_obj
= cb
? cb
->reply_obj
: _reply_obj
;
9180 reply_obj
.size
= oi
.size
;
9181 reply_obj
.mtime
= oi
.mtime
;
9182 ceph_assert(obc
->ssc
);
9183 if (soid
.snap
< CEPH_NOSNAP
) {
9184 auto p
= obc
->ssc
->snapset
.clone_snaps
.find(soid
.snap
);
9185 ceph_assert(p
!= obc
->ssc
->snapset
.clone_snaps
.end()); // warn?
9186 reply_obj
.snaps
= p
->second
;
9188 reply_obj
.snap_seq
= obc
->ssc
->snapset
.seq
;
9190 if (oi
.is_data_digest()) {
9191 reply_obj
.flags
|= object_copy_data_t::FLAG_DATA_DIGEST
;
9192 reply_obj
.data_digest
= oi
.data_digest
;
9194 if (oi
.is_omap_digest()) {
9195 reply_obj
.flags
|= object_copy_data_t::FLAG_OMAP_DIGEST
;
9196 reply_obj
.omap_digest
= oi
.omap_digest
;
9198 reply_obj
.truncate_seq
= oi
.truncate_seq
;
9199 reply_obj
.truncate_size
= oi
.truncate_size
;
9202 map
<string
,bufferlist
,less
<>>& out_attrs
= reply_obj
.attrs
;
9203 if (!cursor
.attr_complete
) {
9204 result
= getattrs_maybe_cache(
9213 cursor
.attr_complete
= true;
9214 dout(20) << " got attrs" << dendl
;
9217 int64_t left
= out_max
- osd_op
.outdata
.length();
9220 bufferlist
& bl
= reply_obj
.data
;
9221 if (left
> 0 && !cursor
.data_complete
) {
9222 if (cursor
.data_offset
< oi
.size
) {
9223 uint64_t max_read
= std::min(oi
.size
- cursor
.data_offset
, (uint64_t)left
);
9225 async_read_started
= true;
9226 ctx
->pending_async_reads
.push_back(
9228 boost::make_tuple(cursor
.data_offset
, max_read
, osd_op
.op
.flags
),
9229 make_pair(&bl
, cb
)));
9232 ctx
->op_finishers
[ctx
->current_osd_subop_num
].reset(
9233 new ReadFinisher(osd_op
));
9234 result
= -EINPROGRESS
;
9236 dout(10) << __func__
<< ": async_read noted for " << soid
<< dendl
;
9238 result
= pgbackend
->objects_read_sync(
9239 oi
.soid
, cursor
.data_offset
, max_read
, osd_op
.op
.flags
, &bl
);
9244 cursor
.data_offset
+= max_read
;
9246 if (cursor
.data_offset
== oi
.size
) {
9247 cursor
.data_complete
= true;
9248 dout(20) << " got data" << dendl
;
9250 ceph_assert(cursor
.data_offset
<= oi
.size
);
9254 uint32_t omap_keys
= 0;
9255 if (!pool
.info
.supports_omap() || !oi
.is_omap()) {
9256 cursor
.omap_complete
= true;
9258 if (left
> 0 && !cursor
.omap_complete
) {
9259 ceph_assert(cursor
.data_complete
);
9260 if (cursor
.omap_offset
.empty()) {
9261 osd
->store
->omap_get_header(ch
, ghobject_t(oi
.soid
),
9262 &reply_obj
.omap_header
);
9264 bufferlist omap_data
;
9265 ObjectMap::ObjectMapIterator iter
=
9266 osd
->store
->get_omap_iterator(ch
, ghobject_t(oi
.soid
));
9268 iter
->upper_bound(cursor
.omap_offset
);
9269 for (; iter
->valid(); iter
->next()) {
9271 encode(iter
->key(), omap_data
);
9272 encode(iter
->value(), omap_data
);
9273 left
-= iter
->key().length() + 4 + iter
->value().length() + 4;
9278 encode(omap_keys
, reply_obj
.omap_data
);
9279 reply_obj
.omap_data
.claim_append(omap_data
);
9281 if (iter
->valid()) {
9282 cursor
.omap_offset
= iter
->key();
9284 cursor
.omap_complete
= true;
9285 dout(20) << " got omap" << dendl
;
9290 if (cursor
.is_complete()) {
9291 // include reqids only in the final step. this is a bit fragile
9293 recovery_state
.get_pg_log().get_log().get_object_reqids(ctx
->obc
->obs
.oi
.soid
, 10,
9295 &reply_obj
.reqid_return_codes
);
9296 dout(20) << " got reqids" << dendl
;
9299 dout(20) << " cursor.is_complete=" << cursor
.is_complete()
9300 << " " << out_attrs
.size() << " attrs"
9301 << " " << bl
.length() << " bytes"
9302 << " " << reply_obj
.omap_header
.length() << " omap header bytes"
9303 << " " << reply_obj
.omap_data
.length() << " omap data bytes in "
9304 << omap_keys
<< " keys"
9305 << " " << reply_obj
.reqids
.size() << " reqids"
9307 reply_obj
.cursor
= cursor
;
9308 if (!async_read_started
) {
9309 encode(reply_obj
, osd_op
.outdata
, features
);
9311 if (cb
&& !async_read_started
) {
9321 void PrimaryLogPG::fill_in_copy_get_noent(OpRequestRef
& op
, hobject_t oid
,
9324 const MOSDOp
*m
= static_cast<const MOSDOp
*>(op
->get_req());
9325 uint64_t features
= m
->get_features();
9326 object_copy_data_t reply_obj
;
9328 recovery_state
.get_pg_log().get_log().get_object_reqids(oid
, 10, &reply_obj
.reqids
,
9329 &reply_obj
.reqid_return_codes
);
9330 dout(20) << __func__
<< " got reqids " << reply_obj
.reqids
<< dendl
;
9331 encode(reply_obj
, osd_op
.outdata
, features
);
9332 osd_op
.rval
= -ENOENT
;
9333 MOSDOpReply
*reply
= new MOSDOpReply(m
, 0, get_osdmap_epoch(), 0, false);
9334 reply
->set_result(-ENOENT
);
9335 reply
->add_flags(CEPH_OSD_FLAG_ACK
| CEPH_OSD_FLAG_ONDISK
);
9336 osd
->send_message_osd_client(reply
, m
->get_connection());
9339 void PrimaryLogPG::start_copy(CopyCallback
*cb
, ObjectContextRef obc
,
9340 hobject_t src
, object_locator_t oloc
,
9341 version_t version
, unsigned flags
,
9342 bool mirror_snapset
,
9343 unsigned src_obj_fadvise_flags
,
9344 unsigned dest_obj_fadvise_flags
)
9346 const hobject_t
& dest
= obc
->obs
.oi
.soid
;
9347 dout(10) << __func__
<< " " << dest
9348 << " from " << src
<< " " << oloc
<< " v" << version
9349 << " flags " << flags
9350 << (mirror_snapset
? " mirror_snapset" : "")
9353 ceph_assert(!mirror_snapset
|| src
.snap
== CEPH_NOSNAP
);
9355 // cancel a previous in-progress copy?
9356 if (copy_ops
.count(dest
)) {
9357 // FIXME: if the src etc match, we could avoid restarting from the
9359 CopyOpRef cop
= copy_ops
[dest
];
9360 vector
<ceph_tid_t
> tids
;
9361 cancel_copy(cop
, false, &tids
);
9362 osd
->objecter
->op_cancel(tids
, -ECANCELED
);
9365 CopyOpRef
cop(std::make_shared
<CopyOp
>(cb
, obc
, src
, oloc
, version
, flags
,
9366 mirror_snapset
, src_obj_fadvise_flags
,
9367 dest_obj_fadvise_flags
));
9368 copy_ops
[dest
] = cop
;
9371 if (!obc
->obs
.oi
.has_manifest()) {
9372 _copy_some(obc
, cop
);
9374 if (obc
->obs
.oi
.manifest
.is_redirect()) {
9375 _copy_some(obc
, cop
);
9376 } else if (obc
->obs
.oi
.manifest
.is_chunked()) {
9377 auto p
= obc
->obs
.oi
.manifest
.chunk_map
.begin();
9378 _copy_some_manifest(obc
, cop
, p
->first
);
9380 ceph_abort_msg("unrecognized manifest type");
9385 void PrimaryLogPG::_copy_some(ObjectContextRef obc
, CopyOpRef cop
)
9387 dout(10) << __func__
<< " " << *obc
<< " " << cop
<< dendl
;
9390 if (cop
->flags
& CEPH_OSD_COPY_FROM_FLAG_FLUSH
)
9391 flags
|= CEPH_OSD_FLAG_FLUSH
;
9392 if (cop
->flags
& CEPH_OSD_COPY_FROM_FLAG_IGNORE_CACHE
)
9393 flags
|= CEPH_OSD_FLAG_IGNORE_CACHE
;
9394 if (cop
->flags
& CEPH_OSD_COPY_FROM_FLAG_IGNORE_OVERLAY
)
9395 flags
|= CEPH_OSD_FLAG_IGNORE_OVERLAY
;
9396 if (cop
->flags
& CEPH_OSD_COPY_FROM_FLAG_MAP_SNAP_CLONE
)
9397 flags
|= CEPH_OSD_FLAG_MAP_SNAP_CLONE
;
9398 if (cop
->flags
& CEPH_OSD_COPY_FROM_FLAG_RWORDERED
)
9399 flags
|= CEPH_OSD_FLAG_RWORDERED
;
9401 C_GatherBuilder
gather(cct
);
9403 if (cop
->cursor
.is_initial() && cop
->mirror_snapset
) {
9405 ceph_assert(cop
->src
.snap
== CEPH_NOSNAP
);
9407 op
.list_snaps(&cop
->results
.snapset
, NULL
);
9408 ceph_tid_t tid
= osd
->objecter
->read(cop
->src
.oid
, cop
->oloc
, op
,
9410 flags
, gather
.new_sub(), NULL
);
9411 cop
->objecter_tid2
= tid
;
9415 if (cop
->results
.user_version
) {
9416 op
.assert_version(cop
->results
.user_version
);
9418 // we should learn the version after the first chunk, if we didn't know
9420 ceph_assert(cop
->cursor
.is_initial());
9422 op
.copy_get(&cop
->cursor
, get_copy_chunk_size(),
9423 &cop
->results
.object_size
, &cop
->results
.mtime
,
9424 &cop
->attrs
, &cop
->data
, &cop
->omap_header
, &cop
->omap_data
,
9425 &cop
->results
.snaps
, &cop
->results
.snap_seq
,
9426 &cop
->results
.flags
,
9427 &cop
->results
.source_data_digest
,
9428 &cop
->results
.source_omap_digest
,
9429 &cop
->results
.reqids
,
9430 &cop
->results
.reqid_return_codes
,
9431 &cop
->results
.truncate_seq
,
9432 &cop
->results
.truncate_size
,
9434 op
.set_last_op_flags(cop
->src_obj_fadvise_flags
);
9436 C_Copyfrom
*fin
= new C_Copyfrom(this, obc
->obs
.oi
.soid
,
9437 get_last_peering_reset(), cop
);
9438 gather
.set_finisher(new C_OnFinisher(fin
,
9439 osd
->get_objecter_finisher(get_pg_shard())));
9441 ceph_tid_t tid
= osd
->objecter
->read(cop
->src
.oid
, cop
->oloc
, op
,
9442 cop
->src
.snap
, NULL
,
9445 // discover the object version if we don't know it yet
9446 cop
->results
.user_version
? NULL
: &cop
->results
.user_version
);
9448 cop
->objecter_tid
= tid
;
9452 void PrimaryLogPG::_copy_some_manifest(ObjectContextRef obc
, CopyOpRef cop
, uint64_t start_offset
)
9454 dout(10) << __func__
<< " " << *obc
<< " " << cop
<< dendl
;
9457 if (cop
->flags
& CEPH_OSD_COPY_FROM_FLAG_FLUSH
)
9458 flags
|= CEPH_OSD_FLAG_FLUSH
;
9459 if (cop
->flags
& CEPH_OSD_COPY_FROM_FLAG_IGNORE_CACHE
)
9460 flags
|= CEPH_OSD_FLAG_IGNORE_CACHE
;
9461 if (cop
->flags
& CEPH_OSD_COPY_FROM_FLAG_IGNORE_OVERLAY
)
9462 flags
|= CEPH_OSD_FLAG_IGNORE_OVERLAY
;
9463 if (cop
->flags
& CEPH_OSD_COPY_FROM_FLAG_MAP_SNAP_CLONE
)
9464 flags
|= CEPH_OSD_FLAG_MAP_SNAP_CLONE
;
9465 if (cop
->flags
& CEPH_OSD_COPY_FROM_FLAG_RWORDERED
)
9466 flags
|= CEPH_OSD_FLAG_RWORDERED
;
9469 uint64_t last_offset
= 0, chunks_size
= 0;
9470 object_manifest_t
*manifest
= &obc
->obs
.oi
.manifest
;
9471 map
<uint64_t, chunk_info_t
>::iterator iter
= manifest
->chunk_map
.find(start_offset
);
9472 for (;iter
!= manifest
->chunk_map
.end(); ++iter
) {
9474 chunks_size
+= iter
->second
.length
;
9475 last_offset
= iter
->first
;
9476 if (get_copy_chunk_size() < chunks_size
) {
9481 cop
->num_chunk
= num_chunks
;
9482 cop
->start_offset
= start_offset
;
9483 cop
->last_offset
= last_offset
;
9484 dout(20) << __func__
<< " oid " << obc
->obs
.oi
.soid
<< " num_chunks: " << num_chunks
9485 << " start_offset: " << start_offset
<< " chunks_size: " << chunks_size
9486 << " last_offset: " << last_offset
<< dendl
;
9488 iter
= manifest
->chunk_map
.find(start_offset
);
9489 for (;iter
!= manifest
->chunk_map
.end(); ++iter
) {
9490 uint64_t obj_offset
= iter
->first
;
9491 uint64_t length
= manifest
->chunk_map
[iter
->first
].length
;
9492 hobject_t soid
= manifest
->chunk_map
[iter
->first
].oid
;
9493 object_locator_t
oloc(soid
);
9494 CopyCallback
* cb
= NULL
;
9495 CopyOpRef
sub_cop(std::make_shared
<CopyOp
>(cb
, ObjectContextRef(), cop
->src
, oloc
,
9496 cop
->results
.user_version
, cop
->flags
, cop
->mirror_snapset
,
9497 cop
->src_obj_fadvise_flags
, cop
->dest_obj_fadvise_flags
));
9498 sub_cop
->cursor
.data_offset
= obj_offset
;
9499 cop
->chunk_cops
[obj_offset
] = sub_cop
;
9501 int s
= sub_cop
->chunk_ops
.size();
9502 sub_cop
->chunk_ops
.resize(s
+1);
9503 sub_cop
->chunk_ops
[s
].op
.op
= CEPH_OSD_OP_READ
;
9504 sub_cop
->chunk_ops
[s
].op
.extent
.offset
= manifest
->chunk_map
[iter
->first
].offset
;
9505 sub_cop
->chunk_ops
[s
].op
.extent
.length
= length
;
9508 op
.dup(sub_cop
->chunk_ops
);
9510 if (cop
->results
.user_version
) {
9511 op
.assert_version(cop
->results
.user_version
);
9513 // we should learn the version after the first chunk, if we didn't know
9515 ceph_assert(cop
->cursor
.is_initial());
9517 op
.set_last_op_flags(cop
->src_obj_fadvise_flags
);
9519 C_CopyChunk
*fin
= new C_CopyChunk(this, obc
->obs
.oi
.soid
,
9520 get_last_peering_reset(), cop
);
9521 fin
->offset
= obj_offset
;
9523 ceph_tid_t tid
= osd
->objecter
->read(
9525 sub_cop
->src
.snap
, NULL
,
9527 new C_OnFinisher(fin
, osd
->get_objecter_finisher(get_pg_shard())),
9528 // discover the object version if we don't know it yet
9529 sub_cop
->results
.user_version
? NULL
: &sub_cop
->results
.user_version
);
9531 sub_cop
->objecter_tid
= tid
;
9533 dout(20) << __func__
<< " tgt_oid: " << soid
.oid
<< " tgt_offset: "
9534 << manifest
->chunk_map
[iter
->first
].offset
9535 << " length: " << length
<< " pool id: " << oloc
.pool
9536 << " tid: " << tid
<< dendl
;
9538 if (last_offset
<= iter
->first
) {
9544 void PrimaryLogPG::process_copy_chunk(hobject_t oid
, ceph_tid_t tid
, int r
)
9546 dout(10) << __func__
<< " " << oid
<< " tid " << tid
9547 << " " << cpp_strerror(r
) << dendl
;
9548 map
<hobject_t
,CopyOpRef
>::iterator p
= copy_ops
.find(oid
);
9549 if (p
== copy_ops
.end()) {
9550 dout(10) << __func__
<< " no copy_op found" << dendl
;
9553 CopyOpRef cop
= p
->second
;
9554 if (tid
!= cop
->objecter_tid
) {
9555 dout(10) << __func__
<< " tid " << tid
<< " != cop " << cop
9556 << " tid " << cop
->objecter_tid
<< dendl
;
9560 if (cop
->omap_data
.length() || cop
->omap_header
.length())
9561 cop
->results
.has_omap
= true;
9563 if (r
>= 0 && !pool
.info
.supports_omap() &&
9564 (cop
->omap_data
.length() || cop
->omap_header
.length())) {
9567 cop
->objecter_tid
= 0;
9568 cop
->objecter_tid2
= 0; // assume this ordered before us (if it happened)
9569 ObjectContextRef
& cobc
= cop
->obc
;
9574 ceph_assert(cop
->rval
>= 0);
9576 if (oid
.snap
< CEPH_NOSNAP
&& !cop
->results
.snaps
.empty()) {
9577 // verify snap hasn't been deleted
9578 vector
<snapid_t
>::iterator p
= cop
->results
.snaps
.begin();
9579 while (p
!= cop
->results
.snaps
.end()) {
9580 // make best effort to sanitize snaps/clones.
9581 if (get_osdmap()->in_removed_snaps_queue(info
.pgid
.pgid
.pool(), *p
)) {
9582 dout(10) << __func__
<< " clone snap " << *p
<< " has been deleted"
9584 for (vector
<snapid_t
>::iterator q
= p
+ 1;
9585 q
!= cop
->results
.snaps
.end();
9588 cop
->results
.snaps
.resize(cop
->results
.snaps
.size() - 1);
9593 if (cop
->results
.snaps
.empty()) {
9594 dout(10) << __func__
<< " no more snaps for " << oid
<< dendl
;
9600 ceph_assert(cop
->rval
>= 0);
9602 if (!cop
->temp_cursor
.data_complete
) {
9603 cop
->results
.data_digest
= cop
->data
.crc32c(cop
->results
.data_digest
);
9605 if (pool
.info
.supports_omap() && !cop
->temp_cursor
.omap_complete
) {
9606 if (cop
->omap_header
.length()) {
9607 cop
->results
.omap_digest
=
9608 cop
->omap_header
.crc32c(cop
->results
.omap_digest
);
9610 if (cop
->omap_data
.length()) {
9612 keys
.substr_of(cop
->omap_data
, 4, cop
->omap_data
.length() - 4);
9613 cop
->results
.omap_digest
= keys
.crc32c(cop
->results
.omap_digest
);
9617 if (!cop
->temp_cursor
.attr_complete
) {
9618 for (map
<string
,bufferlist
>::iterator p
= cop
->attrs
.begin();
9619 p
!= cop
->attrs
.end();
9621 cop
->results
.attrs
[string("_") + p
->first
] = p
->second
;
9626 if (!cop
->cursor
.is_complete()) {
9627 // write out what we have so far
9628 if (cop
->temp_cursor
.is_initial()) {
9629 ceph_assert(!cop
->results
.started_temp_obj
);
9630 cop
->results
.started_temp_obj
= true;
9631 cop
->results
.temp_oid
= generate_temp_object(oid
);
9632 dout(20) << __func__
<< " using temp " << cop
->results
.temp_oid
<< dendl
;
9634 ObjectContextRef tempobc
= get_object_context(cop
->results
.temp_oid
, true);
9635 OpContextUPtr ctx
= simple_opc_create(tempobc
);
9636 if (cop
->temp_cursor
.is_initial()) {
9637 ctx
->new_temp_oid
= cop
->results
.temp_oid
;
9639 _write_copy_chunk(cop
, ctx
->op_t
.get());
9640 simple_opc_submit(std::move(ctx
));
9641 dout(10) << __func__
<< " fetching more" << dendl
;
9642 _copy_some(cobc
, cop
);
9647 if (cop
->results
.is_data_digest() || cop
->results
.is_omap_digest()) {
9648 dout(20) << __func__
<< std::hex
9649 << " got digest: rx data 0x" << cop
->results
.data_digest
9650 << " omap 0x" << cop
->results
.omap_digest
9651 << ", source: data 0x" << cop
->results
.source_data_digest
9652 << " omap 0x" << cop
->results
.source_omap_digest
9654 << " flags " << cop
->results
.flags
9657 if (cop
->results
.is_data_digest() &&
9658 cop
->results
.data_digest
!= cop
->results
.source_data_digest
) {
9659 derr
<< __func__
<< std::hex
<< " data digest 0x" << cop
->results
.data_digest
9660 << " != source 0x" << cop
->results
.source_data_digest
<< std::dec
9662 osd
->clog
->error() << info
.pgid
<< " copy from " << cop
->src
9663 << " to " << cop
->obc
->obs
.oi
.soid
<< std::hex
9664 << " data digest 0x" << cop
->results
.data_digest
9665 << " != source 0x" << cop
->results
.source_data_digest
9670 if (cop
->results
.is_omap_digest() &&
9671 cop
->results
.omap_digest
!= cop
->results
.source_omap_digest
) {
9672 derr
<< __func__
<< std::hex
9673 << " omap digest 0x" << cop
->results
.omap_digest
9674 << " != source 0x" << cop
->results
.source_omap_digest
9675 << std::dec
<< dendl
;
9676 osd
->clog
->error() << info
.pgid
<< " copy from " << cop
->src
9677 << " to " << cop
->obc
->obs
.oi
.soid
<< std::hex
9678 << " omap digest 0x" << cop
->results
.omap_digest
9679 << " != source 0x" << cop
->results
.source_omap_digest
9684 if (cct
->_conf
->osd_debug_inject_copyfrom_error
) {
9685 derr
<< __func__
<< " injecting copyfrom failure" << dendl
;
9690 cop
->results
.fill_in_final_tx
= std::function
<void(PGTransaction
*)>(
9691 [this, &cop
/* avoid ref cycle */](PGTransaction
*t
) {
9692 ObjectState
& obs
= cop
->obc
->obs
;
9693 if (cop
->temp_cursor
.is_initial()) {
9694 dout(20) << "fill_in_final_tx: writing "
9695 << "directly to final object" << dendl
;
9696 // write directly to final object
9697 cop
->results
.temp_oid
= obs
.oi
.soid
;
9698 _write_copy_chunk(cop
, t
);
9700 // finish writing to temp object, then move into place
9701 dout(20) << "fill_in_final_tx: writing to temp object" << dendl
;
9702 if (obs
.oi
.has_manifest() && obs
.oi
.manifest
.is_redirect() && obs
.exists
) {
9703 /* In redirect manifest case, the object exists in the upper tier.
9704 * So, to avoid a conflict when rename() is called, remove existing
9707 t
->remove(obs
.oi
.soid
);
9709 _write_copy_chunk(cop
, t
);
9710 t
->rename(obs
.oi
.soid
, cop
->results
.temp_oid
);
9712 t
->setattrs(obs
.oi
.soid
, cop
->results
.attrs
);
9715 dout(20) << __func__
<< " success; committing" << dendl
;
9718 dout(20) << __func__
<< " complete r = " << cpp_strerror(r
) << dendl
;
9719 CopyCallbackResults
results(r
, &cop
->results
);
9720 cop
->cb
->complete(results
);
9722 copy_ops
.erase(cobc
->obs
.oi
.soid
);
9725 if (r
< 0 && cop
->results
.started_temp_obj
) {
9726 dout(10) << __func__
<< " deleting partial temp object "
9727 << cop
->results
.temp_oid
<< dendl
;
9728 ObjectContextRef tempobc
= get_object_context(cop
->results
.temp_oid
, true);
9729 OpContextUPtr ctx
= simple_opc_create(tempobc
);
9730 ctx
->op_t
->remove(cop
->results
.temp_oid
);
9731 ctx
->discard_temp_oid
= cop
->results
.temp_oid
;
9732 simple_opc_submit(std::move(ctx
));
9735 // cancel and requeue proxy ops on this object
9737 cancel_and_requeue_proxy_ops(cobc
->obs
.oi
.soid
);
9740 kick_object_context_blocked(cobc
);
9743 void PrimaryLogPG::process_copy_chunk_manifest(hobject_t oid
, ceph_tid_t tid
, int r
, uint64_t offset
)
9745 dout(10) << __func__
<< " " << oid
<< " tid " << tid
9746 << " " << cpp_strerror(r
) << dendl
;
9747 map
<hobject_t
,CopyOpRef
>::iterator p
= copy_ops
.find(oid
);
9748 if (p
== copy_ops
.end()) {
9749 dout(10) << __func__
<< " no copy_op found" << dendl
;
9752 CopyOpRef obj_cop
= p
->second
;
9753 CopyOpRef chunk_cop
= obj_cop
->chunk_cops
[offset
];
9755 if (tid
!= chunk_cop
->objecter_tid
) {
9756 dout(10) << __func__
<< " tid " << tid
<< " != cop " << chunk_cop
9757 << " tid " << chunk_cop
->objecter_tid
<< dendl
;
9761 if (chunk_cop
->omap_data
.length() || chunk_cop
->omap_header
.length()) {
9765 chunk_cop
->objecter_tid
= 0;
9766 chunk_cop
->objecter_tid2
= 0; // assume this ordered before us (if it happened)
9767 ObjectContextRef
& cobc
= obj_cop
->obc
;
9768 OSDOp
&chunk_data
= chunk_cop
->chunk_ops
[0];
9771 obj_cop
->failed
= true;
9775 if (obj_cop
->failed
) {
9778 if (!chunk_data
.outdata
.length()) {
9780 obj_cop
->failed
= true;
9784 obj_cop
->num_chunk
--;
9786 /* check all of the copyop are completed */
9787 if (obj_cop
->num_chunk
) {
9788 dout(20) << __func__
<< " num_chunk: " << obj_cop
->num_chunk
<< dendl
;
9793 OpContextUPtr ctx
= simple_opc_create(obj_cop
->obc
);
9794 if (!ctx
->lock_manager
.take_write_lock(
9795 obj_cop
->obc
->obs
.oi
.soid
,
9797 // recovery op can take read lock.
9798 // so need to wait for recovery completion
9800 obj_cop
->failed
= true;
9801 close_op_ctx(ctx
.release());
9804 dout(20) << __func__
<< " took lock on obc, " << obj_cop
->obc
->rwstate
<< dendl
;
9806 PGTransaction
*t
= ctx
->op_t
.get();
9807 ObjectState
& obs
= ctx
->new_obs
;
9808 for (auto p
: obj_cop
->chunk_cops
) {
9809 OSDOp
&sub_chunk
= p
.second
->chunk_ops
[0];
9810 t
->write(cobc
->obs
.oi
.soid
,
9811 p
.second
->cursor
.data_offset
,
9812 sub_chunk
.outdata
.length(),
9814 p
.second
->dest_obj_fadvise_flags
);
9815 dout(20) << __func__
<< " offset: " << p
.second
->cursor
.data_offset
9816 << " length: " << sub_chunk
.outdata
.length() << dendl
;
9817 write_update_size_and_usage(ctx
->delta_stats
, obs
.oi
, ctx
->modified_ranges
,
9818 p
.second
->cursor
.data_offset
, sub_chunk
.outdata
.length());
9819 obs
.oi
.manifest
.chunk_map
[p
.second
->cursor
.data_offset
].clear_flag(chunk_info_t::FLAG_MISSING
);
9820 ctx
->clean_regions
.mark_data_region_dirty(p
.second
->cursor
.data_offset
, sub_chunk
.outdata
.length());
9821 sub_chunk
.outdata
.clear();
9823 obs
.oi
.clear_data_digest();
9824 ctx
->at_version
= get_next_version();
9825 finish_ctx(ctx
.get(), pg_log_entry_t::PROMOTE
);
9826 simple_opc_submit(std::move(ctx
));
9827 obj_cop
->chunk_cops
.clear();
9829 auto p
= cobc
->obs
.oi
.manifest
.chunk_map
.rbegin();
9830 /* check remaining work */
9831 if (p
!= cobc
->obs
.oi
.manifest
.chunk_map
.rend()) {
9832 if (obj_cop
->last_offset
< p
->first
) {
9833 for (auto &en
: cobc
->obs
.oi
.manifest
.chunk_map
) {
9834 if (obj_cop
->last_offset
< en
.first
) {
9835 _copy_some_manifest(cobc
, obj_cop
, en
.first
);
9844 dout(20) << __func__
<< " complete r = " << cpp_strerror(r
) << dendl
;
9845 CopyCallbackResults
results(r
, &obj_cop
->results
);
9846 obj_cop
->cb
->complete(results
);
9848 copy_ops
.erase(cobc
->obs
.oi
.soid
);
9851 // cancel and requeue proxy ops on this object
9853 cancel_and_requeue_proxy_ops(cobc
->obs
.oi
.soid
);
9856 kick_object_context_blocked(cobc
);
9859 void PrimaryLogPG::cancel_and_requeue_proxy_ops(hobject_t oid
) {
9860 vector
<ceph_tid_t
> tids
;
9861 for (map
<ceph_tid_t
, ProxyReadOpRef
>::iterator it
= proxyread_ops
.begin();
9862 it
!= proxyread_ops
.end();) {
9863 if (it
->second
->soid
== oid
) {
9864 cancel_proxy_read((it
++)->second
, &tids
);
9869 for (map
<ceph_tid_t
, ProxyWriteOpRef
>::iterator it
= proxywrite_ops
.begin();
9870 it
!= proxywrite_ops
.end();) {
9871 if (it
->second
->soid
== oid
) {
9872 cancel_proxy_write((it
++)->second
, &tids
);
9877 osd
->objecter
->op_cancel(tids
, -ECANCELED
);
9878 kick_proxy_ops_blocked(oid
);
9881 void PrimaryLogPG::_write_copy_chunk(CopyOpRef cop
, PGTransaction
*t
)
9883 dout(20) << __func__
<< " " << cop
9884 << " " << cop
->attrs
.size() << " attrs"
9885 << " " << cop
->data
.length() << " bytes"
9886 << " " << cop
->omap_header
.length() << " omap header bytes"
9887 << " " << cop
->omap_data
.length() << " omap data bytes"
9889 if (!cop
->temp_cursor
.attr_complete
) {
9890 t
->create(cop
->results
.temp_oid
);
9892 if (!cop
->temp_cursor
.data_complete
) {
9893 ceph_assert(cop
->data
.length() + cop
->temp_cursor
.data_offset
==
9894 cop
->cursor
.data_offset
);
9895 if (pool
.info
.required_alignment() &&
9896 !cop
->cursor
.data_complete
) {
9898 * Trim off the unaligned bit at the end, we'll adjust cursor.data_offset
9899 * to pick it up on the next pass.
9901 ceph_assert(cop
->temp_cursor
.data_offset
%
9902 pool
.info
.required_alignment() == 0);
9903 if (cop
->data
.length() % pool
.info
.required_alignment() != 0) {
9905 cop
->data
.length() % pool
.info
.required_alignment();
9907 bl
.substr_of(cop
->data
, 0, cop
->data
.length() - to_trim
);
9909 cop
->cursor
.data_offset
-= to_trim
;
9910 ceph_assert(cop
->data
.length() + cop
->temp_cursor
.data_offset
==
9911 cop
->cursor
.data_offset
);
9914 if (cop
->data
.length()) {
9916 cop
->results
.temp_oid
,
9917 cop
->temp_cursor
.data_offset
,
9920 cop
->dest_obj_fadvise_flags
);
9924 if (pool
.info
.supports_omap()) {
9925 if (!cop
->temp_cursor
.omap_complete
) {
9926 if (cop
->omap_header
.length()) {
9928 cop
->results
.temp_oid
,
9930 cop
->omap_header
.clear();
9932 if (cop
->omap_data
.length()) {
9933 map
<string
,bufferlist
> omap
;
9934 bufferlist::const_iterator p
= cop
->omap_data
.begin();
9936 t
->omap_setkeys(cop
->results
.temp_oid
, omap
);
9937 cop
->omap_data
.clear();
9941 ceph_assert(cop
->omap_header
.length() == 0);
9942 ceph_assert(cop
->omap_data
.length() == 0);
9944 cop
->temp_cursor
= cop
->cursor
;
9947 void PrimaryLogPG::finish_copyfrom(CopyFromCallback
*cb
)
9949 OpContext
*ctx
= cb
->ctx
;
9950 dout(20) << "finish_copyfrom on " << ctx
->obs
->oi
.soid
<< dendl
;
9952 ObjectState
& obs
= ctx
->new_obs
;
9954 dout(20) << __func__
<< ": exists, removing" << dendl
;
9955 ctx
->op_t
->remove(obs
.oi
.soid
);
9957 ctx
->delta_stats
.num_objects
++;
9960 if (cb
->is_temp_obj_used()) {
9961 ctx
->discard_temp_oid
= cb
->results
->temp_oid
;
9963 cb
->results
->fill_in_final_tx(ctx
->op_t
.get());
9965 // CopyFromCallback fills this in for us
9966 obs
.oi
.user_version
= ctx
->user_at_version
;
9968 if (cb
->results
->is_data_digest()) {
9969 obs
.oi
.set_data_digest(cb
->results
->data_digest
);
9971 obs
.oi
.clear_data_digest();
9973 if (cb
->results
->is_omap_digest()) {
9974 obs
.oi
.set_omap_digest(cb
->results
->omap_digest
);
9976 obs
.oi
.clear_omap_digest();
9979 obs
.oi
.truncate_seq
= cb
->truncate_seq
;
9980 obs
.oi
.truncate_size
= cb
->truncate_size
;
9982 obs
.oi
.mtime
= ceph::real_clock::to_timespec(cb
->results
->mtime
);
9983 ctx
->mtime
= utime_t();
9985 ctx
->extra_reqids
= cb
->results
->reqids
;
9986 ctx
->extra_reqid_return_codes
= cb
->results
->reqid_return_codes
;
9988 // cache: clear whiteout?
9989 if (obs
.oi
.is_whiteout()) {
9990 dout(10) << __func__
<< " clearing whiteout on " << obs
.oi
.soid
<< dendl
;
9991 obs
.oi
.clear_flag(object_info_t::FLAG_WHITEOUT
);
9992 --ctx
->delta_stats
.num_whiteouts
;
9995 if (cb
->results
->has_omap
) {
9996 dout(10) << __func__
<< " setting omap flag on " << obs
.oi
.soid
<< dendl
;
9997 obs
.oi
.set_flag(object_info_t::FLAG_OMAP
);
9998 ctx
->clean_regions
.mark_omap_dirty();
10000 dout(10) << __func__
<< " clearing omap flag on " << obs
.oi
.soid
<< dendl
;
10001 obs
.oi
.clear_flag(object_info_t::FLAG_OMAP
);
10004 interval_set
<uint64_t> ch
;
10005 if (obs
.oi
.size
> 0)
10006 ch
.insert(0, obs
.oi
.size
);
10007 ctx
->modified_ranges
.union_of(ch
);
10008 ctx
->clean_regions
.mark_data_region_dirty(0, std::max(obs
.oi
.size
, cb
->get_data_size()));
10010 if (cb
->get_data_size() != obs
.oi
.size
) {
10011 ctx
->delta_stats
.num_bytes
-= obs
.oi
.size
;
10012 obs
.oi
.size
= cb
->get_data_size();
10013 ctx
->delta_stats
.num_bytes
+= obs
.oi
.size
;
10015 ctx
->delta_stats
.num_wr
++;
10016 ctx
->delta_stats
.num_wr_kb
+= shift_round_up(obs
.oi
.size
, 10);
10018 osd
->logger
->inc(l_osd_copyfrom
);
10021 void PrimaryLogPG::finish_promote(int r
, CopyResults
*results
,
10022 ObjectContextRef obc
)
10024 const hobject_t
& soid
= obc
->obs
.oi
.soid
;
10025 dout(10) << __func__
<< " " << soid
<< " r=" << r
10026 << " uv" << results
->user_version
<< dendl
;
10028 if (r
== -ECANCELED
) {
10032 if (r
!= -ENOENT
&& soid
.is_snap()) {
10033 if (results
->snaps
.empty()) {
10034 // we must have read "snap" content from the head object in the
10035 // base pool. use snap_seq to construct what snaps should be
10036 // for this clone (what is was before we evicted the clean clone
10037 // from this pool, and what it will be when we flush and the
10038 // clone eventually happens in the base pool). we want to use
10039 // snaps in (results->snap_seq,soid.snap]
10040 SnapSet
& snapset
= obc
->ssc
->snapset
;
10041 for (auto p
= snapset
.clone_snaps
.rbegin();
10042 p
!= snapset
.clone_snaps
.rend();
10044 for (auto snap
: p
->second
) {
10045 if (snap
> soid
.snap
) {
10048 if (snap
<= results
->snap_seq
) {
10051 results
->snaps
.push_back(snap
);
10056 dout(20) << __func__
<< " snaps " << results
->snaps
<< dendl
;
10057 filter_snapc(results
->snaps
);
10059 dout(20) << __func__
<< " filtered snaps " << results
->snaps
<< dendl
;
10060 if (results
->snaps
.empty()) {
10061 dout(20) << __func__
10062 << " snaps are empty, clone is invalid,"
10063 << " setting r to ENOENT" << dendl
;
10068 if (r
< 0 && results
->started_temp_obj
) {
10069 dout(10) << __func__
<< " abort; will clean up partial work" << dendl
;
10070 ObjectContextRef tempobc
= get_object_context(results
->temp_oid
, false);
10071 ceph_assert(tempobc
);
10072 OpContextUPtr ctx
= simple_opc_create(tempobc
);
10073 ctx
->op_t
->remove(results
->temp_oid
);
10074 simple_opc_submit(std::move(ctx
));
10075 results
->started_temp_obj
= false;
10078 if (r
== -ENOENT
&& soid
.is_snap()) {
10079 dout(10) << __func__
10080 << ": enoent while trying to promote clone, " << soid
10081 << " must have been trimmed, removing from snapset"
10083 hobject_t
head(soid
.get_head());
10084 ObjectContextRef obc
= get_object_context(head
, false);
10087 OpContextUPtr tctx
= simple_opc_create(obc
);
10088 tctx
->at_version
= get_next_version();
10089 if (get_osdmap()->require_osd_release
< ceph_release_t::octopus
) {
10090 filter_snapc(tctx
->new_snapset
.snaps
);
10092 tctx
->new_snapset
.snaps
.clear();
10094 vector
<snapid_t
> new_clones
;
10095 map
<snapid_t
, vector
<snapid_t
>> new_clone_snaps
;
10096 for (vector
<snapid_t
>::iterator i
= tctx
->new_snapset
.clones
.begin();
10097 i
!= tctx
->new_snapset
.clones
.end();
10099 if (*i
!= soid
.snap
) {
10100 new_clones
.push_back(*i
);
10101 auto p
= tctx
->new_snapset
.clone_snaps
.find(*i
);
10102 if (p
!= tctx
->new_snapset
.clone_snaps
.end()) {
10103 new_clone_snaps
[*i
] = p
->second
;
10107 tctx
->new_snapset
.clones
.swap(new_clones
);
10108 tctx
->new_snapset
.clone_overlap
.erase(soid
.snap
);
10109 tctx
->new_snapset
.clone_size
.erase(soid
.snap
);
10110 tctx
->new_snapset
.clone_snaps
.swap(new_clone_snaps
);
10112 // take RWWRITE lock for duration of our local write. ignore starvation.
10113 if (!tctx
->lock_manager
.take_write_lock(
10116 ceph_abort_msg("problem!");
10118 dout(20) << __func__
<< " took lock on obc, " << obc
->rwstate
<< dendl
;
10120 finish_ctx(tctx
.get(), pg_log_entry_t::PROMOTE
);
10122 simple_opc_submit(std::move(tctx
));
10126 bool whiteout
= false;
10127 if (r
== -ENOENT
) {
10128 ceph_assert(soid
.snap
== CEPH_NOSNAP
); // snap case is above
10129 dout(10) << __func__
<< " whiteout " << soid
<< dendl
;
10133 if (r
< 0 && !whiteout
) {
10134 derr
<< __func__
<< " unexpected promote error " << cpp_strerror(r
) << dendl
;
10135 // pass error to everyone blocked on this object
10136 // FIXME: this is pretty sloppy, but at this point we got
10137 // something unexpected and don't have many other options.
10138 map
<hobject_t
,list
<OpRequestRef
>>::iterator blocked_iter
=
10139 waiting_for_blocked_object
.find(soid
);
10140 if (blocked_iter
!= waiting_for_blocked_object
.end()) {
10141 while (!blocked_iter
->second
.empty()) {
10142 osd
->reply_op_error(blocked_iter
->second
.front(), r
);
10143 blocked_iter
->second
.pop_front();
10145 waiting_for_blocked_object
.erase(blocked_iter
);
10150 osd
->promote_finish(results
->object_size
);
10152 OpContextUPtr tctx
= simple_opc_create(obc
);
10153 tctx
->at_version
= get_next_version();
10155 if (!obc
->obs
.oi
.has_manifest()) {
10156 ++tctx
->delta_stats
.num_objects
;
10158 if (soid
.snap
< CEPH_NOSNAP
)
10159 ++tctx
->delta_stats
.num_object_clones
;
10160 tctx
->new_obs
.exists
= true;
10162 tctx
->extra_reqids
= results
->reqids
;
10163 tctx
->extra_reqid_return_codes
= results
->reqid_return_codes
;
10165 if (obc
->obs
.oi
.has_manifest() && obc
->obs
.oi
.manifest
.is_redirect()) {
10166 tctx
->new_obs
.oi
.manifest
.type
= object_manifest_t::TYPE_NONE
;
10167 tctx
->new_obs
.oi
.clear_flag(object_info_t::FLAG_REDIRECT_HAS_REFERENCE
);
10168 tctx
->new_obs
.oi
.clear_flag(object_info_t::FLAG_MANIFEST
);
10169 tctx
->new_obs
.oi
.manifest
.redirect_target
= hobject_t();
10170 tctx
->delta_stats
.num_objects_manifest
--;
10171 if (obc
->obs
.oi
.test_flag(object_info_t::FLAG_REDIRECT_HAS_REFERENCE
)) {
10172 dec_all_refcount_manifest(obc
->obs
.oi
, tctx
.get());
10177 // create a whiteout
10178 tctx
->op_t
->create(soid
);
10179 tctx
->new_obs
.oi
.set_flag(object_info_t::FLAG_WHITEOUT
);
10180 ++tctx
->delta_stats
.num_whiteouts
;
10181 dout(20) << __func__
<< " creating whiteout on " << soid
<< dendl
;
10182 osd
->logger
->inc(l_osd_tier_whiteout
);
10184 if (results
->has_omap
) {
10185 dout(10) << __func__
<< " setting omap flag on " << soid
<< dendl
;
10186 tctx
->new_obs
.oi
.set_flag(object_info_t::FLAG_OMAP
);
10187 ++tctx
->delta_stats
.num_objects_omap
;
10190 results
->fill_in_final_tx(tctx
->op_t
.get());
10191 if (results
->started_temp_obj
) {
10192 tctx
->discard_temp_oid
= results
->temp_oid
;
10194 tctx
->new_obs
.oi
.size
= results
->object_size
;
10195 tctx
->new_obs
.oi
.user_version
= results
->user_version
;
10196 tctx
->new_obs
.oi
.mtime
= ceph::real_clock::to_timespec(results
->mtime
);
10197 tctx
->mtime
= utime_t();
10198 if (results
->is_data_digest()) {
10199 tctx
->new_obs
.oi
.set_data_digest(results
->data_digest
);
10201 tctx
->new_obs
.oi
.clear_data_digest();
10203 if (results
->object_size
)
10204 tctx
->clean_regions
.mark_data_region_dirty(0, results
->object_size
);
10205 if (results
->is_omap_digest()) {
10206 tctx
->new_obs
.oi
.set_omap_digest(results
->omap_digest
);
10208 tctx
->new_obs
.oi
.clear_omap_digest();
10210 if (results
->has_omap
)
10211 tctx
->clean_regions
.mark_omap_dirty();
10212 tctx
->new_obs
.oi
.truncate_seq
= results
->truncate_seq
;
10213 tctx
->new_obs
.oi
.truncate_size
= results
->truncate_size
;
10215 if (soid
.snap
!= CEPH_NOSNAP
) {
10216 ceph_assert(obc
->ssc
->snapset
.clone_snaps
.count(soid
.snap
));
10217 ceph_assert(obc
->ssc
->snapset
.clone_size
.count(soid
.snap
));
10218 ceph_assert(obc
->ssc
->snapset
.clone_size
[soid
.snap
] ==
10219 results
->object_size
);
10220 ceph_assert(obc
->ssc
->snapset
.clone_overlap
.count(soid
.snap
));
10222 tctx
->delta_stats
.num_bytes
+= obc
->ssc
->snapset
.get_clone_bytes(soid
.snap
);
10224 tctx
->delta_stats
.num_bytes
+= results
->object_size
;
10228 if (results
->mirror_snapset
) {
10229 ceph_assert(tctx
->new_obs
.oi
.soid
.snap
== CEPH_NOSNAP
);
10230 tctx
->new_snapset
.from_snap_set(
10232 get_osdmap()->require_osd_release
< ceph_release_t::luminous
);
10234 dout(20) << __func__
<< " new_snapset " << tctx
->new_snapset
<< dendl
;
10236 // take RWWRITE lock for duration of our local write. ignore starvation.
10237 if (!tctx
->lock_manager
.take_write_lock(
10240 ceph_abort_msg("problem!");
10242 dout(20) << __func__
<< " took lock on obc, " << obc
->rwstate
<< dendl
;
10244 finish_ctx(tctx
.get(), pg_log_entry_t::PROMOTE
);
10246 simple_opc_submit(std::move(tctx
));
10248 osd
->logger
->inc(l_osd_tier_promote
);
10251 agent_state
->is_idle())
10252 agent_choose_mode();
10255 void PrimaryLogPG::finish_promote_manifest(int r
, CopyResults
*results
,
10256 ObjectContextRef obc
)
10258 const hobject_t
& soid
= obc
->obs
.oi
.soid
;
10259 dout(10) << __func__
<< " " << soid
<< " r=" << r
10260 << " uv" << results
->user_version
<< dendl
;
10262 if (r
== -ECANCELED
|| r
== -EAGAIN
) {
10267 derr
<< __func__
<< " unexpected promote error " << cpp_strerror(r
) << dendl
;
10268 // pass error to everyone blocked on this object
10269 // FIXME: this is pretty sloppy, but at this point we got
10270 // something unexpected and don't have many other options.
10271 map
<hobject_t
,list
<OpRequestRef
>>::iterator blocked_iter
=
10272 waiting_for_blocked_object
.find(soid
);
10273 if (blocked_iter
!= waiting_for_blocked_object
.end()) {
10274 while (!blocked_iter
->second
.empty()) {
10275 osd
->reply_op_error(blocked_iter
->second
.front(), r
);
10276 blocked_iter
->second
.pop_front();
10278 waiting_for_blocked_object
.erase(blocked_iter
);
10283 osd
->promote_finish(results
->object_size
);
10284 osd
->logger
->inc(l_osd_tier_promote
);
10287 agent_state
->is_idle())
10288 agent_choose_mode();
10291 void PrimaryLogPG::cancel_copy(CopyOpRef cop
, bool requeue
,
10292 vector
<ceph_tid_t
> *tids
)
10294 dout(10) << __func__
<< " " << cop
->obc
->obs
.oi
.soid
10295 << " from " << cop
->src
<< " " << cop
->oloc
10296 << " v" << cop
->results
.user_version
<< dendl
;
10298 // cancel objecter op, if we can
10299 if (cop
->objecter_tid
) {
10300 tids
->push_back(cop
->objecter_tid
);
10301 cop
->objecter_tid
= 0;
10302 if (cop
->objecter_tid2
) {
10303 tids
->push_back(cop
->objecter_tid2
);
10304 cop
->objecter_tid2
= 0;
10308 copy_ops
.erase(cop
->obc
->obs
.oi
.soid
);
10309 cop
->obc
->stop_block();
10311 kick_object_context_blocked(cop
->obc
);
10312 cop
->results
.should_requeue
= requeue
;
10313 CopyCallbackResults
result(-ECANCELED
, &cop
->results
);
10314 cop
->cb
->complete(result
);
10316 // There may still be an objecter callback referencing this copy op.
10317 // That callback will not need the obc since it's been canceled, and
10318 // we need the obc reference to go away prior to flush.
10319 cop
->obc
= ObjectContextRef();
10322 void PrimaryLogPG::cancel_copy_ops(bool requeue
, vector
<ceph_tid_t
> *tids
)
10324 dout(10) << __func__
<< dendl
;
10325 map
<hobject_t
,CopyOpRef
>::iterator p
= copy_ops
.begin();
10326 while (p
!= copy_ops
.end()) {
10327 // requeue this op? can I queue up all of them?
10328 cancel_copy((p
++)->second
, requeue
, tids
);
10332 struct C_gather
: public Context
{
10333 PrimaryLogPGRef pg
;
10335 epoch_t last_peering_reset
;
10337 C_gather(PrimaryLogPG
*pg_
, hobject_t oid_
, epoch_t lpr_
, OSDOp
*osd_op_
) :
10338 pg(pg_
), oid(oid_
), last_peering_reset(lpr_
), osd_op(osd_op_
) {}
10339 void finish(int r
) override
{
10340 if (r
== -ECANCELED
)
10342 std::scoped_lock locker
{*pg
};
10343 auto p
= pg
->cls_gather_ops
.find(oid
);
10344 if (p
== pg
->cls_gather_ops
.end()) {
10345 // op was cancelled
10348 if (last_peering_reset
!= pg
->get_last_peering_reset()) {
10352 PrimaryLogPG::OpContext
*ctx
= p
->second
.ctx
;
10353 pg
->cls_gather_ops
.erase(p
);
10354 pg
->execute_ctx(ctx
);
10358 int PrimaryLogPG::start_cls_gather(OpContext
*ctx
, std::map
<std::string
, bufferlist
> *src_obj_buffs
, const std::string
& pool
,
10359 const char *cls
, const char *method
, bufferlist
& inbl
)
10361 OpRequestRef op
= ctx
->op
;
10362 MOSDOp
*m
= static_cast<MOSDOp
*>(op
->get_nonconst_req());
10364 auto pool_id
= osd
->objecter
->with_osdmap(std::mem_fn(&OSDMap::lookup_pg_pool_name
), pool
);
10365 object_locator_t
oloc(pool_id
);
10367 ObjectState
& obs
= ctx
->new_obs
;
10368 object_info_t
& oi
= obs
.oi
;
10369 const hobject_t
& soid
= oi
.soid
;
10371 ObjectContextRef obc
= get_object_context(soid
, false);
10372 C_GatherBuilder
gather(cct
);
10374 auto [iter
, inserted
] = cls_gather_ops
.emplace(soid
, CLSGatherOp(ctx
, obc
, op
));
10375 ceph_assert(inserted
);
10376 auto &cgop
= iter
->second
;
10377 for (std::map
<std::string
, bufferlist
>::iterator it
= src_obj_buffs
->begin(); it
!= src_obj_buffs
->end(); it
++) {
10378 std::string oid
= it
->first
;
10379 ObjectOperation obj_op
;
10380 obj_op
.call(cls
, method
, inbl
);
10381 uint32_t flags
= 0;
10382 ceph_tid_t tid
= osd
->objecter
->read(
10383 object_t(oid
), oloc
, obj_op
,
10384 m
->get_snapid(), &it
->second
,
10385 flags
, gather
.new_sub());
10386 cgop
.objecter_tids
.push_back(tid
);
10387 dout(10) << __func__
<< " src=" << oid
<< ", tgt=" << soid
<< dendl
;
10390 C_gather
*fin
= new C_gather(this, soid
, get_last_peering_reset(), &(*ctx
->ops
)[ctx
->current_osd_subop_num
]);
10391 gather
.set_finisher(new C_OnFinisher(fin
,
10392 osd
->get_objecter_finisher(get_pg_shard())));
10395 return -EINPROGRESS
;
10398 // ========================================================================
10401 // Flush a dirty object in the cache tier by writing it back to the
10402 // base tier. The sequence looks like:
10404 // * send a copy-from operation to the base tier to copy the current
10405 // version of the object
10406 // * base tier will pull the object via (perhaps multiple) copy-get(s)
10407 // * on completion, we check if the object has been modified. if so,
10408 // just reply with -EAGAIN.
10409 // * try to take a write lock so we can clear the dirty flag. if this
10410 // fails, wait and retry
10411 // * start a repop that clears the bit.
10413 // If we have to wait, we will retry by coming back through the
10414 // start_flush method. We check if a flush is already in progress
10415 // and, if so, try to finish it by rechecking the version and trying
10416 // to clear the dirty bit.
10418 // In order for the cache-flush (a write op) to not block the copy-get
10419 // from reading the object, the client *must* set the SKIPRWLOCKS
10422 // NOTE: normally writes are strictly ordered for the client, but
10423 // flushes are special in that they can be reordered with respect to
10424 // other writes. In particular, we can't have a flush request block
10425 // an update to the cache pool object!
10427 struct C_Flush
: public Context
{
10428 PrimaryLogPGRef pg
;
10430 epoch_t last_peering_reset
;
10433 C_Flush(PrimaryLogPG
*p
, hobject_t o
, epoch_t lpr
)
10434 : pg(p
), oid(o
), last_peering_reset(lpr
),
10435 tid(0), start(ceph_clock_now())
10437 void finish(int r
) override
{
10438 if (r
== -ECANCELED
)
10440 std::scoped_lock locker
{*pg
};
10441 if (last_peering_reset
== pg
->get_last_peering_reset()) {
10442 pg
->finish_flush(oid
, tid
, r
);
10443 pg
->osd
->logger
->tinc(l_osd_tier_flush_lat
, ceph_clock_now() - start
);
10448 int PrimaryLogPG::start_dedup(OpRequestRef op
, ObjectContextRef obc
)
10450 const object_info_t
& oi
= obc
->obs
.oi
;
10451 const hobject_t
& soid
= oi
.soid
;
10453 ceph_assert(obc
->is_blocked());
10454 if (oi
.size
== 0) {
10458 if (pool
.info
.get_fingerprint_type() == pg_pool_t::TYPE_FINGERPRINT_NONE
) {
10459 dout(0) << " fingerprint algorithm is not set " << dendl
;
10464 * The operations to make dedup chunks are tracked by a ManifestOp.
10465 * This op will be finished if all the operations are completed.
10467 ManifestOpRef
mop(std::make_shared
<ManifestOp
>());
10470 std::map
<uint64_t, bufferlist
> chunks
;
10471 int r
= do_cdc(oi
, mop
->new_manifest
.chunk_map
, chunks
);
10475 if (!chunks
.size()) {
10479 // chunks issued here are different with chunk_map newly generated
10480 // because the same chunks in previous snap will not be issued
10481 // So, we need two data structures; the first is the issued chunk list to track
10482 // issued operations, and the second is the new chunk_map to update chunk_map after
10483 // all operations are finished
10484 object_ref_delta_t refs
;
10485 ObjectContextRef obc_l
, obc_g
;
10486 get_adjacent_clones(obc
, obc_l
, obc_g
);
10487 // skip if the same content exits in prev snap at same offset
10488 mop
->new_manifest
.calc_refs_to_inc_on_set(
10489 obc_l
? &(obc_l
->obs
.oi
.manifest
) : nullptr,
10490 obc_g
? &(obc_g
->obs
.oi
.manifest
) : nullptr,
10493 for (auto p
: chunks
) {
10494 hobject_t target
= mop
->new_manifest
.chunk_map
[p
.first
].oid
;
10495 if (refs
.find(target
) == refs
.end()) {
10498 C_SetDedupChunks
*fin
= new C_SetDedupChunks(this, soid
, get_last_peering_reset(), p
.first
);
10499 ceph_tid_t tid
= refcount_manifest(soid
, target
, refcount_t::CREATE_OR_GET_REF
,
10500 fin
, std::move(chunks
[p
.first
]));
10501 mop
->chunks
[target
] = make_pair(p
.first
, p
.second
.length());
10503 mop
->tids
[p
.first
] = tid
;
10505 dout(10) << __func__
<< " oid: " << soid
<< " tid: " << tid
10506 << " target: " << target
<< " offset: " << p
.first
10507 << " length: " << p
.second
.length() << dendl
;
10510 if (mop
->tids
.size()) {
10511 manifest_ops
[soid
] = mop
;
10512 manifest_ops
[soid
]->op
= op
;
10518 return -EINPROGRESS
;
10521 int PrimaryLogPG::do_cdc(const object_info_t
& oi
,
10522 std::map
<uint64_t, chunk_info_t
>& chunk_map
,
10523 std::map
<uint64_t, bufferlist
>& chunks
)
10525 string chunk_algo
= pool
.info
.get_dedup_chunk_algorithm_name();
10526 int64_t chunk_size
= pool
.info
.get_dedup_cdc_chunk_size();
10527 uint64_t total_length
= 0;
10529 std::unique_ptr
<CDC
> cdc
= CDC::create(chunk_algo
, cbits(chunk_size
)-1);
10531 dout(0) << __func__
<< " unrecognized chunk-algorithm " << dendl
;
10537 * We disable EC pool as a base tier of distributed dedup.
10538 * The reason why we disallow erasure code pool here is that the EC pool does not support objects_read_sync().
10539 * Therefore, we should change the current implementation totally to make EC pool compatible.
10540 * As s result, we leave this as a future work.
10542 int r
= pgbackend
->objects_read_sync(
10543 oi
.soid
, 0, oi
.size
, 0, &bl
);
10545 dout(0) << __func__
<< " read fail " << oi
.soid
10546 << " len: " << oi
.size
<< " r: " << r
<< dendl
;
10549 if (bl
.length() != oi
.size
) {
10550 dout(0) << __func__
<< " bl.length: " << bl
.length() << " != oi.size: "
10551 << oi
.size
<< " during chunking " << dendl
;
10555 dout(10) << __func__
<< " oid: " << oi
.soid
<< " len: " << bl
.length()
10556 << " oi.size: " << oi
.size
10557 << " chunk_size: " << chunk_size
<< dendl
;
10559 vector
<pair
<uint64_t, uint64_t>> cdc_chunks
;
10560 cdc
->calc_chunks(bl
, &cdc_chunks
);
10563 for (auto p
: cdc_chunks
) {
10565 chunk
.substr_of(bl
, p
.first
, p
.second
);
10566 hobject_t target
= get_fpoid_from_chunk(oi
.soid
, chunk
);
10567 chunks
[p
.first
] = std::move(chunk
);
10568 chunk_map
[p
.first
] = chunk_info_t(0, p
.second
, target
);
10569 total_length
+= p
.second
;
10571 return total_length
;
10574 hobject_t
PrimaryLogPG::get_fpoid_from_chunk(const hobject_t soid
, bufferlist
& chunk
)
10576 pg_pool_t::fingerprint_t fp_algo
= pool
.info
.get_fingerprint_type();
10577 if (fp_algo
== pg_pool_t::TYPE_FINGERPRINT_NONE
) {
10578 return hobject_t();
10580 object_t fp_oid
= [&fp_algo
, &chunk
]() -> string
{
10582 case pg_pool_t::TYPE_FINGERPRINT_SHA1
:
10583 return ceph::crypto::digest
<ceph::crypto::SHA1
>(chunk
).to_str();
10584 case pg_pool_t::TYPE_FINGERPRINT_SHA256
:
10585 return ceph::crypto::digest
<ceph::crypto::SHA256
>(chunk
).to_str();
10586 case pg_pool_t::TYPE_FINGERPRINT_SHA512
:
10587 return ceph::crypto::digest
<ceph::crypto::SHA512
>(chunk
).to_str();
10589 assert(0 == "unrecognized fingerprint type");
10595 object_locator_t
oloc(soid
);
10596 oloc
.pool
= pool
.info
.get_dedup_tier();
10597 get_osdmap()->object_locator_to_pg(fp_oid
, oloc
, raw_pg
);
10598 hobject_t
target(fp_oid
, oloc
.key
, snapid_t(),
10599 raw_pg
.ps(), raw_pg
.pool(),
10604 int PrimaryLogPG::finish_set_dedup(hobject_t oid
, int r
, ceph_tid_t tid
, uint64_t offset
)
10606 dout(10) << __func__
<< " " << oid
<< " tid " << tid
10607 << " " << cpp_strerror(r
) << dendl
;
10608 map
<hobject_t
,ManifestOpRef
>::iterator p
= manifest_ops
.find(oid
);
10609 if (p
== manifest_ops
.end()) {
10610 dout(10) << __func__
<< " no manifest_op found" << dendl
;
10613 ManifestOpRef mop
= p
->second
;
10614 mop
->results
[offset
] = r
;
10616 // if any failure occurs, put a mark on the results to recognize the failure
10617 mop
->results
[0] = r
;
10619 if (mop
->num_chunks
!= mop
->results
.size()) {
10620 // there are on-going works
10621 return -EINPROGRESS
;
10623 ObjectContextRef obc
= get_object_context(oid
, false);
10626 osd
->reply_op_error(mop
->op
, -EINVAL
);
10629 ceph_assert(obc
->is_blocked());
10631 kick_object_context_blocked(obc
);
10632 if (mop
->results
[0] < 0) {
10633 // check if the previous op returns fail
10634 ceph_assert(mop
->num_chunks
== mop
->results
.size());
10635 manifest_ops
.erase(oid
);
10636 osd
->reply_op_error(mop
->op
, mop
->results
[0]);
10640 if (mop
->chunks
.size()) {
10641 OpContextUPtr ctx
= simple_opc_create(obc
);
10643 if (ctx
->lock_manager
.get_lock_type(
10648 dout(20) << __func__
<< " took write lock" << dendl
;
10649 } else if (mop
->op
) {
10650 dout(10) << __func__
<< " waiting on write lock " << mop
->op
<< dendl
;
10651 close_op_ctx(ctx
.release());
10655 ctx
->at_version
= get_next_version();
10656 ctx
->new_obs
= obc
->obs
;
10657 ctx
->new_obs
.oi
.clear_flag(object_info_t::FLAG_DIRTY
);
10658 --ctx
->delta_stats
.num_objects_dirty
;
10661 * Let's assume that there is a manifest snapshotted object, and we issue tier_flush() to head.
10662 * head: [0, 2) aaa <-- tier_flush()
10663 * 20: [0, 2) ddd, [6, 2) bbb, [8, 2) ccc
10665 * In this case, if the new chunk_map is as follows,
10666 * new_chunk_map : [0, 2) ddd, [6, 2) bbb, [8, 2) ccc
10667 * we should drop aaa from head by using calc_refs_to_drop_on_removal().
10668 * So, the precedure is
10669 * 1. calc_refs_to_drop_on_removal()
10670 * 2. register old references to drop after tier_flush() is committed
10671 * 3. update new chunk_map
10674 ObjectCleanRegions c_regions
= ctx
->clean_regions
;
10675 ObjectContextRef cobc
= get_prev_clone_obc(obc
);
10676 c_regions
.mark_fully_dirty();
10677 // CDC was done on entire range of manifest object,
10678 // so the first thing we should do here is to drop the reference to old chunks
10679 ObjectContextRef obc_l
, obc_g
;
10680 get_adjacent_clones(obc
, obc_l
, obc_g
);
10681 // clear all old references
10682 object_ref_delta_t refs
;
10683 ctx
->obs
->oi
.manifest
.calc_refs_to_drop_on_removal(
10684 obc_l
? &(obc_l
->obs
.oi
.manifest
) : nullptr,
10685 obc_g
? &(obc_g
->obs
.oi
.manifest
) : nullptr,
10687 if (!refs
.is_empty()) {
10688 ctx
->register_on_commit(
10689 [oid
, this, refs
](){
10690 dec_refcount(oid
, refs
);
10694 // set new references
10695 ctx
->new_obs
.oi
.manifest
.chunk_map
= mop
->new_manifest
.chunk_map
;
10697 finish_ctx(ctx
.get(), pg_log_entry_t::CLEAN
);
10698 simple_opc_submit(std::move(ctx
));
10701 osd
->reply_op_error(mop
->op
, r
);
10703 manifest_ops
.erase(oid
);
10707 int PrimaryLogPG::finish_set_manifest_refcount(hobject_t oid
, int r
, ceph_tid_t tid
, uint64_t offset
)
10709 dout(10) << __func__
<< " " << oid
<< " tid " << tid
10710 << " " << cpp_strerror(r
) << dendl
;
10711 map
<hobject_t
,ManifestOpRef
>::iterator p
= manifest_ops
.find(oid
);
10712 if (p
== manifest_ops
.end()) {
10713 dout(10) << __func__
<< " no manifest_op found" << dendl
;
10716 ManifestOpRef mop
= p
->second
;
10717 mop
->results
[offset
] = r
;
10719 // if any failure occurs, put a mark on the results to recognize the failure
10720 mop
->results
[0] = r
;
10722 if (mop
->num_chunks
!= mop
->results
.size()) {
10723 // there are on-going works
10724 return -EINPROGRESS
;
10728 mop
->cb
->complete(r
);
10731 manifest_ops
.erase(p
);
10737 int PrimaryLogPG::start_flush(
10738 OpRequestRef op
, ObjectContextRef obc
,
10739 bool blocking
, hobject_t
*pmissing
,
10740 std::optional
<std::function
<void()>> &&on_flush
)
10742 const object_info_t
& oi
= obc
->obs
.oi
;
10743 const hobject_t
& soid
= oi
.soid
;
10744 dout(10) << __func__
<< " " << soid
10745 << " v" << oi
.version
10746 << " uv" << oi
.user_version
10747 << " " << (blocking
? "blocking" : "non-blocking/best-effort")
10750 bool preoctopus_compat
=
10751 get_osdmap()->require_osd_release
< ceph_release_t::octopus
;
10753 if (preoctopus_compat
) {
10754 // for pre-octopus compatibility, filter SnapSet::snaps. not
10755 // certain we need this, but let's be conservative.
10756 snapset
= obc
->ssc
->snapset
.get_filtered(pool
.info
);
10758 // NOTE: change this to a const ref when we remove this compat code
10759 snapset
= obc
->ssc
->snapset
;
10762 if (obc
->obs
.oi
.has_manifest() && obc
->obs
.oi
.manifest
.is_chunked()) {
10763 // current dedup tier only supports blocking operation
10765 return -EOPNOTSUPP
;
10769 // verify there are no (older) check for dirty clones
10771 dout(20) << " snapset " << snapset
<< dendl
;
10772 vector
<snapid_t
>::reverse_iterator p
= snapset
.clones
.rbegin();
10773 while (p
!= snapset
.clones
.rend() && *p
>= soid
.snap
)
10775 if (p
!= snapset
.clones
.rend()) {
10776 hobject_t next
= soid
;
10778 ceph_assert(next
.snap
< soid
.snap
);
10779 if (recovery_state
.get_pg_log().get_missing().is_missing(next
)) {
10780 dout(10) << __func__
<< " missing clone is " << next
<< dendl
;
10785 ObjectContextRef older_obc
= get_object_context(next
, false);
10787 dout(20) << __func__
<< " next oldest clone is " << older_obc
->obs
.oi
10789 if (older_obc
->obs
.oi
.is_dirty()) {
10790 dout(10) << __func__
<< " next oldest clone is dirty: "
10791 << older_obc
->obs
.oi
<< dendl
;
10795 dout(20) << __func__
<< " next oldest clone " << next
10796 << " is not present; implicitly clean" << dendl
;
10799 dout(20) << __func__
<< " no older clones" << dendl
;
10804 obc
->start_block();
10806 map
<hobject_t
,FlushOpRef
>::iterator p
= flush_ops
.find(soid
);
10807 if (p
!= flush_ops
.end()) {
10808 FlushOpRef fop
= p
->second
;
10809 if (fop
->op
== op
) {
10810 // we couldn't take the write lock on a cache-try-flush before;
10811 // now we are trying again for the lock.
10812 return try_flush_mark_clean(fop
);
10814 if (fop
->flushed_version
== obc
->obs
.oi
.user_version
&&
10815 (fop
->blocking
|| !blocking
)) {
10816 // nonblocking can join anything
10817 // blocking can only join a blocking flush
10818 dout(20) << __func__
<< " piggybacking on existing flush " << dendl
;
10820 fop
->dup_ops
.push_back(op
);
10821 return -EAGAIN
; // clean up this ctx; op will retry later
10824 // cancel current flush since it will fail anyway, or because we
10825 // are blocking and the existing flush is nonblocking.
10826 dout(20) << __func__
<< " canceling previous flush; it will fail" << dendl
;
10828 osd
->reply_op_error(fop
->op
, -EBUSY
);
10829 while (!fop
->dup_ops
.empty()) {
10830 osd
->reply_op_error(fop
->dup_ops
.front(), -EBUSY
);
10831 fop
->dup_ops
.pop_front();
10833 vector
<ceph_tid_t
> tids
;
10834 cancel_flush(fop
, false, &tids
);
10835 osd
->objecter
->op_cancel(tids
, -ECANCELED
);
10838 if (obc
->obs
.oi
.has_manifest() && obc
->obs
.oi
.manifest
.is_chunked()) {
10839 int r
= start_dedup(op
, obc
);
10840 if (r
!= -EINPROGRESS
) {
10848 * In general, we need to send a delete and a copyfrom.
10849 * Consider snapc 10:[10, 9, 8, 4, 3, 2]:[10(10, 9), 4(4,3,2)]
10850 * where 4 is marked as clean. To flush 10, we have to:
10851 * 1) delete 4:[4,3,2] -- Logically, the object does not exist after 4
10852 * 2) copyfrom 8:[8,4,3,2] -- flush object after snap 8
10854 * There is a complicating case. Supposed there had been a clone 7
10855 * for snaps [7, 6] which has been trimmed since they no longer exist.
10856 * In the base pool, we'd have 5:[4,3,2]:[4(4,3,2)]+head. When we submit
10857 * the delete, the snap will be promoted to 5, and the head will become
10858 * a whiteout. When the copy-from goes through, we'll end up with
10859 * 8:[8,4,3,2]:[4(4,3,2)]+head.
10861 * Another complication is the case where there is an interval change
10862 * after doing the delete and the flush but before marking the object
10863 * clean. We'll happily delete head and then recreate it at the same
10864 * sequence number, which works out ok.
10867 SnapContext snapc
, dsnapc
;
10868 if (snapset
.seq
!= 0) {
10869 if (soid
.snap
== CEPH_NOSNAP
) {
10870 snapc
= snapset
.get_ssc_as_of(snapset
.seq
);
10872 snapid_t min_included_snap
;
10873 auto p
= snapset
.clone_snaps
.find(soid
.snap
);
10874 ceph_assert(p
!= snapset
.clone_snaps
.end());
10875 min_included_snap
= p
->second
.back();
10876 snapc
= snapset
.get_ssc_as_of(min_included_snap
- 1);
10879 snapid_t prev_snapc
= 0;
10880 for (vector
<snapid_t
>::reverse_iterator citer
= snapset
.clones
.rbegin();
10881 citer
!= snapset
.clones
.rend();
10883 if (*citer
< soid
.snap
) {
10884 prev_snapc
= *citer
;
10889 dsnapc
= snapset
.get_ssc_as_of(prev_snapc
);
10892 object_locator_t
base_oloc(soid
);
10893 base_oloc
.pool
= pool
.info
.tier_of
;
10895 if (dsnapc
.seq
< snapc
.seq
) {
10898 osd
->objecter
->mutate(
10903 ceph::real_clock::from_ceph_timespec(oi
.mtime
),
10904 (CEPH_OSD_FLAG_IGNORE_OVERLAY
|
10905 CEPH_OSD_FLAG_ENFORCE_SNAPC
),
10906 NULL
/* no callback, we'll rely on the ordering w.r.t the next op */);
10909 FlushOpRef
fop(std::make_shared
<FlushOp
>());
10911 fop
->flushed_version
= oi
.user_version
;
10912 fop
->blocking
= blocking
;
10913 fop
->on_flush
= std::move(on_flush
);
10917 if (oi
.is_whiteout()) {
10918 fop
->removal
= true;
10921 object_locator_t
oloc(soid
);
10922 o
.copy_from(soid
.oid
.name
, soid
.snap
, oloc
, oi
.user_version
,
10923 CEPH_OSD_COPY_FROM_FLAG_FLUSH
|
10924 CEPH_OSD_COPY_FROM_FLAG_IGNORE_OVERLAY
|
10925 CEPH_OSD_COPY_FROM_FLAG_IGNORE_CACHE
|
10926 CEPH_OSD_COPY_FROM_FLAG_MAP_SNAP_CLONE
,
10927 LIBRADOS_OP_FLAG_FADVISE_SEQUENTIAL
|LIBRADOS_OP_FLAG_FADVISE_NOCACHE
);
10929 //mean the base tier don't cache data after this
10930 if (agent_state
&& agent_state
->evict_mode
!= TierAgentState::EVICT_MODE_FULL
)
10931 o
.set_last_op_flags(LIBRADOS_OP_FLAG_FADVISE_DONTNEED
);
10933 C_Flush
*fin
= new C_Flush(this, soid
, get_last_peering_reset());
10935 ceph_tid_t tid
= osd
->objecter
->mutate(
10936 soid
.oid
, base_oloc
, o
, snapc
,
10937 ceph::real_clock::from_ceph_timespec(oi
.mtime
),
10938 CEPH_OSD_FLAG_IGNORE_OVERLAY
| CEPH_OSD_FLAG_ENFORCE_SNAPC
,
10939 new C_OnFinisher(fin
,
10940 osd
->get_objecter_finisher(get_pg_shard())));
10941 /* we're under the pg lock and fin->finish() is grabbing that */
10943 fop
->objecter_tid
= tid
;
10945 flush_ops
[soid
] = fop
;
10947 recovery_state
.update_stats(
10948 [&oi
](auto &history
, auto &stats
) {
10949 stats
.stats
.sum
.num_flush
++;
10950 stats
.stats
.sum
.num_flush_kb
+= shift_round_up(oi
.size
, 10);
10953 return -EINPROGRESS
;
10956 void PrimaryLogPG::finish_flush(hobject_t oid
, ceph_tid_t tid
, int r
)
10958 dout(10) << __func__
<< " " << oid
<< " tid " << tid
10959 << " " << cpp_strerror(r
) << dendl
;
10960 map
<hobject_t
,FlushOpRef
>::iterator p
= flush_ops
.find(oid
);
10961 if (p
== flush_ops
.end()) {
10962 dout(10) << __func__
<< " no flush_op found" << dendl
;
10965 FlushOpRef fop
= p
->second
;
10966 if (tid
!= fop
->objecter_tid
&& !fop
->obc
->obs
.oi
.has_manifest()) {
10967 dout(10) << __func__
<< " tid " << tid
<< " != fop " << fop
10968 << " tid " << fop
->objecter_tid
<< dendl
;
10971 ObjectContextRef obc
= fop
->obc
;
10972 fop
->objecter_tid
= 0;
10974 if (r
< 0 && !(r
== -ENOENT
&& fop
->removal
)) {
10976 osd
->reply_op_error(fop
->op
, -EBUSY
);
10977 if (fop
->blocking
) {
10979 kick_object_context_blocked(obc
);
10982 if (!fop
->dup_ops
.empty()) {
10983 dout(20) << __func__
<< " requeueing dups" << dendl
;
10984 requeue_ops(fop
->dup_ops
);
10986 if (fop
->on_flush
) {
10987 (*(fop
->on_flush
))();
10988 fop
->on_flush
= std::nullopt
;
10990 flush_ops
.erase(oid
);
10994 r
= try_flush_mark_clean(fop
);
10995 if (r
== -EBUSY
&& fop
->op
) {
10996 osd
->reply_op_error(fop
->op
, r
);
11000 int PrimaryLogPG::try_flush_mark_clean(FlushOpRef fop
)
11002 ObjectContextRef obc
= fop
->obc
;
11003 const hobject_t
& oid
= obc
->obs
.oi
.soid
;
11005 if (fop
->blocking
) {
11007 kick_object_context_blocked(obc
);
11010 if (fop
->flushed_version
!= obc
->obs
.oi
.user_version
||
11011 !obc
->obs
.exists
) {
11012 if (obc
->obs
.exists
)
11013 dout(10) << __func__
<< " flushed_version " << fop
->flushed_version
11014 << " != current " << obc
->obs
.oi
.user_version
11017 dout(10) << __func__
<< " object no longer exists" << dendl
;
11019 if (!fop
->dup_ops
.empty()) {
11020 dout(20) << __func__
<< " requeueing dups" << dendl
;
11021 requeue_ops(fop
->dup_ops
);
11023 if (fop
->on_flush
) {
11024 (*(fop
->on_flush
))();
11025 fop
->on_flush
= std::nullopt
;
11027 flush_ops
.erase(oid
);
11029 osd
->logger
->inc(l_osd_tier_flush_fail
);
11031 osd
->logger
->inc(l_osd_tier_try_flush_fail
);
11035 if (!fop
->blocking
&&
11036 m_scrubber
->write_blocked_by_scrub(oid
)) {
11038 dout(10) << __func__
<< " blocked by scrub" << dendl
;
11039 requeue_op(fop
->op
);
11040 requeue_ops(fop
->dup_ops
);
11041 return -EAGAIN
; // will retry
11043 osd
->logger
->inc(l_osd_tier_try_flush_fail
);
11044 vector
<ceph_tid_t
> tids
;
11045 cancel_flush(fop
, false, &tids
);
11046 osd
->objecter
->op_cancel(tids
, -ECANCELED
);
11051 // successfully flushed, can we evict this object?
11052 if (!obc
->obs
.oi
.has_manifest() && !fop
->op
&&
11053 agent_state
&& agent_state
->evict_mode
!= TierAgentState::EVICT_MODE_IDLE
&&
11054 agent_maybe_evict(obc
, true)) {
11055 osd
->logger
->inc(l_osd_tier_clean
);
11056 if (fop
->on_flush
) {
11057 (*(fop
->on_flush
))();
11058 fop
->on_flush
= std::nullopt
;
11060 flush_ops
.erase(oid
);
11064 dout(10) << __func__
<< " clearing DIRTY flag for " << oid
<< dendl
;
11065 OpContextUPtr ctx
= simple_opc_create(fop
->obc
);
11067 // successfully flushed; can we clear the dirty bit?
11068 // try to take the lock manually, since we don't
11070 if (ctx
->lock_manager
.get_lock_type(
11075 dout(20) << __func__
<< " took write lock" << dendl
;
11076 } else if (fop
->op
) {
11077 dout(10) << __func__
<< " waiting on write lock " << fop
->op
<< " "
11078 << fop
->dup_ops
<< dendl
;
11079 // fop->op is now waiting on the lock; get fop->dup_ops to wait too.
11080 for (auto op
: fop
->dup_ops
) {
11081 bool locked
= ctx
->lock_manager
.get_lock_type(
11086 ceph_assert(!locked
);
11088 close_op_ctx(ctx
.release());
11089 return -EAGAIN
; // will retry
11091 dout(10) << __func__
<< " failed write lock, no op; failing" << dendl
;
11092 close_op_ctx(ctx
.release());
11093 osd
->logger
->inc(l_osd_tier_try_flush_fail
);
11094 vector
<ceph_tid_t
> tids
;
11095 cancel_flush(fop
, false, &tids
);
11096 osd
->objecter
->op_cancel(tids
, -ECANCELED
);
11100 if (fop
->on_flush
) {
11101 ctx
->register_on_finish(*(fop
->on_flush
));
11102 fop
->on_flush
= std::nullopt
;
11105 ctx
->at_version
= get_next_version();
11107 ctx
->new_obs
= obc
->obs
;
11108 ctx
->new_obs
.oi
.clear_flag(object_info_t::FLAG_DIRTY
);
11109 --ctx
->delta_stats
.num_objects_dirty
;
11110 if (fop
->obc
->obs
.oi
.has_manifest()) {
11111 ceph_assert(obc
->obs
.oi
.manifest
.is_chunked());
11112 PGTransaction
* t
= ctx
->op_t
.get();
11113 uint64_t chunks_size
= 0;
11114 for (auto &p
: ctx
->new_obs
.oi
.manifest
.chunk_map
) {
11115 chunks_size
+= p
.second
.length
;
11117 if (ctx
->new_obs
.oi
.is_omap() && pool
.info
.supports_omap()) {
11118 t
->omap_clear(oid
);
11119 ctx
->new_obs
.oi
.clear_omap_digest();
11120 ctx
->new_obs
.oi
.clear_flag(object_info_t::FLAG_OMAP
);
11121 ctx
->clean_regions
.mark_omap_dirty();
11123 if (obc
->obs
.oi
.size
== chunks_size
) {
11124 t
->truncate(oid
, 0);
11125 interval_set
<uint64_t> trim
;
11126 trim
.insert(0, ctx
->new_obs
.oi
.size
);
11127 ctx
->modified_ranges
.union_of(trim
);
11128 truncate_update_size_and_usage(ctx
->delta_stats
,
11131 ctx
->clean_regions
.mark_data_region_dirty(0, ctx
->new_obs
.oi
.size
);
11132 ctx
->new_obs
.oi
.new_object();
11133 for (auto &p
: ctx
->new_obs
.oi
.manifest
.chunk_map
) {
11134 p
.second
.set_flag(chunk_info_t::FLAG_MISSING
);
11137 for (auto &p
: ctx
->new_obs
.oi
.manifest
.chunk_map
) {
11138 dout(20) << __func__
<< " offset: " << p
.second
.offset
11139 << " length: " << p
.second
.length
<< dendl
;
11140 p
.second
.clear_flag(chunk_info_t::FLAG_MISSING
); // CLEAN
11145 finish_ctx(ctx
.get(), pg_log_entry_t::CLEAN
);
11147 osd
->logger
->inc(l_osd_tier_clean
);
11149 if (!fop
->dup_ops
.empty() || fop
->op
) {
11150 dout(20) << __func__
<< " requeueing for " << ctx
->at_version
<< dendl
;
11151 list
<OpRequestRef
> ls
;
11153 ls
.push_back(fop
->op
);
11154 ls
.splice(ls
.end(), fop
->dup_ops
);
11158 simple_opc_submit(std::move(ctx
));
11160 flush_ops
.erase(oid
);
11163 osd
->logger
->inc(l_osd_tier_flush
);
11165 osd
->logger
->inc(l_osd_tier_try_flush
);
11167 return -EINPROGRESS
;
11170 void PrimaryLogPG::cancel_flush(FlushOpRef fop
, bool requeue
,
11171 vector
<ceph_tid_t
> *tids
)
11173 dout(10) << __func__
<< " " << fop
->obc
->obs
.oi
.soid
<< " tid "
11174 << fop
->objecter_tid
<< dendl
;
11175 if (fop
->objecter_tid
) {
11176 tids
->push_back(fop
->objecter_tid
);
11177 fop
->objecter_tid
= 0;
11179 if (fop
->io_tids
.size()) {
11180 for (auto &p
: fop
->io_tids
) {
11181 tids
->push_back(p
.second
);
11185 if (fop
->blocking
&& fop
->obc
->is_blocked()) {
11186 fop
->obc
->stop_block();
11187 kick_object_context_blocked(fop
->obc
);
11191 requeue_op(fop
->op
);
11192 requeue_ops(fop
->dup_ops
);
11194 if (fop
->on_flush
) {
11195 (*(fop
->on_flush
))();
11196 fop
->on_flush
= std::nullopt
;
11198 flush_ops
.erase(fop
->obc
->obs
.oi
.soid
);
11201 void PrimaryLogPG::cancel_flush_ops(bool requeue
, vector
<ceph_tid_t
> *tids
)
11203 dout(10) << __func__
<< dendl
;
11204 map
<hobject_t
,FlushOpRef
>::iterator p
= flush_ops
.begin();
11205 while (p
!= flush_ops
.end()) {
11206 cancel_flush((p
++)->second
, requeue
, tids
);
11210 bool PrimaryLogPG::is_present_clone(hobject_t coid
)
11212 if (!pool
.info
.allow_incomplete_clones())
11214 if (is_missing_object(coid
))
11216 ObjectContextRef obc
= get_object_context(coid
, false);
11217 return obc
&& obc
->obs
.exists
;
11220 // ========================================================================
11224 void PrimaryLogPG::cancel_cls_gather(map
<hobject_t
,CLSGatherOp
>::iterator iter
, bool requeue
,
11225 vector
<ceph_tid_t
> *tids
)
11227 auto &cgop
= iter
->second
;
11228 for (std::vector
<ceph_tid_t
>::iterator p
= cgop
.objecter_tids
.begin(); p
!= cgop
.objecter_tids
.end(); p
++) {
11229 tids
->push_back(*p
);
11230 dout(10) << __func__
<< " " << cgop
.obc
->obs
.oi
.soid
<< " tid " << *p
<< dendl
;
11232 cgop
.objecter_tids
.clear();
11233 close_op_ctx(cgop
.ctx
);
11237 requeue_op(cgop
.op
);
11239 cls_gather_ops
.erase(iter
);
11242 void PrimaryLogPG::cancel_cls_gather_ops(bool requeue
, vector
<ceph_tid_t
> *tids
)
11244 dout(10) << __func__
<< dendl
;
11245 map
<hobject_t
,CLSGatherOp
>::iterator p
= cls_gather_ops
.begin();
11246 while (p
!= cls_gather_ops
.end()) {
11247 cancel_cls_gather(p
++, requeue
, tids
);
11251 // ========================================================================
11254 class C_OSD_RepopCommit
: public Context
{
11255 PrimaryLogPGRef pg
;
11256 boost::intrusive_ptr
<PrimaryLogPG::RepGather
> repop
;
11258 C_OSD_RepopCommit(PrimaryLogPG
*pg
, PrimaryLogPG::RepGather
*repop
)
11259 : pg(pg
), repop(repop
) {}
11260 void finish(int) override
{
11261 pg
->repop_all_committed(repop
.get());
11265 void PrimaryLogPG::repop_all_committed(RepGather
*repop
)
11267 dout(10) << __func__
<< ": repop tid " << repop
->rep_tid
<< " all committed "
11269 repop
->all_committed
= true;
11270 if (!repop
->rep_aborted
) {
11271 if (repop
->v
!= eversion_t()) {
11272 recovery_state
.complete_write(repop
->v
, repop
->pg_local_last_complete
);
11278 void PrimaryLogPG::op_applied(const eversion_t
&applied_version
)
11280 dout(10) << "op_applied version " << applied_version
<< dendl
;
11281 ceph_assert(applied_version
!= eversion_t());
11282 ceph_assert(applied_version
<= info
.last_update
);
11283 recovery_state
.local_write_applied(applied_version
);
11285 if (is_primary() && m_scrubber
) {
11286 // if there's a scrub operation waiting for the selected chunk to be fully updated -
11287 // allow it to continue
11288 m_scrubber
->on_applied_when_primary(recovery_state
.get_last_update_applied());
11292 void PrimaryLogPG::eval_repop(RepGather
*repop
)
11296 span
= tracing::osd::tracer
.add_span(__func__
, repop
->op
->osd_parent_span
);
11298 dout(10) << "eval_repop " << *repop
11299 << (repop
->op
&& repop
->op
->get_req
<MOSDOp
>() ? "" : " (no op)") << dendl
;
11302 if (repop
->all_committed
) {
11303 dout(10) << " commit: " << *repop
<< dendl
;
11304 for (auto p
= repop
->on_committed
.begin();
11305 p
!= repop
->on_committed
.end();
11306 repop
->on_committed
.erase(p
++)) {
11309 // send dup commits, in order
11310 auto it
= waiting_for_ondisk
.find(repop
->v
);
11311 if (it
!= waiting_for_ondisk
.end()) {
11312 ceph_assert(waiting_for_ondisk
.begin()->first
== repop
->v
);
11313 for (auto& i
: it
->second
) {
11314 int return_code
= repop
->r
;
11315 if (return_code
>= 0) {
11316 return_code
= std::get
<2>(i
);
11318 osd
->reply_op_error(std::get
<0>(i
), return_code
, repop
->v
,
11319 std::get
<1>(i
), std::get
<3>(i
));
11321 waiting_for_ondisk
.erase(it
);
11324 publish_stats_to_osd();
11326 dout(10) << " removing " << *repop
<< dendl
;
11327 ceph_assert(!repop_queue
.empty());
11328 dout(20) << " q front is " << *repop_queue
.front() << dendl
;
11329 if (repop_queue
.front() == repop
) {
11330 RepGather
*to_remove
= nullptr;
11331 while (!repop_queue
.empty() &&
11332 (to_remove
= repop_queue
.front())->all_committed
) {
11333 repop_queue
.pop_front();
11334 for (auto p
= to_remove
->on_success
.begin();
11335 p
!= to_remove
->on_success
.end();
11336 to_remove
->on_success
.erase(p
++)) {
11339 remove_repop(to_remove
);
11345 void PrimaryLogPG::issue_repop(RepGather
*repop
, OpContext
*ctx
)
11348 const hobject_t
& soid
= ctx
->obs
->oi
.soid
;
11349 dout(7) << "issue_repop rep_tid " << repop
->rep_tid
11355 span
= tracing::osd::tracer
.add_span(__func__
, ctx
->op
->osd_parent_span
);
11358 repop
->v
= ctx
->at_version
;
11360 ctx
->op_t
->add_obc(ctx
->obc
);
11361 if (ctx
->clone_obc
) {
11362 ctx
->op_t
->add_obc(ctx
->clone_obc
);
11364 if (ctx
->head_obc
) {
11365 ctx
->op_t
->add_obc(ctx
->head_obc
);
11368 Context
*on_all_commit
= new C_OSD_RepopCommit(this, repop
);
11369 if (!(ctx
->log
.empty())) {
11370 ceph_assert(ctx
->at_version
>= projected_last_update
);
11371 projected_last_update
= ctx
->at_version
;
11373 for (auto &&entry
: ctx
->log
) {
11374 projected_log
.add(entry
);
11377 recovery_state
.pre_submit_op(
11381 pgbackend
->submit_transaction(
11385 std::move(ctx
->op_t
),
11386 recovery_state
.get_pg_trim_to(),
11387 recovery_state
.get_min_last_complete_ondisk(),
11388 std::move(ctx
->log
),
11389 ctx
->updated_hset_history
,
11396 PrimaryLogPG::RepGather
*PrimaryLogPG::new_repop(
11398 ceph_tid_t rep_tid
)
11401 dout(10) << "new_repop rep_tid " << rep_tid
<< " on " << *ctx
->op
->get_req() << dendl
;
11403 dout(10) << "new_repop rep_tid " << rep_tid
<< " (no op)" << dendl
;
11405 RepGather
*repop
= new RepGather(
11406 ctx
, rep_tid
, info
.last_complete
);
11408 repop
->start
= ceph_clock_now();
11410 repop_queue
.push_back(&repop
->queue_item
);
11413 osd
->logger
->inc(l_osd_op_wip
);
11415 dout(10) << __func__
<< ": " << *repop
<< dendl
;
11419 boost::intrusive_ptr
<PrimaryLogPG::RepGather
> PrimaryLogPG::new_repop(
11420 eversion_t version
,
11422 ObcLockManager
&&manager
,
11424 std::optional
<std::function
<void(void)> > &&on_complete
)
11426 RepGather
*repop
= new RepGather(
11427 std::move(manager
),
11429 std::move(on_complete
),
11431 info
.last_complete
,
11433 repop
->v
= version
;
11435 repop
->start
= ceph_clock_now();
11437 repop_queue
.push_back(&repop
->queue_item
);
11439 osd
->logger
->inc(l_osd_op_wip
);
11441 dout(10) << __func__
<< ": " << *repop
<< dendl
;
11442 return boost::intrusive_ptr
<RepGather
>(repop
);
11445 void PrimaryLogPG::remove_repop(RepGather
*repop
)
11447 dout(20) << __func__
<< " " << *repop
<< dendl
;
11449 for (auto p
= repop
->on_finish
.begin();
11450 p
!= repop
->on_finish
.end();
11451 repop
->on_finish
.erase(p
++)) {
11455 release_object_locks(
11456 repop
->lock_manager
);
11459 osd
->logger
->dec(l_osd_op_wip
);
11462 PrimaryLogPG::OpContextUPtr
PrimaryLogPG::simple_opc_create(ObjectContextRef obc
)
11464 dout(20) << __func__
<< " " << obc
->obs
.oi
.soid
<< dendl
;
11465 ceph_tid_t rep_tid
= osd
->get_tid();
11466 osd_reqid_t
reqid(osd
->get_cluster_msgr_name(), 0, rep_tid
);
11467 OpContextUPtr
ctx(new OpContext(OpRequestRef(), reqid
, nullptr, obc
, this));
11468 ctx
->op_t
.reset(new PGTransaction());
11469 ctx
->mtime
= ceph_clock_now();
11473 void PrimaryLogPG::simple_opc_submit(OpContextUPtr ctx
)
11475 RepGather
*repop
= new_repop(ctx
.get(), ctx
->reqid
.tid
);
11476 dout(20) << __func__
<< " " << repop
<< dendl
;
11477 issue_repop(repop
, ctx
.get());
11479 recovery_state
.update_trim_to();
11484 void PrimaryLogPG::submit_log_entries(
11485 const mempool::osd_pglog::list
<pg_log_entry_t
> &entries
,
11486 ObcLockManager
&&manager
,
11487 std::optional
<std::function
<void(void)> > &&_on_complete
,
11491 dout(10) << __func__
<< " " << entries
<< dendl
;
11492 ceph_assert(is_primary());
11494 eversion_t version
;
11495 if (!entries
.empty()) {
11496 ceph_assert(entries
.rbegin()->version
>= projected_last_update
);
11497 version
= projected_last_update
= entries
.rbegin()->version
;
11500 boost::intrusive_ptr
<RepGather
> repop
;
11501 std::optional
<std::function
<void(void)> > on_complete
;
11502 if (get_osdmap()->require_osd_release
>= ceph_release_t::jewel
) {
11506 std::move(manager
),
11508 std::move(_on_complete
));
11510 on_complete
= std::move(_on_complete
);
11513 pgbackend
->call_write_ordered(
11514 [this, entries
, repop
, on_complete
]() {
11515 ObjectStore::Transaction t
;
11516 eversion_t old_last_update
= info
.last_update
;
11517 recovery_state
.merge_new_log_entries(
11518 entries
, t
, recovery_state
.get_pg_trim_to(),
11519 recovery_state
.get_min_last_complete_ondisk());
11521 set
<pg_shard_t
> waiting_on
;
11522 for (set
<pg_shard_t
>::const_iterator i
= get_acting_recovery_backfill().begin();
11523 i
!= get_acting_recovery_backfill().end();
11525 pg_shard_t
peer(*i
);
11526 if (peer
== pg_whoami
) continue;
11527 ceph_assert(recovery_state
.get_peer_missing().count(peer
));
11528 ceph_assert(recovery_state
.has_peer_info(peer
));
11529 if (get_osdmap()->require_osd_release
>= ceph_release_t::jewel
) {
11530 ceph_assert(repop
);
11531 MOSDPGUpdateLogMissing
*m
= new MOSDPGUpdateLogMissing(
11533 spg_t(info
.pgid
.pgid
, i
->shard
),
11535 get_osdmap_epoch(),
11536 get_last_peering_reset(),
11538 recovery_state
.get_pg_trim_to(),
11539 recovery_state
.get_min_last_complete_ondisk());
11540 osd
->send_message_osd_cluster(
11541 peer
.osd
, m
, get_osdmap_epoch());
11542 waiting_on
.insert(peer
);
11544 MOSDPGLog
*m
= new MOSDPGLog(
11545 peer
.shard
, pg_whoami
.shard
,
11546 info
.last_update
.epoch
,
11547 info
, get_last_peering_reset());
11548 m
->log
.log
= entries
;
11549 m
->log
.tail
= old_last_update
;
11550 m
->log
.head
= info
.last_update
;
11551 osd
->send_message_osd_cluster(
11552 peer
.osd
, m
, get_osdmap_epoch());
11555 ceph_tid_t rep_tid
= repop
->rep_tid
;
11556 waiting_on
.insert(pg_whoami
);
11557 log_entry_update_waiting_on
.insert(
11560 LogUpdateCtx
{std::move(repop
), std::move(waiting_on
)}
11562 struct OnComplete
: public Context
{
11563 PrimaryLogPGRef pg
;
11564 ceph_tid_t rep_tid
;
11567 PrimaryLogPGRef pg
,
11568 ceph_tid_t rep_tid
,
11570 : pg(pg
), rep_tid(rep_tid
), epoch(epoch
) {}
11571 void finish(int) override
{
11572 std::scoped_lock l
{*pg
};
11573 if (!pg
->pg_has_reset_since(epoch
)) {
11574 auto it
= pg
->log_entry_update_waiting_on
.find(rep_tid
);
11575 ceph_assert(it
!= pg
->log_entry_update_waiting_on
.end());
11576 auto it2
= it
->second
.waiting_on
.find(pg
->pg_whoami
);
11577 ceph_assert(it2
!= it
->second
.waiting_on
.end());
11578 it
->second
.waiting_on
.erase(it2
);
11579 if (it
->second
.waiting_on
.empty()) {
11580 pg
->repop_all_committed(it
->second
.repop
.get());
11581 pg
->log_entry_update_waiting_on
.erase(it
);
11586 t
.register_on_commit(
11587 new OnComplete
{this, rep_tid
, get_osdmap_epoch()});
11588 int r
= osd
->store
->queue_transaction(ch
, std::move(t
), NULL
);
11589 ceph_assert(r
== 0);
11590 op_applied(info
.last_update
);
11593 recovery_state
.update_trim_to();
11596 void PrimaryLogPG::cancel_log_updates()
11598 // get rid of all the LogUpdateCtx so their references to repops are
11600 log_entry_update_waiting_on
.clear();
11603 // -------------------------------------------------------
11605 void PrimaryLogPG::get_watchers(list
<obj_watch_item_t
> *ls
)
11607 std::scoped_lock l
{*this};
11608 pair
<hobject_t
, ObjectContextRef
> i
;
11609 while (object_contexts
.get_next(i
.first
, &i
)) {
11610 ObjectContextRef
obc(i
.second
);
11611 get_obc_watchers(obc
, *ls
);
11615 void PrimaryLogPG::get_obc_watchers(ObjectContextRef obc
, list
<obj_watch_item_t
> &pg_watchers
)
11617 for (map
<pair
<uint64_t, entity_name_t
>, WatchRef
>::iterator j
=
11618 obc
->watchers
.begin();
11619 j
!= obc
->watchers
.end();
11621 obj_watch_item_t owi
;
11623 owi
.obj
= obc
->obs
.oi
.soid
;
11624 owi
.wi
.addr
= j
->second
->get_peer_addr();
11625 owi
.wi
.name
= j
->second
->get_entity();
11626 owi
.wi
.cookie
= j
->second
->get_cookie();
11627 owi
.wi
.timeout_seconds
= j
->second
->get_timeout();
11629 dout(30) << "watch: Found oid=" << owi
.obj
<< " addr=" << owi
.wi
.addr
11630 << " name=" << owi
.wi
.name
<< " cookie=" << owi
.wi
.cookie
<< dendl
;
11632 pg_watchers
.push_back(owi
);
11636 void PrimaryLogPG::check_blocklisted_watchers()
11638 dout(20) << "PrimaryLogPG::check_blocklisted_watchers for pg " << get_pgid() << dendl
;
11639 pair
<hobject_t
, ObjectContextRef
> i
;
11640 while (object_contexts
.get_next(i
.first
, &i
))
11641 check_blocklisted_obc_watchers(i
.second
);
11644 void PrimaryLogPG::check_blocklisted_obc_watchers(ObjectContextRef obc
)
11646 dout(20) << "PrimaryLogPG::check_blocklisted_obc_watchers for obc " << obc
->obs
.oi
.soid
<< dendl
;
11647 for (map
<pair
<uint64_t, entity_name_t
>, WatchRef
>::iterator k
=
11648 obc
->watchers
.begin();
11649 k
!= obc
->watchers
.end();
11651 //Advance iterator now so handle_watch_timeout() can erase element
11652 map
<pair
<uint64_t, entity_name_t
>, WatchRef
>::iterator j
= k
++;
11653 dout(30) << "watch: Found " << j
->second
->get_entity() << " cookie " << j
->second
->get_cookie() << dendl
;
11654 entity_addr_t ea
= j
->second
->get_peer_addr();
11655 dout(30) << "watch: Check entity_addr_t " << ea
<< dendl
;
11656 if (get_osdmap()->is_blocklisted(ea
)) {
11657 dout(10) << "watch: Found blocklisted watcher for " << ea
<< dendl
;
11658 ceph_assert(j
->second
->get_pg() == this);
11659 j
->second
->unregister_cb();
11660 handle_watch_timeout(j
->second
);
11665 void PrimaryLogPG::populate_obc_watchers(ObjectContextRef obc
)
11667 ceph_assert(is_primary() && is_active());
11668 auto it_objects
= recovery_state
.get_pg_log().get_log().objects
.find(obc
->obs
.oi
.soid
);
11669 ceph_assert((recovering
.count(obc
->obs
.oi
.soid
) ||
11670 !is_missing_object(obc
->obs
.oi
.soid
)) ||
11671 (it_objects
!= recovery_state
.get_pg_log().get_log().objects
.end() && // or this is a revert... see recover_primary()
11672 it_objects
->second
->op
==
11673 pg_log_entry_t::LOST_REVERT
&&
11674 it_objects
->second
->reverting_to
==
11675 obc
->obs
.oi
.version
));
11677 dout(10) << "populate_obc_watchers " << obc
->obs
.oi
.soid
<< dendl
;
11678 ceph_assert(obc
->watchers
.empty());
11679 // populate unconnected_watchers
11680 for (map
<pair
<uint64_t, entity_name_t
>, watch_info_t
>::iterator p
=
11681 obc
->obs
.oi
.watchers
.begin();
11682 p
!= obc
->obs
.oi
.watchers
.end();
11684 utime_t expire
= info
.stats
.last_became_active
;
11685 expire
+= p
->second
.timeout_seconds
;
11686 dout(10) << " unconnected watcher " << p
->first
<< " will expire " << expire
<< dendl
;
11688 Watch::makeWatchRef(
11689 this, osd
, obc
, p
->second
.timeout_seconds
, p
->first
.first
,
11690 p
->first
.second
, p
->second
.addr
));
11691 watch
->disconnect();
11692 obc
->watchers
.insert(
11694 make_pair(p
->first
.first
, p
->first
.second
),
11697 // Look for watchers from blocklisted clients and drop
11698 check_blocklisted_obc_watchers(obc
);
11701 void PrimaryLogPG::handle_watch_timeout(WatchRef watch
)
11703 ObjectContextRef obc
= watch
->get_obc(); // handle_watch_timeout owns this ref
11704 dout(10) << "handle_watch_timeout obc " << obc
<< dendl
;
11706 if (!is_active()) {
11707 dout(10) << "handle_watch_timeout not active, no-op" << dendl
;
11710 if (!obc
->obs
.exists
) {
11711 dout(10) << __func__
<< " object " << obc
->obs
.oi
.soid
<< " dne" << dendl
;
11714 if (is_degraded_or_backfilling_object(obc
->obs
.oi
.soid
)) {
11715 callbacks_for_degraded_object
[obc
->obs
.oi
.soid
].push_back(
11716 watch
->get_delayed_cb()
11718 dout(10) << "handle_watch_timeout waiting for degraded on obj "
11719 << obc
->obs
.oi
.soid
11724 if (m_scrubber
->write_blocked_by_scrub(obc
->obs
.oi
.soid
)) {
11725 dout(10) << "handle_watch_timeout waiting for scrub on obj "
11726 << obc
->obs
.oi
.soid
11728 m_scrubber
->add_callback(
11729 watch
->get_delayed_cb() // This callback!
11734 OpContextUPtr ctx
= simple_opc_create(obc
);
11735 ctx
->at_version
= get_next_version();
11737 object_info_t
& oi
= ctx
->new_obs
.oi
;
11738 oi
.watchers
.erase(make_pair(watch
->get_cookie(),
11739 watch
->get_entity()));
11741 list
<watch_disconnect_t
> watch_disconnects
= {
11742 watch_disconnect_t(watch
->get_cookie(), watch
->get_entity(), true)
11744 ctx
->register_on_success(
11745 [this, obc
, watch_disconnects
]() {
11746 complete_disconnect_watches(obc
, watch_disconnects
);
11750 PGTransaction
*t
= ctx
->op_t
.get();
11751 ctx
->log
.push_back(pg_log_entry_t(pg_log_entry_t::MODIFY
, obc
->obs
.oi
.soid
,
11755 osd_reqid_t(), ctx
->mtime
, 0));
11757 oi
.prior_version
= obc
->obs
.oi
.version
;
11758 oi
.version
= ctx
->at_version
;
11760 encode(oi
, bl
, get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD
, nullptr));
11761 t
->setattr(obc
->obs
.oi
.soid
, OI_ATTR
, bl
);
11763 // apply new object state.
11764 ctx
->obc
->obs
= ctx
->new_obs
;
11766 // no ctx->delta_stats
11767 simple_opc_submit(std::move(ctx
));
11770 ObjectContextRef
PrimaryLogPG::create_object_context(const object_info_t
& oi
,
11771 SnapSetContext
*ssc
)
11773 ObjectContextRef
obc(object_contexts
.lookup_or_create(oi
.soid
));
11774 ceph_assert(obc
->destructor_callback
== NULL
);
11775 obc
->destructor_callback
= new C_PG_ObjectContext(this, obc
.get());
11777 obc
->obs
.exists
= false;
11780 register_snapset_context(ssc
);
11781 dout(10) << "create_object_context " << (void*)obc
.get() << " " << oi
.soid
<< " " << dendl
;
11783 populate_obc_watchers(obc
);
11787 ObjectContextRef
PrimaryLogPG::get_object_context(
11788 const hobject_t
& soid
,
11790 const map
<string
, bufferlist
, less
<>> *attrs
)
11792 auto it_objects
= recovery_state
.get_pg_log().get_log().objects
.find(soid
);
11794 attrs
|| !recovery_state
.get_pg_log().get_missing().is_missing(soid
) ||
11795 // or this is a revert... see recover_primary()
11796 (it_objects
!= recovery_state
.get_pg_log().get_log().objects
.end() &&
11797 it_objects
->second
->op
==
11798 pg_log_entry_t::LOST_REVERT
));
11799 ObjectContextRef obc
= object_contexts
.lookup(soid
);
11800 osd
->logger
->inc(l_osd_object_ctx_cache_total
);
11802 osd
->logger
->inc(l_osd_object_ctx_cache_hit
);
11803 dout(10) << __func__
<< ": found obc in cache: " << obc
11806 dout(10) << __func__
<< ": obc NOT found in cache: " << soid
<< dendl
;
11810 auto it_oi
= attrs
->find(OI_ATTR
);
11811 ceph_assert(it_oi
!= attrs
->end());
11812 bv
= it_oi
->second
;
11814 int r
= pgbackend
->objects_get_attr(soid
, OI_ATTR
, &bv
);
11817 dout(10) << __func__
<< ": no obc for soid "
11818 << soid
<< " and !can_create"
11820 return ObjectContextRef(); // -ENOENT!
11823 dout(10) << __func__
<< ": no obc for soid "
11824 << soid
<< " but can_create"
11827 object_info_t
oi(soid
);
11828 SnapSetContext
*ssc
= get_snapset_context(
11829 soid
, true, 0, false);
11831 obc
= create_object_context(oi
, ssc
);
11832 dout(10) << __func__
<< ": " << obc
<< " " << soid
11833 << " " << obc
->rwstate
11834 << " oi: " << obc
->obs
.oi
11835 << " ssc: " << obc
->ssc
11836 << " snapset: " << obc
->ssc
->snapset
<< dendl
;
11843 bufferlist::const_iterator bliter
= bv
.begin();
11844 decode(oi
, bliter
);
11846 dout(0) << __func__
<< ": obc corrupt: " << soid
<< dendl
;
11847 return ObjectContextRef(); // -ENOENT!
11850 ceph_assert(oi
.soid
.pool
== (int64_t)info
.pgid
.pool());
11852 obc
= object_contexts
.lookup_or_create(oi
.soid
);
11853 obc
->destructor_callback
= new C_PG_ObjectContext(this, obc
.get());
11855 obc
->obs
.exists
= true;
11857 obc
->ssc
= get_snapset_context(
11859 soid
.has_snapset() ? attrs
: 0);
11861 if (is_primary() && is_active())
11862 populate_obc_watchers(obc
);
11864 if (pool
.info
.is_erasure()) {
11866 obc
->attr_cache
= *attrs
;
11868 int r
= pgbackend
->objects_get_attrs(
11871 ceph_assert(r
== 0);
11875 dout(10) << __func__
<< ": creating obc from disk: " << obc
11879 // XXX: Caller doesn't expect this
11880 if (obc
->ssc
== NULL
) {
11881 derr
<< __func__
<< ": obc->ssc not available, not returning context" << dendl
;
11882 return ObjectContextRef(); // -ENOENT!
11885 dout(10) << __func__
<< ": " << obc
<< " " << soid
11886 << " " << obc
->rwstate
11887 << " oi: " << obc
->obs
.oi
11888 << " exists: " << (int)obc
->obs
.exists
11889 << " ssc: " << obc
->ssc
11890 << " snapset: " << obc
->ssc
->snapset
<< dendl
;
11894 void PrimaryLogPG::context_registry_on_change()
11896 pair
<hobject_t
, ObjectContextRef
> i
;
11897 while (object_contexts
.get_next(i
.first
, &i
)) {
11898 ObjectContextRef
obc(i
.second
);
11900 for (map
<pair
<uint64_t, entity_name_t
>, WatchRef
>::iterator j
=
11901 obc
->watchers
.begin();
11902 j
!= obc
->watchers
.end();
11903 obc
->watchers
.erase(j
++)) {
11904 j
->second
->discard();
11912 * If we return an error, and set *pmissing, then promoting that
11915 * If we return -EAGAIN, we will always set *pmissing to the missing
11916 * object to wait for.
11918 * If we return an error but do not set *pmissing, then we know the
11919 * object does not exist.
11921 int PrimaryLogPG::find_object_context(const hobject_t
& oid
,
11922 ObjectContextRef
*pobc
,
11924 bool map_snapid_to_clone
,
11925 hobject_t
*pmissing
)
11928 ceph_assert(oid
.pool
== static_cast<int64_t>(info
.pgid
.pool()));
11930 if (oid
.snap
== CEPH_NOSNAP
) {
11931 ObjectContextRef obc
= get_object_context(oid
, can_create
);
11937 dout(10) << __func__
<< " " << oid
11938 << " @" << oid
.snap
11939 << " oi=" << obc
->obs
.oi
11948 hobject_t head
= oid
.get_head();
11949 SnapSetContext
*ssc
= get_snapset_context(oid
, can_create
);
11950 if (!ssc
|| !(ssc
->exists
|| can_create
)) {
11951 dout(20) << __func__
<< " " << oid
<< " no snapset" << dendl
;
11953 *pmissing
= head
; // start by getting the head
11955 put_snapset_context(ssc
);
11959 if (map_snapid_to_clone
) {
11960 dout(10) << __func__
<< " " << oid
<< " @" << oid
.snap
11961 << " snapset " << ssc
->snapset
11962 << " map_snapid_to_clone=true" << dendl
;
11963 if (oid
.snap
> ssc
->snapset
.seq
) {
11964 // already must be readable
11965 ObjectContextRef obc
= get_object_context(head
, false);
11966 dout(10) << __func__
<< " " << oid
<< " @" << oid
.snap
11967 << " snapset " << ssc
->snapset
11968 << " maps to head" << dendl
;
11970 put_snapset_context(ssc
);
11971 return (obc
&& obc
->obs
.exists
) ? 0 : -ENOENT
;
11973 vector
<snapid_t
>::const_iterator citer
= std::find(
11974 ssc
->snapset
.clones
.begin(),
11975 ssc
->snapset
.clones
.end(),
11977 if (citer
== ssc
->snapset
.clones
.end()) {
11978 dout(10) << __func__
<< " " << oid
<< " @" << oid
.snap
11979 << " snapset " << ssc
->snapset
11980 << " maps to nothing" << dendl
;
11981 put_snapset_context(ssc
);
11985 dout(10) << __func__
<< " " << oid
<< " @" << oid
.snap
11986 << " snapset " << ssc
->snapset
11987 << " maps to " << oid
<< dendl
;
11989 if (recovery_state
.get_pg_log().get_missing().is_missing(oid
)) {
11990 dout(10) << __func__
<< " " << oid
<< " @" << oid
.snap
11991 << " snapset " << ssc
->snapset
11992 << " " << oid
<< " is missing" << dendl
;
11995 put_snapset_context(ssc
);
11999 ObjectContextRef obc
= get_object_context(oid
, false);
12000 if (!obc
|| !obc
->obs
.exists
) {
12001 dout(10) << __func__
<< " " << oid
<< " @" << oid
.snap
12002 << " snapset " << ssc
->snapset
12003 << " " << oid
<< " is not present" << dendl
;
12006 put_snapset_context(ssc
);
12009 dout(10) << __func__
<< " " << oid
<< " @" << oid
.snap
12010 << " snapset " << ssc
->snapset
12011 << " " << oid
<< " HIT" << dendl
;
12013 put_snapset_context(ssc
);
12016 ceph_abort(); //unreachable
12019 dout(10) << __func__
<< " " << oid
<< " @" << oid
.snap
12020 << " snapset " << ssc
->snapset
<< dendl
;
12023 if (oid
.snap
> ssc
->snapset
.seq
) {
12024 ObjectContextRef obc
= get_object_context(head
, false);
12025 dout(10) << __func__
<< " " << head
12026 << " want " << oid
.snap
<< " > snapset seq " << ssc
->snapset
.seq
12027 << " -- HIT " << obc
->obs
12032 ceph_assert(ssc
== obc
->ssc
);
12033 put_snapset_context(ssc
);
12039 // which clone would it be?
12041 while (k
< ssc
->snapset
.clones
.size() &&
12042 ssc
->snapset
.clones
[k
] < oid
.snap
)
12044 if (k
== ssc
->snapset
.clones
.size()) {
12045 dout(10) << __func__
<< " no clones with last >= oid.snap "
12046 << oid
.snap
<< " -- DNE" << dendl
;
12047 put_snapset_context(ssc
);
12050 hobject_t
soid(oid
.oid
, oid
.get_key(), ssc
->snapset
.clones
[k
], oid
.get_hash(),
12051 info
.pgid
.pool(), oid
.get_namespace());
12053 if (recovery_state
.get_pg_log().get_missing().is_missing(soid
)) {
12054 dout(20) << __func__
<< " " << soid
<< " missing, try again later"
12058 put_snapset_context(ssc
);
12062 ObjectContextRef obc
= get_object_context(soid
, false);
12063 if (!obc
|| !obc
->obs
.exists
) {
12066 put_snapset_context(ssc
);
12067 if (is_primary()) {
12068 if (is_degraded_or_backfilling_object(soid
)) {
12069 dout(20) << __func__
<< " clone is degraded or backfilling " << soid
<< dendl
;
12071 } else if (is_degraded_on_async_recovery_target(soid
)) {
12072 dout(20) << __func__
<< " clone is recovering " << soid
<< dendl
;
12075 dout(20) << __func__
<< " missing clone " << soid
<< dendl
;
12079 dout(20) << __func__
<< " replica missing clone" << soid
<< dendl
;
12087 ceph_assert(obc
->ssc
== ssc
);
12088 put_snapset_context(ssc
);
12093 dout(20) << __func__
<< " " << soid
12094 << " snapset " << obc
->ssc
->snapset
12096 snapid_t first
, last
;
12097 auto p
= obc
->ssc
->snapset
.clone_snaps
.find(soid
.snap
);
12098 ceph_assert(p
!= obc
->ssc
->snapset
.clone_snaps
.end());
12099 if (p
->second
.empty()) {
12100 dout(1) << __func__
<< " " << soid
<< " empty snapset -- DNE" << dendl
;
12101 ceph_assert(!cct
->_conf
->osd_debug_verify_snaps
);
12104 if (std::find(p
->second
.begin(), p
->second
.end(), oid
.snap
) ==
12106 dout(20) << __func__
<< " " << soid
<< " clone_snaps " << p
->second
12107 << " does not contain " << oid
.snap
<< " -- DNE" << dendl
;
12110 if (get_osdmap()->in_removed_snaps_queue(info
.pgid
.pgid
.pool(), oid
.snap
)) {
12111 dout(20) << __func__
<< " " << soid
<< " snap " << oid
.snap
12112 << " in removed_snaps_queue" << " -- DNE" << dendl
;
12115 dout(20) << __func__
<< " " << soid
<< " clone_snaps " << p
->second
12116 << " contains " << oid
.snap
<< " -- HIT " << obc
->obs
<< dendl
;
12121 void PrimaryLogPG::object_context_destructor_callback(ObjectContext
*obc
)
12124 put_snapset_context(obc
->ssc
);
12127 void PrimaryLogPG::add_object_context_to_pg_stat(ObjectContextRef obc
, pg_stat_t
*pgstat
)
12129 object_info_t
& oi
= obc
->obs
.oi
;
12131 dout(10) << __func__
<< " " << oi
.soid
<< dendl
;
12132 ceph_assert(!oi
.soid
.is_snapdir());
12134 object_stat_sum_t stat
;
12135 stat
.num_objects
++;
12137 stat
.num_objects_dirty
++;
12138 if (oi
.is_whiteout())
12139 stat
.num_whiteouts
++;
12141 stat
.num_objects_omap
++;
12142 if (oi
.is_cache_pinned())
12143 stat
.num_objects_pinned
++;
12144 if (oi
.has_manifest())
12145 stat
.num_objects_manifest
++;
12147 if (oi
.soid
.is_snap()) {
12148 stat
.num_object_clones
++;
12151 obc
->ssc
= get_snapset_context(oi
.soid
, false);
12152 ceph_assert(obc
->ssc
);
12153 stat
.num_bytes
+= obc
->ssc
->snapset
.get_clone_bytes(oi
.soid
.snap
);
12155 stat
.num_bytes
+= oi
.size
;
12159 pgstat
->stats
.sum
.add(stat
);
12162 void PrimaryLogPG::requeue_op_blocked_by_object(const hobject_t
&soid
) {
12163 map
<hobject_t
, list
<OpRequestRef
>>::iterator p
= waiting_for_blocked_object
.find(soid
);
12164 if (p
!= waiting_for_blocked_object
.end()) {
12165 list
<OpRequestRef
>& ls
= p
->second
;
12166 dout(10) << __func__
<< " " << soid
<< " requeuing " << ls
.size() << " requests" << dendl
;
12168 waiting_for_blocked_object
.erase(p
);
12172 void PrimaryLogPG::kick_object_context_blocked(ObjectContextRef obc
)
12174 const hobject_t
& soid
= obc
->obs
.oi
.soid
;
12175 if (obc
->is_blocked()) {
12176 dout(10) << __func__
<< " " << soid
<< " still blocked" << dendl
;
12180 requeue_op_blocked_by_object(soid
);
12182 map
<hobject_t
, ObjectContextRef
>::iterator i
=
12183 objects_blocked_on_snap_promotion
.find(obc
->obs
.oi
.soid
.get_head());
12184 if (i
!= objects_blocked_on_snap_promotion
.end()) {
12185 ceph_assert(i
->second
== obc
);
12186 ObjectContextRef head_obc
= get_object_context(i
->first
, false);
12187 head_obc
->stop_block();
12188 // kick blocked ops (head)
12189 requeue_op_blocked_by_object(i
->first
);
12190 objects_blocked_on_snap_promotion
.erase(i
);
12193 if (obc
->requeue_scrub_on_unblock
) {
12195 obc
->requeue_scrub_on_unblock
= false;
12197 dout(20) << __func__
<< " requeuing if still active: " << (is_active() ? "yes" : "no") << dendl
;
12199 // only requeue if we are still active: we may be unblocking
12200 // because we are resetting for a new peering interval
12202 osd
->queue_scrub_unblocking(this, is_scrub_blocking_ops());
12207 SnapSetContext
*PrimaryLogPG::get_snapset_context(
12208 const hobject_t
& oid
,
12210 const map
<string
, bufferlist
, less
<>> *attrs
,
12213 std::lock_guard
l(snapset_contexts_lock
);
12214 SnapSetContext
*ssc
;
12215 map
<hobject_t
, SnapSetContext
*>::iterator p
= snapset_contexts
.find(
12216 oid
.get_snapdir());
12217 if (p
!= snapset_contexts
.end()) {
12218 if (can_create
|| p
->second
->exists
) {
12227 if (!(oid
.is_head() && !oid_existed
)) {
12228 r
= pgbackend
->objects_get_attr(oid
.get_head(), SS_ATTR
, &bv
);
12230 if (r
< 0 && !can_create
)
12233 auto it_ss
= attrs
->find(SS_ATTR
);
12234 ceph_assert(it_ss
!= attrs
->end());
12235 bv
= it_ss
->second
;
12237 ssc
= new SnapSetContext(oid
.get_snapdir());
12238 _register_snapset_context(ssc
);
12240 bufferlist::const_iterator bvp
= bv
.begin();
12242 ssc
->snapset
.decode(bvp
);
12243 } catch (const ceph::buffer::error
& e
) {
12244 dout(0) << __func__
<< " Can't decode snapset: " << e
.what() << dendl
;
12247 ssc
->exists
= true;
12249 ssc
->exists
= false;
12257 void PrimaryLogPG::put_snapset_context(SnapSetContext
*ssc
)
12259 std::lock_guard
l(snapset_contexts_lock
);
12261 if (ssc
->ref
== 0) {
12262 if (ssc
->registered
)
12263 snapset_contexts
.erase(ssc
->oid
);
12270 * NONE - didn't pull anything
12271 * YES - pulled what the caller wanted
12272 * HEAD - needed to pull head first
12274 enum { PULL_NONE
, PULL_HEAD
, PULL_YES
};
12276 int PrimaryLogPG::recover_missing(
12277 const hobject_t
&soid
, eversion_t v
,
12279 PGBackend::RecoveryHandle
*h
)
12281 if (recovery_state
.get_missing_loc().is_unfound(soid
)) {
12282 dout(7) << __func__
<< " " << soid
12284 << " but it is unfound" << dendl
;
12288 if (recovery_state
.get_missing_loc().is_deleted(soid
)) {
12289 start_recovery_op(soid
);
12290 ceph_assert(!recovering
.count(soid
));
12291 recovering
.insert(make_pair(soid
, ObjectContextRef()));
12292 epoch_t cur_epoch
= get_osdmap_epoch();
12293 remove_missing_object(soid
, v
, new LambdaContext(
12295 std::scoped_lock locker
{*this};
12296 if (!pg_has_reset_since(cur_epoch
)) {
12297 bool object_missing
= false;
12298 for (const auto& shard
: get_acting_recovery_backfill()) {
12299 if (shard
== pg_whoami
)
12301 if (recovery_state
.get_peer_missing(shard
).is_missing(soid
)) {
12302 dout(20) << __func__
<< ": soid " << soid
<< " needs to be deleted from replica " << shard
<< dendl
;
12303 object_missing
= true;
12307 if (!object_missing
) {
12308 object_stat_sum_t stat_diff
;
12309 stat_diff
.num_objects_recovered
= 1;
12310 if (scrub_after_recovery
)
12311 stat_diff
.num_objects_repaired
= 1;
12312 on_global_recover(soid
, stat_diff
, true);
12314 auto recovery_handle
= pgbackend
->open_recovery_op();
12315 pgbackend
->recover_delete_object(soid
, v
, recovery_handle
);
12316 pgbackend
->run_recovery_op(recovery_handle
, priority
);
12323 // is this a snapped object? if so, consult the snapset.. we may not need the entire object!
12324 ObjectContextRef obc
;
12325 ObjectContextRef head_obc
;
12326 if (soid
.snap
&& soid
.snap
< CEPH_NOSNAP
) {
12327 // do we have the head?
12328 hobject_t head
= soid
.get_head();
12329 if (recovery_state
.get_pg_log().get_missing().is_missing(head
)) {
12330 if (recovering
.count(head
)) {
12331 dout(10) << " missing but already recovering head " << head
<< dendl
;
12334 int r
= recover_missing(
12335 head
, recovery_state
.get_pg_log().get_missing().get_items().find(head
)->second
.need
, priority
,
12337 if (r
!= PULL_NONE
)
12342 head_obc
= get_object_context(
12346 ceph_assert(head_obc
);
12348 start_recovery_op(soid
);
12349 ceph_assert(!recovering
.count(soid
));
12350 recovering
.insert(make_pair(soid
, obc
));
12351 int r
= pgbackend
->recover_object(
12357 // This is only a pull which shouldn't return an error
12358 ceph_assert(r
>= 0);
12362 void PrimaryLogPG::remove_missing_object(const hobject_t
&soid
,
12363 eversion_t v
, Context
*on_complete
)
12365 dout(20) << __func__
<< " " << soid
<< " " << v
<< dendl
;
12366 ceph_assert(on_complete
!= nullptr);
12368 ObjectStore::Transaction t
;
12369 remove_snap_mapped_object(t
, soid
);
12371 ObjectRecoveryInfo recovery_info
;
12372 recovery_info
.soid
= soid
;
12373 recovery_info
.version
= v
;
12375 epoch_t cur_epoch
= get_osdmap_epoch();
12376 t
.register_on_complete(new LambdaContext(
12378 std::unique_lock locker
{*this};
12379 if (!pg_has_reset_since(cur_epoch
)) {
12380 ObjectStore::Transaction t2
;
12381 on_local_recover(soid
, recovery_info
, ObjectContextRef(), true, &t2
);
12382 t2
.register_on_complete(on_complete
);
12383 int r
= osd
->store
->queue_transaction(ch
, std::move(t2
), nullptr);
12384 ceph_assert(r
== 0);
12388 on_complete
->complete(-EAGAIN
);
12391 int r
= osd
->store
->queue_transaction(ch
, std::move(t
), nullptr);
12392 ceph_assert(r
== 0);
12395 void PrimaryLogPG::finish_degraded_object(const hobject_t oid
)
12397 dout(10) << __func__
<< " " << oid
<< dendl
;
12398 if (callbacks_for_degraded_object
.count(oid
)) {
12399 list
<Context
*> contexts
;
12400 contexts
.swap(callbacks_for_degraded_object
[oid
]);
12401 callbacks_for_degraded_object
.erase(oid
);
12402 for (list
<Context
*>::iterator i
= contexts
.begin();
12403 i
!= contexts
.end();
12408 map
<hobject_t
, snapid_t
>::iterator i
= objects_blocked_on_degraded_snap
.find(
12410 if (i
!= objects_blocked_on_degraded_snap
.end() &&
12411 i
->second
== oid
.snap
)
12412 objects_blocked_on_degraded_snap
.erase(i
);
12415 void PrimaryLogPG::_committed_pushed_object(
12416 epoch_t epoch
, eversion_t last_complete
)
12418 std::scoped_lock locker
{*this};
12419 if (!pg_has_reset_since(epoch
)) {
12420 recovery_state
.recovery_committed_to(last_complete
);
12422 dout(10) << __func__
12423 << " pg has changed, not touching last_complete_ondisk" << dendl
;
12427 void PrimaryLogPG::_applied_recovered_object(ObjectContextRef obc
)
12429 dout(20) << __func__
<< dendl
;
12431 dout(20) << "obc = " << *obc
<< dendl
;
12433 ceph_assert(active_pushes
>= 1);
12436 // requeue an active chunky scrub waiting on recovery ops
12437 if (!recovery_state
.is_deleting() && active_pushes
== 0 &&
12438 is_scrub_active()) {
12440 osd
->queue_scrub_pushes_update(this, is_scrub_blocking_ops());
12444 void PrimaryLogPG::_applied_recovered_object_replica()
12446 dout(20) << __func__
<< dendl
;
12447 ceph_assert(active_pushes
>= 1);
12450 // requeue an active scrub waiting on recovery ops
12451 if (!recovery_state
.is_deleting() && active_pushes
== 0 &&
12452 is_scrub_active()) {
12454 osd
->queue_scrub_replica_pushes(this, m_scrubber
->replica_op_priority());
12458 void PrimaryLogPG::on_failed_pull(
12459 const set
<pg_shard_t
> &from
,
12460 const hobject_t
&soid
,
12461 const eversion_t
&v
)
12463 dout(20) << __func__
<< ": " << soid
<< dendl
;
12464 ceph_assert(recovering
.count(soid
));
12465 auto obc
= recovering
[soid
];
12467 list
<OpRequestRef
> blocked_ops
;
12468 obc
->drop_recovery_read(&blocked_ops
);
12469 requeue_ops(blocked_ops
);
12471 recovering
.erase(soid
);
12472 for (auto&& i
: from
) {
12473 if (i
!= pg_whoami
) { // we'll get it below in primary_error
12474 recovery_state
.force_object_missing(i
, soid
, v
);
12478 dout(0) << __func__
<< " " << soid
<< " from shard " << from
12479 << ", reps on " << recovery_state
.get_missing_loc().get_locations(soid
)
12480 << " unfound? " << recovery_state
.get_missing_loc().is_unfound(soid
)
12482 finish_recovery_op(soid
); // close out this attempt,
12483 finish_degraded_object(soid
);
12485 if (from
.count(pg_whoami
)) {
12486 dout(0) << " primary missing oid " << soid
<< " version " << v
<< dendl
;
12487 primary_error(soid
, v
);
12488 backfills_in_flight
.erase(soid
);
12492 eversion_t
PrimaryLogPG::pick_newest_available(const hobject_t
& oid
)
12495 pg_missing_item pmi
;
12496 bool is_missing
= recovery_state
.get_pg_log().get_missing().is_missing(oid
, &pmi
);
12497 ceph_assert(is_missing
);
12499 dout(10) << "pick_newest_available " << oid
<< " " << v
<< " on osd." << osd
->whoami
<< " (local)" << dendl
;
12501 ceph_assert(!get_acting_recovery_backfill().empty());
12502 for (set
<pg_shard_t
>::iterator i
= get_acting_recovery_backfill().begin();
12503 i
!= get_acting_recovery_backfill().end();
12505 if (*i
== get_primary()) continue;
12506 pg_shard_t peer
= *i
;
12507 if (!recovery_state
.get_peer_missing(peer
).is_missing(oid
)) {
12510 eversion_t h
= recovery_state
.get_peer_missing(peer
).get_items().at(oid
).have
;
12511 dout(10) << "pick_newest_available " << oid
<< " " << h
<< " on osd." << peer
<< dendl
;
12516 dout(10) << "pick_newest_available " << oid
<< " " << v
<< " (newest)" << dendl
;
12520 void PrimaryLogPG::do_update_log_missing(OpRequestRef
&op
)
12522 const MOSDPGUpdateLogMissing
*m
= static_cast<const MOSDPGUpdateLogMissing
*>(
12524 ceph_assert(m
->get_type() == MSG_OSD_PG_UPDATE_LOG_MISSING
);
12525 ObjectStore::Transaction t
;
12526 std::optional
<eversion_t
> op_trim_to
, op_roll_forward_to
;
12527 if (m
->pg_trim_to
!= eversion_t())
12528 op_trim_to
= m
->pg_trim_to
;
12529 if (m
->pg_roll_forward_to
!= eversion_t())
12530 op_roll_forward_to
= m
->pg_roll_forward_to
;
12532 dout(20) << __func__
12533 << " op_trim_to = " << op_trim_to
<< " op_roll_forward_to = " << op_roll_forward_to
<< dendl
;
12535 recovery_state
.append_log_entries_update_missing(
12536 m
->entries
, t
, op_trim_to
, op_roll_forward_to
);
12537 eversion_t new_lcod
= info
.last_complete
;
12539 Context
*complete
= new LambdaContext(
12541 const MOSDPGUpdateLogMissing
*msg
= static_cast<const MOSDPGUpdateLogMissing
*>(
12543 std::scoped_lock locker
{*this};
12544 if (!pg_has_reset_since(msg
->get_epoch())) {
12545 update_last_complete_ondisk(new_lcod
);
12546 MOSDPGUpdateLogMissingReply
*reply
=
12547 new MOSDPGUpdateLogMissingReply(
12548 spg_t(info
.pgid
.pgid
, primary_shard().shard
),
12554 reply
->set_priority(CEPH_MSG_PRIO_HIGH
);
12555 msg
->get_connection()->send_message(reply
);
12559 if (get_osdmap()->require_osd_release
>= ceph_release_t::kraken
) {
12560 t
.register_on_commit(complete
);
12562 /* Hack to work around the fact that ReplicatedBackend sends
12563 * ack+commit if commit happens first
12565 * This behavior is no longer necessary, but we preserve it so old
12566 * primaries can keep their repops in order */
12567 if (pool
.info
.is_erasure()) {
12568 t
.register_on_complete(complete
);
12570 t
.register_on_commit(complete
);
12573 int tr
= osd
->store
->queue_transaction(
12577 ceph_assert(tr
== 0);
12578 op_applied(info
.last_update
);
12581 void PrimaryLogPG::do_update_log_missing_reply(OpRequestRef
&op
)
12583 const MOSDPGUpdateLogMissingReply
*m
=
12584 static_cast<const MOSDPGUpdateLogMissingReply
*>(
12586 dout(20) << __func__
<< " got reply from "
12587 << m
->get_from() << dendl
;
12589 auto it
= log_entry_update_waiting_on
.find(m
->get_tid());
12590 if (it
!= log_entry_update_waiting_on
.end()) {
12591 if (it
->second
.waiting_on
.count(m
->get_from())) {
12592 it
->second
.waiting_on
.erase(m
->get_from());
12593 if (m
->last_complete_ondisk
!= eversion_t()) {
12594 update_peer_last_complete_ondisk(m
->get_from(), m
->last_complete_ondisk
);
12598 << info
.pgid
<< " got reply "
12599 << *m
<< " from shard we are not waiting for "
12603 if (it
->second
.waiting_on
.empty()) {
12604 repop_all_committed(it
->second
.repop
.get());
12605 log_entry_update_waiting_on
.erase(it
);
12609 << info
.pgid
<< " got reply "
12610 << *m
<< " on unknown tid " << m
->get_tid();
12614 /* Mark all unfound objects as lost.
12616 void PrimaryLogPG::mark_all_unfound_lost(
12618 std::function
<void(int,const std::string
&,bufferlist
&)> on_finish
)
12620 dout(3) << __func__
<< " " << pg_log_entry_t::get_op_name(what
) << dendl
;
12621 list
<hobject_t
> oids
;
12623 dout(30) << __func__
<< ": log before:\n";
12624 recovery_state
.get_pg_log().get_log().print(*_dout
);
12627 mempool::osd_pglog::list
<pg_log_entry_t
> log_entries
;
12629 utime_t mtime
= ceph_clock_now();
12630 map
<hobject_t
, pg_missing_item
>::const_iterator m
=
12631 recovery_state
.get_missing_loc().get_needs_recovery().begin();
12632 map
<hobject_t
, pg_missing_item
>::const_iterator mend
=
12633 recovery_state
.get_missing_loc().get_needs_recovery().end();
12635 ObcLockManager manager
;
12636 eversion_t v
= get_next_version();
12637 v
.epoch
= get_osdmap_epoch();
12638 uint64_t num_unfound
= recovery_state
.get_missing_loc().num_unfound();
12639 while (m
!= mend
) {
12640 const hobject_t
&oid(m
->first
);
12641 if (!recovery_state
.get_missing_loc().is_unfound(oid
)) {
12642 // We only care about unfound objects
12647 ObjectContextRef obc
;
12651 case pg_log_entry_t::LOST_MARK
:
12652 ceph_abort_msg("actually, not implemented yet!");
12655 case pg_log_entry_t::LOST_REVERT
:
12656 prev
= pick_newest_available(oid
);
12657 if (prev
> eversion_t()) {
12660 pg_log_entry_t::LOST_REVERT
, oid
, v
,
12661 m
->second
.need
, 0, osd_reqid_t(), mtime
, 0);
12662 e
.reverting_to
= prev
;
12663 e
.mark_unrollbackable();
12664 log_entries
.push_back(e
);
12665 dout(10) << e
<< dendl
;
12667 // we are now missing the new version; recovery code will sort it out.
12673 case pg_log_entry_t::LOST_DELETE
:
12675 pg_log_entry_t
e(pg_log_entry_t::LOST_DELETE
, oid
, v
, m
->second
.need
,
12676 0, osd_reqid_t(), mtime
, 0);
12677 if (get_osdmap()->require_osd_release
>= ceph_release_t::jewel
) {
12678 if (pool
.info
.require_rollback()) {
12679 e
.mod_desc
.try_rmobject(v
.version
);
12681 e
.mark_unrollbackable();
12683 } // otherwise, just do what we used to do
12684 dout(10) << e
<< dendl
;
12685 log_entries
.push_back(e
);
12686 oids
.push_back(oid
);
12688 // If context found mark object as deleted in case
12689 // of racing with new creation. This can happen if
12690 // object lost and EIO at primary.
12691 obc
= object_contexts
.lookup(oid
);
12693 obc
->obs
.exists
= false;
12705 recovery_state
.update_stats(
12706 [](auto &history
, auto &stats
) {
12707 stats
.stats_invalid
= true;
12711 submit_log_entries(
12713 std::move(manager
),
12714 std::optional
<std::function
<void(void)> >(
12715 [this, oids
, num_unfound
, on_finish
]() {
12716 if (recovery_state
.perform_deletes_during_peering()) {
12717 for (auto oid
: oids
) {
12718 // clear old locations - merge_new_log_entries will have
12719 // handled rebuilding missing_loc for each of these
12720 // objects if we have the RECOVERY_DELETES flag
12721 recovery_state
.object_recovered(oid
, object_stat_sum_t());
12725 if (is_recovery_unfound()) {
12726 queue_peering_event(
12728 std::make_shared
<PGPeeringEvent
>(
12729 get_osdmap_epoch(),
12730 get_osdmap_epoch(),
12731 PeeringState::DoRecovery())));
12732 } else if (is_backfill_unfound()) {
12733 queue_peering_event(
12735 std::make_shared
<PGPeeringEvent
>(
12736 get_osdmap_epoch(),
12737 get_osdmap_epoch(),
12738 PeeringState::RequestBackfill())));
12744 ss
<< "pg has " << num_unfound
12745 << " objects unfound and apparently lost marking";
12746 string rs
= ss
.str();
12747 dout(0) << "do_command r=" << 0 << " " << rs
<< dendl
;
12748 osd
->clog
->info() << rs
;
12750 on_finish(0, rs
, empty
);
12755 void PrimaryLogPG::_split_into(pg_t child_pgid
, PG
*child
, unsigned split_bits
)
12757 ceph_assert(repop_queue
.empty());
12761 * pg status change notification
12764 void PrimaryLogPG::apply_and_flush_repops(bool requeue
)
12766 list
<OpRequestRef
> rq
;
12768 // apply all repops
12769 while (!repop_queue
.empty()) {
12770 RepGather
*repop
= repop_queue
.front();
12771 repop_queue
.pop_front();
12772 dout(10) << " canceling repop tid " << repop
->rep_tid
<< dendl
;
12773 repop
->rep_aborted
= true;
12774 repop
->on_committed
.clear();
12775 repop
->on_success
.clear();
12779 dout(10) << " requeuing " << *repop
->op
->get_req() << dendl
;
12780 rq
.push_back(repop
->op
);
12781 repop
->op
= OpRequestRef();
12784 // also requeue any dups, interleaved into position
12785 auto p
= waiting_for_ondisk
.find(repop
->v
);
12786 if (p
!= waiting_for_ondisk
.end()) {
12787 dout(10) << " also requeuing ondisk waiters " << p
->second
<< dendl
;
12788 for (auto& i
: p
->second
) {
12789 rq
.push_back(std::get
<0>(i
));
12791 waiting_for_ondisk
.erase(p
);
12795 remove_repop(repop
);
12798 ceph_assert(repop_queue
.empty());
12802 if (!waiting_for_ondisk
.empty()) {
12803 for (auto& i
: waiting_for_ondisk
) {
12804 for (auto& j
: i
.second
) {
12805 derr
<< __func__
<< ": op " << *(std::get
<0>(j
)->get_req())
12806 << " waiting on " << i
.first
<< dendl
;
12809 ceph_assert(waiting_for_ondisk
.empty());
12813 waiting_for_ondisk
.clear();
12816 void PrimaryLogPG::on_flushed()
12818 requeue_ops(waiting_for_flush
);
12819 if (!is_peered() || !is_primary()) {
12820 pair
<hobject_t
, ObjectContextRef
> i
;
12821 while (object_contexts
.get_next(i
.first
, &i
)) {
12822 derr
<< __func__
<< ": object " << i
.first
<< " obc still alive" << dendl
;
12824 ceph_assert(object_contexts
.empty());
12828 void PrimaryLogPG::on_removal(ObjectStore::Transaction
&t
)
12830 dout(10) << __func__
<< dendl
;
12834 t
.register_on_commit(new C_DeleteMore(this, get_osdmap_epoch()));
12837 void PrimaryLogPG::clear_async_reads()
12839 dout(10) << __func__
<< dendl
;
12840 for(auto& i
: in_progress_async_reads
) {
12841 dout(10) << "clear ctx: "
12842 << "OpRequestRef " << i
.first
12843 << " OpContext " << i
.second
12845 close_op_ctx(i
.second
);
12849 void PrimaryLogPG::clear_cache()
12851 object_contexts
.clear();
12854 void PrimaryLogPG::on_shutdown()
12856 dout(10) << __func__
<< dendl
;
12858 if (recovery_queued
) {
12859 recovery_queued
= false;
12860 osd
->clear_queued_recovery(this);
12863 m_scrubber
->scrub_clear_state();
12864 m_scrubber
->rm_from_osd_scrubbing();
12866 vector
<ceph_tid_t
> tids
;
12867 cancel_copy_ops(false, &tids
);
12868 cancel_flush_ops(false, &tids
);
12869 cancel_proxy_ops(false, &tids
);
12870 cancel_manifest_ops(false, &tids
);
12871 cancel_cls_gather_ops(false, &tids
);
12872 osd
->objecter
->op_cancel(tids
, -ECANCELED
);
12874 apply_and_flush_repops(false);
12875 cancel_log_updates();
12876 // we must remove PGRefs, so do this this prior to release_backoffs() callers
12878 // clean up snap trim references
12879 snap_trimmer_machine
.process_event(Reset());
12881 pgbackend
->on_change();
12883 context_registry_on_change();
12884 object_contexts
.clear();
12886 clear_async_reads();
12888 osd
->remote_reserver
.cancel_reservation(info
.pgid
);
12889 osd
->local_reserver
.cancel_reservation(info
.pgid
);
12891 clear_primary_state();
12894 if (is_primary()) {
12895 osd
->clear_ready_to_merge(this);
12899 void PrimaryLogPG::on_activate_complete()
12903 if (!recovery_state
.needs_flush()) {
12904 requeue_ops(waiting_for_peered
);
12905 } else if (!waiting_for_peered
.empty()) {
12906 dout(10) << __func__
<< " flushes in progress, moving "
12907 << waiting_for_peered
.size()
12908 << " items to waiting_for_flush"
12910 ceph_assert(waiting_for_flush
.empty());
12911 waiting_for_flush
.swap(waiting_for_peered
);
12916 if (needs_recovery()) {
12917 dout(10) << "activate not all replicas are up-to-date, queueing recovery" << dendl
;
12918 queue_peering_event(
12920 std::make_shared
<PGPeeringEvent
>(
12921 get_osdmap_epoch(),
12922 get_osdmap_epoch(),
12923 PeeringState::DoRecovery())));
12924 } else if (needs_backfill()) {
12925 dout(10) << "activate queueing backfill" << dendl
;
12926 queue_peering_event(
12928 std::make_shared
<PGPeeringEvent
>(
12929 get_osdmap_epoch(),
12930 get_osdmap_epoch(),
12931 PeeringState::RequestBackfill())));
12933 dout(10) << "activate all replicas clean, no recovery" << dendl
;
12934 queue_peering_event(
12936 std::make_shared
<PGPeeringEvent
>(
12937 get_osdmap_epoch(),
12938 get_osdmap_epoch(),
12939 PeeringState::AllReplicasRecovered())));
12942 publish_stats_to_osd();
12944 if (get_backfill_targets().size()) {
12945 last_backfill_started
= recovery_state
.earliest_backfill();
12946 new_backfill
= true;
12947 ceph_assert(!last_backfill_started
.is_max());
12948 dout(5) << __func__
<< ": bft=" << get_backfill_targets()
12949 << " from " << last_backfill_started
<< dendl
;
12950 for (set
<pg_shard_t
>::const_iterator i
= get_backfill_targets().begin();
12951 i
!= get_backfill_targets().end();
12953 dout(5) << "target shard " << *i
12954 << " from " << recovery_state
.get_peer_info(*i
).last_backfill
12963 void PrimaryLogPG::on_change(ObjectStore::Transaction
&t
)
12965 dout(10) << __func__
<< dendl
;
12967 if (hit_set
&& hit_set
->insert_count() == 0) {
12968 dout(20) << " discarding empty hit_set" << dendl
;
12972 if (recovery_queued
) {
12973 recovery_queued
= false;
12974 osd
->clear_queued_recovery(this);
12977 // requeue everything in the reverse order they should be
12979 requeue_ops(waiting_for_peered
);
12980 requeue_ops(waiting_for_flush
);
12981 requeue_ops(waiting_for_active
);
12982 requeue_ops(waiting_for_readable
);
12984 vector
<ceph_tid_t
> tids
;
12985 cancel_copy_ops(is_primary(), &tids
);
12986 cancel_flush_ops(is_primary(), &tids
);
12987 cancel_proxy_ops(is_primary(), &tids
);
12988 cancel_manifest_ops(is_primary(), &tids
);
12989 cancel_cls_gather_ops(is_primary(), &tids
);
12990 osd
->objecter
->op_cancel(tids
, -ECANCELED
);
12992 // requeue object waiters
12993 for (auto& p
: waiting_for_unreadable_object
) {
12994 release_backoffs(p
.first
);
12996 if (is_primary()) {
12997 requeue_object_waiters(waiting_for_unreadable_object
);
12999 waiting_for_unreadable_object
.clear();
13001 for (map
<hobject_t
,list
<OpRequestRef
>>::iterator p
= waiting_for_degraded_object
.begin();
13002 p
!= waiting_for_degraded_object
.end();
13003 waiting_for_degraded_object
.erase(p
++)) {
13004 release_backoffs(p
->first
);
13006 requeue_ops(p
->second
);
13009 finish_degraded_object(p
->first
);
13012 // requeues waiting_for_scrub
13013 m_scrubber
->scrub_clear_state();
13015 for (auto p
= waiting_for_blocked_object
.begin();
13016 p
!= waiting_for_blocked_object
.end();
13017 waiting_for_blocked_object
.erase(p
++)) {
13019 requeue_ops(p
->second
);
13023 for (auto i
= callbacks_for_degraded_object
.begin();
13024 i
!= callbacks_for_degraded_object
.end();
13026 finish_degraded_object((i
++)->first
);
13028 ceph_assert(callbacks_for_degraded_object
.empty());
13030 if (is_primary()) {
13031 requeue_ops(waiting_for_cache_not_full
);
13033 waiting_for_cache_not_full
.clear();
13035 objects_blocked_on_cache_full
.clear();
13037 for (list
<pair
<OpRequestRef
, OpContext
*> >::iterator i
=
13038 in_progress_async_reads
.begin();
13039 i
!= in_progress_async_reads
.end();
13040 in_progress_async_reads
.erase(i
++)) {
13041 close_op_ctx(i
->second
);
13043 requeue_op(i
->first
);
13046 // this will requeue ops we were working on but didn't finish, and
13048 apply_and_flush_repops(is_primary());
13049 cancel_log_updates();
13051 // do this *after* apply_and_flush_repops so that we catch any newly
13052 // registered watches.
13053 context_registry_on_change();
13055 pgbackend
->on_change_cleanup(&t
);
13056 m_scrubber
->cleanup_store(&t
);
13057 pgbackend
->on_change();
13059 // clear snap_trimmer state
13060 snap_trimmer_machine
.process_event(Reset());
13062 debug_op_order
.clear();
13063 unstable_stats
.clear();
13065 // we don't want to cache object_contexts through the interval change
13066 // NOTE: we actually assert that all currently live references are dead
13067 // by the time the flush for the next interval completes.
13068 object_contexts
.clear();
13070 // should have been cleared above by finishing all of the degraded objects
13071 ceph_assert(objects_blocked_on_degraded_snap
.empty());
13074 void PrimaryLogPG::plpg_on_role_change()
13076 dout(10) << __func__
<< dendl
;
13077 if (get_role() != 0 && hit_set
) {
13078 dout(10) << " clearing hit set" << dendl
;
13083 void PrimaryLogPG::plpg_on_pool_change()
13085 dout(10) << __func__
<< dendl
;
13086 // requeue cache full waiters just in case the cache_mode is
13087 // changing away from writeback mode. note that if we are not
13088 // active the normal requeuing machinery is sufficient (and properly
13091 pool
.info
.cache_mode
!= pg_pool_t::CACHEMODE_WRITEBACK
&&
13092 !waiting_for_cache_not_full
.empty()) {
13093 dout(10) << __func__
<< " requeuing full waiters (not in writeback) "
13095 requeue_ops(waiting_for_cache_not_full
);
13096 objects_blocked_on_cache_full
.clear();
13102 // clear state. called on recovery completion AND cancellation.
13103 void PrimaryLogPG::_clear_recovery_state()
13105 #ifdef DEBUG_RECOVERY_OIDS
13106 recovering_oids
.clear();
13108 dout(15) << __func__
<< " flags: " << m_planned_scrub
<< dendl
;
13110 last_backfill_started
= hobject_t();
13111 set
<hobject_t
>::iterator i
= backfills_in_flight
.begin();
13112 while (i
!= backfills_in_flight
.end()) {
13113 backfills_in_flight
.erase(i
++);
13116 list
<OpRequestRef
> blocked_ops
;
13117 for (map
<hobject_t
, ObjectContextRef
>::iterator i
= recovering
.begin();
13118 i
!= recovering
.end();
13119 recovering
.erase(i
++)) {
13121 i
->second
->drop_recovery_read(&blocked_ops
);
13122 requeue_ops(blocked_ops
);
13125 ceph_assert(backfills_in_flight
.empty());
13126 pending_backfill_updates
.clear();
13127 ceph_assert(recovering
.empty());
13128 pgbackend
->clear_recovery_state();
13131 void PrimaryLogPG::cancel_pull(const hobject_t
&soid
)
13133 dout(20) << __func__
<< ": " << soid
<< dendl
;
13134 ceph_assert(recovering
.count(soid
));
13135 ObjectContextRef obc
= recovering
[soid
];
13137 list
<OpRequestRef
> blocked_ops
;
13138 obc
->drop_recovery_read(&blocked_ops
);
13139 requeue_ops(blocked_ops
);
13141 recovering
.erase(soid
);
13142 finish_recovery_op(soid
);
13143 release_backoffs(soid
);
13144 if (waiting_for_degraded_object
.count(soid
)) {
13145 dout(20) << " kicking degraded waiters on " << soid
<< dendl
;
13146 requeue_ops(waiting_for_degraded_object
[soid
]);
13147 waiting_for_degraded_object
.erase(soid
);
13149 if (waiting_for_unreadable_object
.count(soid
)) {
13150 dout(20) << " kicking unreadable waiters on " << soid
<< dendl
;
13151 requeue_ops(waiting_for_unreadable_object
[soid
]);
13152 waiting_for_unreadable_object
.erase(soid
);
13154 if (is_missing_object(soid
))
13155 recovery_state
.set_last_requested(0);
13156 finish_degraded_object(soid
);
13159 void PrimaryLogPG::check_recovery_sources(const OSDMapRef
& osdmap
)
13161 pgbackend
->check_recovery_sources(osdmap
);
13164 bool PrimaryLogPG::start_recovery_ops(
13166 ThreadPool::TPHandle
&handle
,
13167 uint64_t *ops_started
)
13169 uint64_t& started
= *ops_started
;
13171 bool work_in_progress
= false;
13172 bool recovery_started
= false;
13173 ceph_assert(is_primary());
13174 ceph_assert(is_peered());
13175 ceph_assert(!recovery_state
.is_deleting());
13177 ceph_assert(recovery_queued
);
13178 recovery_queued
= false;
13180 if (!state_test(PG_STATE_RECOVERING
) &&
13181 !state_test(PG_STATE_BACKFILLING
)) {
13182 /* TODO: I think this case is broken and will make do_recovery()
13183 * unhappy since we're returning false */
13184 dout(10) << "recovery raced and were queued twice, ignoring!" << dendl
;
13185 return have_unfound();
13188 const auto &missing
= recovery_state
.get_pg_log().get_missing();
13190 uint64_t num_unfound
= get_num_unfound();
13192 if (!recovery_state
.have_missing()) {
13193 recovery_state
.local_recovery_complete();
13196 if (!missing
.have_missing() || // Primary does not have missing
13197 // or all of the missing objects are unfound.
13198 recovery_state
.all_missing_unfound()) {
13199 // Recover the replicas.
13200 started
= recover_replicas(max
, handle
, &recovery_started
);
13203 // We still have missing objects that we should grab from replicas.
13204 started
+= recover_primary(max
, handle
);
13206 if (!started
&& num_unfound
!= get_num_unfound()) {
13207 // second chance to recovery replicas
13208 started
= recover_replicas(max
, handle
, &recovery_started
);
13211 if (started
|| recovery_started
)
13212 work_in_progress
= true;
13214 bool deferred_backfill
= false;
13215 if (recovering
.empty() &&
13216 state_test(PG_STATE_BACKFILLING
) &&
13217 !get_backfill_targets().empty() && started
< max
&&
13218 missing
.num_missing() == 0 &&
13219 waiting_on_backfill
.empty()) {
13220 if (get_osdmap()->test_flag(CEPH_OSDMAP_NOBACKFILL
)) {
13221 dout(10) << "deferring backfill due to NOBACKFILL" << dendl
;
13222 deferred_backfill
= true;
13223 } else if (get_osdmap()->test_flag(CEPH_OSDMAP_NOREBALANCE
) &&
13225 dout(10) << "deferring backfill due to NOREBALANCE" << dendl
;
13226 deferred_backfill
= true;
13227 } else if (!recovery_state
.is_backfill_reserved()) {
13228 /* DNMNOTE I think this branch is dead */
13229 dout(10) << "deferring backfill due to !backfill_reserved" << dendl
;
13230 if (!backfill_reserving
) {
13231 dout(10) << "queueing RequestBackfill" << dendl
;
13232 backfill_reserving
= true;
13233 queue_peering_event(
13235 std::make_shared
<PGPeeringEvent
>(
13236 get_osdmap_epoch(),
13237 get_osdmap_epoch(),
13238 PeeringState::RequestBackfill())));
13240 deferred_backfill
= true;
13242 started
+= recover_backfill(max
- started
, handle
, &work_in_progress
);
13246 dout(10) << " started " << started
<< dendl
;
13247 osd
->logger
->inc(l_osd_rop
, started
);
13249 if (!recovering
.empty() ||
13250 work_in_progress
|| recovery_ops_active
> 0 || deferred_backfill
)
13251 return !work_in_progress
&& have_unfound();
13253 ceph_assert(recovering
.empty());
13254 ceph_assert(recovery_ops_active
== 0);
13256 dout(10) << __func__
<< " needs_recovery: "
13257 << recovery_state
.get_missing_loc().get_needs_recovery()
13259 dout(10) << __func__
<< " missing_loc: "
13260 << recovery_state
.get_missing_loc().get_missing_locs()
13262 int unfound
= get_num_unfound();
13264 dout(10) << " still have " << unfound
<< " unfound" << dendl
;
13268 if (missing
.num_missing() > 0) {
13269 // this shouldn't happen!
13270 osd
->clog
->error() << info
.pgid
<< " Unexpected Error: recovery ending with "
13271 << missing
.num_missing() << ": " << missing
.get_items();
13275 if (needs_recovery()) {
13276 // this shouldn't happen!
13277 // We already checked num_missing() so we must have missing replicas
13278 osd
->clog
->error() << info
.pgid
13279 << " Unexpected Error: recovery ending with missing replicas";
13283 if (state_test(PG_STATE_RECOVERING
)) {
13284 state_clear(PG_STATE_RECOVERING
);
13285 state_clear(PG_STATE_FORCED_RECOVERY
);
13286 if (needs_backfill()) {
13287 dout(10) << "recovery done, queuing backfill" << dendl
;
13288 queue_peering_event(
13290 std::make_shared
<PGPeeringEvent
>(
13291 get_osdmap_epoch(),
13292 get_osdmap_epoch(),
13293 PeeringState::RequestBackfill())));
13295 dout(10) << "recovery done, no backfill" << dendl
;
13296 state_clear(PG_STATE_FORCED_BACKFILL
);
13297 queue_peering_event(
13299 std::make_shared
<PGPeeringEvent
>(
13300 get_osdmap_epoch(),
13301 get_osdmap_epoch(),
13302 PeeringState::AllReplicasRecovered())));
13304 } else { // backfilling
13305 state_clear(PG_STATE_BACKFILLING
);
13306 state_clear(PG_STATE_FORCED_BACKFILL
);
13307 state_clear(PG_STATE_FORCED_RECOVERY
);
13308 dout(10) << "recovery done, backfill done" << dendl
;
13309 queue_peering_event(
13311 std::make_shared
<PGPeeringEvent
>(
13312 get_osdmap_epoch(),
13313 get_osdmap_epoch(),
13314 PeeringState::Backfilled())));
13321 * do one recovery op.
13322 * return true if done, false if nothing left to do.
13324 uint64_t PrimaryLogPG::recover_primary(uint64_t max
, ThreadPool::TPHandle
&handle
)
13326 ceph_assert(is_primary());
13328 const auto &missing
= recovery_state
.get_pg_log().get_missing();
13330 dout(10) << __func__
<< " recovering " << recovering
.size()
13332 << " missing " << missing
<< dendl
;
13334 dout(25) << __func__
<< " " << missing
.get_items() << dendl
;
13337 pg_log_entry_t
*latest
= 0;
13338 unsigned started
= 0;
13341 PGBackend::RecoveryHandle
*h
= pgbackend
->open_recovery_op();
13342 map
<version_t
, hobject_t
>::const_iterator p
=
13343 missing
.get_rmissing().lower_bound(recovery_state
.get_pg_log().get_log().last_requested
);
13344 while (p
!= missing
.get_rmissing().end()) {
13345 handle
.reset_tp_timeout();
13347 version_t v
= p
->first
;
13349 auto it_objects
= recovery_state
.get_pg_log().get_log().objects
.find(p
->second
);
13350 if (it_objects
!= recovery_state
.get_pg_log().get_log().objects
.end()) {
13351 latest
= it_objects
->second
;
13352 ceph_assert(latest
->is_update() || latest
->is_delete());
13353 soid
= latest
->soid
;
13358 const pg_missing_item
& item
= missing
.get_items().find(p
->second
)->second
;
13361 hobject_t head
= soid
.get_head();
13363 eversion_t need
= item
.need
;
13365 dout(10) << __func__
<< " "
13366 << soid
<< " " << item
.need
13367 << (missing
.is_missing(soid
) ? " (missing)":"")
13368 << (missing
.is_missing(head
) ? " (missing head)":"")
13369 << (recovering
.count(soid
) ? " (recovering)":"")
13370 << (recovering
.count(head
) ? " (recovering head)":"")
13374 switch (latest
->op
) {
13375 case pg_log_entry_t::CLONE
:
13377 * Handling for this special case removed for now, until we
13378 * can correctly construct an accurate SnapSet from the old
13383 case pg_log_entry_t::LOST_REVERT
:
13385 if (item
.have
== latest
->reverting_to
) {
13386 ObjectContextRef obc
= get_object_context(soid
, true);
13388 if (obc
->obs
.oi
.version
== latest
->version
) {
13389 // I'm already reverting
13390 dout(10) << " already reverting " << soid
<< dendl
;
13392 dout(10) << " reverting " << soid
<< " to " << latest
->prior_version
<< dendl
;
13393 obc
->obs
.oi
.version
= latest
->version
;
13395 ObjectStore::Transaction t
;
13397 obc
->obs
.oi
.encode(
13399 get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD
, nullptr));
13400 ceph_assert(!pool
.info
.require_rollback());
13401 t
.setattr(coll
, ghobject_t(soid
), OI_ATTR
, b2
);
13403 recovery_state
.recover_got(
13411 t
.register_on_applied(new C_OSD_AppliedRecoveredObject(this, obc
));
13412 t
.register_on_commit(new C_OSD_CommittedPushedObject(
13414 get_osdmap_epoch(),
13415 info
.last_complete
));
13416 osd
->store
->queue_transaction(ch
, std::move(t
));
13421 * Pull the old version of the object. Update missing_loc here to have the location
13422 * of the version we want.
13424 * This doesn't use the usual missing_loc paths, but that's okay:
13425 * - if we have it locally, we hit the case above, and go from there.
13426 * - if we don't, we always pass through this case during recovery and set up the location
13428 * - this way we don't need to mangle the missing code to be general about needing an old
13431 eversion_t alternate_need
= latest
->reverting_to
;
13432 dout(10) << " need to pull prior_version " << alternate_need
<< " for revert " << item
<< dendl
;
13434 set
<pg_shard_t
> good_peers
;
13435 for (auto p
= recovery_state
.get_peer_missing().begin();
13436 p
!= recovery_state
.get_peer_missing().end();
13438 if (p
->second
.is_missing(soid
, need
) &&
13439 p
->second
.get_items().at(soid
).have
== alternate_need
) {
13440 good_peers
.insert(p
->first
);
13443 recovery_state
.set_revert_with_targets(
13446 dout(10) << " will pull " << alternate_need
<< " or " << need
13448 << recovery_state
.get_missing_loc().get_locations(soid
)
13456 if (!recovering
.count(soid
)) {
13457 if (recovering
.count(head
)) {
13460 int r
= recover_missing(
13461 soid
, need
, get_recovery_op_priority(), h
);
13474 if (started
>= max
)
13479 // only advance last_requested if we haven't skipped anything
13481 recovery_state
.set_last_requested(v
);
13484 pgbackend
->run_recovery_op(h
, get_recovery_op_priority());
13488 bool PrimaryLogPG::primary_error(
13489 const hobject_t
& soid
, eversion_t v
)
13491 recovery_state
.force_object_missing(pg_whoami
, soid
, v
);
13492 bool uhoh
= recovery_state
.get_missing_loc().is_unfound(soid
);
13494 osd
->clog
->error() << info
.pgid
<< " missing primary copy of "
13495 << soid
<< ", unfound";
13497 osd
->clog
->error() << info
.pgid
<< " missing primary copy of "
13499 << ", will try copies on "
13500 << recovery_state
.get_missing_loc().get_locations(soid
);
13504 int PrimaryLogPG::prep_object_replica_deletes(
13505 const hobject_t
& soid
, eversion_t v
,
13506 PGBackend::RecoveryHandle
*h
,
13507 bool *work_started
)
13509 ceph_assert(is_primary());
13510 dout(10) << __func__
<< ": on " << soid
<< dendl
;
13512 ObjectContextRef obc
= get_object_context(soid
, false);
13514 if (!obc
->get_recovery_read()) {
13515 dout(20) << "replica delete delayed on " << soid
13516 << "; could not get rw_manager lock" << dendl
;
13517 *work_started
= true;
13520 dout(20) << "replica delete got recovery read lock on " << soid
13525 start_recovery_op(soid
);
13526 ceph_assert(!recovering
.count(soid
));
13528 recovering
.insert(make_pair(soid
, ObjectContextRef()));
13530 recovering
.insert(make_pair(soid
, obc
));
13532 pgbackend
->recover_delete_object(soid
, v
, h
);
13536 int PrimaryLogPG::prep_object_replica_pushes(
13537 const hobject_t
& soid
, eversion_t v
,
13538 PGBackend::RecoveryHandle
*h
,
13539 bool *work_started
)
13541 ceph_assert(is_primary());
13542 dout(10) << __func__
<< ": on " << soid
<< dendl
;
13544 if (soid
.snap
&& soid
.snap
< CEPH_NOSNAP
) {
13545 // do we have the head and/or snapdir?
13546 hobject_t head
= soid
.get_head();
13547 if (recovery_state
.get_pg_log().get_missing().is_missing(head
)) {
13548 if (recovering
.count(head
)) {
13549 dout(10) << " missing but already recovering head " << head
<< dendl
;
13552 int r
= recover_missing(
13553 head
, recovery_state
.get_pg_log().get_missing().get_items().find(head
)->second
.need
,
13554 get_recovery_op_priority(), h
);
13555 if (r
!= PULL_NONE
)
13562 // NOTE: we know we will get a valid oloc off of disk here.
13563 ObjectContextRef obc
= get_object_context(soid
, false);
13565 primary_error(soid
, v
);
13569 if (!obc
->get_recovery_read()) {
13570 dout(20) << "recovery delayed on " << soid
13571 << "; could not get rw_manager lock" << dendl
;
13572 *work_started
= true;
13575 dout(20) << "recovery got recovery read lock on " << soid
13579 start_recovery_op(soid
);
13580 ceph_assert(!recovering
.count(soid
));
13581 recovering
.insert(make_pair(soid
, obc
));
13583 int r
= pgbackend
->recover_object(
13586 ObjectContextRef(),
13587 obc
, // has snapset context
13590 dout(0) << __func__
<< " Error " << r
<< " on oid " << soid
<< dendl
;
13591 on_failed_pull({ pg_whoami
}, soid
, v
);
13597 uint64_t PrimaryLogPG::recover_replicas(uint64_t max
, ThreadPool::TPHandle
&handle
,
13598 bool *work_started
)
13600 dout(10) << __func__
<< "(" << max
<< ")" << dendl
;
13601 uint64_t started
= 0;
13603 PGBackend::RecoveryHandle
*h
= pgbackend
->open_recovery_op();
13605 // this is FAR from an optimal recovery order. pretty lame, really.
13606 ceph_assert(!get_acting_recovery_backfill().empty());
13607 // choose replicas to recover, replica has the shortest missing list first
13608 // so we can bring it back to normal ASAP
13609 std::vector
<std::pair
<unsigned int, pg_shard_t
>> replicas_by_num_missing
,
13610 async_by_num_missing
;
13611 replicas_by_num_missing
.reserve(get_acting_recovery_backfill().size() - 1);
13612 for (auto &p
: get_acting_recovery_backfill()) {
13613 if (p
== get_primary()) {
13616 auto pm
= recovery_state
.get_peer_missing().find(p
);
13617 ceph_assert(pm
!= recovery_state
.get_peer_missing().end());
13618 auto nm
= pm
->second
.num_missing();
13620 if (is_async_recovery_target(p
)) {
13621 async_by_num_missing
.push_back(make_pair(nm
, p
));
13623 replicas_by_num_missing
.push_back(make_pair(nm
, p
));
13627 // sort by number of missing objects, in ascending order.
13628 auto func
= [](const std::pair
<unsigned int, pg_shard_t
> &lhs
,
13629 const std::pair
<unsigned int, pg_shard_t
> &rhs
) {
13630 return lhs
.first
< rhs
.first
;
13632 // acting goes first
13633 std::sort(replicas_by_num_missing
.begin(), replicas_by_num_missing
.end(), func
);
13634 // then async_recovery_targets
13635 std::sort(async_by_num_missing
.begin(), async_by_num_missing
.end(), func
);
13636 replicas_by_num_missing
.insert(replicas_by_num_missing
.end(),
13637 async_by_num_missing
.begin(), async_by_num_missing
.end());
13638 for (auto &replica
: replicas_by_num_missing
) {
13639 pg_shard_t
&peer
= replica
.second
;
13640 ceph_assert(peer
!= get_primary());
13641 auto pm
= recovery_state
.get_peer_missing().find(peer
);
13642 ceph_assert(pm
!= recovery_state
.get_peer_missing().end());
13643 size_t m_sz
= pm
->second
.num_missing();
13645 dout(10) << " peer osd." << peer
<< " missing " << m_sz
<< " objects." << dendl
;
13646 dout(20) << " peer osd." << peer
<< " missing " << pm
->second
.get_items() << dendl
;
13649 const pg_missing_t
&m(pm
->second
);
13650 for (map
<version_t
, hobject_t
>::const_iterator p
= m
.get_rmissing().begin();
13651 p
!= m
.get_rmissing().end() && started
< max
;
13653 handle
.reset_tp_timeout();
13654 const hobject_t
soid(p
->second
);
13656 if (recovery_state
.get_missing_loc().is_unfound(soid
)) {
13657 dout(10) << __func__
<< ": " << soid
<< " still unfound" << dendl
;
13661 const pg_info_t
&pi
= recovery_state
.get_peer_info(peer
);
13662 if (soid
> pi
.last_backfill
) {
13663 if (!recovering
.count(soid
)) {
13664 derr
<< __func__
<< ": object " << soid
<< " last_backfill "
13665 << pi
.last_backfill
<< dendl
;
13666 derr
<< __func__
<< ": object added to missing set for backfill, but "
13667 << "is not in recovering, error!" << dendl
;
13673 if (recovering
.count(soid
)) {
13674 dout(10) << __func__
<< ": already recovering " << soid
<< dendl
;
13678 if (recovery_state
.get_missing_loc().is_deleted(soid
)) {
13679 dout(10) << __func__
<< ": " << soid
<< " is a delete, removing" << dendl
;
13680 map
<hobject_t
,pg_missing_item
>::const_iterator r
= m
.get_items().find(soid
);
13681 started
+= prep_object_replica_deletes(soid
, r
->second
.need
, h
, work_started
);
13685 if (soid
.is_snap() &&
13686 recovery_state
.get_pg_log().get_missing().is_missing(
13687 soid
.get_head())) {
13688 dout(10) << __func__
<< ": " << soid
.get_head()
13689 << " still missing on primary" << dendl
;
13693 if (recovery_state
.get_pg_log().get_missing().is_missing(soid
)) {
13694 dout(10) << __func__
<< ": " << soid
<< " still missing on primary" << dendl
;
13698 dout(10) << __func__
<< ": recover_object_replicas(" << soid
<< ")" << dendl
;
13699 map
<hobject_t
,pg_missing_item
>::const_iterator r
= m
.get_items().find(soid
);
13700 started
+= prep_object_replica_pushes(soid
, r
->second
.need
, h
, work_started
);
13704 pgbackend
->run_recovery_op(h
, get_recovery_op_priority());
13708 hobject_t
PrimaryLogPG::earliest_peer_backfill() const
13710 hobject_t e
= hobject_t::get_max();
13711 for (const pg_shard_t
& peer
: get_backfill_targets()) {
13712 const auto iter
= peer_backfill_info
.find(peer
);
13713 ceph_assert(iter
!= peer_backfill_info
.end());
13714 e
= std::min(e
, iter
->second
.begin
);
13719 bool PrimaryLogPG::all_peer_done() const
13721 // Primary hasn't got any more objects
13722 ceph_assert(backfill_info
.empty());
13724 for (const pg_shard_t
& bt
: get_backfill_targets()) {
13725 const auto piter
= peer_backfill_info
.find(bt
);
13726 ceph_assert(piter
!= peer_backfill_info
.end());
13727 const BackfillInterval
& pbi
= piter
->second
;
13728 // See if peer has more to process
13729 if (!pbi
.extends_to_end() || !pbi
.empty())
13740 * backfilled: fully pushed to replica or present in replica's missing set (both
13741 * our copy and theirs).
13743 * All objects on a backfill_target in
13744 * [MIN,peer_backfill_info[backfill_target].begin) are valid; logically-removed
13745 * objects have been actually deleted and all logically-valid objects are replicated.
13746 * There may be PG objects in this interval yet to be backfilled.
13748 * All objects in PG in [MIN,backfill_info.begin) have been backfilled to all
13749 * backfill_targets. There may be objects on backfill_target(s) yet to be deleted.
13751 * For a backfill target, all objects < std::min(peer_backfill_info[target].begin,
13752 * backfill_info.begin) in PG are backfilled. No deleted objects in this
13753 * interval remain on the backfill target.
13755 * For a backfill target, all objects <= peer_info[target].last_backfill
13756 * have been backfilled to target
13758 * There *MAY* be missing/outdated objects between last_backfill_started and
13759 * std::min(peer_backfill_info[*].begin, backfill_info.begin) in the event that client
13760 * io created objects since the last scan. For this reason, we call
13761 * update_range() again before continuing backfill.
13763 uint64_t PrimaryLogPG::recover_backfill(
13765 ThreadPool::TPHandle
&handle
, bool *work_started
)
13767 dout(10) << __func__
<< " (" << max
<< ")"
13768 << " bft=" << get_backfill_targets()
13769 << " last_backfill_started " << last_backfill_started
13770 << (new_backfill
? " new_backfill":"")
13772 ceph_assert(!get_backfill_targets().empty());
13774 // Initialize from prior backfill state
13775 if (new_backfill
) {
13776 // on_activate() was called prior to getting here
13777 ceph_assert(last_backfill_started
== recovery_state
.earliest_backfill());
13778 new_backfill
= false;
13780 // initialize BackfillIntervals
13781 for (set
<pg_shard_t
>::const_iterator i
= get_backfill_targets().begin();
13782 i
!= get_backfill_targets().end();
13784 peer_backfill_info
[*i
].reset(
13785 recovery_state
.get_peer_info(*i
).last_backfill
);
13787 backfill_info
.reset(last_backfill_started
);
13789 backfills_in_flight
.clear();
13790 pending_backfill_updates
.clear();
13793 for (set
<pg_shard_t
>::const_iterator i
= get_backfill_targets().begin();
13794 i
!= get_backfill_targets().end();
13796 dout(10) << "peer osd." << *i
13797 << " info " << recovery_state
.get_peer_info(*i
)
13798 << " interval " << peer_backfill_info
[*i
].begin
13799 << "-" << peer_backfill_info
[*i
].end
13800 << " " << peer_backfill_info
[*i
].objects
.size() << " objects"
13804 // update our local interval to cope with recent changes
13805 backfill_info
.begin
= last_backfill_started
;
13806 update_range(&backfill_info
, handle
);
13809 vector
<boost::tuple
<hobject_t
, eversion_t
, pg_shard_t
> > to_remove
;
13810 set
<hobject_t
> add_to_stat
;
13812 for (set
<pg_shard_t
>::const_iterator i
= get_backfill_targets().begin();
13813 i
!= get_backfill_targets().end();
13815 peer_backfill_info
[*i
].trim_to(
13817 recovery_state
.get_peer_info(*i
).last_backfill
,
13818 last_backfill_started
));
13820 backfill_info
.trim_to(last_backfill_started
);
13822 PGBackend::RecoveryHandle
*h
= pgbackend
->open_recovery_op();
13823 while (ops
< max
) {
13824 if (backfill_info
.begin
<= earliest_peer_backfill() &&
13825 !backfill_info
.extends_to_end() && backfill_info
.empty()) {
13826 hobject_t next
= backfill_info
.end
;
13827 backfill_info
.reset(next
);
13828 backfill_info
.end
= hobject_t::get_max();
13829 update_range(&backfill_info
, handle
);
13830 backfill_info
.trim();
13833 dout(20) << " my backfill interval " << backfill_info
<< dendl
;
13835 bool sent_scan
= false;
13836 for (set
<pg_shard_t
>::const_iterator i
= get_backfill_targets().begin();
13837 i
!= get_backfill_targets().end();
13839 pg_shard_t bt
= *i
;
13840 BackfillInterval
& pbi
= peer_backfill_info
[bt
];
13842 dout(20) << " peer shard " << bt
<< " backfill " << pbi
<< dendl
;
13843 if (pbi
.begin
<= backfill_info
.begin
&&
13844 !pbi
.extends_to_end() && pbi
.empty()) {
13845 dout(10) << " scanning peer osd." << bt
<< " from " << pbi
.end
<< dendl
;
13846 epoch_t e
= get_osdmap_epoch();
13847 MOSDPGScan
*m
= new MOSDPGScan(
13848 MOSDPGScan::OP_SCAN_GET_DIGEST
, pg_whoami
, e
, get_last_peering_reset(),
13849 spg_t(info
.pgid
.pgid
, bt
.shard
),
13850 pbi
.end
, hobject_t());
13851 osd
->send_message_osd_cluster(bt
.osd
, m
, get_osdmap_epoch());
13852 ceph_assert(waiting_on_backfill
.find(bt
) == waiting_on_backfill
.end());
13853 waiting_on_backfill
.insert(bt
);
13858 // Count simultaneous scans as a single op and let those complete
13861 start_recovery_op(hobject_t::get_max()); // XXX: was pbi.end
13865 if (backfill_info
.empty() && all_peer_done()) {
13866 dout(10) << " reached end for both local and all peers" << dendl
;
13870 // Get object within set of peers to operate on and
13871 // the set of targets for which that object applies.
13872 hobject_t check
= earliest_peer_backfill();
13874 if (check
< backfill_info
.begin
) {
13876 set
<pg_shard_t
> check_targets
;
13877 for (set
<pg_shard_t
>::const_iterator i
= get_backfill_targets().begin();
13878 i
!= get_backfill_targets().end();
13880 pg_shard_t bt
= *i
;
13881 BackfillInterval
& pbi
= peer_backfill_info
[bt
];
13882 if (pbi
.begin
== check
)
13883 check_targets
.insert(bt
);
13885 ceph_assert(!check_targets
.empty());
13887 dout(20) << " BACKFILL removing " << check
13888 << " from peers " << check_targets
<< dendl
;
13889 for (set
<pg_shard_t
>::iterator i
= check_targets
.begin();
13890 i
!= check_targets
.end();
13892 pg_shard_t bt
= *i
;
13893 BackfillInterval
& pbi
= peer_backfill_info
[bt
];
13894 ceph_assert(pbi
.begin
== check
);
13896 to_remove
.push_back(boost::make_tuple(check
, pbi
.objects
.begin()->second
, bt
));
13900 last_backfill_started
= check
;
13902 // Don't increment ops here because deletions
13903 // are cheap and not replied to unlike real recovery_ops,
13904 // and we can't increment ops without requeueing ourself
13907 eversion_t
& obj_v
= backfill_info
.objects
.begin()->second
;
13909 vector
<pg_shard_t
> need_ver_targs
, missing_targs
, keep_ver_targs
, skip_targs
;
13910 for (set
<pg_shard_t
>::const_iterator i
= get_backfill_targets().begin();
13911 i
!= get_backfill_targets().end();
13913 pg_shard_t bt
= *i
;
13914 BackfillInterval
& pbi
= peer_backfill_info
[bt
];
13915 // Find all check peers that have the wrong version
13916 if (check
== backfill_info
.begin
&& check
== pbi
.begin
) {
13917 if (pbi
.objects
.begin()->second
!= obj_v
) {
13918 need_ver_targs
.push_back(bt
);
13920 keep_ver_targs
.push_back(bt
);
13923 const pg_info_t
& pinfo
= recovery_state
.get_peer_info(bt
);
13925 // Only include peers that we've caught up to their backfill line
13926 // otherwise, they only appear to be missing this object
13927 // because their pbi.begin > backfill_info.begin.
13928 if (backfill_info
.begin
> pinfo
.last_backfill
)
13929 missing_targs
.push_back(bt
);
13931 skip_targs
.push_back(bt
);
13935 if (!keep_ver_targs
.empty()) {
13936 // These peers have version obj_v
13937 dout(20) << " BACKFILL keeping " << check
13938 << " with ver " << obj_v
13939 << " on peers " << keep_ver_targs
<< dendl
;
13940 //assert(!waiting_for_degraded_object.count(check));
13942 if (!need_ver_targs
.empty() || !missing_targs
.empty()) {
13943 ObjectContextRef obc
= get_object_context(backfill_info
.begin
, false);
13945 if (obc
->get_recovery_read()) {
13946 if (!need_ver_targs
.empty()) {
13947 dout(20) << " BACKFILL replacing " << check
13948 << " with ver " << obj_v
13949 << " to peers " << need_ver_targs
<< dendl
;
13951 if (!missing_targs
.empty()) {
13952 dout(20) << " BACKFILL pushing " << backfill_info
.begin
13953 << " with ver " << obj_v
13954 << " to peers " << missing_targs
<< dendl
;
13956 vector
<pg_shard_t
> all_push
= need_ver_targs
;
13957 all_push
.insert(all_push
.end(), missing_targs
.begin(), missing_targs
.end());
13959 handle
.reset_tp_timeout();
13960 int r
= prep_backfill_object_push(backfill_info
.begin
, obj_v
, obc
, all_push
, h
);
13962 *work_started
= true;
13963 dout(0) << __func__
<< " Error " << r
<< " trying to backfill " << backfill_info
.begin
<< dendl
;
13968 *work_started
= true;
13969 dout(20) << "backfill blocking on " << backfill_info
.begin
13970 << "; could not get rw_manager lock" << dendl
;
13974 dout(20) << "need_ver_targs=" << need_ver_targs
13975 << " keep_ver_targs=" << keep_ver_targs
<< dendl
;
13976 dout(20) << "backfill_targets=" << get_backfill_targets()
13977 << " missing_targs=" << missing_targs
13978 << " skip_targs=" << skip_targs
<< dendl
;
13980 last_backfill_started
= backfill_info
.begin
;
13981 add_to_stat
.insert(backfill_info
.begin
); // XXX: Only one for all pushes?
13982 backfill_info
.pop_front();
13983 vector
<pg_shard_t
> check_targets
= need_ver_targs
;
13984 check_targets
.insert(check_targets
.end(), keep_ver_targs
.begin(), keep_ver_targs
.end());
13985 for (vector
<pg_shard_t
>::iterator i
= check_targets
.begin();
13986 i
!= check_targets
.end();
13988 pg_shard_t bt
= *i
;
13989 BackfillInterval
& pbi
= peer_backfill_info
[bt
];
13995 for (set
<hobject_t
>::iterator i
= add_to_stat
.begin();
13996 i
!= add_to_stat
.end();
13998 ObjectContextRef obc
= get_object_context(*i
, false);
14001 add_object_context_to_pg_stat(obc
, &stat
);
14002 pending_backfill_updates
[*i
] = stat
;
14004 map
<pg_shard_t
,MOSDPGBackfillRemove
*> reqs
;
14005 for (unsigned i
= 0; i
< to_remove
.size(); ++i
) {
14006 handle
.reset_tp_timeout();
14007 const hobject_t
& oid
= to_remove
[i
].get
<0>();
14008 eversion_t v
= to_remove
[i
].get
<1>();
14009 pg_shard_t peer
= to_remove
[i
].get
<2>();
14010 MOSDPGBackfillRemove
*m
;
14011 auto it
= reqs
.find(peer
);
14012 if (it
!= reqs
.end()) {
14015 m
= reqs
[peer
] = new MOSDPGBackfillRemove(
14016 spg_t(info
.pgid
.pgid
, peer
.shard
),
14017 get_osdmap_epoch());
14019 m
->ls
.push_back(make_pair(oid
, v
));
14021 if (oid
<= last_backfill_started
)
14022 pending_backfill_updates
[oid
]; // add empty stat!
14024 for (auto p
: reqs
) {
14025 osd
->send_message_osd_cluster(p
.first
.osd
, p
.second
,
14026 get_osdmap_epoch());
14029 pgbackend
->run_recovery_op(h
, get_recovery_op_priority());
14031 hobject_t backfill_pos
=
14032 std::min(backfill_info
.begin
, earliest_peer_backfill());
14033 dout(5) << "backfill_pos is " << backfill_pos
<< dendl
;
14034 for (set
<hobject_t
>::iterator i
= backfills_in_flight
.begin();
14035 i
!= backfills_in_flight
.end();
14037 dout(20) << *i
<< " is still in flight" << dendl
;
14040 hobject_t next_backfill_to_complete
= backfills_in_flight
.empty() ?
14041 backfill_pos
: *(backfills_in_flight
.begin());
14042 hobject_t new_last_backfill
= recovery_state
.earliest_backfill();
14043 dout(10) << "starting new_last_backfill at " << new_last_backfill
<< dendl
;
14044 for (map
<hobject_t
, pg_stat_t
>::iterator i
=
14045 pending_backfill_updates
.begin();
14046 i
!= pending_backfill_updates
.end() &&
14047 i
->first
< next_backfill_to_complete
;
14048 pending_backfill_updates
.erase(i
++)) {
14049 dout(20) << " pending_backfill_update " << i
->first
<< dendl
;
14050 ceph_assert(i
->first
> new_last_backfill
);
14051 // carried from a previous round – if we are here, then we had to
14052 // be requeued (by e.g. on_global_recover()) and those operations
14054 recovery_state
.update_complete_backfill_object_stats(
14057 new_last_backfill
= i
->first
;
14059 dout(10) << "possible new_last_backfill at " << new_last_backfill
<< dendl
;
14061 ceph_assert(!pending_backfill_updates
.empty() ||
14062 new_last_backfill
== last_backfill_started
);
14063 if (pending_backfill_updates
.empty() &&
14064 backfill_pos
.is_max()) {
14065 ceph_assert(backfills_in_flight
.empty());
14066 new_last_backfill
= backfill_pos
;
14067 last_backfill_started
= backfill_pos
;
14069 dout(10) << "final new_last_backfill at " << new_last_backfill
<< dendl
;
14071 // If new_last_backfill == MAX, then we will send OP_BACKFILL_FINISH to
14072 // all the backfill targets. Otherwise, we will move last_backfill up on
14073 // those targets need it and send OP_BACKFILL_PROGRESS to them.
14074 for (set
<pg_shard_t
>::const_iterator i
= get_backfill_targets().begin();
14075 i
!= get_backfill_targets().end();
14077 pg_shard_t bt
= *i
;
14078 const pg_info_t
& pinfo
= recovery_state
.get_peer_info(bt
);
14080 if (new_last_backfill
> pinfo
.last_backfill
) {
14081 recovery_state
.update_peer_last_backfill(bt
, new_last_backfill
);
14082 epoch_t e
= get_osdmap_epoch();
14083 MOSDPGBackfill
*m
= NULL
;
14084 if (pinfo
.last_backfill
.is_max()) {
14085 m
= new MOSDPGBackfill(
14086 MOSDPGBackfill::OP_BACKFILL_FINISH
,
14088 get_last_peering_reset(),
14089 spg_t(info
.pgid
.pgid
, bt
.shard
));
14090 // Use default priority here, must match sub_op priority
14091 start_recovery_op(hobject_t::get_max());
14093 m
= new MOSDPGBackfill(
14094 MOSDPGBackfill::OP_BACKFILL_PROGRESS
,
14096 get_last_peering_reset(),
14097 spg_t(info
.pgid
.pgid
, bt
.shard
));
14098 // Use default priority here, must match sub_op priority
14100 m
->last_backfill
= pinfo
.last_backfill
;
14101 m
->stats
= pinfo
.stats
;
14102 osd
->send_message_osd_cluster(bt
.osd
, m
, get_osdmap_epoch());
14103 dout(10) << " peer " << bt
14104 << " num_objects now " << pinfo
.stats
.stats
.sum
.num_objects
14105 << " / " << info
.stats
.stats
.sum
.num_objects
<< dendl
;
14110 *work_started
= true;
14114 int PrimaryLogPG::prep_backfill_object_push(
14115 hobject_t oid
, eversion_t v
,
14116 ObjectContextRef obc
,
14117 vector
<pg_shard_t
> peers
,
14118 PGBackend::RecoveryHandle
*h
)
14120 dout(10) << __func__
<< " " << oid
<< " v " << v
<< " to peers " << peers
<< dendl
;
14121 ceph_assert(!peers
.empty());
14123 backfills_in_flight
.insert(oid
);
14124 recovery_state
.prepare_backfill_for_missing(oid
, v
, peers
);
14126 ceph_assert(!recovering
.count(oid
));
14128 start_recovery_op(oid
);
14129 recovering
.insert(make_pair(oid
, obc
));
14131 int r
= pgbackend
->recover_object(
14134 ObjectContextRef(),
14138 dout(0) << __func__
<< " Error " << r
<< " on oid " << oid
<< dendl
;
14139 on_failed_pull({ pg_whoami
}, oid
, v
);
14144 void PrimaryLogPG::update_range(
14145 BackfillInterval
*bi
,
14146 ThreadPool::TPHandle
&handle
)
14148 int local_min
= cct
->_conf
->osd_backfill_scan_min
;
14149 int local_max
= cct
->_conf
->osd_backfill_scan_max
;
14151 if (bi
->version
< info
.log_tail
) {
14152 dout(10) << __func__
<< ": bi is old, rescanning local backfill_info"
14154 bi
->version
= info
.last_update
;
14155 scan_range(local_min
, local_max
, bi
, handle
);
14158 if (bi
->version
>= projected_last_update
) {
14159 dout(10) << __func__
<< ": bi is current " << dendl
;
14160 ceph_assert(bi
->version
== projected_last_update
);
14161 } else if (bi
->version
>= info
.log_tail
) {
14162 if (recovery_state
.get_pg_log().get_log().empty() && projected_log
.empty()) {
14163 /* Because we don't move log_tail on split, the log might be
14164 * empty even if log_tail != last_update. However, the only
14165 * way to get here with an empty log is if log_tail is actually
14166 * eversion_t(), because otherwise the entry which changed
14167 * last_update since the last scan would have to be present.
14169 ceph_assert(bi
->version
== eversion_t());
14173 dout(10) << __func__
<< ": bi is old, (" << bi
->version
14174 << ") can be updated with log to projected_last_update "
14175 << projected_last_update
<< dendl
;
14177 auto func
= [&](const pg_log_entry_t
&e
) {
14178 dout(10) << __func__
<< ": updating from version " << e
.version
14180 const hobject_t
&soid
= e
.soid
;
14181 if (soid
>= bi
->begin
&&
14183 if (e
.is_update()) {
14184 dout(10) << __func__
<< ": " << e
.soid
<< " updated to version "
14185 << e
.version
<< dendl
;
14186 bi
->objects
.erase(e
.soid
);
14187 bi
->objects
.insert(
14191 } else if (e
.is_delete()) {
14192 dout(10) << __func__
<< ": " << e
.soid
<< " removed" << dendl
;
14193 bi
->objects
.erase(e
.soid
);
14197 dout(10) << "scanning pg log first" << dendl
;
14198 recovery_state
.get_pg_log().get_log().scan_log_after(bi
->version
, func
);
14199 dout(10) << "scanning projected log" << dendl
;
14200 projected_log
.scan_log_after(bi
->version
, func
);
14201 bi
->version
= projected_last_update
;
14203 ceph_abort_msg("scan_range should have raised bi->version past log_tail");
14207 void PrimaryLogPG::scan_range(
14208 int min
, int max
, BackfillInterval
*bi
,
14209 ThreadPool::TPHandle
&handle
)
14211 ceph_assert(is_locked());
14212 dout(10) << "scan_range from " << bi
->begin
<< dendl
;
14213 bi
->clear_objects();
14215 vector
<hobject_t
> ls
;
14217 int r
= pgbackend
->objects_list_partial(bi
->begin
, min
, max
, &ls
, &bi
->end
);
14218 ceph_assert(r
>= 0);
14219 dout(10) << " got " << ls
.size() << " items, next " << bi
->end
<< dendl
;
14220 dout(20) << ls
<< dendl
;
14222 for (vector
<hobject_t
>::iterator p
= ls
.begin(); p
!= ls
.end(); ++p
) {
14223 handle
.reset_tp_timeout();
14224 ObjectContextRef obc
;
14226 obc
= object_contexts
.lookup(*p
);
14228 if (!obc
->obs
.exists
) {
14229 /* If the object does not exist here, it must have been removed
14230 * between the collection_list_partial and here. This can happen
14231 * for the first item in the range, which is usually last_backfill.
14235 bi
->objects
[*p
] = obc
->obs
.oi
.version
;
14236 dout(20) << " " << *p
<< " " << obc
->obs
.oi
.version
<< dendl
;
14239 int r
= pgbackend
->objects_get_attr(*p
, OI_ATTR
, &bl
);
14240 /* If the object does not exist here, it must have been removed
14241 * between the collection_list_partial and here. This can happen
14242 * for the first item in the range, which is usually last_backfill.
14247 ceph_assert(r
>= 0);
14248 object_info_t
oi(bl
);
14249 bi
->objects
[*p
] = oi
.version
;
14250 dout(20) << " " << *p
<< " " << oi
.version
<< dendl
;
14258 * verifies that stray objects have been deleted
14260 void PrimaryLogPG::check_local()
14262 dout(10) << __func__
<< dendl
;
14265 info
.last_update
>=
14266 recovery_state
.get_pg_log().get_tail()); // otherwise we need some help!
14268 if (!cct
->_conf
->osd_debug_verify_stray_on_activate
)
14271 // just scan the log.
14272 set
<hobject_t
> did
;
14273 for (list
<pg_log_entry_t
>::const_reverse_iterator p
= recovery_state
.get_pg_log().get_log().log
.rbegin();
14274 p
!= recovery_state
.get_pg_log().get_log().log
.rend();
14276 if (did
.count(p
->soid
))
14278 did
.insert(p
->soid
);
14280 if (p
->is_delete() && !is_missing_object(p
->soid
)) {
14281 dout(10) << " checking " << p
->soid
14282 << " at " << p
->version
<< dendl
;
14284 int r
= osd
->store
->stat(
14286 ghobject_t(p
->soid
, ghobject_t::NO_GEN
, pg_whoami
.shard
),
14288 if (r
!= -ENOENT
) {
14289 derr
<< __func__
<< " " << p
->soid
<< " exists, but should have been "
14290 << "deleted" << dendl
;
14291 ceph_abort_msg("erroneously present object");
14294 // ignore old(+missing) objects
14301 // ===========================
14304 hobject_t
PrimaryLogPG::get_hit_set_current_object(utime_t stamp
)
14307 ss
<< "hit_set_" << info
.pgid
.pgid
<< "_current_" << stamp
;
14308 hobject_t
hoid(sobject_t(ss
.str(), CEPH_NOSNAP
), "",
14309 info
.pgid
.ps(), info
.pgid
.pool(),
14310 cct
->_conf
->osd_hit_set_namespace
);
14311 dout(20) << __func__
<< " " << hoid
<< dendl
;
14315 hobject_t
PrimaryLogPG::get_hit_set_archive_object(utime_t start
,
14320 ss
<< "hit_set_" << info
.pgid
.pgid
<< "_archive_";
14322 start
.gmtime(ss
, true /* legacy pre-octopus form */) << "_";
14323 end
.gmtime(ss
, true /* legacy pre-octopus form */);
14325 start
.localtime(ss
, true /* legacy pre-octopus form */) << "_";
14326 end
.localtime(ss
, true /* legacy pre-octopus form */);
14328 hobject_t
hoid(sobject_t(ss
.str(), CEPH_NOSNAP
), "",
14329 info
.pgid
.ps(), info
.pgid
.pool(),
14330 cct
->_conf
->osd_hit_set_namespace
);
14331 dout(20) << __func__
<< " " << hoid
<< dendl
;
14335 void PrimaryLogPG::hit_set_clear()
14337 dout(20) << __func__
<< dendl
;
14339 hit_set_start_stamp
= utime_t();
14342 void PrimaryLogPG::hit_set_setup()
14344 if (!is_active() ||
14350 if (is_active() && is_primary() &&
14351 (!pool
.info
.hit_set_count
||
14352 !pool
.info
.hit_set_period
||
14353 pool
.info
.hit_set_params
.get_type() == HitSet::TYPE_NONE
)) {
14356 // only primary is allowed to remove all the hit set objects
14357 hit_set_remove_all();
14361 // FIXME: discard any previous data for now
14364 // include any writes we know about from the pg log. this doesn't
14365 // capture reads, but it is better than nothing!
14366 hit_set_apply_log();
14369 void PrimaryLogPG::hit_set_remove_all()
14371 // If any archives are degraded we skip this
14372 for (auto p
= info
.hit_set
.history
.begin();
14373 p
!= info
.hit_set
.history
.end();
14375 hobject_t aoid
= get_hit_set_archive_object(p
->begin
, p
->end
, p
->using_gmt
);
14377 // Once we hit a degraded object just skip
14378 if (is_degraded_or_backfilling_object(aoid
))
14380 if (m_scrubber
->write_blocked_by_scrub(aoid
))
14384 if (!info
.hit_set
.history
.empty()) {
14385 auto p
= info
.hit_set
.history
.rbegin();
14386 ceph_assert(p
!= info
.hit_set
.history
.rend());
14387 hobject_t oid
= get_hit_set_archive_object(p
->begin
, p
->end
, p
->using_gmt
);
14388 ceph_assert(!is_degraded_or_backfilling_object(oid
));
14389 ObjectContextRef obc
= get_object_context(oid
, false);
14392 OpContextUPtr ctx
= simple_opc_create(obc
);
14393 ctx
->at_version
= get_next_version();
14394 ctx
->updated_hset_history
= info
.hit_set
;
14395 utime_t now
= ceph_clock_now();
14397 hit_set_trim(ctx
, 0);
14398 simple_opc_submit(std::move(ctx
));
14401 recovery_state
.update_hset(pg_hit_set_history_t());
14403 agent_state
->discard_hit_sets();
14407 void PrimaryLogPG::hit_set_create()
14409 utime_t now
= ceph_clock_now();
14410 // make a copy of the params to modify
14411 HitSet::Params
params(pool
.info
.hit_set_params
);
14413 dout(20) << __func__
<< " " << params
<< dendl
;
14414 if (pool
.info
.hit_set_params
.get_type() == HitSet::TYPE_BLOOM
) {
14415 BloomHitSet::Params
*p
=
14416 static_cast<BloomHitSet::Params
*>(params
.impl
.get());
14418 // convert false positive rate so it holds up across the full period
14419 p
->set_fpp(p
->get_fpp() / pool
.info
.hit_set_count
);
14420 if (p
->get_fpp() <= 0.0)
14421 p
->set_fpp(.01); // fpp cannot be zero!
14423 // if we don't have specified size, estimate target size based on the
14425 if (p
->target_size
== 0 && hit_set
) {
14426 utime_t dur
= now
- hit_set_start_stamp
;
14427 unsigned unique
= hit_set
->approx_unique_insert_count();
14428 dout(20) << __func__
<< " previous set had approx " << unique
14429 << " unique items over " << dur
<< " seconds" << dendl
;
14430 p
->target_size
= (double)unique
* (double)pool
.info
.hit_set_period
14433 if (p
->target_size
<
14434 static_cast<uint64_t>(cct
->_conf
->osd_hit_set_min_size
))
14435 p
->target_size
= cct
->_conf
->osd_hit_set_min_size
;
14438 > static_cast<uint64_t>(cct
->_conf
->osd_hit_set_max_size
))
14439 p
->target_size
= cct
->_conf
->osd_hit_set_max_size
;
14441 p
->seed
= now
.sec();
14443 dout(10) << __func__
<< " target_size " << p
->target_size
14444 << " fpp " << p
->get_fpp() << dendl
;
14446 hit_set
.reset(new HitSet(params
));
14447 hit_set_start_stamp
= now
;
14451 * apply log entries to set
14453 * this would only happen after peering, to at least capture writes
14454 * during an interval that was potentially lost.
14456 bool PrimaryLogPG::hit_set_apply_log()
14461 eversion_t to
= info
.last_update
;
14462 eversion_t from
= info
.hit_set
.current_last_update
;
14464 dout(20) << __func__
<< " no update" << dendl
;
14468 dout(20) << __func__
<< " " << to
<< " .. " << info
.last_update
<< dendl
;
14469 list
<pg_log_entry_t
>::const_reverse_iterator p
=
14470 recovery_state
.get_pg_log().get_log().log
.rbegin();
14471 while (p
!= recovery_state
.get_pg_log().get_log().log
.rend() && p
->version
> to
)
14473 while (p
!= recovery_state
.get_pg_log().get_log().log
.rend() && p
->version
> from
) {
14474 hit_set
->insert(p
->soid
);
14481 void PrimaryLogPG::hit_set_persist()
14483 dout(10) << __func__
<< dendl
;
14485 unsigned max
= pool
.info
.hit_set_count
;
14487 utime_t now
= ceph_clock_now();
14490 // If any archives are degraded we skip this persist request
14491 // account for the additional entry being added below
14492 for (auto p
= info
.hit_set
.history
.begin();
14493 p
!= info
.hit_set
.history
.end();
14495 hobject_t aoid
= get_hit_set_archive_object(p
->begin
, p
->end
, p
->using_gmt
);
14497 // Once we hit a degraded object just skip further trim
14498 if (is_degraded_or_backfilling_object(aoid
))
14500 if (m_scrubber
->write_blocked_by_scrub(aoid
))
14504 // If backfill is in progress and we could possibly overlap with the
14505 // hit_set_* objects, back off. Since these all have
14506 // hobject_t::hash set to pgid.ps(), and those sort first, we can
14507 // look just at that. This is necessary because our transactions
14508 // may include a modify of the new hit_set *and* a delete of the
14509 // old one, and this may span the backfill boundary.
14510 for (set
<pg_shard_t
>::const_iterator p
= get_backfill_targets().begin();
14511 p
!= get_backfill_targets().end();
14513 const pg_info_t
& pi
= recovery_state
.get_peer_info(*p
);
14514 if (pi
.last_backfill
== hobject_t() ||
14515 pi
.last_backfill
.get_hash() == info
.pgid
.ps()) {
14516 dout(10) << __func__
<< " backfill target osd." << *p
14517 << " last_backfill has not progressed past pgid ps"
14524 pg_hit_set_info_t new_hset
= pg_hit_set_info_t(pool
.info
.use_gmt_hitset
);
14525 new_hset
.begin
= hit_set_start_stamp
;
14526 new_hset
.end
= now
;
14527 oid
= get_hit_set_archive_object(
14530 new_hset
.using_gmt
);
14532 // If the current object is degraded we skip this persist request
14533 if (m_scrubber
->write_blocked_by_scrub(oid
))
14537 encode(*hit_set
, bl
);
14538 dout(20) << __func__
<< " archive " << oid
<< dendl
;
14541 agent_state
->add_hit_set(new_hset
.begin
, hit_set
);
14542 uint32_t size
= agent_state
->hit_set_map
.size();
14543 if (size
>= pool
.info
.hit_set_count
) {
14544 size
= pool
.info
.hit_set_count
> 0 ? pool
.info
.hit_set_count
- 1: 0;
14546 hit_set_in_memory_trim(size
);
14549 ObjectContextRef obc
= get_object_context(oid
, true);
14550 OpContextUPtr ctx
= simple_opc_create(obc
);
14552 ctx
->at_version
= get_next_version();
14553 ctx
->updated_hset_history
= info
.hit_set
;
14554 pg_hit_set_history_t
&updated_hit_set_hist
= *(ctx
->updated_hset_history
);
14556 updated_hit_set_hist
.current_last_update
= info
.last_update
;
14557 new_hset
.version
= ctx
->at_version
;
14559 updated_hit_set_hist
.history
.push_back(new_hset
);
14562 // fabricate an object_info_t and SnapSet
14563 obc
->obs
.oi
.version
= ctx
->at_version
;
14564 obc
->obs
.oi
.mtime
= now
;
14565 obc
->obs
.oi
.size
= bl
.length();
14566 obc
->obs
.exists
= true;
14567 obc
->obs
.oi
.set_data_digest(bl
.crc32c(-1));
14569 ctx
->new_obs
= obc
->obs
;
14571 ctx
->new_snapset
= obc
->ssc
->snapset
;
14573 ctx
->delta_stats
.num_objects
++;
14574 ctx
->delta_stats
.num_objects_hit_set_archive
++;
14576 ctx
->delta_stats
.num_bytes
+= bl
.length();
14577 ctx
->delta_stats
.num_bytes_hit_set_archive
+= bl
.length();
14580 encode(ctx
->new_snapset
, bss
);
14581 bufferlist
boi(sizeof(ctx
->new_obs
.oi
));
14582 encode(ctx
->new_obs
.oi
, boi
,
14583 get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD
, nullptr));
14585 ctx
->op_t
->create(oid
);
14587 ctx
->op_t
->write(oid
, 0, bl
.length(), bl
, 0);
14588 write_update_size_and_usage(ctx
->delta_stats
, obc
->obs
.oi
, ctx
->modified_ranges
,
14590 ctx
->clean_regions
.mark_data_region_dirty(0, bl
.length());
14592 map
<string
, bufferlist
, std::less
<>> attrs
= {
14593 {OI_ATTR
, std::move(boi
)},
14594 {SS_ATTR
, std::move(bss
)}
14596 setattrs_maybe_cache(ctx
->obc
, ctx
->op_t
.get(), attrs
);
14597 ctx
->log
.push_back(
14599 pg_log_entry_t::MODIFY
,
14608 ctx
->log
.back().clean_regions
= ctx
->clean_regions
;
14610 hit_set_trim(ctx
, max
);
14612 simple_opc_submit(std::move(ctx
));
14615 void PrimaryLogPG::hit_set_trim(OpContextUPtr
&ctx
, unsigned max
)
14617 ceph_assert(ctx
->updated_hset_history
);
14618 pg_hit_set_history_t
&updated_hit_set_hist
=
14619 *(ctx
->updated_hset_history
);
14620 for (unsigned num
= updated_hit_set_hist
.history
.size(); num
> max
; --num
) {
14621 list
<pg_hit_set_info_t
>::iterator p
= updated_hit_set_hist
.history
.begin();
14622 ceph_assert(p
!= updated_hit_set_hist
.history
.end());
14623 hobject_t oid
= get_hit_set_archive_object(p
->begin
, p
->end
, p
->using_gmt
);
14625 ceph_assert(!is_degraded_or_backfilling_object(oid
));
14627 dout(20) << __func__
<< " removing " << oid
<< dendl
;
14628 ++ctx
->at_version
.version
;
14629 ctx
->log
.push_back(
14630 pg_log_entry_t(pg_log_entry_t::DELETE
,
14639 ctx
->op_t
->remove(oid
);
14640 updated_hit_set_hist
.history
.pop_front();
14642 ObjectContextRef obc
= get_object_context(oid
, false);
14644 --ctx
->delta_stats
.num_objects
;
14645 --ctx
->delta_stats
.num_objects_hit_set_archive
;
14646 ctx
->delta_stats
.num_bytes
-= obc
->obs
.oi
.size
;
14647 ctx
->delta_stats
.num_bytes_hit_set_archive
-= obc
->obs
.oi
.size
;
14651 void PrimaryLogPG::hit_set_in_memory_trim(uint32_t max_in_memory
)
14653 while (agent_state
->hit_set_map
.size() > max_in_memory
) {
14654 agent_state
->remove_oldest_hit_set();
14659 // =======================================
14662 void PrimaryLogPG::agent_setup()
14664 ceph_assert(is_locked());
14665 if (!is_active() ||
14667 state_test(PG_STATE_PREMERGE
) ||
14668 pool
.info
.cache_mode
== pg_pool_t::CACHEMODE_NONE
||
14669 pool
.info
.tier_of
< 0 ||
14670 !get_osdmap()->have_pg_pool(pool
.info
.tier_of
)) {
14674 if (!agent_state
) {
14675 agent_state
.reset(new TierAgentState
);
14677 // choose random starting position
14678 agent_state
->position
= hobject_t();
14679 agent_state
->position
.pool
= info
.pgid
.pool();
14680 agent_state
->position
.set_hash(pool
.info
.get_random_pg_position(
14683 agent_state
->start
= agent_state
->position
;
14685 dout(10) << __func__
<< " allocated new state, position "
14686 << agent_state
->position
<< dendl
;
14688 dout(10) << __func__
<< " keeping existing state" << dendl
;
14691 if (info
.stats
.stats_invalid
) {
14692 osd
->clog
->warn() << "pg " << info
.pgid
<< " has invalid (post-split) stats; must scrub before tier agent can activate";
14695 agent_choose_mode();
14698 void PrimaryLogPG::agent_clear()
14701 agent_state
.reset(NULL
);
14704 // Return false if no objects operated on since start of object hash space
14705 bool PrimaryLogPG::agent_work(int start_max
, int agent_flush_quota
)
14707 std::scoped_lock locker
{*this};
14708 if (!agent_state
) {
14709 dout(10) << __func__
<< " no agent state, stopping" << dendl
;
14713 ceph_assert(!recovery_state
.is_deleting());
14715 if (agent_state
->is_idle()) {
14716 dout(10) << __func__
<< " idle, stopping" << dendl
;
14720 osd
->logger
->inc(l_osd_agent_wake
);
14722 dout(10) << __func__
14723 << " max " << start_max
14724 << ", flush " << agent_state
->get_flush_mode_name()
14725 << ", evict " << agent_state
->get_evict_mode_name()
14726 << ", pos " << agent_state
->position
14728 ceph_assert(is_primary());
14729 ceph_assert(is_active());
14731 agent_load_hit_sets();
14733 const pg_pool_t
*base_pool
= get_osdmap()->get_pg_pool(pool
.info
.tier_of
);
14734 ceph_assert(base_pool
);
14737 int ls_max
= cct
->_conf
->osd_pool_default_cache_max_evict_check_size
;
14739 // list some objects. this conveniently lists clones (oldest to
14740 // newest) before heads... the same order we want to flush in.
14742 // NOTE: do not flush the Sequencer. we will assume that the
14743 // listing we get back is imprecise.
14744 vector
<hobject_t
> ls
;
14746 int r
= pgbackend
->objects_list_partial(agent_state
->position
, ls_min
, ls_max
,
14748 ceph_assert(r
>= 0);
14749 dout(20) << __func__
<< " got " << ls
.size() << " objects" << dendl
;
14751 for (vector
<hobject_t
>::iterator p
= ls
.begin();
14754 if (p
->nspace
== cct
->_conf
->osd_hit_set_namespace
) {
14755 dout(20) << __func__
<< " skip (hit set) " << *p
<< dendl
;
14756 osd
->logger
->inc(l_osd_agent_skip
);
14759 if (is_degraded_or_backfilling_object(*p
)) {
14760 dout(20) << __func__
<< " skip (degraded) " << *p
<< dendl
;
14761 osd
->logger
->inc(l_osd_agent_skip
);
14764 if (is_missing_object(p
->get_head())) {
14765 dout(20) << __func__
<< " skip (missing head) " << *p
<< dendl
;
14766 osd
->logger
->inc(l_osd_agent_skip
);
14769 ObjectContextRef obc
= get_object_context(*p
, false, NULL
);
14771 // we didn't flush; we may miss something here.
14772 dout(20) << __func__
<< " skip (no obc) " << *p
<< dendl
;
14773 osd
->logger
->inc(l_osd_agent_skip
);
14776 if (!obc
->obs
.exists
) {
14777 dout(20) << __func__
<< " skip (dne) " << obc
->obs
.oi
.soid
<< dendl
;
14778 osd
->logger
->inc(l_osd_agent_skip
);
14781 if (m_scrubber
->range_intersects_scrub(obc
->obs
.oi
.soid
,
14782 obc
->obs
.oi
.soid
.get_head())) {
14783 dout(20) << __func__
<< " skip (scrubbing) " << obc
->obs
.oi
<< dendl
;
14784 osd
->logger
->inc(l_osd_agent_skip
);
14787 if (obc
->is_blocked()) {
14788 dout(20) << __func__
<< " skip (blocked) " << obc
->obs
.oi
<< dendl
;
14789 osd
->logger
->inc(l_osd_agent_skip
);
14792 if (obc
->is_request_pending()) {
14793 dout(20) << __func__
<< " skip (request pending) " << obc
->obs
.oi
<< dendl
;
14794 osd
->logger
->inc(l_osd_agent_skip
);
14798 // be careful flushing omap to an EC pool.
14799 if (!base_pool
->supports_omap() &&
14800 obc
->obs
.oi
.is_omap()) {
14801 dout(20) << __func__
<< " skip (omap to EC) " << obc
->obs
.oi
<< dendl
;
14802 osd
->logger
->inc(l_osd_agent_skip
);
14806 if (agent_state
->evict_mode
!= TierAgentState::EVICT_MODE_IDLE
&&
14807 agent_maybe_evict(obc
, false))
14809 else if (agent_state
->flush_mode
!= TierAgentState::FLUSH_MODE_IDLE
&&
14810 agent_flush_quota
> 0 && agent_maybe_flush(obc
)) {
14812 --agent_flush_quota
;
14814 if (started
>= start_max
) {
14815 // If finishing early, set "next" to the next object
14816 if (++p
!= ls
.end())
14822 if (++agent_state
->hist_age
> cct
->_conf
->osd_agent_hist_halflife
) {
14823 dout(20) << __func__
<< " resetting atime and temp histograms" << dendl
;
14824 agent_state
->hist_age
= 0;
14825 agent_state
->temp_hist
.decay();
14828 // Total objects operated on so far
14829 int total_started
= agent_state
->started
+ started
;
14830 bool need_delay
= false;
14832 dout(20) << __func__
<< " start pos " << agent_state
->position
14833 << " next start pos " << next
14834 << " started " << total_started
<< dendl
;
14836 // See if we've made a full pass over the object hash space
14837 // This might check at most ls_max objects a second time to notice that
14838 // we've checked every objects at least once.
14839 if (agent_state
->position
< agent_state
->start
&&
14840 next
>= agent_state
->start
) {
14841 dout(20) << __func__
<< " wrap around " << agent_state
->start
<< dendl
;
14842 if (total_started
== 0)
14846 agent_state
->start
= next
;
14848 agent_state
->started
= total_started
;
14850 // See if we are starting from beginning
14852 agent_state
->position
= hobject_t();
14854 agent_state
->position
= next
;
14856 // Discard old in memory HitSets
14857 hit_set_in_memory_trim(pool
.info
.hit_set_count
);
14860 ceph_assert(agent_state
->delaying
== false);
14864 agent_choose_mode();
14868 void PrimaryLogPG::agent_load_hit_sets()
14870 if (agent_state
->evict_mode
== TierAgentState::EVICT_MODE_IDLE
) {
14874 if (agent_state
->hit_set_map
.size() < info
.hit_set
.history
.size()) {
14875 dout(10) << __func__
<< dendl
;
14876 for (auto p
= info
.hit_set
.history
.begin();
14877 p
!= info
.hit_set
.history
.end(); ++p
) {
14878 if (agent_state
->hit_set_map
.count(p
->begin
.sec()) == 0) {
14879 dout(10) << __func__
<< " loading " << p
->begin
<< "-"
14880 << p
->end
<< dendl
;
14881 if (!pool
.info
.is_replicated()) {
14882 // FIXME: EC not supported here yet
14883 derr
<< __func__
<< " on non-replicated pool" << dendl
;
14887 hobject_t oid
= get_hit_set_archive_object(p
->begin
, p
->end
, p
->using_gmt
);
14888 if (is_unreadable_object(oid
)) {
14889 dout(10) << __func__
<< " unreadable " << oid
<< ", waiting" << dendl
;
14893 ObjectContextRef obc
= get_object_context(oid
, false);
14895 derr
<< __func__
<< ": could not load hitset " << oid
<< dendl
;
14901 int r
= osd
->store
->read(ch
, ghobject_t(oid
), 0, 0, bl
);
14902 ceph_assert(r
>= 0);
14904 HitSetRef
hs(new HitSet
);
14905 bufferlist::const_iterator pbl
= bl
.begin();
14907 agent_state
->add_hit_set(p
->begin
.sec(), hs
);
14913 bool PrimaryLogPG::agent_maybe_flush(ObjectContextRef
& obc
)
14915 if (!obc
->obs
.oi
.is_dirty()) {
14916 dout(20) << __func__
<< " skip (clean) " << obc
->obs
.oi
<< dendl
;
14917 osd
->logger
->inc(l_osd_agent_skip
);
14920 if (obc
->obs
.oi
.is_cache_pinned()) {
14921 dout(20) << __func__
<< " skip (cache_pinned) " << obc
->obs
.oi
<< dendl
;
14922 osd
->logger
->inc(l_osd_agent_skip
);
14926 utime_t now
= ceph_clock_now();
14927 utime_t ob_local_mtime
;
14928 if (obc
->obs
.oi
.local_mtime
!= utime_t()) {
14929 ob_local_mtime
= obc
->obs
.oi
.local_mtime
;
14931 ob_local_mtime
= obc
->obs
.oi
.mtime
;
14933 bool evict_mode_full
=
14934 (agent_state
->evict_mode
== TierAgentState::EVICT_MODE_FULL
);
14935 if (!evict_mode_full
&&
14936 obc
->obs
.oi
.soid
.snap
== CEPH_NOSNAP
&& // snaps immutable; don't delay
14937 (ob_local_mtime
+ utime_t(pool
.info
.cache_min_flush_age
, 0) > now
)) {
14938 dout(20) << __func__
<< " skip (too young) " << obc
->obs
.oi
<< dendl
;
14939 osd
->logger
->inc(l_osd_agent_skip
);
14943 if (osd
->agent_is_active_oid(obc
->obs
.oi
.soid
)) {
14944 dout(20) << __func__
<< " skip (flushing) " << obc
->obs
.oi
<< dendl
;
14945 osd
->logger
->inc(l_osd_agent_skip
);
14949 dout(10) << __func__
<< " flushing " << obc
->obs
.oi
<< dendl
;
14951 // FIXME: flush anything dirty, regardless of what distribution of
14954 hobject_t oid
= obc
->obs
.oi
.soid
;
14955 osd
->agent_start_op(oid
);
14956 // no need to capture a pg ref, can't outlive fop or ctx
14957 std::function
<void()> on_flush
= [this, oid
]() {
14958 osd
->agent_finish_op(oid
);
14961 int result
= start_flush(
14962 OpRequestRef(), obc
, false, NULL
,
14964 if (result
!= -EINPROGRESS
) {
14966 dout(10) << __func__
<< " start_flush() failed " << obc
->obs
.oi
14967 << " with " << result
<< dendl
;
14968 osd
->logger
->inc(l_osd_agent_skip
);
14972 osd
->logger
->inc(l_osd_agent_flush
);
14976 bool PrimaryLogPG::agent_maybe_evict(ObjectContextRef
& obc
, bool after_flush
)
14978 const hobject_t
& soid
= obc
->obs
.oi
.soid
;
14979 if (!after_flush
&& obc
->obs
.oi
.is_dirty()) {
14980 dout(20) << __func__
<< " skip (dirty) " << obc
->obs
.oi
<< dendl
;
14983 // This is already checked by agent_work() which passes after_flush = false
14984 if (after_flush
&& m_scrubber
->range_intersects_scrub(soid
, soid
.get_head())) {
14985 dout(20) << __func__
<< " skip (scrubbing) " << obc
->obs
.oi
<< dendl
;
14988 if (!obc
->obs
.oi
.watchers
.empty()) {
14989 dout(20) << __func__
<< " skip (watchers) " << obc
->obs
.oi
<< dendl
;
14992 if (obc
->is_blocked()) {
14993 dout(20) << __func__
<< " skip (blocked) " << obc
->obs
.oi
<< dendl
;
14996 if (obc
->obs
.oi
.is_cache_pinned()) {
14997 dout(20) << __func__
<< " skip (cache_pinned) " << obc
->obs
.oi
<< dendl
;
15001 if (soid
.snap
== CEPH_NOSNAP
) {
15002 int result
= _verify_no_head_clones(soid
, obc
->ssc
->snapset
);
15004 dout(20) << __func__
<< " skip (clones) " << obc
->obs
.oi
<< dendl
;
15009 if (agent_state
->evict_mode
!= TierAgentState::EVICT_MODE_FULL
) {
15010 // is this object old than cache_min_evict_age?
15011 utime_t now
= ceph_clock_now();
15012 utime_t ob_local_mtime
;
15013 if (obc
->obs
.oi
.local_mtime
!= utime_t()) {
15014 ob_local_mtime
= obc
->obs
.oi
.local_mtime
;
15016 ob_local_mtime
= obc
->obs
.oi
.mtime
;
15018 if (ob_local_mtime
+ utime_t(pool
.info
.cache_min_evict_age
, 0) > now
) {
15019 dout(20) << __func__
<< " skip (too young) " << obc
->obs
.oi
<< dendl
;
15020 osd
->logger
->inc(l_osd_agent_skip
);
15023 // is this object old and/or cold enough?
15025 uint64_t temp_upper
= 0, temp_lower
= 0;
15027 agent_estimate_temp(soid
, &temp
);
15028 agent_state
->temp_hist
.add(temp
);
15029 agent_state
->temp_hist
.get_position_micro(temp
, &temp_lower
, &temp_upper
);
15031 dout(20) << __func__
15032 << " temp " << temp
15033 << " pos " << temp_lower
<< "-" << temp_upper
15034 << ", evict_effort " << agent_state
->evict_effort
15036 dout(30) << "agent_state:\n";
15037 Formatter
*f
= Formatter::create("");
15038 f
->open_object_section("agent_state");
15039 agent_state
->dump(f
);
15040 f
->close_section();
15045 if (1000000 - temp_upper
>= agent_state
->evict_effort
)
15049 dout(10) << __func__
<< " evicting " << obc
->obs
.oi
<< dendl
;
15050 OpContextUPtr ctx
= simple_opc_create(obc
);
15052 auto null_op_req
= OpRequestRef();
15053 if (!ctx
->lock_manager
.get_lock_type(
15058 close_op_ctx(ctx
.release());
15059 dout(20) << __func__
<< " skip (cannot get lock) " << obc
->obs
.oi
<< dendl
;
15063 osd
->agent_start_evict_op();
15064 ctx
->register_on_finish(
15066 osd
->agent_finish_evict_op();
15069 ctx
->at_version
= get_next_version();
15070 ceph_assert(ctx
->new_obs
.exists
);
15071 int r
= _delete_oid(ctx
.get(), true, false);
15072 if (obc
->obs
.oi
.is_omap())
15073 ctx
->delta_stats
.num_objects_omap
--;
15074 ctx
->delta_stats
.num_evict
++;
15075 ctx
->delta_stats
.num_evict_kb
+= shift_round_up(obc
->obs
.oi
.size
, 10);
15076 if (obc
->obs
.oi
.is_dirty())
15077 --ctx
->delta_stats
.num_objects_dirty
;
15078 ceph_assert(r
== 0);
15079 finish_ctx(ctx
.get(), pg_log_entry_t::DELETE
);
15080 simple_opc_submit(std::move(ctx
));
15081 osd
->logger
->inc(l_osd_tier_evict
);
15082 osd
->logger
->inc(l_osd_agent_evict
);
15086 void PrimaryLogPG::agent_stop()
15088 dout(20) << __func__
<< dendl
;
15089 if (agent_state
&& !agent_state
->is_idle()) {
15090 agent_state
->evict_mode
= TierAgentState::EVICT_MODE_IDLE
;
15091 agent_state
->flush_mode
= TierAgentState::FLUSH_MODE_IDLE
;
15092 osd
->agent_disable_pg(this, agent_state
->evict_effort
);
15096 void PrimaryLogPG::agent_delay()
15098 dout(20) << __func__
<< dendl
;
15099 if (agent_state
&& !agent_state
->is_idle()) {
15100 ceph_assert(agent_state
->delaying
== false);
15101 agent_state
->delaying
= true;
15102 osd
->agent_disable_pg(this, agent_state
->evict_effort
);
15106 void PrimaryLogPG::agent_choose_mode_restart()
15108 dout(20) << __func__
<< dendl
;
15109 std::scoped_lock locker
{*this};
15110 if (agent_state
&& agent_state
->delaying
) {
15111 agent_state
->delaying
= false;
15112 agent_choose_mode(true);
15116 bool PrimaryLogPG::agent_choose_mode(bool restart
, OpRequestRef op
)
15118 bool requeued
= false;
15119 // Let delay play out
15120 if (agent_state
->delaying
) {
15121 dout(20) << __func__
<< " " << this << " delaying, ignored" << dendl
;
15125 TierAgentState::flush_mode_t flush_mode
= TierAgentState::FLUSH_MODE_IDLE
;
15126 TierAgentState::evict_mode_t evict_mode
= TierAgentState::EVICT_MODE_IDLE
;
15127 unsigned evict_effort
= 0;
15129 if (info
.stats
.stats_invalid
) {
15130 // idle; stats can't be trusted until we scrub.
15131 dout(20) << __func__
<< " stats invalid (post-split), idle" << dendl
;
15136 uint64_t divisor
= pool
.info
.get_pg_num_divisor(info
.pgid
.pgid
);
15137 ceph_assert(divisor
> 0);
15139 // adjust (effective) user objects down based on the number
15140 // of HitSet objects, which should not count toward our total since
15141 // they cannot be flushed.
15142 uint64_t unflushable
= info
.stats
.stats
.sum
.num_objects_hit_set_archive
;
15144 // also exclude omap objects if ec backing pool
15145 const pg_pool_t
*base_pool
= get_osdmap()->get_pg_pool(pool
.info
.tier_of
);
15146 ceph_assert(base_pool
);
15147 if (!base_pool
->supports_omap())
15148 unflushable
+= info
.stats
.stats
.sum
.num_objects_omap
;
15150 uint64_t num_user_objects
= info
.stats
.stats
.sum
.num_objects
;
15151 if (num_user_objects
> unflushable
)
15152 num_user_objects
-= unflushable
;
15154 num_user_objects
= 0;
15156 uint64_t num_user_bytes
= info
.stats
.stats
.sum
.num_bytes
;
15157 uint64_t unflushable_bytes
= info
.stats
.stats
.sum
.num_bytes_hit_set_archive
;
15158 num_user_bytes
-= unflushable_bytes
;
15159 uint64_t num_overhead_bytes
= osd
->store
->estimate_objects_overhead(num_user_objects
);
15160 num_user_bytes
+= num_overhead_bytes
;
15162 // also reduce the num_dirty by num_objects_omap
15163 int64_t num_dirty
= info
.stats
.stats
.sum
.num_objects_dirty
;
15164 if (!base_pool
->supports_omap()) {
15165 if (num_dirty
> info
.stats
.stats
.sum
.num_objects_omap
)
15166 num_dirty
-= info
.stats
.stats
.sum
.num_objects_omap
;
15171 dout(10) << __func__
15173 << TierAgentState::get_flush_mode_name(agent_state
->flush_mode
)
15175 << TierAgentState::get_evict_mode_name(agent_state
->evict_mode
)
15176 << " num_objects: " << info
.stats
.stats
.sum
.num_objects
15177 << " num_bytes: " << info
.stats
.stats
.sum
.num_bytes
15178 << " num_objects_dirty: " << info
.stats
.stats
.sum
.num_objects_dirty
15179 << " num_objects_omap: " << info
.stats
.stats
.sum
.num_objects_omap
15180 << " num_dirty: " << num_dirty
15181 << " num_user_objects: " << num_user_objects
15182 << " num_user_bytes: " << num_user_bytes
15183 << " num_overhead_bytes: " << num_overhead_bytes
15184 << " pool.info.target_max_bytes: " << pool
.info
.target_max_bytes
15185 << " pool.info.target_max_objects: " << pool
.info
.target_max_objects
15188 // get dirty, full ratios
15189 uint64_t dirty_micro
= 0;
15190 uint64_t full_micro
= 0;
15191 if (pool
.info
.target_max_bytes
&& num_user_objects
> 0) {
15192 uint64_t avg_size
= num_user_bytes
/ num_user_objects
;
15194 num_dirty
* avg_size
* 1000000 /
15195 std::max
<uint64_t>(pool
.info
.target_max_bytes
/ divisor
, 1);
15197 num_user_objects
* avg_size
* 1000000 /
15198 std::max
<uint64_t>(pool
.info
.target_max_bytes
/ divisor
, 1);
15200 if (pool
.info
.target_max_objects
> 0) {
15201 uint64_t dirty_objects_micro
=
15202 num_dirty
* 1000000 /
15203 std::max
<uint64_t>(pool
.info
.target_max_objects
/ divisor
, 1);
15204 if (dirty_objects_micro
> dirty_micro
)
15205 dirty_micro
= dirty_objects_micro
;
15206 uint64_t full_objects_micro
=
15207 num_user_objects
* 1000000 /
15208 std::max
<uint64_t>(pool
.info
.target_max_objects
/ divisor
, 1);
15209 if (full_objects_micro
> full_micro
)
15210 full_micro
= full_objects_micro
;
15212 dout(20) << __func__
<< " dirty " << ((float)dirty_micro
/ 1000000.0)
15213 << " full " << ((float)full_micro
/ 1000000.0)
15217 uint64_t flush_target
= pool
.info
.cache_target_dirty_ratio_micro
;
15218 uint64_t flush_high_target
= pool
.info
.cache_target_dirty_high_ratio_micro
;
15219 uint64_t flush_slop
= (float)flush_target
* cct
->_conf
->osd_agent_slop
;
15220 if (restart
|| agent_state
->flush_mode
== TierAgentState::FLUSH_MODE_IDLE
) {
15221 flush_target
+= flush_slop
;
15222 flush_high_target
+= flush_slop
;
15224 flush_target
-= std::min(flush_target
, flush_slop
);
15225 flush_high_target
-= std::min(flush_high_target
, flush_slop
);
15228 if (dirty_micro
> flush_high_target
) {
15229 flush_mode
= TierAgentState::FLUSH_MODE_HIGH
;
15230 } else if (dirty_micro
> flush_target
|| (!flush_target
&& num_dirty
> 0)) {
15231 flush_mode
= TierAgentState::FLUSH_MODE_LOW
;
15235 uint64_t evict_target
= pool
.info
.cache_target_full_ratio_micro
;
15236 uint64_t evict_slop
= (float)evict_target
* cct
->_conf
->osd_agent_slop
;
15237 if (restart
|| agent_state
->evict_mode
== TierAgentState::EVICT_MODE_IDLE
)
15238 evict_target
+= evict_slop
;
15240 evict_target
-= std::min(evict_target
, evict_slop
);
15242 if (full_micro
> 1000000) {
15243 // evict anything clean
15244 evict_mode
= TierAgentState::EVICT_MODE_FULL
;
15245 evict_effort
= 1000000;
15246 } else if (full_micro
> evict_target
) {
15247 // set effort in [0..1] range based on where we are between
15248 evict_mode
= TierAgentState::EVICT_MODE_SOME
;
15249 uint64_t over
= full_micro
- evict_target
;
15250 uint64_t span
= 1000000 - evict_target
;
15251 evict_effort
= std::max(over
* 1000000 / span
,
15252 uint64_t(1000000.0 *
15253 cct
->_conf
->osd_agent_min_evict_effort
));
15255 // quantize effort to avoid too much reordering in the agent_queue.
15256 uint64_t inc
= cct
->_conf
->osd_agent_quantize_effort
* 1000000;
15257 ceph_assert(inc
> 0);
15258 uint64_t was
= evict_effort
;
15259 evict_effort
-= evict_effort
% inc
;
15260 if (evict_effort
< inc
)
15261 evict_effort
= inc
;
15262 ceph_assert(evict_effort
>= inc
&& evict_effort
<= 1000000);
15263 dout(30) << __func__
<< " evict_effort " << was
<< " quantized by " << inc
<< " to " << evict_effort
<< dendl
;
15268 bool old_idle
= agent_state
->is_idle();
15269 if (flush_mode
!= agent_state
->flush_mode
) {
15270 dout(5) << __func__
<< " flush_mode "
15271 << TierAgentState::get_flush_mode_name(agent_state
->flush_mode
)
15273 << TierAgentState::get_flush_mode_name(flush_mode
)
15275 recovery_state
.update_stats(
15276 [=](auto &history
, auto &stats
) {
15277 if (flush_mode
== TierAgentState::FLUSH_MODE_HIGH
) {
15278 osd
->agent_inc_high_count();
15279 stats
.stats
.sum
.num_flush_mode_high
= 1;
15280 } else if (flush_mode
== TierAgentState::FLUSH_MODE_LOW
) {
15281 stats
.stats
.sum
.num_flush_mode_low
= 1;
15283 if (agent_state
->flush_mode
== TierAgentState::FLUSH_MODE_HIGH
) {
15284 osd
->agent_dec_high_count();
15285 stats
.stats
.sum
.num_flush_mode_high
= 0;
15286 } else if (agent_state
->flush_mode
== TierAgentState::FLUSH_MODE_LOW
) {
15287 stats
.stats
.sum
.num_flush_mode_low
= 0;
15291 agent_state
->flush_mode
= flush_mode
;
15293 if (evict_mode
!= agent_state
->evict_mode
) {
15294 dout(5) << __func__
<< " evict_mode "
15295 << TierAgentState::get_evict_mode_name(agent_state
->evict_mode
)
15297 << TierAgentState::get_evict_mode_name(evict_mode
)
15299 if (agent_state
->evict_mode
== TierAgentState::EVICT_MODE_FULL
&&
15303 requeue_ops(waiting_for_flush
);
15304 requeue_ops(waiting_for_active
);
15305 requeue_ops(waiting_for_readable
);
15306 requeue_ops(waiting_for_scrub
);
15307 requeue_ops(waiting_for_cache_not_full
);
15308 objects_blocked_on_cache_full
.clear();
15311 recovery_state
.update_stats(
15312 [=](auto &history
, auto &stats
) {
15313 if (evict_mode
== TierAgentState::EVICT_MODE_SOME
) {
15314 stats
.stats
.sum
.num_evict_mode_some
= 1;
15315 } else if (evict_mode
== TierAgentState::EVICT_MODE_FULL
) {
15316 stats
.stats
.sum
.num_evict_mode_full
= 1;
15318 if (agent_state
->evict_mode
== TierAgentState::EVICT_MODE_SOME
) {
15319 stats
.stats
.sum
.num_evict_mode_some
= 0;
15320 } else if (agent_state
->evict_mode
== TierAgentState::EVICT_MODE_FULL
) {
15321 stats
.stats
.sum
.num_evict_mode_full
= 0;
15325 agent_state
->evict_mode
= evict_mode
;
15327 uint64_t old_effort
= agent_state
->evict_effort
;
15328 if (evict_effort
!= agent_state
->evict_effort
) {
15329 dout(5) << __func__
<< " evict_effort "
15330 << ((float)agent_state
->evict_effort
/ 1000000.0)
15332 << ((float)evict_effort
/ 1000000.0)
15334 agent_state
->evict_effort
= evict_effort
;
15337 // NOTE: we are using evict_effort as a proxy for *all* agent effort
15338 // (including flush). This is probably fine (they should be
15339 // correlated) but it is not precisely correct.
15340 if (agent_state
->is_idle()) {
15341 if (!restart
&& !old_idle
) {
15342 osd
->agent_disable_pg(this, old_effort
);
15345 if (restart
|| old_idle
) {
15346 osd
->agent_enable_pg(this, agent_state
->evict_effort
);
15347 } else if (old_effort
!= agent_state
->evict_effort
) {
15348 osd
->agent_adjust_pg(this, old_effort
, agent_state
->evict_effort
);
15354 void PrimaryLogPG::agent_estimate_temp(const hobject_t
& oid
, int *temp
)
15356 ceph_assert(hit_set
);
15359 if (hit_set
->contains(oid
))
15362 int last_n
= pool
.info
.hit_set_search_last_n
;
15363 for (map
<time_t,HitSetRef
>::reverse_iterator p
=
15364 agent_state
->hit_set_map
.rbegin(); last_n
> 0 &&
15365 p
!= agent_state
->hit_set_map
.rend(); ++p
, ++i
) {
15366 if (p
->second
->contains(oid
)) {
15367 *temp
+= pool
.info
.get_grade(i
);
15373 // Dup op detection
15375 bool PrimaryLogPG::already_complete(eversion_t v
)
15377 dout(20) << __func__
<< ": " << v
<< dendl
;
15378 for (xlist
<RepGather
*>::iterator i
= repop_queue
.begin();
15381 dout(20) << __func__
<< ": " << **i
<< dendl
;
15382 // skip copy from temp object ops
15383 if ((*i
)->v
== eversion_t()) {
15384 dout(20) << __func__
<< ": " << **i
15385 << " version is empty" << dendl
;
15389 dout(20) << __func__
<< ": " << **i
15390 << " (*i)->v past v" << dendl
;
15393 if (!(*i
)->all_committed
) {
15394 dout(20) << __func__
<< ": " << **i
15395 << " not committed, returning false"
15400 dout(20) << __func__
<< ": returning true" << dendl
;
15405 // ==========================================================================================
15408 void PrimaryLogPG::do_replica_scrub_map(OpRequestRef op
)
15410 dout(15) << __func__
<< " is scrub active? " << is_scrub_active() << dendl
;
15411 op
->mark_started();
15413 if (!is_scrub_active()) {
15414 dout(10) << __func__
<< " scrub isn't active" << dendl
;
15417 m_scrubber
->map_from_replica(op
);
15420 bool PrimaryLogPG::_range_available_for_scrub(const hobject_t
& begin
,
15421 const hobject_t
& end
)
15423 pair
<hobject_t
, ObjectContextRef
> next
;
15424 next
.second
= object_contexts
.lookup(begin
);
15425 next
.first
= begin
;
15427 while (more
&& next
.first
< end
) {
15428 if (next
.second
&& next
.second
->is_blocked()) {
15429 next
.second
->requeue_scrub_on_unblock
= true;
15430 dout(10) << __func__
<< ": scrub delayed, "
15431 << next
.first
<< " is blocked"
15435 more
= object_contexts
.get_next(next
.first
, &next
);
15441 int PrimaryLogPG::rep_repair_primary_object(const hobject_t
& soid
, OpContext
*ctx
)
15443 OpRequestRef op
= ctx
->op
;
15444 // Only supports replicated pools
15445 ceph_assert(!pool
.info
.is_erasure());
15446 ceph_assert(is_primary());
15448 dout(10) << __func__
<< " " << soid
15449 << " peers osd.{" << get_acting_recovery_backfill() << "}" << dendl
;
15452 block_for_clean(soid
, op
);
15456 ceph_assert(!recovery_state
.get_pg_log().get_missing().is_missing(soid
));
15457 auto& oi
= ctx
->new_obs
.oi
;
15458 eversion_t v
= oi
.version
;
15460 if (primary_error(soid
, v
)) {
15461 dout(0) << __func__
<< " No other replicas available for " << soid
<< dendl
;
15462 // XXX: If we knew that there is no down osd which could include this
15463 // object, it would be nice if we could return EIO here.
15464 // If a "never fail" flag was available, that could be used
15465 // for rbd to NOT return EIO until object marked lost.
15467 // Drop through to save this op in case an osd comes up with the object.
15470 // Restart the op after object becomes readable again
15471 waiting_for_unreadable_object
[soid
].push_back(op
);
15472 op
->mark_delayed("waiting for missing object");
15474 ceph_assert(is_clean());
15475 state_set(PG_STATE_REPAIR
);
15476 state_clear(PG_STATE_CLEAN
);
15477 queue_peering_event(
15479 std::make_shared
<PGPeeringEvent
>(
15480 get_osdmap_epoch(),
15481 get_osdmap_epoch(),
15482 PeeringState::DoRecovery())));
15487 /*---SnapTrimmer Logging---*/
15489 #define dout_prefix pg->gen_prefix(*_dout)
15491 void PrimaryLogPG::SnapTrimmer::log_enter(const char *state_name
)
15493 ldout(pg
->cct
, 20) << "enter " << state_name
<< dendl
;
15496 void PrimaryLogPG::SnapTrimmer::log_exit(const char *state_name
, utime_t enter_time
)
15498 ldout(pg
->cct
, 20) << "exit " << state_name
<< dendl
;
15501 bool PrimaryLogPG::SnapTrimmer::permit_trim() {
15504 !pg
->is_scrub_queued_or_active() &&
15505 !pg
->snap_trimq
.empty();
15508 /*---SnapTrimmer states---*/
15510 #define dout_prefix (context< SnapTrimmer >().pg->gen_prefix(*_dout) \
15511 << "SnapTrimmer state<" << get_state_name() << ">: ")
15514 PrimaryLogPG::NotTrimming::NotTrimming(my_context ctx
)
15516 NamedState(nullptr, "NotTrimming")
15518 context
< SnapTrimmer
>().log_enter(state_name
);
15521 void PrimaryLogPG::NotTrimming::exit()
15523 context
< SnapTrimmer
>().log_exit(state_name
, enter_time
);
15526 boost::statechart::result
PrimaryLogPG::NotTrimming::react(const KickTrim
&)
15528 PrimaryLogPG
*pg
= context
< SnapTrimmer
>().pg
;
15529 ldout(pg
->cct
, 10) << "NotTrimming react KickTrim" << dendl
;
15531 if (!(pg
->is_primary() && pg
->is_active())) {
15532 ldout(pg
->cct
, 10) << "NotTrimming not primary or active" << dendl
;
15533 return discard_event();
15535 if (!pg
->is_clean() ||
15536 pg
->snap_trimq
.empty()) {
15537 ldout(pg
->cct
, 10) << "NotTrimming not clean or nothing to trim" << dendl
;
15538 return discard_event();
15540 if (pg
->is_scrub_queued_or_active()) {
15541 ldout(pg
->cct
, 10) << " scrubbing, will requeue snap_trimmer after" << dendl
;
15542 return transit
< WaitScrub
>();
15544 return transit
< Trimming
>();
15548 boost::statechart::result
PrimaryLogPG::WaitReservation::react(const SnapTrimReserved
&)
15550 PrimaryLogPG
*pg
= context
< SnapTrimmer
>().pg
;
15551 ldout(pg
->cct
, 10) << "WaitReservation react SnapTrimReserved" << dendl
;
15554 if (!context
< SnapTrimmer
>().can_trim()) {
15555 post_event(KickTrim());
15556 return transit
< NotTrimming
>();
15559 context
<Trimming
>().snap_to_trim
= pg
->snap_trimq
.range_start();
15560 ldout(pg
->cct
, 10) << "NotTrimming: trimming "
15561 << pg
->snap_trimq
.range_start()
15563 return transit
< AwaitAsyncWork
>();
15566 /* AwaitAsyncWork */
15567 PrimaryLogPG::AwaitAsyncWork::AwaitAsyncWork(my_context ctx
)
15569 NamedState(nullptr, "Trimming/AwaitAsyncWork")
15571 auto *pg
= context
< SnapTrimmer
>().pg
;
15572 context
< SnapTrimmer
>().log_enter(state_name
);
15573 context
< SnapTrimmer
>().pg
->osd
->queue_for_snap_trim(pg
);
15574 pg
->state_set(PG_STATE_SNAPTRIM
);
15575 pg
->state_clear(PG_STATE_SNAPTRIM_ERROR
);
15576 pg
->publish_stats_to_osd();
15579 boost::statechart::result
PrimaryLogPG::AwaitAsyncWork::react(const DoSnapWork
&)
15581 PrimaryLogPGRef pg
= context
< SnapTrimmer
>().pg
;
15582 snapid_t snap_to_trim
= context
<Trimming
>().snap_to_trim
;
15583 auto &in_flight
= context
<Trimming
>().in_flight
;
15584 ceph_assert(in_flight
.empty());
15586 ceph_assert(pg
->is_primary() && pg
->is_active());
15587 if (!context
< SnapTrimmer
>().can_trim()) {
15588 ldout(pg
->cct
, 10) << "something changed, reverting to NotTrimming" << dendl
;
15589 post_event(KickTrim());
15590 return transit
< NotTrimming
>();
15593 ldout(pg
->cct
, 10) << "AwaitAsyncWork: trimming snap " << snap_to_trim
<< dendl
;
15595 vector
<hobject_t
> to_trim
;
15596 unsigned max
= pg
->cct
->_conf
->osd_pg_max_concurrent_snap_trims
;
15597 to_trim
.reserve(max
);
15598 int r
= pg
->snap_mapper
.get_next_objects_to_trim(
15602 if (r
!= 0 && r
!= -ENOENT
) {
15603 lderr(pg
->cct
) << "get_next_objects_to_trim returned "
15604 << cpp_strerror(r
) << dendl
;
15605 ceph_abort_msg("get_next_objects_to_trim returned an invalid code");
15606 } else if (r
== -ENOENT
) {
15608 ldout(pg
->cct
, 10) << "got ENOENT" << dendl
;
15610 pg
->snap_trimq
.erase(snap_to_trim
);
15612 if (pg
->snap_trimq_repeat
.count(snap_to_trim
)) {
15613 ldout(pg
->cct
, 10) << " removing from snap_trimq_repeat" << dendl
;
15614 pg
->snap_trimq_repeat
.erase(snap_to_trim
);
15616 ldout(pg
->cct
, 10) << "adding snap " << snap_to_trim
15617 << " to purged_snaps"
15619 ObjectStore::Transaction t
;
15620 pg
->recovery_state
.adjust_purged_snaps(
15621 [snap_to_trim
](auto &purged_snaps
) {
15622 purged_snaps
.insert(snap_to_trim
);
15624 pg
->write_if_dirty(t
);
15626 ldout(pg
->cct
, 10) << "purged_snaps now "
15627 << pg
->info
.purged_snaps
<< ", snap_trimq now "
15628 << pg
->snap_trimq
<< dendl
;
15630 int tr
= pg
->osd
->store
->queue_transaction(pg
->ch
, std::move(t
), NULL
);
15631 ceph_assert(tr
== 0);
15633 pg
->recovery_state
.share_pg_info();
15635 post_event(KickTrim());
15636 return transit
< NotTrimming
>();
15638 ceph_assert(!to_trim
.empty());
15640 for (auto &&object
: to_trim
) {
15642 ldout(pg
->cct
, 10) << "AwaitAsyncWork react trimming " << object
<< dendl
;
15644 int error
= pg
->trim_object(in_flight
.empty(), object
, snap_to_trim
, &ctx
);
15646 if (error
== -ENOLCK
) {
15647 ldout(pg
->cct
, 10) << "could not get write lock on obj "
15648 << object
<< dendl
;
15650 pg
->state_set(PG_STATE_SNAPTRIM_ERROR
);
15651 ldout(pg
->cct
, 10) << "Snaptrim error=" << error
<< dendl
;
15653 if (!in_flight
.empty()) {
15654 ldout(pg
->cct
, 10) << "letting the ones we already started finish" << dendl
;
15655 return transit
< WaitRepops
>();
15657 if (error
== -ENOLCK
) {
15658 ldout(pg
->cct
, 10) << "waiting for it to clear"
15660 return transit
< WaitRWLock
>();
15662 return transit
< NotTrimming
>();
15666 in_flight
.insert(object
);
15667 ctx
->register_on_success(
15668 [pg
, object
, &in_flight
]() {
15669 ceph_assert(in_flight
.find(object
) != in_flight
.end());
15670 in_flight
.erase(object
);
15671 if (in_flight
.empty()) {
15672 if (pg
->state_test(PG_STATE_SNAPTRIM_ERROR
)) {
15673 pg
->snap_trimmer_machine
.process_event(Reset());
15675 pg
->snap_trimmer_machine
.process_event(RepopsComplete());
15680 pg
->simple_opc_submit(std::move(ctx
));
15683 return transit
< WaitRepops
>();
15686 void PrimaryLogPG::setattr_maybe_cache(
15687 ObjectContextRef obc
,
15692 t
->setattr(obc
->obs
.oi
.soid
, key
, val
);
15695 void PrimaryLogPG::setattrs_maybe_cache(
15696 ObjectContextRef obc
,
15698 map
<string
, bufferlist
, less
<>> &attrs
)
15700 t
->setattrs(obc
->obs
.oi
.soid
, attrs
);
15703 void PrimaryLogPG::rmattr_maybe_cache(
15704 ObjectContextRef obc
,
15708 t
->rmattr(obc
->obs
.oi
.soid
, key
);
15711 int PrimaryLogPG::getattr_maybe_cache(
15712 ObjectContextRef obc
,
15716 if (pool
.info
.is_erasure()) {
15717 map
<string
, bufferlist
>::iterator i
= obc
->attr_cache
.find(key
);
15718 if (i
!= obc
->attr_cache
.end()) {
15726 return pgbackend
->objects_get_attr(obc
->obs
.oi
.soid
, key
, val
);
15729 int PrimaryLogPG::getattrs_maybe_cache(
15730 ObjectContextRef obc
,
15731 map
<string
, bufferlist
, less
<>> *out
)
15735 if (pool
.info
.is_erasure()) {
15736 *out
= obc
->attr_cache
;
15738 r
= pgbackend
->objects_get_attrs(obc
->obs
.oi
.soid
, out
);
15740 map
<string
, bufferlist
, less
<>> tmp
;
15741 for (auto& [key
, val
]: *out
) {
15742 if (key
.size() > 1 && key
[0] == '_') {
15743 tmp
[key
.substr(1, key
.size())] = std::move(val
);
15750 bool PrimaryLogPG::check_failsafe_full() {
15751 return osd
->check_failsafe_full(get_dpp());
15754 bool PrimaryLogPG::maybe_preempt_replica_scrub(const hobject_t
& oid
)
15756 return m_scrubber
->write_blocked_by_scrub(oid
);
15759 void intrusive_ptr_add_ref(PrimaryLogPG
*pg
) { pg
->get("intptr"); }
15760 void intrusive_ptr_release(PrimaryLogPG
*pg
) { pg
->put("intptr"); }
15762 #ifdef PG_DEBUG_REFS
15763 uint64_t get_with_id(PrimaryLogPG
*pg
) { return pg
->get_with_id(); }
15764 void put_with_id(PrimaryLogPG
*pg
, uint64_t id
) { return pg
->put_with_id(id
); }
15767 void intrusive_ptr_add_ref(PrimaryLogPG::RepGather
*repop
) { repop
->get(); }
15768 void intrusive_ptr_release(PrimaryLogPG::RepGather
*repop
) { repop
->put(); }