1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 * Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com>
9 * Author: Loic Dachary <loic@dachary.org>
11 * This is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public
13 * License version 2.1, as published by the Free Software
14 * Foundation. See file COPYING.
23 #include <boost/intrusive_ptr.hpp>
24 #include <boost/tuple/tuple.hpp>
26 #include "PrimaryLogPG.h"
28 #include "cls/cas/cls_cas_ops.h"
29 #include "common/CDC.h"
30 #include "common/EventTrace.h"
31 #include "common/ceph_crypto.h"
32 #include "common/config.h"
33 #include "common/errno.h"
34 #include "common/perf_counters.h"
35 #include "common/scrub_types.h"
36 #include "include/compat.h"
37 #include "json_spirit/json_spirit_reader.h"
38 #include "json_spirit/json_spirit_value.h"
39 #include "messages/MCommandReply.h"
40 #include "messages/MOSDBackoff.h"
41 #include "messages/MOSDOp.h"
42 #include "messages/MOSDPGBackfill.h"
43 #include "messages/MOSDPGBackfillRemove.h"
44 #include "messages/MOSDPGLog.h"
45 #include "messages/MOSDPGScan.h"
46 #include "messages/MOSDPGTrim.h"
47 #include "messages/MOSDPGUpdateLogMissing.h"
48 #include "messages/MOSDPGUpdateLogMissingReply.h"
49 #include "messages/MOSDRepScrub.h"
50 #include "messages/MOSDScrubReserve.h"
51 #include "mon/MonClient.h"
52 #include "objclass/objclass.h"
53 #include "osd/ClassHandler.h"
54 #include "osdc/Objecter.h"
55 #include "osd/scrubber/PrimaryLogScrub.h"
56 #include "osd/scrubber/ScrubStore.h"
57 #include "osd/scrubber/pg_scrubber.h"
60 #include "OpRequest.h"
64 // required includes order:
65 #include "json_spirit/json_spirit_value.h"
66 #include "json_spirit/json_spirit_reader.h"
67 #include "include/ceph_assert.h" // json_spirit clobbers it
68 #include "include/rados/rados_types.hpp"
71 #include "tracing/osd.h"
73 #define tracepoint(...)
76 #define dout_context cct
77 #define dout_subsys ceph_subsys_osd
78 #define DOUT_PREFIX_ARGS this, osd->whoami, get_osdmap()
80 #define dout_prefix _prefix(_dout, this)
82 #include "osd_tracer.h"
84 MEMPOOL_DEFINE_OBJECT_FACTORY(PrimaryLogPG
, replicatedpg
, osd
);
91 using std::make_unique
;
93 using std::ostringstream
;
96 using std::string_view
;
97 using std::stringstream
;
98 using std::unique_ptr
;
101 using ceph::bufferlist
;
102 using ceph::bufferptr
;
103 using ceph::Formatter
;
105 using ceph::decode_noclear
;
107 using ceph::encode_destructively
;
109 using namespace ceph::osd::scheduler
;
110 using TOPNSPC::common::cmd_getval
;
111 using TOPNSPC::common::cmd_getval_or
;
113 template <typename T
>
114 static ostream
& _prefix(std::ostream
*_dout
, T
*pg
) {
115 return pg
->gen_prefix(*_dout
);
119 * The CopyCallback class defines an interface for completions to the
120 * copy_start code. Users of the copy infrastructure must implement
121 * one and give an instance of the class to start_copy.
123 * The implementer is responsible for making sure that the CopyCallback
124 * can associate itself with the correct copy operation.
126 class PrimaryLogPG::CopyCallback
: public GenContext
<CopyCallbackResults
> {
130 * results.get<0>() is the return code: 0 for success; -ECANCELED if
131 * the operation was cancelled by the local OSD; -errno for other issues.
132 * results.get<1>() is a pointer to a CopyResults object, which you are
133 * responsible for deleting.
135 void finish(CopyCallbackResults results_
) override
= 0;
138 /// Provide the final size of the copied object to the CopyCallback
139 ~CopyCallback() override
{}
142 template <typename T
>
143 class PrimaryLogPG::BlessedGenContext
: public GenContext
<T
> {
145 unique_ptr
<GenContext
<T
>> c
;
148 BlessedGenContext(PrimaryLogPG
*pg
, GenContext
<T
> *c
, epoch_t e
)
149 : pg(pg
), c(c
), e(e
) {}
150 void finish(T t
) override
{
151 std::scoped_lock locker
{*pg
};
152 if (pg
->pg_has_reset_since(e
))
155 c
.release()->complete(t
);
157 bool sync_finish(T t
) {
158 // we assume here all blessed/wrapped Contexts can complete synchronously.
159 c
.release()->complete(t
);
164 GenContext
<ThreadPool::TPHandle
&> *PrimaryLogPG::bless_gencontext(
165 GenContext
<ThreadPool::TPHandle
&> *c
) {
166 return new BlessedGenContext
<ThreadPool::TPHandle
&>(
167 this, c
, get_osdmap_epoch());
170 template <typename T
>
171 class PrimaryLogPG::UnlockedBlessedGenContext
: public GenContext
<T
> {
173 unique_ptr
<GenContext
<T
>> c
;
176 UnlockedBlessedGenContext(PrimaryLogPG
*pg
, GenContext
<T
> *c
, epoch_t e
)
177 : pg(pg
), c(c
), e(e
) {}
178 void finish(T t
) override
{
179 if (pg
->pg_has_reset_since(e
))
182 c
.release()->complete(t
);
184 bool sync_finish(T t
) {
185 // we assume here all blessed/wrapped Contexts can complete synchronously.
186 c
.release()->complete(t
);
191 GenContext
<ThreadPool::TPHandle
&> *PrimaryLogPG::bless_unlocked_gencontext(
192 GenContext
<ThreadPool::TPHandle
&> *c
) {
193 return new UnlockedBlessedGenContext
<ThreadPool::TPHandle
&>(
194 this, c
, get_osdmap_epoch());
197 class PrimaryLogPG::BlessedContext
: public Context
{
199 unique_ptr
<Context
> c
;
202 BlessedContext(PrimaryLogPG
*pg
, Context
*c
, epoch_t e
)
203 : pg(pg
), c(c
), e(e
) {}
204 void finish(int r
) override
{
205 std::scoped_lock locker
{*pg
};
206 if (pg
->pg_has_reset_since(e
))
209 c
.release()->complete(r
);
211 bool sync_finish(int r
) override
{
212 // we assume here all blessed/wrapped Contexts can complete synchronously.
213 c
.release()->complete(r
);
218 Context
*PrimaryLogPG::bless_context(Context
*c
) {
219 return new BlessedContext(this, c
, get_osdmap_epoch());
222 class PrimaryLogPG::C_PG_ObjectContext
: public Context
{
226 C_PG_ObjectContext(PrimaryLogPG
*p
, ObjectContext
*o
) :
228 void finish(int r
) override
{
229 pg
->object_context_destructor_callback(obc
);
233 struct OnReadComplete
: public Context
{
235 PrimaryLogPG::OpContext
*opcontext
;
238 PrimaryLogPG::OpContext
*ctx
) : pg(pg
), opcontext(ctx
) {}
239 void finish(int r
) override
{
240 opcontext
->finish_read(pg
);
242 ~OnReadComplete() override
{}
245 class PrimaryLogPG::C_OSD_AppliedRecoveredObject
: public Context
{
247 ObjectContextRef obc
;
249 C_OSD_AppliedRecoveredObject(PrimaryLogPG
*p
, ObjectContextRef o
) :
251 bool sync_finish(int r
) override
{
252 pg
->_applied_recovered_object(obc
);
255 void finish(int r
) override
{
256 std::scoped_lock locker
{*pg
};
257 pg
->_applied_recovered_object(obc
);
261 class PrimaryLogPG::C_OSD_CommittedPushedObject
: public Context
{
264 eversion_t last_complete
;
266 C_OSD_CommittedPushedObject(
267 PrimaryLogPG
*p
, epoch_t epoch
, eversion_t lc
) :
268 pg(p
), epoch(epoch
), last_complete(lc
) {
270 void finish(int r
) override
{
271 pg
->_committed_pushed_object(epoch
, last_complete
);
275 class PrimaryLogPG::C_OSD_AppliedRecoveredObjectReplica
: public Context
{
278 explicit C_OSD_AppliedRecoveredObjectReplica(PrimaryLogPG
*p
) :
280 bool sync_finish(int r
) override
{
281 pg
->_applied_recovered_object_replica();
284 void finish(int r
) override
{
285 std::scoped_lock locker
{*pg
};
286 pg
->_applied_recovered_object_replica();
291 void PrimaryLogPG::OpContext::start_async_reads(PrimaryLogPG
*pg
)
294 list
<pair
<boost::tuple
<uint64_t, uint64_t, unsigned>,
295 pair
<bufferlist
*, Context
*> > > in
;
296 in
.swap(pending_async_reads
);
297 pg
->pgbackend
->objects_read_async(
300 new OnReadComplete(pg
, this), pg
->get_pool().fast_read
);
302 void PrimaryLogPG::OpContext::finish_read(PrimaryLogPG
*pg
)
304 ceph_assert(inflightreads
> 0);
306 if (async_reads_complete()) {
307 ceph_assert(pg
->in_progress_async_reads
.size());
308 ceph_assert(pg
->in_progress_async_reads
.front().second
== this);
309 pg
->in_progress_async_reads
.pop_front();
311 // Restart the op context now that all reads have been
312 // completed. Read failures will be handled by the op finisher
313 pg
->execute_ctx(this);
317 class CopyFromCallback
: public PrimaryLogPG::CopyCallback
{
319 PrimaryLogPG::CopyResults
*results
= nullptr;
320 PrimaryLogPG::OpContext
*ctx
;
322 uint32_t truncate_seq
;
323 uint64_t truncate_size
;
324 bool have_truncate
= false;
326 CopyFromCallback(PrimaryLogPG::OpContext
*ctx
, OSDOp
&osd_op
)
327 : ctx(ctx
), osd_op(osd_op
) {
329 ~CopyFromCallback() override
{}
331 void finish(PrimaryLogPG::CopyCallbackResults results_
) override
{
332 results
= results_
.get
<1>();
333 int r
= results_
.get
<0>();
335 // Only use truncate_{seq,size} from the original object if the client
336 // did not sent us these parameters
337 if (!have_truncate
) {
338 truncate_seq
= results
->truncate_seq
;
339 truncate_size
= results
->truncate_size
;
342 // for finish_copyfrom
343 ctx
->user_at_version
= results
->user_version
;
346 ctx
->pg
->execute_ctx(ctx
);
348 if (r
!= -ECANCELED
) { // on cancel just toss it out; client resends
350 ctx
->pg
->osd
->reply_op_error(ctx
->op
, r
);
351 } else if (results
->should_requeue
) {
353 ctx
->pg
->requeue_op(ctx
->op
);
355 ctx
->pg
->close_op_ctx(ctx
);
359 bool is_temp_obj_used() {
360 return results
->started_temp_obj
;
362 uint64_t get_data_size() {
363 return results
->object_size
;
365 void set_truncate(uint32_t seq
, uint64_t size
) {
367 truncate_size
= size
;
368 have_truncate
= true;
372 struct CopyFromFinisher
: public PrimaryLogPG::OpFinisher
{
373 CopyFromCallback
*copy_from_callback
;
375 explicit CopyFromFinisher(CopyFromCallback
*copy_from_callback
)
376 : copy_from_callback(copy_from_callback
) {
379 int execute() override
{
380 // instance will be destructed after this method completes
381 copy_from_callback
->ctx
->pg
->finish_copyfrom(copy_from_callback
);
386 // ======================
387 // PGBackend::Listener
389 void PrimaryLogPG::on_local_recover(
390 const hobject_t
&hoid
,
391 const ObjectRecoveryInfo
&_recovery_info
,
392 ObjectContextRef obc
,
394 ObjectStore::Transaction
*t
397 dout(10) << __func__
<< ": " << hoid
<< dendl
;
399 ObjectRecoveryInfo
recovery_info(_recovery_info
);
400 clear_object_snap_mapping(t
, hoid
);
401 if (!is_delete
&& recovery_info
.soid
.is_snap()) {
402 OSDriver::OSTransaction
_t(osdriver
.get_transaction(t
));
404 dout(20) << " snapset " << recovery_info
.ss
<< dendl
;
405 auto p
= recovery_info
.ss
.clone_snaps
.find(hoid
.snap
);
406 if (p
!= recovery_info
.ss
.clone_snaps
.end()) {
407 snaps
.insert(p
->second
.begin(), p
->second
.end());
408 dout(20) << " snaps " << snaps
<< dendl
;
414 derr
<< __func__
<< " " << hoid
<< " had no clone_snaps" << dendl
;
417 if (!is_delete
&& recovery_state
.get_pg_log().get_missing().is_missing(recovery_info
.soid
) &&
418 recovery_state
.get_pg_log().get_missing().get_items().find(recovery_info
.soid
)->second
.need
> recovery_info
.version
) {
419 ceph_assert(is_primary());
420 const pg_log_entry_t
*latest
= recovery_state
.get_pg_log().get_log().objects
.find(recovery_info
.soid
)->second
;
421 if (latest
->op
== pg_log_entry_t::LOST_REVERT
&&
422 latest
->reverting_to
== recovery_info
.version
) {
423 dout(10) << " got old revert version " << recovery_info
.version
424 << " for " << *latest
<< dendl
;
425 recovery_info
.version
= latest
->version
;
426 // update the attr to the revert event version
427 recovery_info
.oi
.prior_version
= recovery_info
.oi
.version
;
428 recovery_info
.oi
.version
= latest
->version
;
430 encode(recovery_info
.oi
, bl
,
431 get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD
, nullptr));
432 ceph_assert(!pool
.info
.is_erasure());
433 t
->setattr(coll
, ghobject_t(recovery_info
.soid
), OI_ATTR
, bl
);
435 obc
->attr_cache
[OI_ATTR
] = bl
;
439 // keep track of active pushes for scrub
442 recovery_state
.recover_got(
444 recovery_info
.version
,
450 obc
->obs
.exists
= true;
452 bool got
= obc
->get_recovery_read();
455 ceph_assert(recovering
.count(obc
->obs
.oi
.soid
));
456 recovering
[obc
->obs
.oi
.soid
] = obc
;
457 obc
->obs
.oi
= recovery_info
.oi
; // may have been updated above
460 t
->register_on_applied(new C_OSD_AppliedRecoveredObject(this, obc
));
462 publish_stats_to_osd();
463 release_backoffs(hoid
);
464 if (!is_unreadable_object(hoid
)) {
465 auto unreadable_object_entry
= waiting_for_unreadable_object
.find(hoid
);
466 if (unreadable_object_entry
!= waiting_for_unreadable_object
.end()) {
467 dout(20) << " kicking unreadable waiters on " << hoid
<< dendl
;
468 requeue_ops(unreadable_object_entry
->second
);
469 waiting_for_unreadable_object
.erase(unreadable_object_entry
);
473 t
->register_on_applied(
474 new C_OSD_AppliedRecoveredObjectReplica(this));
478 t
->register_on_commit(
479 new C_OSD_CommittedPushedObject(
482 info
.last_complete
));
485 void PrimaryLogPG::on_global_recover(
486 const hobject_t
&soid
,
487 const object_stat_sum_t
&stat_diff
,
490 recovery_state
.object_recovered(soid
, stat_diff
);
491 publish_stats_to_osd();
492 dout(10) << "pushed " << soid
<< " to all replicas" << dendl
;
493 auto i
= recovering
.find(soid
);
494 ceph_assert(i
!= recovering
.end());
496 if (i
->second
&& i
->second
->rwstate
.recovery_read_marker
) {
497 // recover missing won't have had an obc, but it gets filled in
498 // during on_local_recover
499 ceph_assert(i
->second
);
500 list
<OpRequestRef
> requeue_list
;
501 i
->second
->drop_recovery_read(&requeue_list
);
502 requeue_ops(requeue_list
);
505 backfills_in_flight
.erase(soid
);
508 finish_recovery_op(soid
);
509 release_backoffs(soid
);
510 auto degraded_object_entry
= waiting_for_degraded_object
.find(soid
);
511 if (degraded_object_entry
!= waiting_for_degraded_object
.end()) {
512 dout(20) << " kicking degraded waiters on " << soid
<< dendl
;
513 requeue_ops(degraded_object_entry
->second
);
514 waiting_for_degraded_object
.erase(degraded_object_entry
);
516 auto unreadable_object_entry
= waiting_for_unreadable_object
.find(soid
);
517 if (unreadable_object_entry
!= waiting_for_unreadable_object
.end()) {
518 dout(20) << " kicking unreadable waiters on " << soid
<< dendl
;
519 requeue_ops(unreadable_object_entry
->second
);
520 waiting_for_unreadable_object
.erase(unreadable_object_entry
);
522 finish_degraded_object(soid
);
525 void PrimaryLogPG::schedule_recovery_work(
526 GenContext
<ThreadPool::TPHandle
&> *c
,
529 osd
->queue_recovery_context(
531 recovery_state
.get_recovery_op_priority());
534 void PrimaryLogPG::replica_clear_repop_obc(
535 const vector
<pg_log_entry_t
> &logv
,
536 ObjectStore::Transaction
&t
)
538 for (auto &&e
: logv
) {
539 /* Have to blast all clones, they share a snapset */
540 object_contexts
.clear_range(
541 e
.soid
.get_object_boundary(), e
.soid
.get_head());
543 snapset_contexts
.find(e
.soid
.get_head()) ==
544 snapset_contexts
.end());
548 bool PrimaryLogPG::should_send_op(
550 const hobject_t
&hoid
) {
551 if (peer
== get_primary())
553 ceph_assert(recovery_state
.has_peer_info(peer
));
555 hoid
.pool
!= (int64_t)info
.pgid
.pool() ||
556 hoid
<= last_backfill_started
||
557 hoid
<= recovery_state
.get_peer_info(peer
).last_backfill
;
559 ceph_assert(is_backfill_target(peer
));
560 dout(10) << __func__
<< " issue_repop shipping empty opt to osd." << peer
561 << ", object " << hoid
562 << " beyond std::max(last_backfill_started "
563 << ", peer_info[peer].last_backfill "
564 << recovery_state
.get_peer_info(peer
).last_backfill
568 if (is_async_recovery_target(peer
) &&
569 recovery_state
.get_peer_missing(peer
).is_missing(hoid
)) {
571 dout(10) << __func__
<< " issue_repop shipping empty opt to osd." << peer
572 << ", object " << hoid
573 << " which is pending recovery in async_recovery_targets" << dendl
;
579 ConnectionRef
PrimaryLogPG::get_con_osd_cluster(
580 int peer
, epoch_t from_epoch
)
582 return osd
->get_con_osd_cluster(peer
, from_epoch
);
585 PerfCounters
*PrimaryLogPG::get_logger()
591 // ====================
594 bool PrimaryLogPG::is_missing_object(const hobject_t
& soid
) const
596 return recovery_state
.get_pg_log().get_missing().get_items().count(soid
);
599 void PrimaryLogPG::maybe_kick_recovery(
600 const hobject_t
&soid
)
603 bool work_started
= false;
604 if (!recovery_state
.get_missing_loc().needs_recovery(soid
, &v
))
607 map
<hobject_t
, ObjectContextRef
>::const_iterator p
= recovering
.find(soid
);
608 if (p
!= recovering
.end()) {
609 dout(7) << "object " << soid
<< " v " << v
<< ", already recovering." << dendl
;
610 } else if (recovery_state
.get_missing_loc().is_unfound(soid
)) {
611 dout(7) << "object " << soid
<< " v " << v
<< ", is unfound." << dendl
;
613 dout(7) << "object " << soid
<< " v " << v
<< ", recovering." << dendl
;
614 PGBackend::RecoveryHandle
*h
= pgbackend
->open_recovery_op();
615 if (is_missing_object(soid
)) {
616 recover_missing(soid
, v
, CEPH_MSG_PRIO_HIGH
, h
);
617 } else if (recovery_state
.get_missing_loc().is_deleted(soid
)) {
618 prep_object_replica_deletes(soid
, v
, h
, &work_started
);
620 prep_object_replica_pushes(soid
, v
, h
, &work_started
);
622 pgbackend
->run_recovery_op(h
, CEPH_MSG_PRIO_HIGH
);
626 void PrimaryLogPG::wait_for_unreadable_object(
627 const hobject_t
& soid
, OpRequestRef op
)
629 ceph_assert(is_unreadable_object(soid
));
630 maybe_kick_recovery(soid
);
631 waiting_for_unreadable_object
[soid
].push_back(op
);
632 op
->mark_delayed("waiting for missing object");
633 osd
->logger
->inc(l_osd_op_delayed_unreadable
);
636 bool PrimaryLogPG::is_degraded_or_backfilling_object(const hobject_t
& soid
)
638 /* The conditions below may clear (on_local_recover, before we queue
639 * the transaction) before we actually requeue the degraded waiters
640 * in on_global_recover after the transaction completes.
642 if (waiting_for_degraded_object
.count(soid
))
644 if (recovery_state
.get_pg_log().get_missing().get_items().count(soid
))
646 ceph_assert(!get_acting_recovery_backfill().empty());
647 for (set
<pg_shard_t
>::iterator i
= get_acting_recovery_backfill().begin();
648 i
!= get_acting_recovery_backfill().end();
650 if (*i
== get_primary()) continue;
651 pg_shard_t peer
= *i
;
652 auto peer_missing_entry
= recovery_state
.get_peer_missing().find(peer
);
653 // If an object is missing on an async_recovery_target, return false.
654 // This will not block the op and the object is async recovered later.
655 if (peer_missing_entry
!= recovery_state
.get_peer_missing().end() &&
656 peer_missing_entry
->second
.get_items().count(soid
)) {
657 if (is_async_recovery_target(peer
))
662 // Object is degraded if after last_backfill AND
663 // we are backfilling it
664 if (is_backfill_target(peer
) &&
665 recovery_state
.get_peer_info(peer
).last_backfill
<= soid
&&
666 last_backfill_started
>= soid
&&
667 backfills_in_flight
.count(soid
))
673 bool PrimaryLogPG::is_degraded_on_async_recovery_target(const hobject_t
& soid
)
675 for (auto &i
: get_async_recovery_targets()) {
676 auto peer_missing_entry
= recovery_state
.get_peer_missing().find(i
);
677 if (peer_missing_entry
!= recovery_state
.get_peer_missing().end() &&
678 peer_missing_entry
->second
.get_items().count(soid
)) {
679 dout(30) << __func__
<< " " << soid
<< dendl
;
686 void PrimaryLogPG::wait_for_degraded_object(const hobject_t
& soid
, OpRequestRef op
)
688 ceph_assert(is_degraded_or_backfilling_object(soid
) || is_degraded_on_async_recovery_target(soid
));
690 maybe_kick_recovery(soid
);
691 waiting_for_degraded_object
[soid
].push_back(op
);
692 op
->mark_delayed("waiting for degraded object");
693 osd
->logger
->inc(l_osd_op_delayed_degraded
);
696 void PrimaryLogPG::block_write_on_full_cache(
697 const hobject_t
& _oid
, OpRequestRef op
)
699 const hobject_t oid
= _oid
.get_head();
700 dout(20) << __func__
<< ": blocking object " << oid
701 << " on full cache" << dendl
;
702 objects_blocked_on_cache_full
.insert(oid
);
703 waiting_for_cache_not_full
.push_back(op
);
704 op
->mark_delayed("waiting for cache not full");
707 void PrimaryLogPG::block_for_clean(
708 const hobject_t
& oid
, OpRequestRef op
)
710 dout(20) << __func__
<< ": blocking object " << oid
711 << " on primary repair" << dendl
;
712 waiting_for_clean_to_primary_repair
.push_back(op
);
713 op
->mark_delayed("waiting for clean to repair");
716 void PrimaryLogPG::block_write_on_snap_rollback(
717 const hobject_t
& oid
, ObjectContextRef obc
, OpRequestRef op
)
719 dout(20) << __func__
<< ": blocking object " << oid
.get_head()
720 << " on snap promotion " << obc
->obs
.oi
.soid
<< dendl
;
721 // otherwise, we'd have blocked in do_op
722 ceph_assert(oid
.is_head());
723 ceph_assert(objects_blocked_on_snap_promotion
.count(oid
) == 0);
725 * We block the head object here.
727 * Let's assume that there is racing read When the head object is being rollbacked.
728 * Since the two different ops can trigger promote_object() with the same source,
729 * infinite loop happens by canceling ops each other.
730 * To avoid this, we block the head object during rollback.
731 * So, the racing read will be blocked until the rollback is completed.
732 * see also: https://tracker.ceph.com/issues/49726
734 ObjectContextRef head_obc
= get_object_context(oid
, false);
735 head_obc
->start_block();
736 objects_blocked_on_snap_promotion
[oid
] = obc
;
737 wait_for_blocked_object(obc
->obs
.oi
.soid
, op
);
740 void PrimaryLogPG::block_write_on_degraded_snap(
741 const hobject_t
& snap
, OpRequestRef op
)
743 dout(20) << __func__
<< ": blocking object " << snap
.get_head()
744 << " on degraded snap " << snap
<< dendl
;
745 // otherwise, we'd have blocked in do_op
746 ceph_assert(objects_blocked_on_degraded_snap
.count(snap
.get_head()) == 0);
747 objects_blocked_on_degraded_snap
[snap
.get_head()] = snap
.snap
;
748 wait_for_degraded_object(snap
, op
);
751 bool PrimaryLogPG::maybe_await_blocked_head(
752 const hobject_t
&hoid
,
755 ObjectContextRef obc
;
756 obc
= object_contexts
.lookup(hoid
.get_head());
758 if (obc
->is_blocked()) {
759 wait_for_blocked_object(obc
->obs
.oi
.soid
, op
);
768 void PrimaryLogPG::wait_for_blocked_object(const hobject_t
& soid
, OpRequestRef op
)
770 dout(10) << __func__
<< " " << soid
<< " " << *op
->get_req() << dendl
;
771 waiting_for_blocked_object
[soid
].push_back(op
);
772 op
->mark_delayed("waiting for blocked object");
775 void PrimaryLogPG::maybe_force_recovery()
777 // no force if not in degraded/recovery/backfill states
778 if (!is_degraded() &&
779 !state_test(PG_STATE_RECOVERING
|
780 PG_STATE_RECOVERY_WAIT
|
781 PG_STATE_BACKFILLING
|
782 PG_STATE_BACKFILL_WAIT
|
783 PG_STATE_BACKFILL_TOOFULL
))
786 if (recovery_state
.get_pg_log().get_log().approx_size() <
787 cct
->_conf
->osd_max_pg_log_entries
*
788 cct
->_conf
->osd_force_recovery_pg_log_entries_factor
)
791 // find the oldest missing object
792 version_t min_version
= recovery_state
.get_pg_log().get_log().head
.version
;
794 if (!recovery_state
.get_pg_log().get_missing().get_rmissing().empty()) {
795 min_version
= recovery_state
.get_pg_log().get_missing().get_rmissing().begin()->first
;
796 soid
= recovery_state
.get_pg_log().get_missing().get_rmissing().begin()->second
;
798 ceph_assert(!get_acting_recovery_backfill().empty());
799 for (set
<pg_shard_t
>::iterator it
= get_acting_recovery_backfill().begin();
800 it
!= get_acting_recovery_backfill().end();
802 if (*it
== get_primary()) continue;
803 pg_shard_t peer
= *it
;
804 auto it_missing
= recovery_state
.get_peer_missing().find(peer
);
805 if (it_missing
!= recovery_state
.get_peer_missing().end() &&
806 !it_missing
->second
.get_rmissing().empty()) {
807 const auto& min_obj
= recovery_state
.get_peer_missing(peer
).get_rmissing().begin();
808 dout(20) << __func__
<< " peer " << peer
<< " min_version " << min_obj
->first
809 << " oid " << min_obj
->second
<< dendl
;
810 if (min_version
> min_obj
->first
) {
811 min_version
= min_obj
->first
;
812 soid
= min_obj
->second
;
818 if (soid
!= hobject_t())
819 maybe_kick_recovery(soid
);
822 bool PrimaryLogPG::check_laggy(OpRequestRef
& op
)
824 assert(HAVE_FEATURE(recovery_state
.get_min_upacting_features(),
826 if (state_test(PG_STATE_WAIT
)) {
827 dout(10) << __func__
<< " PG is WAIT state" << dendl
;
828 } else if (!state_test(PG_STATE_LAGGY
)) {
829 auto mnow
= osd
->get_mnow();
830 auto ru
= recovery_state
.get_readable_until();
837 << " > readable_until " << ru
<< dendl
;
840 osd
->reply_op_error(op
, -EAGAIN
);
845 state_set(PG_STATE_LAGGY
);
846 publish_stats_to_osd();
848 dout(10) << __func__
<< " not readable" << dendl
;
849 waiting_for_readable
.push_back(op
);
850 op
->mark_delayed("waiting for readable");
854 bool PrimaryLogPG::check_laggy_requeue(OpRequestRef
& op
)
856 assert(HAVE_FEATURE(recovery_state
.get_min_upacting_features(),
858 if (!state_test(PG_STATE_WAIT
) && !state_test(PG_STATE_LAGGY
)) {
859 return true; // not laggy
861 dout(10) << __func__
<< " not readable" << dendl
;
862 waiting_for_readable
.push_front(op
);
863 op
->mark_delayed("waiting for readable");
867 void PrimaryLogPG::recheck_readable()
869 if (!is_wait() && !is_laggy()) {
870 dout(20) << __func__
<< " wasn't wait or laggy" << dendl
;
873 auto mnow
= osd
->get_mnow();
876 auto prior_readable_until_ub
= recovery_state
.get_prior_readable_until_ub();
877 if (mnow
< prior_readable_until_ub
) {
878 dout(10) << __func__
<< " still wait (mnow " << mnow
879 << " < prior_readable_until_ub " << prior_readable_until_ub
882 dout(10) << __func__
<< " no longer wait (mnow " << mnow
883 << " >= prior_readable_until_ub " << prior_readable_until_ub
885 state_clear(PG_STATE_WAIT
);
886 recovery_state
.clear_prior_readable_until_ub();
891 auto ru
= recovery_state
.get_readable_until();
892 if (ru
== ceph::signedspan::zero()) {
893 dout(10) << __func__
<< " still laggy (mnow " << mnow
894 << ", readable_until zero)" << dendl
;
895 } else if (mnow
>= ru
) {
896 dout(10) << __func__
<< " still laggy (mnow " << mnow
897 << " >= readable_until " << ru
<< ")" << dendl
;
899 dout(10) << __func__
<< " no longer laggy (mnow " << mnow
900 << " < readable_until " << ru
<< ")" << dendl
;
901 state_clear(PG_STATE_LAGGY
);
906 publish_stats_to_osd();
908 if (!is_laggy() && !is_wait()) {
909 requeue_ops(waiting_for_readable
);
913 bool PrimaryLogPG::pgls_filter(const PGLSFilter
& filter
, const hobject_t
& sobj
)
917 // If filter has expressed an interest in an xattr, load it.
918 if (!filter
.get_xattr().empty()) {
919 int ret
= pgbackend
->objects_get_attr(
923 dout(0) << "getattr (sobj=" << sobj
<< ", attr=" << filter
.get_xattr() << ") returned " << ret
<< dendl
;
925 if (ret
!= -ENODATA
|| filter
.reject_empty_xattr()) {
931 return filter
.filter(sobj
, bl
);
934 std::pair
<int, std::unique_ptr
<const PGLSFilter
>>
935 PrimaryLogPG::get_pgls_filter(bufferlist::const_iterator
& iter
)
938 // storing non-const PGLSFilter for the sake of ::init()
939 std::unique_ptr
<PGLSFilter
> filter
;
944 catch (ceph::buffer::error
& e
) {
945 return { -EINVAL
, nullptr };
948 if (type
.compare("plain") == 0) {
949 filter
= std::make_unique
<PGLSPlainFilter
>();
951 std::size_t dot
= type
.find('.');
952 if (dot
== std::string::npos
|| dot
== 0 || dot
== type
.size() - 1) {
953 return { -EINVAL
, nullptr };
956 const std::string class_name
= type
.substr(0, dot
);
957 const std::string filter_name
= type
.substr(dot
+ 1);
958 ClassHandler::ClassData
*cls
= NULL
;
959 int r
= ClassHandler::get_instance().open_class(class_name
, &cls
);
961 derr
<< "Error opening class '" << class_name
<< "': "
962 << cpp_strerror(r
) << dendl
;
963 if (r
!= -EPERM
) // propagate permission error
965 return { r
, nullptr };
970 ClassHandler::ClassFilter
*class_filter
= cls
->get_filter(filter_name
);
971 if (class_filter
== NULL
) {
972 derr
<< "Error finding filter '" << filter_name
<< "' in class "
973 << class_name
<< dendl
;
974 return { -EINVAL
, nullptr };
976 filter
.reset(class_filter
->fn());
978 // Object classes are obliged to return us something, but let's
979 // give an error rather than asserting out.
980 derr
<< "Buggy class " << class_name
<< " failed to construct "
981 "filter " << filter_name
<< dendl
;
982 return { -EINVAL
, nullptr };
987 int r
= filter
->init(iter
);
989 derr
<< "Error initializing filter " << type
<< ": "
990 << cpp_strerror(r
) << dendl
;
991 return { -EINVAL
, nullptr };
993 // Successfully constructed and initialized, return it.
994 return std::make_pair(0, std::move(filter
));
999 // ==========================================================
1001 void PrimaryLogPG::do_command(
1002 const string_view
& orig_prefix
,
1003 const cmdmap_t
& cmdmap
,
1004 const bufferlist
& idata
,
1005 std::function
<void(int,const std::string
&,bufferlist
&)> on_finish
)
1008 cmd_getval(cmdmap
, "format", format
);
1009 auto f(Formatter::create_unique(format
, "json-pretty", "json-pretty"));
1011 stringstream ss
; // stderr error message stream
1012 bufferlist outbl
; // if empty at end, we'll dump formatter as output
1014 // get final prefix:
1015 // - ceph pg <pgid> foo -> prefix=pg, cmd=foo
1016 // - ceph tell <pgid> foo -> prefix=foo
1017 string
prefix(orig_prefix
);
1019 cmd_getval(cmdmap
, "cmd", command
);
1020 if (command
.size()) {
1024 if (prefix
== "query") {
1025 f
->open_object_section("pg");
1026 f
->dump_stream("snap_trimq") << snap_trimq
;
1027 f
->dump_unsigned("snap_trimq_len", snap_trimq
.size());
1028 recovery_state
.dump_peering_state(f
.get());
1030 f
->open_array_section("recovery_state");
1031 handle_query_state(f
.get());
1034 if (is_primary() && is_active() && m_scrubber
) {
1035 m_scrubber
->dump_scrubber(f
.get(), m_planned_scrub
);
1038 f
->open_object_section("agent_state");
1040 agent_state
->dump(f
.get());
1045 else if (prefix
== "log") {
1047 f
->open_object_section("op_log");
1048 f
->open_object_section("pg_log_t");
1049 recovery_state
.get_pg_log().get_log().dump(f
.get());
1053 else if (prefix
== "mark_unfound_lost") {
1055 cmd_getval(cmdmap
, "mulcmd", mulcmd
);
1057 if (mulcmd
== "revert") {
1058 if (pool
.info
.is_erasure()) {
1059 ss
<< "mode must be 'delete' for ec pool";
1063 mode
= pg_log_entry_t::LOST_REVERT
;
1064 } else if (mulcmd
== "delete") {
1065 mode
= pg_log_entry_t::LOST_DELETE
;
1067 ss
<< "mode must be 'revert' or 'delete'; mark not yet implemented";
1071 ceph_assert(mode
== pg_log_entry_t::LOST_REVERT
||
1072 mode
== pg_log_entry_t::LOST_DELETE
);
1074 if (!is_primary()) {
1075 ss
<< "not primary";
1080 uint64_t unfound
= recovery_state
.get_missing_loc().num_unfound();
1082 ss
<< "pg has no unfound objects";
1083 goto out
; // make command idempotent
1086 if (!recovery_state
.all_unfound_are_queried_or_lost(get_osdmap())) {
1087 ss
<< "pg has " << unfound
1088 << " unfound objects but we haven't probed all sources, not marking lost";
1093 mark_all_unfound_lost(mode
, on_finish
);
1097 else if (prefix
== "list_unfound") {
1100 bool show_offset
= false;
1101 if (cmd_getval(cmdmap
, "offset", offset_json
)) {
1102 json_spirit::Value v
;
1104 if (!json_spirit::read(offset_json
, v
))
1105 throw std::runtime_error("bad json");
1107 } catch (std::runtime_error
& e
) {
1108 ss
<< "error parsing offset: " << e
.what();
1114 f
->open_object_section("missing");
1116 f
->open_object_section("offset");
1117 offset
.dump(f
.get());
1120 auto &needs_recovery_map
= recovery_state
.get_missing_loc()
1121 .get_needs_recovery();
1122 f
->dump_int("num_missing", needs_recovery_map
.size());
1123 f
->dump_int("num_unfound", get_num_unfound());
1124 map
<hobject_t
, pg_missing_item
>::const_iterator p
=
1125 needs_recovery_map
.upper_bound(offset
);
1127 f
->open_array_section("objects");
1129 for (; p
!= needs_recovery_map
.end() &&
1130 num
< cct
->_conf
->osd_command_max_records
;
1132 if (recovery_state
.get_missing_loc().is_unfound(p
->first
)) {
1133 f
->open_object_section("object");
1135 f
->open_object_section("oid");
1136 p
->first
.dump(f
.get());
1139 p
->second
.dump(f
.get()); // have, need keys
1141 f
->open_array_section("locations");
1142 for (auto &&r
: recovery_state
.get_missing_loc().get_locations(
1144 f
->dump_stream("shard") << r
;
1154 // Get possible locations of missing objects from pg information
1155 PeeringState::QueryUnfound
q(f
.get());
1156 recovery_state
.handle_event(q
, 0);
1157 f
->dump_bool("more", p
!= needs_recovery_map
.end());
1161 else if (prefix
== "scrub" ||
1162 prefix
== "deep_scrub") {
1163 bool deep
= (prefix
== "deep_scrub");
1164 int64_t time
= cmd_getval_or
<int64_t>(cmdmap
, "time", 0);
1167 const pg_pool_t
*p
= &pool
.info
;
1168 double pool_scrub_max_interval
= 0;
1169 double scrub_max_interval
;
1171 p
->opts
.get(pool_opts_t::DEEP_SCRUB_INTERVAL
, &pool_scrub_max_interval
);
1172 scrub_max_interval
= pool_scrub_max_interval
> 0 ?
1173 pool_scrub_max_interval
: g_conf()->osd_deep_scrub_interval
;
1175 p
->opts
.get(pool_opts_t::SCRUB_MAX_INTERVAL
, &pool_scrub_max_interval
);
1176 scrub_max_interval
= pool_scrub_max_interval
> 0 ?
1177 pool_scrub_max_interval
: g_conf()->osd_scrub_max_interval
;
1179 // Instead of marking must_scrub force a schedule scrub
1180 utime_t stamp
= ceph_clock_now();
1182 stamp
-= scrub_max_interval
;
1184 stamp
-= (float)time
;
1185 stamp
-= 100.0; // push back last scrub more for good measure
1187 set_last_deep_scrub_stamp(stamp
);
1189 set_last_scrub_stamp(stamp
); // for 'deep' as well, as we use this value to order scrubs
1190 f
->open_object_section("result");
1191 f
->dump_bool("deep", deep
);
1192 f
->dump_stream("stamp") << stamp
;
1195 ss
<< "Not primary";
1198 outbl
.append(ss
.str());
1201 else if (prefix
== "block" || prefix
== "unblock" || prefix
== "set" ||
1202 prefix
== "unset") {
1204 cmd_getval(cmdmap
, "value", value
);
1207 ret
= m_scrubber
->asok_debug(prefix
, value
, f
.get(), ss
);
1208 f
->open_object_section("result");
1209 f
->dump_bool("success", true);
1212 ss
<< "Not primary";
1215 outbl
.append(ss
.str());
1219 ss
<< "prefix '" << prefix
<< "' not implemented";
1223 if (ret
>= 0 && outbl
.length() == 0) {
1226 on_finish(ret
, ss
.str(), outbl
);
1230 // ==========================================================
1232 void PrimaryLogPG::do_pg_op(OpRequestRef op
)
1234 const MOSDOp
*m
= static_cast<const MOSDOp
*>(op
->get_req());
1235 ceph_assert(m
->get_type() == CEPH_MSG_OSD_OP
);
1236 dout(10) << "do_pg_op " << *m
<< dendl
;
1241 string cname
, mname
;
1243 snapid_t snapid
= m
->get_snapid();
1245 vector
<OSDOp
> ops
= m
->ops
;
1247 for (vector
<OSDOp
>::iterator p
= ops
.begin(); p
!= ops
.end(); ++p
) {
1248 std::unique_ptr
<const PGLSFilter
> filter
;
1250 auto bp
= p
->indata
.cbegin();
1252 case CEPH_OSD_OP_PGNLS_FILTER
:
1257 catch (const ceph::buffer::error
& e
) {
1258 dout(0) << "unable to decode PGLS_FILTER description in " << *m
<< dendl
;
1262 std::tie(result
, filter
) = get_pgls_filter(bp
);
1266 ceph_assert(filter
);
1270 case CEPH_OSD_OP_PGNLS
:
1271 if (snapid
!= CEPH_NOSNAP
) {
1275 if (get_osdmap()->raw_pg_to_pg(m
->get_pg()) != info
.pgid
.pgid
) {
1276 dout(10) << " pgnls pg=" << m
->get_pg()
1277 << " " << get_osdmap()->raw_pg_to_pg(m
->get_pg())
1278 << " != " << info
.pgid
<< dendl
;
1281 unsigned list_size
= std::min
<uint64_t>(cct
->_conf
->osd_max_pgls
,
1284 dout(10) << " pgnls pg=" << m
->get_pg() << " count " << list_size
1286 // read into a buffer
1287 vector
<hobject_t
> sentries
;
1288 pg_nls_response_t response
;
1290 decode(response
.handle
, bp
);
1292 catch (const ceph::buffer::error
& e
) {
1293 dout(0) << "unable to decode PGNLS handle in " << *m
<< dendl
;
1299 hobject_t lower_bound
= response
.handle
;
1300 hobject_t pg_start
= info
.pgid
.pgid
.get_hobj_start();
1301 hobject_t pg_end
= info
.pgid
.pgid
.get_hobj_end(pool
.info
.get_pg_num());
1302 dout(10) << " pgnls lower_bound " << lower_bound
1303 << " pg_end " << pg_end
<< dendl
;
1304 if (((!lower_bound
.is_max() && lower_bound
>= pg_end
) ||
1305 (lower_bound
!= hobject_t() && lower_bound
< pg_start
))) {
1306 // this should only happen with a buggy client.
1307 dout(10) << "outside of PG bounds " << pg_start
<< " .. "
1313 hobject_t current
= lower_bound
;
1314 int r
= pgbackend
->objects_list_partial(
1325 map
<hobject_t
, pg_missing_item
>::const_iterator missing_iter
=
1326 recovery_state
.get_pg_log().get_missing().get_items().lower_bound(current
);
1327 vector
<hobject_t
>::iterator ls_iter
= sentries
.begin();
1328 hobject_t _max
= hobject_t::get_max();
1330 const hobject_t
&mcand
=
1331 missing_iter
== recovery_state
.get_pg_log().get_missing().get_items().end() ?
1333 missing_iter
->first
;
1334 const hobject_t
&lcand
=
1335 ls_iter
== sentries
.end() ?
1339 hobject_t candidate
;
1340 if (mcand
== lcand
) {
1342 if (!mcand
.is_max()) {
1346 } else if (mcand
< lcand
) {
1348 ceph_assert(!mcand
.is_max());
1352 ceph_assert(!lcand
.is_max());
1356 dout(10) << " pgnls candidate 0x" << std::hex
<< candidate
.get_hash()
1357 << " vs lower bound 0x" << lower_bound
.get_hash()
1358 << std::dec
<< dendl
;
1360 if (candidate
>= next
) {
1364 if (response
.entries
.size() == list_size
) {
1369 if (candidate
.snap
!= CEPH_NOSNAP
)
1372 // skip internal namespace
1373 if (candidate
.get_namespace() == cct
->_conf
->osd_hit_set_namespace
)
1376 if (recovery_state
.get_missing_loc().is_deleted(candidate
))
1379 // skip wrong namespace
1380 if (m
->get_hobj().nspace
!= librados::all_nspaces
&&
1381 candidate
.get_namespace() != m
->get_hobj().nspace
)
1384 if (filter
&& !pgls_filter(*filter
, candidate
))
1387 dout(20) << "pgnls item 0x" << std::hex
1388 << candidate
.get_hash()
1389 << ", rev 0x" << hobject_t::_reverse_bits(candidate
.get_hash())
1391 << candidate
.oid
.name
<< dendl
;
1393 librados::ListObjectImpl item
;
1394 item
.nspace
= candidate
.get_namespace();
1395 item
.oid
= candidate
.oid
.name
;
1396 item
.locator
= candidate
.get_key();
1397 response
.entries
.push_back(item
);
1400 if (next
.is_max() &&
1401 missing_iter
== recovery_state
.get_pg_log().get_missing().get_items().end() &&
1402 ls_iter
== sentries
.end()) {
1405 // Set response.handle to the start of the next PG according
1406 // to the object sort order.
1407 response
.handle
= info
.pgid
.pgid
.get_hobj_end(pool
.info
.get_pg_num());
1409 response
.handle
= next
;
1411 dout(10) << "pgnls handle=" << response
.handle
<< dendl
;
1412 encode(response
, osd_op
.outdata
);
1413 dout(10) << " pgnls result=" << result
<< " outdata.length()="
1414 << osd_op
.outdata
.length() << dendl
;
1418 case CEPH_OSD_OP_PGLS_FILTER
:
1423 catch (const ceph::buffer::error
& e
) {
1424 dout(0) << "unable to decode PGLS_FILTER description in " << *m
<< dendl
;
1428 std::tie(result
, filter
) = get_pgls_filter(bp
);
1432 ceph_assert(filter
);
1436 case CEPH_OSD_OP_PGLS
:
1437 if (snapid
!= CEPH_NOSNAP
) {
1441 if (get_osdmap()->raw_pg_to_pg(m
->get_pg()) != info
.pgid
.pgid
) {
1442 dout(10) << " pgls pg=" << m
->get_pg()
1443 << " " << get_osdmap()->raw_pg_to_pg(m
->get_pg())
1444 << " != " << info
.pgid
<< dendl
;
1447 unsigned list_size
= std::min
<uint64_t>(cct
->_conf
->osd_max_pgls
,
1450 dout(10) << " pgls pg=" << m
->get_pg() << " count " << list_size
<< dendl
;
1451 // read into a buffer
1452 vector
<hobject_t
> sentries
;
1453 pg_ls_response_t response
;
1455 decode(response
.handle
, bp
);
1457 catch (const ceph::buffer::error
& e
) {
1458 dout(0) << "unable to decode PGLS handle in " << *m
<< dendl
;
1464 hobject_t current
= response
.handle
;
1465 int r
= pgbackend
->objects_list_partial(
1476 ceph_assert(snapid
== CEPH_NOSNAP
|| recovery_state
.get_pg_log().get_missing().get_items().empty());
1478 map
<hobject_t
, pg_missing_item
>::const_iterator missing_iter
=
1479 recovery_state
.get_pg_log().get_missing().get_items().lower_bound(current
);
1480 vector
<hobject_t
>::iterator ls_iter
= sentries
.begin();
1481 hobject_t _max
= hobject_t::get_max();
1483 const hobject_t
&mcand
=
1484 missing_iter
== recovery_state
.get_pg_log().get_missing().get_items().end() ?
1486 missing_iter
->first
;
1487 const hobject_t
&lcand
=
1488 ls_iter
== sentries
.end() ?
1492 hobject_t candidate
;
1493 if (mcand
== lcand
) {
1495 if (!mcand
.is_max()) {
1499 } else if (mcand
< lcand
) {
1501 ceph_assert(!mcand
.is_max());
1505 ceph_assert(!lcand
.is_max());
1509 if (candidate
>= next
) {
1513 if (response
.entries
.size() == list_size
) {
1518 if (candidate
.snap
!= CEPH_NOSNAP
)
1521 // skip wrong namespace
1522 if (candidate
.get_namespace() != m
->get_hobj().nspace
)
1525 if (recovery_state
.get_missing_loc().is_deleted(candidate
))
1528 if (filter
&& !pgls_filter(*filter
, candidate
))
1531 response
.entries
.push_back(make_pair(candidate
.oid
,
1532 candidate
.get_key()));
1534 if (next
.is_max() &&
1535 missing_iter
== recovery_state
.get_pg_log().get_missing().get_items().end() &&
1536 ls_iter
== sentries
.end()) {
1539 response
.handle
= next
;
1540 encode(response
, osd_op
.outdata
);
1541 dout(10) << " pgls result=" << result
<< " outdata.length()="
1542 << osd_op
.outdata
.length() << dendl
;
1546 case CEPH_OSD_OP_PG_HITSET_LS
:
1548 list
< pair
<utime_t
,utime_t
> > ls
;
1549 for (list
<pg_hit_set_info_t
>::const_iterator p
= info
.hit_set
.history
.begin();
1550 p
!= info
.hit_set
.history
.end();
1552 ls
.push_back(make_pair(p
->begin
, p
->end
));
1554 ls
.push_back(make_pair(hit_set_start_stamp
, utime_t()));
1555 encode(ls
, osd_op
.outdata
);
1559 case CEPH_OSD_OP_PG_HITSET_GET
:
1561 utime_t
stamp(osd_op
.op
.hit_set_get
.stamp
);
1562 if (hit_set_start_stamp
&& stamp
>= hit_set_start_stamp
) {
1563 // read the current in-memory HitSet, not the version we've
1569 encode(*hit_set
, osd_op
.outdata
);
1570 result
= osd_op
.outdata
.length();
1572 // read an archived HitSet.
1574 for (list
<pg_hit_set_info_t
>::const_iterator p
= info
.hit_set
.history
.begin();
1575 p
!= info
.hit_set
.history
.end();
1577 if (stamp
>= p
->begin
&& stamp
<= p
->end
) {
1578 oid
= get_hit_set_archive_object(p
->begin
, p
->end
, p
->using_gmt
);
1582 if (oid
== hobject_t()) {
1586 if (!pool
.info
.is_replicated()) {
1587 // FIXME: EC not supported yet
1588 result
= -EOPNOTSUPP
;
1591 if (is_unreadable_object(oid
)) {
1592 wait_for_unreadable_object(oid
, op
);
1595 result
= osd
->store
->read(ch
, ghobject_t(oid
), 0, 0, osd_op
.outdata
);
1600 case CEPH_OSD_OP_SCRUBLS
:
1601 result
= do_scrub_ls(m
, &osd_op
);
1614 MOSDOpReply
*reply
= new MOSDOpReply(m
, 0, get_osdmap_epoch(),
1615 CEPH_OSD_FLAG_ACK
| CEPH_OSD_FLAG_ONDISK
,
1617 reply
->claim_op_out_data(ops
);
1618 reply
->set_result(result
);
1619 reply
->set_reply_versions(info
.last_update
, info
.last_user_version
);
1620 osd
->send_message_osd_client(reply
, m
->get_connection());
1623 int PrimaryLogPG::do_scrub_ls(const MOSDOp
*m
, OSDOp
*osd_op
)
1625 if (m
->get_pg() != info
.pgid
.pgid
) {
1626 dout(10) << " scrubls pg=" << m
->get_pg() << " != " << info
.pgid
<< dendl
;
1627 return -EINVAL
; // hmm?
1629 auto bp
= osd_op
->indata
.cbegin();
1633 } catch (ceph::buffer::error
&) {
1634 dout(10) << " corrupted scrub_ls_arg_t" << dendl
;
1639 scrub_ls_result_t result
= {.interval
= info
.history
.same_interval_since
};
1641 if (arg
.interval
!= 0 && arg
.interval
!= info
.history
.same_interval_since
) {
1644 bool store_queried
= m_scrubber
&& m_scrubber
->get_store_errors(arg
, result
);
1645 if (store_queried
) {
1646 encode(result
, osd_op
->outdata
);
1648 // the scrubber's store is not initialized
1657 * Grabs locks for OpContext, should be cleaned up in close_op_ctx
1659 * @param ctx [in,out] ctx to get locks for
1660 * @return true on success, false if we are queued
1662 bool PrimaryLogPG::get_rw_locks(bool write_ordered
, OpContext
*ctx
)
1664 /* If head_obc, !obc->obs->exists and we will always take the
1665 * snapdir lock *before* the head lock. Since all callers will do
1666 * this (read or write) if we get the first we will be guaranteed
1667 * to get the second.
1669 if (write_ordered
&& ctx
->op
->may_read()) {
1670 ctx
->lock_type
= RWState::RWEXCL
;
1671 } else if (write_ordered
) {
1672 ctx
->lock_type
= RWState::RWWRITE
;
1674 ceph_assert(ctx
->op
->may_read());
1675 ctx
->lock_type
= RWState::RWREAD
;
1678 if (ctx
->head_obc
) {
1679 ceph_assert(!ctx
->obc
->obs
.exists
);
1680 if (!ctx
->lock_manager
.get_lock_type(
1682 ctx
->head_obc
->obs
.oi
.soid
,
1685 ctx
->lock_type
= RWState::RWNONE
;
1689 if (ctx
->lock_manager
.get_lock_type(
1691 ctx
->obc
->obs
.oi
.soid
,
1696 ceph_assert(!ctx
->head_obc
);
1697 ctx
->lock_type
= RWState::RWNONE
;
1705 * @param manager [in] manager with locks to release
1707 void PrimaryLogPG::release_object_locks(
1708 ObcLockManager
&lock_manager
) {
1709 std::list
<std::pair
<ObjectContextRef
, std::list
<OpRequestRef
> > > to_req
;
1710 bool requeue_recovery
= false;
1711 bool requeue_snaptrim
= false;
1712 lock_manager
.put_locks(
1716 if (requeue_recovery
)
1718 if (requeue_snaptrim
)
1719 snap_trimmer_machine
.process_event(TrimWriteUnblocked());
1721 if (!to_req
.empty()) {
1722 // requeue at front of scrub blocking queue if we are blocked by scrub
1723 for (auto &&p
: to_req
) {
1724 if (m_scrubber
->write_blocked_by_scrub(p
.first
->obs
.oi
.soid
.get_head())) {
1725 for (auto& op
: p
.second
) {
1726 op
->mark_delayed("waiting for scrub");
1729 waiting_for_scrub
.splice(
1730 waiting_for_scrub
.begin(),
1734 } else if (is_laggy()) {
1735 for (auto& op
: p
.second
) {
1736 op
->mark_delayed("waiting for readable");
1738 waiting_for_readable
.splice(
1739 waiting_for_readable
.begin(),
1744 requeue_ops(p
.second
);
1750 PrimaryLogPG::PrimaryLogPG(OSDService
*o
, OSDMapRef curmap
,
1751 const PGPool
&_pool
,
1752 const map
<string
,string
>& ec_profile
, spg_t p
) :
1753 PG(o
, curmap
, _pool
, p
),
1755 PGBackend::build_pg_backend(
1756 _pool
.info
, ec_profile
, this, coll_t(p
), ch
, o
->store
, cct
)),
1757 object_contexts(o
->cct
, o
->cct
->_conf
->osd_pg_object_context_cache_count
),
1758 new_backfill(false),
1760 snap_trimmer_machine(this)
1762 recovery_state
.set_backend_predicates(
1763 pgbackend
->get_is_readable_predicate(),
1764 pgbackend
->get_is_recoverable_predicate());
1765 snap_trimmer_machine
.initiate();
1767 m_scrubber
= make_unique
<PrimaryLogScrub
>(this);
1770 PrimaryLogPG::~PrimaryLogPG()
1775 void PrimaryLogPG::get_src_oloc(const object_t
& oid
, const object_locator_t
& oloc
, object_locator_t
& src_oloc
)
1778 if (oloc
.key
.empty())
1779 src_oloc
.key
= oid
.name
;
1782 void PrimaryLogPG::handle_backoff(OpRequestRef
& op
)
1784 auto m
= op
->get_req
<MOSDBackoff
>();
1785 auto session
= ceph::ref_cast
<Session
>(m
->get_connection()->get_priv());
1788 hobject_t begin
= info
.pgid
.pgid
.get_hobj_start();
1789 hobject_t end
= info
.pgid
.pgid
.get_hobj_end(pool
.info
.get_pg_num());
1790 if (begin
< m
->begin
) {
1796 dout(10) << __func__
<< " backoff ack id " << m
->id
1797 << " [" << begin
<< "," << end
<< ")" << dendl
;
1798 session
->ack_backoff(cct
, m
->pgid
, m
->id
, begin
, end
);
1801 void PrimaryLogPG::do_request(
1803 ThreadPool::TPHandle
&handle
)
1805 if (op
->osd_trace
) {
1806 op
->pg_trace
.init("pg op", &trace_endpoint
, &op
->osd_trace
);
1807 op
->pg_trace
.event("do request");
1811 // make sure we have a new enough map
1812 auto p
= waiting_for_map
.find(op
->get_source());
1813 if (p
!= waiting_for_map
.end()) {
1814 // preserve ordering
1815 dout(20) << __func__
<< " waiting_for_map "
1816 << p
->first
<< " not empty, queueing" << dendl
;
1817 p
->second
.push_back(op
);
1818 op
->mark_delayed("waiting_for_map not empty");
1821 if (!have_same_or_newer_map(op
->min_epoch
)) {
1822 dout(20) << __func__
<< " min " << op
->min_epoch
1823 << ", queue on waiting_for_map " << op
->get_source() << dendl
;
1824 waiting_for_map
[op
->get_source()].push_back(op
);
1825 op
->mark_delayed("op must wait for map");
1826 osd
->request_osdmap_update(op
->min_epoch
);
1830 if (can_discard_request(op
)) {
1835 const Message
*m
= op
->get_req();
1836 int msg_type
= m
->get_type();
1837 if (m
->get_connection()->has_feature(CEPH_FEATURE_RADOS_BACKOFF
)) {
1838 auto session
= ceph::ref_cast
<Session
>(m
->get_connection()->get_priv());
1841 if (msg_type
== CEPH_MSG_OSD_OP
) {
1842 if (session
->check_backoff(cct
, info
.pgid
,
1843 info
.pgid
.pgid
.get_hobj_start(), m
)) {
1850 (!is_active() && is_peered());
1851 if (g_conf()->osd_backoff_on_peering
&& !backoff
) {
1857 add_pg_backoff(session
);
1861 // pg backoff acks at pg-level
1862 if (msg_type
== CEPH_MSG_OSD_BACKOFF
) {
1863 const MOSDBackoff
*ba
= static_cast<const MOSDBackoff
*>(m
);
1864 if (ba
->begin
!= ba
->end
) {
1872 // Delay unless PGBackend says it's ok
1873 if (pgbackend
->can_handle_while_inactive(op
)) {
1874 bool handled
= pgbackend
->handle_message(op
);
1875 ceph_assert(handled
);
1878 waiting_for_peered
.push_back(op
);
1879 op
->mark_delayed("waiting for peered");
1884 if (recovery_state
.needs_flush()) {
1885 dout(20) << "waiting for flush on " << *op
->get_req() << dendl
;
1886 waiting_for_flush
.push_back(op
);
1887 op
->mark_delayed("waiting for flush");
1891 ceph_assert(is_peered() && !recovery_state
.needs_flush());
1892 if (pgbackend
->handle_message(op
))
1896 case CEPH_MSG_OSD_OP
:
1897 case CEPH_MSG_OSD_BACKOFF
:
1899 dout(20) << " peered, not active, waiting for active on "
1900 << *op
->get_req() << dendl
;
1901 waiting_for_active
.push_back(op
);
1902 op
->mark_delayed("waiting for active");
1906 case CEPH_MSG_OSD_OP
:
1907 // verify client features
1908 if ((pool
.info
.has_tiers() || pool
.info
.is_tier()) &&
1909 !op
->has_feature(CEPH_FEATURE_OSD_CACHEPOOL
)) {
1910 osd
->reply_op_error(op
, -EOPNOTSUPP
);
1915 case CEPH_MSG_OSD_BACKOFF
:
1916 // object-level backoff acks handled in osdop context
1922 case MSG_OSD_PG_SCAN
:
1923 do_scan(op
, handle
);
1926 case MSG_OSD_PG_BACKFILL
:
1930 case MSG_OSD_PG_BACKFILL_REMOVE
:
1931 do_backfill_remove(op
);
1934 case MSG_OSD_SCRUB_RESERVE
:
1937 osd
->reply_op_error(op
, -EAGAIN
);
1940 auto m
= op
->get_req
<MOSDScrubReserve
>();
1942 case MOSDScrubReserve::REQUEST
:
1943 m_scrubber
->handle_scrub_reserve_request(op
);
1945 case MOSDScrubReserve::GRANT
:
1946 m_scrubber
->handle_scrub_reserve_grant(op
, m
->from
);
1948 case MOSDScrubReserve::REJECT
:
1949 m_scrubber
->handle_scrub_reserve_reject(op
, m
->from
);
1951 case MOSDScrubReserve::RELEASE
:
1952 m_scrubber
->handle_scrub_reserve_release(op
);
1958 case MSG_OSD_REP_SCRUB
:
1959 replica_scrub(op
, handle
);
1962 case MSG_OSD_REP_SCRUBMAP
:
1963 do_replica_scrub_map(op
);
1966 case MSG_OSD_PG_UPDATE_LOG_MISSING
:
1967 do_update_log_missing(op
);
1970 case MSG_OSD_PG_UPDATE_LOG_MISSING_REPLY
:
1971 do_update_log_missing_reply(op
);
1975 ceph_abort_msg("bad message type in do_request");
1979 /** do_op - do an op
1980 * pg lock will be held (if multithreaded)
1981 * osd_lock NOT held.
1983 void PrimaryLogPG::do_op(OpRequestRef
& op
)
1986 // NOTE: take a non-const pointer here; we must be careful not to
1987 // change anything that will break other reads on m (operator<<).
1988 MOSDOp
*m
= static_cast<MOSDOp
*>(op
->get_nonconst_req());
1989 ceph_assert(m
->get_type() == CEPH_MSG_OSD_OP
);
1990 if (m
->finish_decode()) {
1991 op
->reset_desc(); // for TrackedOp
1995 dout(20) << __func__
<< ": op " << *m
<< dendl
;
1997 const hobject_t head
= m
->get_hobj().get_head();
1999 if (!info
.pgid
.pgid
.contains(
2000 info
.pgid
.pgid
.get_split_bits(pool
.info
.get_pg_num()), head
)) {
2001 derr
<< __func__
<< " " << info
.pgid
.pgid
<< " does not contain "
2002 << head
<< " pg_num " << pool
.info
.get_pg_num() << " hash "
2003 << std::hex
<< head
.get_hash() << std::dec
<< dendl
;
2004 osd
->clog
->warn() << info
.pgid
.pgid
<< " does not contain " << head
2006 ceph_assert(!cct
->_conf
->osd_debug_misdirected_ops
);
2011 m
->get_connection()->has_feature(CEPH_FEATURE_RADOS_BACKOFF
);
2012 ceph::ref_t
<Session
> session
;
2014 session
= static_cast<Session
*>(m
->get_connection()->get_priv().get());
2015 if (!session
.get()) {
2016 dout(10) << __func__
<< " no session" << dendl
;
2020 if (session
->check_backoff(cct
, info
.pgid
, head
, m
)) {
2025 if (m
->has_flag(CEPH_OSD_FLAG_PARALLELEXEC
)) {
2027 dout(20) << __func__
<< ": PARALLELEXEC not implemented " << *m
<< dendl
;
2028 osd
->reply_op_error(op
, -EINVAL
);
2033 int r
= op
->maybe_init_op_info(*get_osdmap());
2035 osd
->reply_op_error(op
, r
);
2040 if ((m
->get_flags() & (CEPH_OSD_FLAG_BALANCE_READS
|
2041 CEPH_OSD_FLAG_LOCALIZE_READS
)) &&
2043 !(op
->may_write() || op
->may_cache())) {
2044 // balanced reads; any replica will do
2045 if (!(is_primary() || is_nonprimary())) {
2046 osd
->handle_misdirected_op(this, op
);
2050 // normal case; must be primary
2051 if (!is_primary()) {
2052 osd
->handle_misdirected_op(this, op
);
2057 if (!check_laggy(op
)) {
2061 if (!op_has_sufficient_caps(op
)) {
2062 osd
->reply_op_error(op
, -EPERM
);
2066 if (op
->includes_pg_op()) {
2067 return do_pg_op(op
);
2070 // object name too long?
2071 if (m
->get_oid().name
.size() > cct
->_conf
->osd_max_object_name_len
) {
2072 dout(4) << "do_op name is longer than "
2073 << cct
->_conf
->osd_max_object_name_len
2074 << " bytes" << dendl
;
2075 osd
->reply_op_error(op
, -ENAMETOOLONG
);
2078 if (m
->get_hobj().get_key().size() > cct
->_conf
->osd_max_object_name_len
) {
2079 dout(4) << "do_op locator is longer than "
2080 << cct
->_conf
->osd_max_object_name_len
2081 << " bytes" << dendl
;
2082 osd
->reply_op_error(op
, -ENAMETOOLONG
);
2085 if (m
->get_hobj().nspace
.size() > cct
->_conf
->osd_max_object_namespace_len
) {
2086 dout(4) << "do_op namespace is longer than "
2087 << cct
->_conf
->osd_max_object_namespace_len
2088 << " bytes" << dendl
;
2089 osd
->reply_op_error(op
, -ENAMETOOLONG
);
2092 if (m
->get_hobj().oid
.name
.empty()) {
2093 dout(4) << "do_op empty oid name is not allowed" << dendl
;
2094 osd
->reply_op_error(op
, -EINVAL
);
2098 if (int r
= osd
->store
->validate_hobject_key(head
)) {
2099 dout(4) << "do_op object " << head
<< " invalid for backing store: "
2101 osd
->reply_op_error(op
, r
);
2106 if (get_osdmap()->is_blocklisted(m
->get_source_addr())) {
2107 dout(10) << "do_op " << m
->get_source_addr() << " is blocklisted" << dendl
;
2108 osd
->reply_op_error(op
, -EBLOCKLISTED
);
2112 // order this op as a write?
2113 bool write_ordered
= op
->rwordered();
2115 // discard due to cluster full transition? (we discard any op that
2116 // originates before the cluster or pool is marked full; the client
2117 // will resend after the full flag is removed or if they expect the
2118 // op to succeed despite being full). The except is FULL_FORCE and
2119 // FULL_TRY ops, which there is no reason to discard because they
2120 // bypass all full checks anyway. If this op isn't write or
2121 // read-ordered, we skip.
2122 // FIXME: we exclude mds writes for now.
2123 if (write_ordered
&& !(m
->get_source().is_mds() ||
2124 m
->has_flag(CEPH_OSD_FLAG_FULL_TRY
) ||
2125 m
->has_flag(CEPH_OSD_FLAG_FULL_FORCE
)) &&
2126 info
.history
.last_epoch_marked_full
> m
->get_map_epoch()) {
2127 dout(10) << __func__
<< " discarding op sent before full " << m
<< " "
2131 // mds should have stopped writing before this point.
2132 // We can't allow OSD to become non-startable even if mds
2133 // could be writing as part of file removals.
2134 if (write_ordered
&& osd
->check_failsafe_full(get_dpp()) &&
2135 !m
->has_flag(CEPH_OSD_FLAG_FULL_TRY
)) {
2136 dout(10) << __func__
<< " fail-safe full check failed, dropping request." << dendl
;
2139 int64_t poolid
= get_pgid().pool();
2140 const pg_pool_t
*pi
= get_osdmap()->get_pg_pool(poolid
);
2144 if (pi
->has_flag(pg_pool_t::FLAG_EIO
)) {
2145 // drop op on the floor; the client will handle returning EIO
2146 if (m
->has_flag(CEPH_OSD_FLAG_SUPPORTSPOOLEIO
)) {
2147 dout(10) << __func__
<< " discarding op due to pool EIO flag" << dendl
;
2149 dout(10) << __func__
<< " replying EIO due to pool EIO flag" << dendl
;
2150 osd
->reply_op_error(op
, -EIO
);
2154 if (op
->may_write()) {
2157 if (m
->get_snapid() != CEPH_NOSNAP
) {
2158 dout(20) << __func__
<< ": write to clone not valid " << *m
<< dendl
;
2159 osd
->reply_op_error(op
, -EINVAL
);
2164 if (cct
->_conf
->osd_max_write_size
&&
2165 m
->get_data_len() > cct
->_conf
->osd_max_write_size
<< 20) {
2166 // journal can't hold commit!
2167 derr
<< "do_op msg data len " << m
->get_data_len()
2168 << " > osd_max_write_size " << (cct
->_conf
->osd_max_write_size
<< 20)
2169 << " on " << *m
<< dendl
;
2170 osd
->reply_op_error(op
, -OSD_WRITETOOBIG
);
2175 dout(10) << "do_op " << *m
2176 << (op
->may_write() ? " may_write" : "")
2177 << (op
->may_read() ? " may_read" : "")
2178 << (op
->may_cache() ? " may_cache" : "")
2179 << " -> " << (write_ordered
? "write-ordered" : "read-ordered")
2180 << " flags " << ceph_osd_flag_string(m
->get_flags())
2185 if (is_unreadable_object(head
)) {
2186 if (!is_primary()) {
2187 osd
->reply_op_error(op
, -EAGAIN
);
2191 (g_conf()->osd_backoff_on_degraded
||
2192 (g_conf()->osd_backoff_on_unfound
&&
2193 recovery_state
.get_missing_loc().is_unfound(head
)))) {
2194 add_backoff(session
, head
, head
);
2195 maybe_kick_recovery(head
);
2197 wait_for_unreadable_object(head
, op
);
2202 if (write_ordered
) {
2204 if (is_degraded_or_backfilling_object(head
)) {
2205 if (can_backoff
&& g_conf()->osd_backoff_on_degraded
) {
2206 add_backoff(session
, head
, head
);
2207 maybe_kick_recovery(head
);
2209 wait_for_degraded_object(head
, op
);
2214 if (m_scrubber
->is_scrub_active() && m_scrubber
->write_blocked_by_scrub(head
)) {
2215 dout(20) << __func__
<< ": waiting for scrub" << dendl
;
2216 waiting_for_scrub
.push_back(op
);
2217 op
->mark_delayed("waiting for scrub");
2220 if (!check_laggy_requeue(op
)) {
2225 if (auto blocked_iter
= objects_blocked_on_degraded_snap
.find(head
);
2226 blocked_iter
!= std::end(objects_blocked_on_degraded_snap
)) {
2227 hobject_t
to_wait_on(head
);
2228 to_wait_on
.snap
= blocked_iter
->second
;
2229 wait_for_degraded_object(to_wait_on
, op
);
2232 if (auto blocked_snap_promote_iter
= objects_blocked_on_snap_promotion
.find(head
);
2233 blocked_snap_promote_iter
!= std::end(objects_blocked_on_snap_promotion
)) {
2234 wait_for_blocked_object(blocked_snap_promote_iter
->second
->obs
.oi
.soid
, op
);
2237 if (objects_blocked_on_cache_full
.count(head
)) {
2238 block_write_on_full_cache(head
, op
);
2244 if (op
->may_write() || op
->may_cache()) {
2245 // warning: we will get back *a* request for this reqid, but not
2246 // necessarily the most recent. this happens with flush and
2247 // promote ops, but we can't possible have both in our log where
2248 // the original request is still not stable on disk, so for our
2249 // purposes here it doesn't matter which one we get.
2251 version_t user_version
;
2252 int return_code
= 0;
2253 vector
<pg_log_op_return_item_t
> op_returns
;
2254 bool got
= check_in_progress_op(
2255 m
->get_reqid(), &version
, &user_version
, &return_code
, &op_returns
);
2257 dout(3) << __func__
<< " dup " << m
->get_reqid()
2258 << " version " << version
<< dendl
;
2259 if (already_complete(version
)) {
2260 osd
->reply_op_error(op
, return_code
, version
, user_version
, op_returns
);
2262 dout(10) << " waiting for " << version
<< " to commit" << dendl
;
2263 // always queue ondisk waiters, so that we can requeue if needed
2264 waiting_for_ondisk
[version
].emplace_back(op
, user_version
, return_code
,
2266 op
->mark_delayed("waiting for ondisk");
2272 ObjectContextRef obc
;
2273 bool can_create
= op
->may_write();
2274 hobject_t missing_oid
;
2276 // kludge around the fact that LIST_SNAPS sets CEPH_SNAPDIR for LIST_SNAPS
2277 const hobject_t
& oid
=
2278 m
->get_snapid() == CEPH_SNAPDIR
? head
: m
->get_hobj();
2280 // make sure LIST_SNAPS is on CEPH_SNAPDIR and nothing else
2281 for (vector
<OSDOp
>::iterator p
= m
->ops
.begin(); p
!= m
->ops
.end(); ++p
) {
2284 if (osd_op
.op
.op
== CEPH_OSD_OP_LIST_SNAPS
) {
2285 if (m
->get_snapid() != CEPH_SNAPDIR
) {
2286 dout(10) << "LIST_SNAPS with incorrect context" << dendl
;
2287 osd
->reply_op_error(op
, -EINVAL
);
2291 if (m
->get_snapid() == CEPH_SNAPDIR
) {
2292 dout(10) << "non-LIST_SNAPS on snapdir" << dendl
;
2293 osd
->reply_op_error(op
, -EINVAL
);
2299 // io blocked on obc?
2300 if (!m
->has_flag(CEPH_OSD_FLAG_FLUSH
) &&
2301 maybe_await_blocked_head(oid
, op
)) {
2305 if (!is_primary()) {
2306 if (!recovery_state
.can_serve_replica_read(oid
)) {
2307 dout(20) << __func__
2308 << ": unstable write on replica, bouncing to primary "
2310 osd
->reply_op_error(op
, -EAGAIN
);
2313 dout(20) << __func__
<< ": serving replica read on oid " << oid
2317 int r
= find_object_context(
2318 oid
, &obc
, can_create
,
2319 m
->has_flag(CEPH_OSD_FLAG_MAP_SNAP_CLONE
),
2322 // LIST_SNAPS needs the ssc too
2324 m
->get_snapid() == CEPH_SNAPDIR
&&
2326 obc
->ssc
= get_snapset_context(oid
, true);
2330 // If we're not the primary of this OSD, we just return -EAGAIN. Otherwise,
2331 // we have to wait for the object.
2333 // missing the specific snap we need; requeue and wait.
2334 ceph_assert(!op
->may_write()); // only happens on a read/cache
2335 wait_for_unreadable_object(missing_oid
, op
);
2338 } else if (r
== 0) {
2339 if (is_unreadable_object(obc
->obs
.oi
.soid
)) {
2340 dout(10) << __func__
<< ": clone " << obc
->obs
.oi
.soid
2341 << " is unreadable, waiting" << dendl
;
2342 wait_for_unreadable_object(obc
->obs
.oi
.soid
, op
);
2346 // degraded object? (the check above was for head; this could be a clone)
2347 if (write_ordered
&&
2348 obc
->obs
.oi
.soid
.snap
!= CEPH_NOSNAP
&&
2349 is_degraded_or_backfilling_object(obc
->obs
.oi
.soid
)) {
2350 dout(10) << __func__
<< ": clone " << obc
->obs
.oi
.soid
2351 << " is degraded, waiting" << dendl
;
2352 wait_for_degraded_object(obc
->obs
.oi
.soid
, op
);
2357 bool in_hit_set
= false;
2360 if (obc
->obs
.oi
.soid
!= hobject_t() && hit_set
->contains(obc
->obs
.oi
.soid
))
2363 if (missing_oid
!= hobject_t() && hit_set
->contains(missing_oid
))
2366 if (!op
->hitset_inserted
) {
2367 hit_set
->insert(oid
);
2368 op
->hitset_inserted
= true;
2369 if (hit_set
->is_full() ||
2370 hit_set_start_stamp
+ pool
.info
.hit_set_period
<= m
->get_recv_stamp()) {
2377 if (agent_choose_mode(false, op
))
2381 if (obc
.get() && obc
->obs
.exists
) {
2382 if (recover_adjacent_clones(obc
, op
)) {
2385 if (maybe_handle_manifest(op
,
2391 if (maybe_handle_cache(op
,
2400 if (r
&& (r
!= -ENOENT
|| !obc
)) {
2401 // copy the reqids for copy get on ENOENT
2403 (m
->ops
[0].op
.op
== CEPH_OSD_OP_COPY_GET
)) {
2404 fill_in_copy_get_noent(op
, oid
, m
->ops
[0]);
2407 dout(20) << __func__
<< ": find_object_context got error " << r
<< dendl
;
2408 if (op
->may_write() &&
2409 get_osdmap()->require_osd_release
>= ceph_release_t::kraken
) {
2410 record_write_error(op
, oid
, nullptr, r
);
2412 osd
->reply_op_error(op
, r
);
2417 // make sure locator is consistent
2418 object_locator_t
oloc(obc
->obs
.oi
.soid
);
2419 if (m
->get_object_locator() != oloc
) {
2420 dout(10) << " provided locator " << m
->get_object_locator()
2421 << " != object's " << obc
->obs
.oi
.soid
<< dendl
;
2422 osd
->clog
->warn() << "bad locator " << m
->get_object_locator()
2423 << " on object " << oloc
2427 // io blocked on obc?
2428 if (obc
->is_blocked() &&
2429 !m
->has_flag(CEPH_OSD_FLAG_FLUSH
)) {
2430 wait_for_blocked_object(obc
->obs
.oi
.soid
, op
);
2434 dout(25) << __func__
<< " oi " << obc
->obs
.oi
<< dendl
;
2436 OpContext
*ctx
= new OpContext(op
, m
->get_reqid(), &m
->ops
, obc
, this);
2438 if (m
->has_flag(CEPH_OSD_FLAG_SKIPRWLOCKS
)) {
2439 dout(20) << __func__
<< ": skipping rw locks" << dendl
;
2440 } else if (m
->get_flags() & CEPH_OSD_FLAG_FLUSH
) {
2441 dout(20) << __func__
<< ": part of flush, will ignore write lock" << dendl
;
2443 // verify there is in fact a flush in progress
2444 // FIXME: we could make this a stronger test.
2445 map
<hobject_t
,FlushOpRef
>::iterator p
= flush_ops
.find(obc
->obs
.oi
.soid
);
2446 if (p
== flush_ops
.end()) {
2447 dout(10) << __func__
<< " no flush in progress, aborting" << dendl
;
2448 reply_ctx(ctx
, -EINVAL
);
2451 } else if (!get_rw_locks(write_ordered
, ctx
)) {
2452 dout(20) << __func__
<< " waiting for rw locks " << dendl
;
2453 op
->mark_delayed("waiting for rw locks");
2457 dout(20) << __func__
<< " obc " << *obc
<< dendl
;
2460 dout(20) << __func__
<< " returned an error: " << r
<< dendl
;
2461 if (op
->may_write() &&
2462 get_osdmap()->require_osd_release
>= ceph_release_t::kraken
) {
2463 record_write_error(op
, oid
, nullptr, r
,
2464 ctx
->op
->allows_returnvec() ? ctx
: nullptr);
2466 osd
->reply_op_error(op
, r
);
2472 if (m
->has_flag(CEPH_OSD_FLAG_IGNORE_CACHE
)) {
2473 ctx
->ignore_cache
= true;
2476 if ((op
->may_read()) && (obc
->obs
.oi
.is_lost())) {
2477 // This object is lost. Reading from it returns an error.
2478 dout(20) << __func__
<< ": object " << obc
->obs
.oi
.soid
2479 << " is lost" << dendl
;
2480 reply_ctx(ctx
, -ENFILE
);
2483 if (!op
->may_write() &&
2485 (!obc
->obs
.exists
||
2486 ((m
->get_snapid() != CEPH_SNAPDIR
) &&
2487 obc
->obs
.oi
.is_whiteout()))) {
2488 // copy the reqids for copy get on ENOENT
2489 if (m
->ops
[0].op
.op
== CEPH_OSD_OP_COPY_GET
) {
2490 fill_in_copy_get_noent(op
, oid
, m
->ops
[0]);
2494 reply_ctx(ctx
, -ENOENT
);
2501 utime_t prepare_latency
= ceph_clock_now();
2502 prepare_latency
-= op
->get_dequeued_time();
2503 osd
->logger
->tinc(l_osd_op_prepare_lat
, prepare_latency
);
2504 if (op
->may_read() && op
->may_write()) {
2505 osd
->logger
->tinc(l_osd_op_rw_prepare_lat
, prepare_latency
);
2506 } else if (op
->may_read()) {
2507 osd
->logger
->tinc(l_osd_op_r_prepare_lat
, prepare_latency
);
2508 } else if (op
->may_write() || op
->may_cache()) {
2509 osd
->logger
->tinc(l_osd_op_w_prepare_lat
, prepare_latency
);
2512 // force recovery of the oldest missing object if too many logs
2513 maybe_force_recovery();
2516 PrimaryLogPG::cache_result_t
PrimaryLogPG::maybe_handle_manifest_detail(
2519 ObjectContextRef obc
)
2522 dout(20) << __func__
<< ": no obc " << dendl
;
2523 return cache_result_t::NOOP
;
2526 if (!obc
->obs
.oi
.has_manifest()) {
2527 dout(20) << __func__
<< ": " << obc
->obs
.oi
.soid
2528 << " is not manifest object " << dendl
;
2529 return cache_result_t::NOOP
;
2531 if (op
->get_req
<MOSDOp
>()->get_flags() & CEPH_OSD_FLAG_IGNORE_REDIRECT
) {
2532 dout(20) << __func__
<< ": ignoring redirect due to flag" << dendl
;
2533 return cache_result_t::NOOP
;
2536 // if it is write-ordered and blocked, stop now
2537 if (obc
->is_blocked() && write_ordered
) {
2538 // we're already doing something with this object
2539 dout(20) << __func__
<< " blocked on " << obc
->obs
.oi
.soid
<< dendl
;
2540 return cache_result_t::NOOP
;
2543 vector
<OSDOp
> ops
= op
->get_req
<MOSDOp
>()->ops
;
2544 for (vector
<OSDOp
>::iterator p
= ops
.begin(); p
!= ops
.end(); ++p
) {
2546 ceph_osd_op
& op
= osd_op
.op
;
2547 if (op
.op
== CEPH_OSD_OP_SET_REDIRECT
||
2548 op
.op
== CEPH_OSD_OP_SET_CHUNK
||
2549 op
.op
== CEPH_OSD_OP_UNSET_MANIFEST
||
2550 op
.op
== CEPH_OSD_OP_TIER_PROMOTE
||
2551 op
.op
== CEPH_OSD_OP_TIER_FLUSH
||
2552 op
.op
== CEPH_OSD_OP_TIER_EVICT
||
2553 op
.op
== CEPH_OSD_OP_ISDIRTY
) {
2554 return cache_result_t::NOOP
;
2558 switch (obc
->obs
.oi
.manifest
.type
) {
2559 case object_manifest_t::TYPE_REDIRECT
:
2560 if (op
->may_write() || write_ordered
) {
2561 do_proxy_write(op
, obc
);
2564 if (obc
->obs
.oi
.size
!= 0) {
2565 return cache_result_t::NOOP
;
2567 do_proxy_read(op
, obc
);
2569 return cache_result_t::HANDLED_PROXY
;
2570 case object_manifest_t::TYPE_CHUNKED
:
2572 if (can_proxy_chunked_read(op
, obc
)) {
2573 map
<hobject_t
,FlushOpRef
>::iterator p
= flush_ops
.find(obc
->obs
.oi
.soid
);
2574 if (p
!= flush_ops
.end()) {
2575 do_proxy_chunked_op(op
, obc
->obs
.oi
.soid
, obc
, true);
2576 return cache_result_t::HANDLED_PROXY
;
2578 do_proxy_chunked_op(op
, obc
->obs
.oi
.soid
, obc
, write_ordered
);
2579 return cache_result_t::HANDLED_PROXY
;
2582 MOSDOp
*m
= static_cast<MOSDOp
*>(op
->get_nonconst_req());
2583 ceph_assert(m
->get_type() == CEPH_MSG_OSD_OP
);
2584 hobject_t head
= m
->get_hobj();
2586 if (is_degraded_or_backfilling_object(head
)) {
2587 dout(20) << __func__
<< ": " << head
<< " is degraded, waiting" << dendl
;
2588 wait_for_degraded_object(head
, op
);
2589 return cache_result_t::BLOCKED_RECOVERY
;
2592 if (m_scrubber
->write_blocked_by_scrub(head
)) {
2593 dout(20) << __func__
<< ": waiting for scrub" << dendl
;
2594 waiting_for_scrub
.push_back(op
);
2595 op
->mark_delayed("waiting for scrub");
2596 return cache_result_t::BLOCKED_RECOVERY
;
2598 if (!check_laggy_requeue(op
)) {
2599 return cache_result_t::BLOCKED_RECOVERY
;
2602 for (auto& p
: obc
->obs
.oi
.manifest
.chunk_map
) {
2603 if (p
.second
.is_missing()) {
2604 auto m
= op
->get_req
<MOSDOp
>();
2605 const object_locator_t oloc
= m
->get_object_locator();
2606 promote_object(obc
, obc
->obs
.oi
.soid
, oloc
, op
, NULL
);
2607 return cache_result_t::BLOCKED_PROMOTE
;
2610 return cache_result_t::NOOP
;
2613 ceph_abort_msg("unrecognized manifest type");
2616 return cache_result_t::NOOP
;
2619 void PrimaryLogPG::record_write_error(OpRequestRef op
, const hobject_t
&soid
,
2620 MOSDOpReply
*orig_reply
, int r
,
2621 OpContext
*ctx_for_op_returns
)
2623 dout(20) << __func__
<< " r=" << r
<< dendl
;
2624 ceph_assert(op
->may_write());
2625 const osd_reqid_t
&reqid
= op
->get_req
<MOSDOp
>()->get_reqid();
2626 mempool::osd_pglog::list
<pg_log_entry_t
> entries
;
2627 entries
.push_back(pg_log_entry_t(pg_log_entry_t::ERROR
, soid
,
2628 get_next_version(), eversion_t(), 0,
2629 reqid
, utime_t(), r
));
2630 if (ctx_for_op_returns
) {
2631 entries
.back().set_op_returns(*ctx_for_op_returns
->ops
);
2632 dout(20) << __func__
<< " op_returns=" << entries
.back().op_returns
<< dendl
;
2638 boost::intrusive_ptr
<MOSDOpReply
> orig_reply
;
2643 MOSDOpReply
*orig_reply
,
2646 orig_reply(orig_reply
, false /* take over ref */), r(r
)
2649 ldpp_dout(pg
, 20) << "finished " << __func__
<< " r=" << r
<< dendl
;
2650 auto m
= op
->get_req
<MOSDOp
>();
2651 MOSDOpReply
*reply
= orig_reply
.detach();
2652 ldpp_dout(pg
, 10) << " sending commit on " << *m
<< " " << reply
<< dendl
;
2653 pg
->osd
->send_message_osd_client(reply
, m
->get_connection());
2657 ObcLockManager lock_manager
;
2660 std::move(lock_manager
),
2661 std::optional
<std::function
<void(void)> >(
2662 OnComplete(this, op
, orig_reply
, r
)),
2667 PrimaryLogPG::cache_result_t
PrimaryLogPG::maybe_handle_cache_detail(
2670 ObjectContextRef obc
,
2671 int r
, hobject_t missing_oid
,
2674 ObjectContextRef
*promote_obc
)
2676 // return quickly if caching is not enabled
2677 if (pool
.info
.cache_mode
== pg_pool_t::CACHEMODE_NONE
)
2678 return cache_result_t::NOOP
;
2682 op
->get_req()->get_type() == CEPH_MSG_OSD_OP
&&
2683 (op
->get_req
<MOSDOp
>()->get_flags() &
2684 CEPH_OSD_FLAG_IGNORE_CACHE
)) {
2685 dout(20) << __func__
<< ": ignoring cache due to flag" << dendl
;
2686 return cache_result_t::NOOP
;
2689 must_promote
= must_promote
|| op
->need_promote();
2692 dout(25) << __func__
<< " " << obc
->obs
.oi
<< " "
2693 << (obc
->obs
.exists
? "exists" : "DNE")
2694 << " missing_oid " << missing_oid
2695 << " must_promote " << (int)must_promote
2696 << " in_hit_set " << (int)in_hit_set
2699 dout(25) << __func__
<< " (no obc)"
2700 << " missing_oid " << missing_oid
2701 << " must_promote " << (int)must_promote
2702 << " in_hit_set " << (int)in_hit_set
2705 // if it is write-ordered and blocked, stop now
2706 if (obc
.get() && obc
->is_blocked() && write_ordered
) {
2707 // we're already doing something with this object
2708 dout(20) << __func__
<< " blocked on " << obc
->obs
.oi
.soid
<< dendl
;
2709 return cache_result_t::NOOP
;
2712 if (r
== -ENOENT
&& missing_oid
== hobject_t()) {
2713 // we know this object is logically absent (e.g., an undefined clone)
2714 return cache_result_t::NOOP
;
2717 if (obc
.get() && obc
->obs
.exists
) {
2718 osd
->logger
->inc(l_osd_op_cache_hit
);
2719 return cache_result_t::NOOP
;
2721 if (!is_primary()) {
2722 dout(20) << __func__
<< " cache miss; ask the primary" << dendl
;
2723 osd
->reply_op_error(op
, -EAGAIN
);
2724 return cache_result_t::REPLIED_WITH_EAGAIN
;
2727 if (missing_oid
== hobject_t() && obc
.get()) {
2728 missing_oid
= obc
->obs
.oi
.soid
;
2731 auto m
= op
->get_req
<MOSDOp
>();
2732 const object_locator_t oloc
= m
->get_object_locator();
2734 if (op
->need_skip_handle_cache()) {
2735 return cache_result_t::NOOP
;
2738 OpRequestRef promote_op
;
2740 switch (pool
.info
.cache_mode
) {
2741 case pg_pool_t::CACHEMODE_WRITEBACK
:
2743 agent_state
->evict_mode
== TierAgentState::EVICT_MODE_FULL
) {
2744 if (!op
->may_write() && !op
->may_cache() &&
2745 !write_ordered
&& !must_promote
) {
2746 dout(20) << __func__
<< " cache pool full, proxying read" << dendl
;
2748 return cache_result_t::HANDLED_PROXY
;
2750 dout(20) << __func__
<< " cache pool full, waiting" << dendl
;
2751 block_write_on_full_cache(missing_oid
, op
);
2752 return cache_result_t::BLOCKED_FULL
;
2755 if (must_promote
|| (!hit_set
&& !op
->need_skip_promote())) {
2756 promote_object(obc
, missing_oid
, oloc
, op
, promote_obc
);
2757 return cache_result_t::BLOCKED_PROMOTE
;
2760 if (op
->may_write() || op
->may_cache()) {
2764 if (!op
->need_skip_promote() &&
2765 maybe_promote(obc
, missing_oid
, oloc
, in_hit_set
,
2766 pool
.info
.min_write_recency_for_promote
,
2769 return cache_result_t::BLOCKED_PROMOTE
;
2771 return cache_result_t::HANDLED_PROXY
;
2775 // Avoid duplicate promotion
2776 if (obc
.get() && obc
->is_blocked()) {
2779 return cache_result_t::BLOCKED_PROMOTE
;
2783 if (!op
->need_skip_promote()) {
2784 (void)maybe_promote(obc
, missing_oid
, oloc
, in_hit_set
,
2785 pool
.info
.min_read_recency_for_promote
,
2786 promote_op
, promote_obc
);
2789 return cache_result_t::HANDLED_PROXY
;
2791 ceph_abort_msg("unreachable");
2792 return cache_result_t::NOOP
;
2794 case pg_pool_t::CACHEMODE_READONLY
:
2795 // TODO: clean this case up
2796 if (!obc
.get() && r
== -ENOENT
) {
2797 // we don't have the object and op's a read
2798 promote_object(obc
, missing_oid
, oloc
, op
, promote_obc
);
2799 return cache_result_t::BLOCKED_PROMOTE
;
2801 if (!r
) { // it must be a write
2802 do_cache_redirect(op
);
2803 return cache_result_t::HANDLED_REDIRECT
;
2805 // crap, there was a failure of some kind
2806 return cache_result_t::NOOP
;
2808 case pg_pool_t::CACHEMODE_FORWARD
:
2809 // this mode is deprecated; proxy instead
2810 case pg_pool_t::CACHEMODE_PROXY
:
2811 if (!must_promote
) {
2812 if (op
->may_write() || op
->may_cache() || write_ordered
) {
2814 return cache_result_t::HANDLED_PROXY
;
2817 return cache_result_t::HANDLED_PROXY
;
2820 // ugh, we're forced to promote.
2822 agent_state
->evict_mode
== TierAgentState::EVICT_MODE_FULL
) {
2823 dout(20) << __func__
<< " cache pool full, waiting" << dendl
;
2824 block_write_on_full_cache(missing_oid
, op
);
2825 return cache_result_t::BLOCKED_FULL
;
2827 promote_object(obc
, missing_oid
, oloc
, op
, promote_obc
);
2828 return cache_result_t::BLOCKED_PROMOTE
;
2830 case pg_pool_t::CACHEMODE_READFORWARD
:
2831 // this mode is deprecated; proxy instead
2832 case pg_pool_t::CACHEMODE_READPROXY
:
2833 // Do writeback to the cache tier for writes
2834 if (op
->may_write() || write_ordered
|| must_promote
) {
2836 agent_state
->evict_mode
== TierAgentState::EVICT_MODE_FULL
) {
2837 dout(20) << __func__
<< " cache pool full, waiting" << dendl
;
2838 block_write_on_full_cache(missing_oid
, op
);
2839 return cache_result_t::BLOCKED_FULL
;
2841 promote_object(obc
, missing_oid
, oloc
, op
, promote_obc
);
2842 return cache_result_t::BLOCKED_PROMOTE
;
2845 // If it is a read, we can read, we need to proxy it
2847 return cache_result_t::HANDLED_PROXY
;
2850 ceph_abort_msg("unrecognized cache_mode");
2852 return cache_result_t::NOOP
;
2855 bool PrimaryLogPG::maybe_promote(ObjectContextRef obc
,
2856 const hobject_t
& missing_oid
,
2857 const object_locator_t
& oloc
,
2860 OpRequestRef promote_op
,
2861 ObjectContextRef
*promote_obc
)
2863 dout(20) << __func__
<< " missing_oid " << missing_oid
2864 << " in_hit_set " << in_hit_set
<< dendl
;
2870 // Check if in the current hit set
2880 unsigned count
= (int)in_hit_set
;
2882 // Check if in other hit sets
2883 const hobject_t
& oid
= obc
.get() ? obc
->obs
.oi
.soid
: missing_oid
;
2884 for (map
<time_t,HitSetRef
>::reverse_iterator itor
=
2885 agent_state
->hit_set_map
.rbegin();
2886 itor
!= agent_state
->hit_set_map
.rend();
2888 if (!itor
->second
->contains(oid
)) {
2892 if (count
>= recency
) {
2897 if (count
>= recency
) {
2900 return false; // not promoting
2905 if (osd
->promote_throttle()) {
2906 dout(10) << __func__
<< " promote throttled" << dendl
;
2909 promote_object(obc
, missing_oid
, oloc
, promote_op
, promote_obc
);
2913 void PrimaryLogPG::do_cache_redirect(OpRequestRef op
)
2915 auto m
= op
->get_req
<MOSDOp
>();
2916 int flags
= m
->get_flags() & (CEPH_OSD_FLAG_ACK
|CEPH_OSD_FLAG_ONDISK
);
2917 MOSDOpReply
*reply
= new MOSDOpReply(m
, -ENOENT
, get_osdmap_epoch(),
2919 request_redirect_t
redir(m
->get_object_locator(), pool
.info
.tier_of
);
2920 reply
->set_redirect(redir
);
2921 dout(10) << "sending redirect to pool " << pool
.info
.tier_of
<< " for op "
2922 << *op
->get_req() << dendl
;
2923 m
->get_connection()->send_message(reply
);
2927 struct C_ProxyRead
: public Context
{
2930 epoch_t last_peering_reset
;
2932 PrimaryLogPG::ProxyReadOpRef prdop
;
2934 C_ProxyRead(PrimaryLogPG
*p
, hobject_t o
, epoch_t lpr
,
2935 const PrimaryLogPG::ProxyReadOpRef
& prd
)
2936 : pg(p
), oid(o
), last_peering_reset(lpr
),
2937 tid(0), prdop(prd
), start(ceph_clock_now())
2939 void finish(int r
) override
{
2940 if (prdop
->canceled
)
2942 std::scoped_lock locker
{*pg
};
2943 if (prdop
->canceled
) {
2946 if (last_peering_reset
== pg
->get_last_peering_reset()) {
2947 pg
->finish_proxy_read(oid
, tid
, r
);
2948 pg
->osd
->logger
->tinc(l_osd_tier_r_lat
, ceph_clock_now() - start
);
2953 struct C_ProxyChunkRead
: public Context
{
2956 epoch_t last_peering_reset
;
2958 PrimaryLogPG::ProxyReadOpRef prdop
;
2960 ObjectOperation
*obj_op
;
2962 uint64_t req_offset
= 0;
2963 ObjectContextRef obc
;
2964 uint64_t req_total_len
= 0;
2965 C_ProxyChunkRead(PrimaryLogPG
*p
, hobject_t o
, epoch_t lpr
,
2966 const PrimaryLogPG::ProxyReadOpRef
& prd
)
2967 : pg(p
), oid(o
), last_peering_reset(lpr
),
2968 tid(0), prdop(prd
), start(ceph_clock_now()), obj_op(NULL
)
2970 void finish(int r
) override
{
2971 if (prdop
->canceled
)
2973 std::scoped_lock locker
{*pg
};
2974 if (prdop
->canceled
) {
2977 if (last_peering_reset
== pg
->get_last_peering_reset()) {
2979 if (!prdop
->ops
[op_index
].outdata
.length()) {
2980 ceph_assert(req_total_len
);
2982 bufferptr
bptr(req_total_len
);
2983 list
.push_back(std::move(bptr
));
2984 prdop
->ops
[op_index
].outdata
.append(list
);
2986 ceph_assert(obj_op
);
2987 uint64_t copy_offset
;
2988 if (req_offset
>= prdop
->ops
[op_index
].op
.extent
.offset
) {
2989 copy_offset
= req_offset
- prdop
->ops
[op_index
].op
.extent
.offset
;
2993 prdop
->ops
[op_index
].outdata
.begin(copy_offset
).copy_in(
2994 obj_op
->ops
[0].outdata
.length(),
2995 obj_op
->ops
[0].outdata
.c_str());
2998 pg
->finish_proxy_read(oid
, tid
, r
);
2999 pg
->osd
->logger
->tinc(l_osd_tier_r_lat
, ceph_clock_now() - start
);
3007 void PrimaryLogPG::do_proxy_read(OpRequestRef op
, ObjectContextRef obc
)
3009 // NOTE: non-const here because the ProxyReadOp needs mutable refs to
3010 // stash the result in the request's OSDOp vector
3011 MOSDOp
*m
= static_cast<MOSDOp
*>(op
->get_nonconst_req());
3012 object_locator_t oloc
;
3014 /* extensible tier */
3015 if (obc
&& obc
->obs
.exists
&& obc
->obs
.oi
.has_manifest()) {
3016 switch (obc
->obs
.oi
.manifest
.type
) {
3017 case object_manifest_t::TYPE_REDIRECT
:
3018 oloc
= object_locator_t(obc
->obs
.oi
.manifest
.redirect_target
);
3019 soid
= obc
->obs
.oi
.manifest
.redirect_target
;
3022 ceph_abort_msg("unrecognized manifest type");
3026 soid
= m
->get_hobj();
3027 oloc
= object_locator_t(m
->get_object_locator());
3028 oloc
.pool
= pool
.info
.tier_of
;
3030 unsigned flags
= CEPH_OSD_FLAG_IGNORE_CACHE
| CEPH_OSD_FLAG_IGNORE_OVERLAY
;
3032 // pass through some original flags that make sense.
3033 // - leave out redirection and balancing flags since we are
3034 // already proxying through the primary
3035 // - leave off read/write/exec flags that are derived from the op
3036 flags
|= m
->get_flags() & (CEPH_OSD_FLAG_RWORDERED
|
3037 CEPH_OSD_FLAG_ORDERSNAP
|
3038 CEPH_OSD_FLAG_ENFORCE_SNAPC
|
3039 CEPH_OSD_FLAG_MAP_SNAP_CLONE
);
3041 dout(10) << __func__
<< " Start proxy read for " << *m
<< dendl
;
3043 ProxyReadOpRef
prdop(std::make_shared
<ProxyReadOp
>(op
, soid
, m
->ops
));
3045 ObjectOperation obj_op
;
3046 obj_op
.dup(prdop
->ops
);
3048 if (pool
.info
.cache_mode
== pg_pool_t::CACHEMODE_WRITEBACK
&&
3049 (agent_state
&& agent_state
->evict_mode
!= TierAgentState::EVICT_MODE_FULL
)) {
3050 for (unsigned i
= 0; i
< obj_op
.ops
.size(); i
++) {
3051 ceph_osd_op op
= obj_op
.ops
[i
].op
;
3053 case CEPH_OSD_OP_READ
:
3054 case CEPH_OSD_OP_SYNC_READ
:
3055 case CEPH_OSD_OP_SPARSE_READ
:
3056 case CEPH_OSD_OP_CHECKSUM
:
3057 case CEPH_OSD_OP_CMPEXT
:
3058 op
.flags
= (op
.flags
| CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL
) &
3059 ~(CEPH_OSD_OP_FLAG_FADVISE_DONTNEED
| CEPH_OSD_OP_FLAG_FADVISE_NOCACHE
);
3064 C_ProxyRead
*fin
= new C_ProxyRead(this, soid
, get_last_peering_reset(),
3066 ceph_tid_t tid
= osd
->objecter
->read(
3067 soid
.oid
, oloc
, obj_op
,
3068 m
->get_snapid(), NULL
,
3069 flags
, new C_OnFinisher(fin
, osd
->get_objecter_finisher(get_pg_shard())),
3070 &prdop
->user_version
,
3071 &prdop
->data_offset
,
3074 prdop
->objecter_tid
= tid
;
3075 proxyread_ops
[tid
] = prdop
;
3076 in_progress_proxy_ops
[soid
].push_back(op
);
3079 void PrimaryLogPG::finish_proxy_read(hobject_t oid
, ceph_tid_t tid
, int r
)
3081 dout(10) << __func__
<< " " << oid
<< " tid " << tid
3082 << " " << cpp_strerror(r
) << dendl
;
3084 map
<ceph_tid_t
, ProxyReadOpRef
>::iterator p
= proxyread_ops
.find(tid
);
3085 if (p
== proxyread_ops
.end()) {
3086 dout(10) << __func__
<< " no proxyread_op found" << dendl
;
3089 ProxyReadOpRef prdop
= p
->second
;
3090 if (tid
!= prdop
->objecter_tid
) {
3091 dout(10) << __func__
<< " tid " << tid
<< " != prdop " << prdop
3092 << " tid " << prdop
->objecter_tid
<< dendl
;
3095 if (oid
!= prdop
->soid
) {
3096 dout(10) << __func__
<< " oid " << oid
<< " != prdop " << prdop
3097 << " soid " << prdop
->soid
<< dendl
;
3100 proxyread_ops
.erase(tid
);
3102 map
<hobject_t
, list
<OpRequestRef
>>::iterator q
= in_progress_proxy_ops
.find(oid
);
3103 if (q
== in_progress_proxy_ops
.end()) {
3104 dout(10) << __func__
<< " no in_progress_proxy_ops found" << dendl
;
3107 ceph_assert(q
->second
.size());
3108 list
<OpRequestRef
>::iterator it
= std::find(q
->second
.begin(),
3111 ceph_assert(it
!= q
->second
.end());
3112 OpRequestRef op
= *it
;
3113 q
->second
.erase(it
);
3114 if (q
->second
.size() == 0) {
3115 in_progress_proxy_ops
.erase(oid
);
3116 } else if (std::find(q
->second
.begin(),
3118 prdop
->op
) != q
->second
.end()) {
3119 /* multiple read case */
3120 dout(20) << __func__
<< " " << oid
<< " is not completed " << dendl
;
3124 osd
->logger
->inc(l_osd_tier_proxy_read
);
3126 auto m
= op
->get_req
<MOSDOp
>();
3127 OpContext
*ctx
= new OpContext(op
, m
->get_reqid(), &prdop
->ops
, this);
3128 ctx
->reply
= new MOSDOpReply(m
, 0, get_osdmap_epoch(), 0, false);
3129 ctx
->user_at_version
= prdop
->user_version
;
3130 ctx
->data_off
= prdop
->data_offset
;
3131 ctx
->ignore_log_op_stats
= true;
3132 complete_read_ctx(r
, ctx
);
3135 void PrimaryLogPG::kick_proxy_ops_blocked(hobject_t
& soid
)
3137 map
<hobject_t
, list
<OpRequestRef
>>::iterator p
= in_progress_proxy_ops
.find(soid
);
3138 if (p
== in_progress_proxy_ops
.end())
3141 list
<OpRequestRef
>& ls
= p
->second
;
3142 dout(10) << __func__
<< " " << soid
<< " requeuing " << ls
.size() << " requests" << dendl
;
3144 in_progress_proxy_ops
.erase(p
);
3147 void PrimaryLogPG::cancel_proxy_read(ProxyReadOpRef prdop
,
3148 vector
<ceph_tid_t
> *tids
)
3150 dout(10) << __func__
<< " " << prdop
->soid
<< dendl
;
3151 prdop
->canceled
= true;
3153 // cancel objecter op, if we can
3154 if (prdop
->objecter_tid
) {
3155 tids
->push_back(prdop
->objecter_tid
);
3156 for (uint32_t i
= 0; i
< prdop
->ops
.size(); i
++) {
3157 prdop
->ops
[i
].outdata
.clear();
3159 proxyread_ops
.erase(prdop
->objecter_tid
);
3160 prdop
->objecter_tid
= 0;
3164 void PrimaryLogPG::cancel_proxy_ops(bool requeue
, vector
<ceph_tid_t
> *tids
)
3166 dout(10) << __func__
<< dendl
;
3168 // cancel proxy reads
3169 map
<ceph_tid_t
, ProxyReadOpRef
>::iterator p
= proxyread_ops
.begin();
3170 while (p
!= proxyread_ops
.end()) {
3171 cancel_proxy_read((p
++)->second
, tids
);
3174 // cancel proxy writes
3175 map
<ceph_tid_t
, ProxyWriteOpRef
>::iterator q
= proxywrite_ops
.begin();
3176 while (q
!= proxywrite_ops
.end()) {
3177 cancel_proxy_write((q
++)->second
, tids
);
3181 map
<hobject_t
, list
<OpRequestRef
>>::iterator p
=
3182 in_progress_proxy_ops
.begin();
3183 while (p
!= in_progress_proxy_ops
.end()) {
3184 list
<OpRequestRef
>& ls
= p
->second
;
3185 dout(10) << __func__
<< " " << p
->first
<< " requeuing " << ls
.size()
3186 << " requests" << dendl
;
3188 in_progress_proxy_ops
.erase(p
++);
3191 in_progress_proxy_ops
.clear();
3195 struct C_ProxyWrite_Commit
: public Context
{
3198 epoch_t last_peering_reset
;
3200 PrimaryLogPG::ProxyWriteOpRef pwop
;
3201 C_ProxyWrite_Commit(PrimaryLogPG
*p
, hobject_t o
, epoch_t lpr
,
3202 const PrimaryLogPG::ProxyWriteOpRef
& pw
)
3203 : pg(p
), oid(o
), last_peering_reset(lpr
),
3206 void finish(int r
) override
{
3209 std::scoped_lock locker
{*pg
};
3210 if (pwop
->canceled
) {
3213 if (last_peering_reset
== pg
->get_last_peering_reset()) {
3214 pg
->finish_proxy_write(oid
, tid
, r
);
3219 void PrimaryLogPG::do_proxy_write(OpRequestRef op
, ObjectContextRef obc
)
3221 // NOTE: non-const because ProxyWriteOp takes a mutable ref
3222 MOSDOp
*m
= static_cast<MOSDOp
*>(op
->get_nonconst_req());
3223 object_locator_t oloc
;
3224 SnapContext
snapc(m
->get_snap_seq(), m
->get_snaps());
3226 /* extensible tier */
3227 if (obc
&& obc
->obs
.exists
&& obc
->obs
.oi
.has_manifest()) {
3228 switch (obc
->obs
.oi
.manifest
.type
) {
3229 case object_manifest_t::TYPE_REDIRECT
:
3230 oloc
= object_locator_t(obc
->obs
.oi
.manifest
.redirect_target
);
3231 soid
= obc
->obs
.oi
.manifest
.redirect_target
;
3234 ceph_abort_msg("unrecognized manifest type");
3238 soid
= m
->get_hobj();
3239 oloc
= object_locator_t(m
->get_object_locator());
3240 oloc
.pool
= pool
.info
.tier_of
;
3243 unsigned flags
= CEPH_OSD_FLAG_IGNORE_CACHE
| CEPH_OSD_FLAG_IGNORE_OVERLAY
;
3244 if (!(op
->may_write() || op
->may_cache())) {
3245 flags
|= CEPH_OSD_FLAG_RWORDERED
;
3247 if (op
->allows_returnvec()) {
3248 flags
|= CEPH_OSD_FLAG_RETURNVEC
;
3251 dout(10) << __func__
<< " Start proxy write for " << *m
<< dendl
;
3253 ProxyWriteOpRef
pwop(std::make_shared
<ProxyWriteOp
>(op
, soid
, m
->ops
, m
->get_reqid()));
3254 pwop
->ctx
= new OpContext(op
, m
->get_reqid(), &pwop
->ops
, this);
3255 pwop
->mtime
= m
->get_mtime();
3257 ObjectOperation obj_op
;
3258 obj_op
.dup(pwop
->ops
);
3260 C_ProxyWrite_Commit
*fin
= new C_ProxyWrite_Commit(
3261 this, soid
, get_last_peering_reset(), pwop
);
3262 ceph_tid_t tid
= osd
->objecter
->mutate(
3263 soid
.oid
, oloc
, obj_op
, snapc
,
3264 ceph::real_clock::from_ceph_timespec(pwop
->mtime
),
3265 flags
, new C_OnFinisher(fin
, osd
->get_objecter_finisher(get_pg_shard())),
3266 &pwop
->user_version
, pwop
->reqid
);
3268 pwop
->objecter_tid
= tid
;
3269 proxywrite_ops
[tid
] = pwop
;
3270 in_progress_proxy_ops
[soid
].push_back(op
);
3273 void PrimaryLogPG::do_proxy_chunked_op(OpRequestRef op
, const hobject_t
& missing_oid
,
3274 ObjectContextRef obc
, bool write_ordered
)
3276 MOSDOp
*m
= static_cast<MOSDOp
*>(op
->get_nonconst_req());
3277 OSDOp
*osd_op
= NULL
;
3278 for (unsigned int i
= 0; i
< m
->ops
.size(); i
++) {
3279 osd_op
= &m
->ops
[i
];
3280 uint64_t cursor
= osd_op
->op
.extent
.offset
;
3281 uint64_t op_length
= osd_op
->op
.extent
.offset
+ osd_op
->op
.extent
.length
;
3282 uint64_t chunk_length
= 0, chunk_index
= 0, req_len
= 0;
3283 object_manifest_t
*manifest
= &obc
->obs
.oi
.manifest
;
3284 map
<uint64_t, map
<uint64_t, uint64_t>> chunk_read
;
3286 while (cursor
< op_length
) {
3289 /* find the right chunk position for cursor */
3290 for (auto &p
: manifest
->chunk_map
) {
3291 if (p
.first
<= cursor
&& p
.first
+ p
.second
.length
> cursor
) {
3292 chunk_length
= p
.second
.length
;
3293 chunk_index
= p
.first
;
3298 if (!chunk_index
&& !chunk_length
) {
3299 if (cursor
== osd_op
->op
.extent
.offset
) {
3300 OpContext
*ctx
= new OpContext(op
, m
->get_reqid(), &m
->ops
, this);
3301 ctx
->reply
= new MOSDOpReply(m
, 0, get_osdmap_epoch(), 0, false);
3302 ctx
->data_off
= osd_op
->op
.extent
.offset
;
3303 ctx
->ignore_log_op_stats
= true;
3304 complete_read_ctx(0, ctx
);
3308 uint64_t next_length
= chunk_length
;
3309 /* the size to read -> | op length | */
3311 if (cursor
+ next_length
> op_length
) {
3312 next_length
= op_length
- cursor
;
3314 /* the size to read -> | op length | */
3316 if (cursor
+ next_length
> chunk_index
+ chunk_length
) {
3317 next_length
= chunk_index
+ chunk_length
- cursor
;
3320 chunk_read
[cursor
] = {{chunk_index
, next_length
}};
3321 cursor
+= next_length
;
3324 req_len
= cursor
- osd_op
->op
.extent
.offset
;
3325 for (auto &p
: chunk_read
) {
3326 auto chunks
= p
.second
.begin();
3327 dout(20) << __func__
<< " chunk_index: " << chunks
->first
3328 << " next_length: " << chunks
->second
<< " cursor: "
3329 << p
.first
<< dendl
;
3330 do_proxy_chunked_read(op
, obc
, i
, chunks
->first
, p
.first
, chunks
->second
, req_len
, write_ordered
);
3335 struct RefCountCallback
: public Context
{
3337 PrimaryLogPG::OpContext
*ctx
;
3339 bool requeue
= false;
3341 RefCountCallback(PrimaryLogPG::OpContext
*ctx
, OSDOp
&osd_op
)
3342 : ctx(ctx
), osd_op(osd_op
) {}
3343 void finish(int r
) override
{
3344 // NB: caller must already have pg->lock held
3345 ctx
->obc
->stop_block();
3346 ctx
->pg
->kick_object_context_blocked(ctx
->obc
);
3349 ctx
->pg
->execute_ctx(ctx
);
3351 // on cancel simply toss op out,
3352 // or requeue as requested
3353 if (r
!= -ECANCELED
) {
3355 ctx
->pg
->osd
->reply_op_error(ctx
->op
, r
);
3356 } else if (requeue
) {
3358 ctx
->pg
->requeue_op(ctx
->op
);
3360 ctx
->pg
->close_op_ctx(ctx
);
3363 void set_requeue(bool rq
) {
3368 struct SetManifestFinisher
: public PrimaryLogPG::OpFinisher
{
3371 explicit SetManifestFinisher(OSDOp
& osd_op
) : osd_op(osd_op
) {
3374 int execute() override
{
3379 struct C_SetManifestRefCountDone
: public Context
{
3384 C_SetManifestRefCountDone(PrimaryLogPG
*p
,
3385 hobject_t soid
, uint64_t offset
) :
3386 pg(p
), soid(soid
), offset(offset
) {}
3387 void finish(int r
) override
{
3388 if (r
== -ECANCELED
)
3390 std::scoped_lock locker
{*pg
};
3391 pg
->finish_set_manifest_refcount(soid
, r
, tid
, offset
);
3395 struct C_SetDedupChunks
: public Context
{
3398 epoch_t last_peering_reset
;
3402 C_SetDedupChunks(PrimaryLogPG
*p
, hobject_t o
, epoch_t lpr
, uint64_t offset
)
3403 : pg(p
), oid(o
), last_peering_reset(lpr
),
3404 tid(0), offset(offset
)
3406 void finish(int r
) override
{
3407 if (r
== -ECANCELED
)
3409 std::scoped_lock locker
{*pg
};
3410 if (last_peering_reset
!= pg
->get_last_peering_reset()) {
3413 pg
->finish_set_dedup(oid
, r
, tid
, offset
);
3417 void PrimaryLogPG::cancel_manifest_ops(bool requeue
, vector
<ceph_tid_t
> *tids
)
3419 dout(10) << __func__
<< dendl
;
3420 auto p
= manifest_ops
.begin();
3421 while (p
!= manifest_ops
.end()) {
3422 auto mop
= p
->second
;
3423 // cancel objecter op, if we can
3424 if (mop
->objecter_tid
) {
3425 tids
->push_back(mop
->objecter_tid
);
3426 mop
->objecter_tid
= 0;
3427 } else if (!mop
->tids
.empty()) {
3428 for (auto &p
: mop
->tids
) {
3429 tids
->push_back(p
.second
);
3433 mop
->cb
->set_requeue(requeue
);
3434 mop
->cb
->complete(-ECANCELED
);
3436 manifest_ops
.erase(p
++);
3440 int PrimaryLogPG::get_manifest_ref_count(ObjectContextRef obc
, std::string
& fp_oid
, OpRequestRef op
)
3444 for (auto &p
: obc
->obs
.oi
.manifest
.chunk_map
) {
3445 if (p
.second
.oid
.oid
.name
== fp_oid
) {
3450 SnapSet
& ss
= obc
->ssc
->snapset
;
3451 const OSDMapRef
& osdmap
= get_osdmap();
3452 for (vector
<snapid_t
>::const_reverse_iterator p
= ss
.clones
.rbegin();
3453 p
!= ss
.clones
.rend();
3455 object_ref_delta_t refs
;
3456 ObjectContextRef obc_l
= nullptr;
3457 ObjectContextRef obc_g
= nullptr;
3458 hobject_t clone_oid
= obc
->obs
.oi
.soid
;
3459 clone_oid
.snap
= *p
;
3460 if (osdmap
->in_removed_snaps_queue(info
.pgid
.pgid
.pool(), *p
)) {
3463 if (is_unreadable_object(clone_oid
)) {
3464 dout(10) << __func__
<< ": " << clone_oid
3465 << " is unreadable. Need to wait for recovery" << dendl
;
3466 wait_for_unreadable_object(clone_oid
, op
);
3469 ObjectContextRef clone_obc
= get_object_context(clone_oid
, false);
3473 if (recover_adjacent_clones(clone_obc
, op
)) {
3476 get_adjacent_clones(clone_obc
, obc_l
, obc_g
);
3477 clone_obc
->obs
.oi
.manifest
.calc_refs_to_inc_on_set(
3478 obc_g
? &(obc_g
->obs
.oi
.manifest
) : nullptr ,
3481 for (auto p
= refs
.begin(); p
!= refs
.end(); ++p
) {
3482 if (p
->first
.oid
.name
== fp_oid
&& p
->second
> 0) {
3491 bool PrimaryLogPG::recover_adjacent_clones(ObjectContextRef obc
, OpRequestRef op
)
3493 if (!obc
->ssc
|| !obc
->ssc
->snapset
.clones
.size()) {
3496 MOSDOp
*m
= static_cast<MOSDOp
*>(op
->get_nonconst_req());
3497 bool has_manifest_op
= std::any_of(
3500 [](const auto& osd_op
) {
3501 return osd_op
.op
.op
== CEPH_OSD_OP_SET_CHUNK
;
3503 if (!obc
->obs
.oi
.manifest
.is_chunked() && !has_manifest_op
) {
3508 const SnapSet
& snapset
= obc
->ssc
->snapset
;
3509 auto s
= std::find(snapset
.clones
.begin(), snapset
.clones
.end(), obc
->obs
.oi
.soid
.snap
);
3510 auto is_unreadable_snap
= [this, obc
, &snapset
, op
](auto iter
) -> bool {
3511 hobject_t cid
= obc
->obs
.oi
.soid
;
3512 cid
.snap
= (iter
== snapset
.clones
.end()) ? snapid_t(CEPH_NOSNAP
) : *iter
;
3513 if (is_unreadable_object(cid
)) {
3514 dout(10) << __func__
<< ": clone " << cid
3515 << " is unreadable, waiting" << dendl
;
3516 wait_for_unreadable_object(cid
, op
);
3521 if (s
!= snapset
.clones
.begin()) {
3522 if (is_unreadable_snap(s
- 1)) {
3526 if (s
!= snapset
.clones
.end()) {
3527 if (is_unreadable_snap(s
+ 1)) {
3534 ObjectContextRef
PrimaryLogPG::get_prev_clone_obc(ObjectContextRef obc
)
3536 auto s
= std::find(obc
->ssc
->snapset
.clones
.begin(), obc
->ssc
->snapset
.clones
.end(),
3537 obc
->obs
.oi
.soid
.snap
);
3538 if (s
!= obc
->ssc
->snapset
.clones
.begin()) {
3539 auto s_iter
= s
- 1;
3540 hobject_t cid
= obc
->obs
.oi
.soid
;
3541 object_ref_delta_t refs
;
3543 ObjectContextRef cobc
= get_object_context(cid
, false, NULL
);
3550 void PrimaryLogPG::dec_refcount(const hobject_t
& soid
, const object_ref_delta_t
& refs
)
3552 for (auto p
= refs
.begin(); p
!= refs
.end(); ++p
) {
3553 int dec_ref_count
= p
->second
;
3554 ceph_assert(dec_ref_count
< 0);
3555 while (dec_ref_count
< 0) {
3556 dout(10) << __func__
<< ": decrement reference on offset oid: " << p
->first
<< dendl
;
3557 refcount_manifest(soid
, p
->first
,
3558 refcount_t::DECREMENT_REF
, NULL
, std::nullopt
);
3565 void PrimaryLogPG::get_adjacent_clones(ObjectContextRef src_obc
,
3566 ObjectContextRef
& _l
, ObjectContextRef
& _g
)
3568 const SnapSet
& snapset
= src_obc
->ssc
->snapset
;
3569 const object_info_t
& oi
= src_obc
->obs
.oi
;
3571 auto get_context
= [this, &oi
, &snapset
](auto iter
)
3572 -> ObjectContextRef
{
3573 hobject_t cid
= oi
.soid
;
3574 cid
.snap
= (iter
== snapset
.clones
.end()) ? snapid_t(CEPH_NOSNAP
) : *iter
;
3575 ObjectContextRef obc
= get_object_context(cid
, false, NULL
);
3580 // check adjacent clones
3581 auto s
= std::find(snapset
.clones
.begin(), snapset
.clones
.end(), oi
.soid
.snap
);
3583 // We *must* find the clone iff it's not head,
3584 // let s == snapset.clones.end() mean head
3585 ceph_assert((s
== snapset
.clones
.end()) == oi
.soid
.is_head());
3587 if (s
!= snapset
.clones
.begin()) {
3588 _l
= get_context(s
- 1);
3591 if (s
!= snapset
.clones
.end()) {
3592 _g
= get_context(s
+ 1);
3596 bool PrimaryLogPG::inc_refcount_by_set(OpContext
* ctx
, object_manifest_t
& set_chunk
,
3599 object_ref_delta_t refs
;
3600 ObjectContextRef obc_l
, obc_g
;
3601 get_adjacent_clones(ctx
->obc
, obc_l
, obc_g
);
3602 set_chunk
.calc_refs_to_inc_on_set(
3603 obc_l
? &(obc_l
->obs
.oi
.manifest
) : nullptr,
3604 obc_g
? &(obc_g
->obs
.oi
.manifest
) : nullptr,
3606 bool need_inc_ref
= false;
3607 if (!refs
.is_empty()) {
3608 ManifestOpRef
mop(std::make_shared
<ManifestOp
>(ctx
->obc
, nullptr));
3609 for (auto c
: set_chunk
.chunk_map
) {
3610 auto p
= refs
.find(c
.second
.oid
);
3611 if (p
== refs
.end()) {
3615 int inc_ref_count
= p
->second
;
3616 if (inc_ref_count
> 0) {
3618 * In set-chunk case, the first thing we should do is to increment
3619 * the reference the targe object has prior to update object_manifest in object_info_t.
3620 * So, call directly refcount_manifest.
3622 auto target_oid
= p
->first
;
3623 auto offset
= c
.first
;
3624 auto length
= c
.second
.length
;
3625 auto* fin
= new C_SetManifestRefCountDone(this, ctx
->obs
->oi
.soid
, offset
);
3626 ceph_tid_t tid
= refcount_manifest(ctx
->obs
->oi
.soid
, target_oid
,
3627 refcount_t::INCREMENT_REF
, fin
, std::nullopt
);
3629 mop
->chunks
[target_oid
] = make_pair(offset
, length
);
3631 mop
->tids
[offset
] = tid
;
3633 if (!ctx
->obc
->is_blocked()) {
3634 dout(15) << fmt::format("{}: blocking object on rc: tid:{}", __func__
, tid
) << dendl
;
3635 ctx
->obc
->start_block();
3637 need_inc_ref
= true;
3638 } else if (inc_ref_count
< 0) {
3639 hobject_t src
= ctx
->obs
->oi
.soid
;
3640 hobject_t tgt
= p
->first
;
3641 ctx
->register_on_commit(
3643 refcount_manifest(src
, tgt
, refcount_t::DECREMENT_REF
, NULL
, std::nullopt
);
3647 if (mop
->tids
.size()) {
3648 mop
->cb
= new RefCountCallback(ctx
, osd_op
);
3649 manifest_ops
[ctx
->obs
->oi
.soid
] = mop
;
3650 manifest_ops
[ctx
->obs
->oi
.soid
]->op
= ctx
->op
;
3654 return need_inc_ref
;
3657 void PrimaryLogPG::update_chunk_map_by_dirty(OpContext
* ctx
) {
3659 * We should consider two cases here:
3660 * 1) just modification: This created dirty regions, but didn't update chunk_map.
3661 * 2) rollback: In rollback, head will be converted to the clone the rollback targets.
3662 * Also, rollback already updated chunk_map.
3663 * So, we should do here is to check whether chunk_map is updated and the clean_region has dirty regions.
3664 * In case of the rollback, chunk_map doesn't need to be clear
3666 for (auto &p
: ctx
->obs
->oi
.manifest
.chunk_map
) {
3667 if (!ctx
->clean_regions
.is_clean_region(p
.first
, p
.second
.length
)) {
3668 ctx
->new_obs
.oi
.manifest
.chunk_map
.erase(p
.first
);
3669 if (ctx
->new_obs
.oi
.manifest
.chunk_map
.empty()) {
3670 ctx
->new_obs
.oi
.manifest
.type
= object_manifest_t::TYPE_NONE
;
3671 ctx
->new_obs
.oi
.clear_flag(object_info_t::FLAG_MANIFEST
);
3672 ctx
->delta_stats
.num_objects_manifest
--;
3678 void PrimaryLogPG::dec_refcount_by_dirty(OpContext
* ctx
)
3680 object_ref_delta_t refs
;
3681 ObjectContextRef cobc
= nullptr;
3682 ObjectContextRef obc
= ctx
->obc
;
3683 // Look over previous snapshot, then figure out whether updated chunk needs to be deleted
3684 cobc
= get_prev_clone_obc(obc
);
3685 obc
->obs
.oi
.manifest
.calc_refs_to_drop_on_modify(
3686 cobc
? &cobc
->obs
.oi
.manifest
: nullptr,
3689 if (!refs
.is_empty()) {
3690 hobject_t soid
= obc
->obs
.oi
.soid
;
3691 ctx
->register_on_commit(
3692 [soid
, this, refs
](){
3693 dec_refcount(soid
, refs
);
3698 void PrimaryLogPG::dec_all_refcount_manifest(const object_info_t
& oi
, OpContext
* ctx
)
3700 ceph_assert(oi
.has_manifest());
3701 ceph_assert(ctx
->obc
->ssc
);
3703 if (oi
.manifest
.is_chunked()) {
3704 object_ref_delta_t refs
;
3705 ObjectContextRef obc_l
, obc_g
, obc
;
3706 /* in trim_object, oi and ctx can have different oid */
3707 obc
= get_object_context(oi
.soid
, false, NULL
);
3709 get_adjacent_clones(obc
, obc_l
, obc_g
);
3710 oi
.manifest
.calc_refs_to_drop_on_removal(
3711 obc_l
? &(obc_l
->obs
.oi
.manifest
) : nullptr,
3712 obc_g
? &(obc_g
->obs
.oi
.manifest
) : nullptr,
3715 if (!refs
.is_empty()) {
3716 /* dec_refcount will use head object anyway */
3717 hobject_t soid
= ctx
->obc
->obs
.oi
.soid
;
3718 ctx
->register_on_commit(
3719 [soid
, this, refs
](){
3720 dec_refcount(soid
, refs
);
3723 } else if (oi
.manifest
.is_redirect() &&
3724 oi
.test_flag(object_info_t::FLAG_REDIRECT_HAS_REFERENCE
)) {
3725 ctx
->register_on_commit(
3727 refcount_manifest(oi
.soid
, oi
.manifest
.redirect_target
,
3728 refcount_t::DECREMENT_REF
, NULL
, std::nullopt
);
3733 ceph_tid_t
PrimaryLogPG::refcount_manifest(hobject_t src_soid
, hobject_t tgt_soid
, refcount_t type
,
3734 Context
*cb
, std::optional
<bufferlist
> chunk
)
3736 unsigned flags
= CEPH_OSD_FLAG_IGNORE_CACHE
| CEPH_OSD_FLAG_IGNORE_OVERLAY
|
3737 CEPH_OSD_FLAG_RWORDERED
;
3739 dout(10) << __func__
<< " Start refcount from " << src_soid
3740 << " to " << tgt_soid
<< dendl
;
3742 ObjectOperation obj_op
;
3744 if (type
== refcount_t::INCREMENT_REF
) {
3745 cls_cas_chunk_get_ref_op call
;
3746 call
.source
= src_soid
.get_head();
3748 obj_op
.call("cas", "chunk_get_ref", in
);
3749 } else if (type
== refcount_t::DECREMENT_REF
) {
3750 cls_cas_chunk_put_ref_op call
;
3751 call
.source
= src_soid
.get_head();
3753 obj_op
.call("cas", "chunk_put_ref", in
);
3754 } else if (type
== refcount_t::CREATE_OR_GET_REF
) {
3755 cls_cas_chunk_create_or_get_ref_op get_call
;
3756 get_call
.source
= src_soid
.get_head();
3758 get_call
.data
= std::move(*chunk
);
3759 ::encode(get_call
, in
);
3760 obj_op
.call("cas", "chunk_create_or_get_ref", in
);
3762 ceph_assert(0 == "unrecognized type");
3765 Context
*c
= nullptr;
3767 c
= new C_OnFinisher(cb
, osd
->get_objecter_finisher(get_pg_shard()));
3770 object_locator_t
oloc(tgt_soid
);
3771 ObjectContextRef src_obc
= get_object_context(src_soid
, false, NULL
);
3772 ceph_assert(src_obc
);
3773 auto tid
= osd
->objecter
->mutate(
3774 tgt_soid
.oid
, oloc
, obj_op
, SnapContext(),
3775 ceph::real_clock::from_ceph_timespec(src_obc
->obs
.oi
.mtime
),
3780 void PrimaryLogPG::do_proxy_chunked_read(OpRequestRef op
, ObjectContextRef obc
, int op_index
,
3781 uint64_t chunk_index
, uint64_t req_offset
, uint64_t req_length
,
3782 uint64_t req_total_len
, bool write_ordered
)
3784 MOSDOp
*m
= static_cast<MOSDOp
*>(op
->get_nonconst_req());
3785 object_manifest_t
*manifest
= &obc
->obs
.oi
.manifest
;
3786 if (!manifest
->chunk_map
.count(chunk_index
)) {
3789 uint64_t chunk_length
= manifest
->chunk_map
[chunk_index
].length
;
3790 hobject_t soid
= manifest
->chunk_map
[chunk_index
].oid
;
3791 hobject_t ori_soid
= m
->get_hobj();
3792 object_locator_t
oloc(soid
);
3793 unsigned flags
= CEPH_OSD_FLAG_IGNORE_CACHE
| CEPH_OSD_FLAG_IGNORE_OVERLAY
;
3794 if (write_ordered
) {
3795 flags
|= CEPH_OSD_FLAG_RWORDERED
;
3798 if (!chunk_length
|| soid
== hobject_t()) {
3802 /* same as do_proxy_read() */
3803 flags
|= m
->get_flags() & (CEPH_OSD_FLAG_RWORDERED
|
3804 CEPH_OSD_FLAG_ORDERSNAP
|
3805 CEPH_OSD_FLAG_ENFORCE_SNAPC
|
3806 CEPH_OSD_FLAG_MAP_SNAP_CLONE
);
3808 dout(10) << __func__
<< " Start do chunk proxy read for " << *m
3809 << " index: " << op_index
<< " oid: " << soid
.oid
.name
<< " req_offset: " << req_offset
3810 << " req_length: " << req_length
<< dendl
;
3812 ProxyReadOpRef
prdop(std::make_shared
<ProxyReadOp
>(op
, ori_soid
, m
->ops
));
3814 ObjectOperation
*pobj_op
= new ObjectOperation
;
3815 OSDOp
&osd_op
= pobj_op
->add_op(m
->ops
[op_index
].op
.op
);
3817 if (chunk_index
<= req_offset
) {
3818 osd_op
.op
.extent
.offset
= manifest
->chunk_map
[chunk_index
].offset
+ req_offset
- chunk_index
;
3820 ceph_abort_msg("chunk_index > req_offset");
3822 osd_op
.op
.extent
.length
= req_length
;
3824 ObjectOperation obj_op
;
3825 obj_op
.dup(pobj_op
->ops
);
3827 C_ProxyChunkRead
*fin
= new C_ProxyChunkRead(this, ori_soid
, get_last_peering_reset(),
3829 fin
->obj_op
= pobj_op
;
3830 fin
->op_index
= op_index
;
3831 fin
->req_offset
= req_offset
;
3833 fin
->req_total_len
= req_total_len
;
3835 ceph_tid_t tid
= osd
->objecter
->read(
3836 soid
.oid
, oloc
, obj_op
,
3837 m
->get_snapid(), NULL
,
3838 flags
, new C_OnFinisher(fin
, osd
->get_objecter_finisher(get_pg_shard())),
3839 &prdop
->user_version
,
3840 &prdop
->data_offset
,
3843 prdop
->objecter_tid
= tid
;
3844 proxyread_ops
[tid
] = prdop
;
3845 in_progress_proxy_ops
[ori_soid
].push_back(op
);
3848 bool PrimaryLogPG::can_proxy_chunked_read(OpRequestRef op
, ObjectContextRef obc
)
3850 MOSDOp
*m
= static_cast<MOSDOp
*>(op
->get_nonconst_req());
3851 OSDOp
*osd_op
= NULL
;
3853 for (unsigned int i
= 0; i
< m
->ops
.size(); i
++) {
3854 osd_op
= &m
->ops
[i
];
3855 ceph_osd_op op
= osd_op
->op
;
3857 case CEPH_OSD_OP_READ
:
3858 case CEPH_OSD_OP_SYNC_READ
: {
3859 uint64_t cursor
= osd_op
->op
.extent
.offset
;
3860 uint64_t remain
= osd_op
->op
.extent
.length
;
3862 /* requested chunks exist in chunk_map ? */
3863 for (auto &p
: obc
->obs
.oi
.manifest
.chunk_map
) {
3864 if (p
.first
<= cursor
&& p
.first
+ p
.second
.length
> cursor
) {
3865 if (!p
.second
.is_missing()) {
3868 if (p
.second
.length
>= remain
) {
3872 remain
= remain
- p
.second
.length
;
3874 cursor
+= p
.second
.length
;
3879 dout(20) << __func__
<< " requested chunks don't exist in chunk_map " << dendl
;
3891 void PrimaryLogPG::finish_proxy_write(hobject_t oid
, ceph_tid_t tid
, int r
)
3893 dout(10) << __func__
<< " " << oid
<< " tid " << tid
3894 << " " << cpp_strerror(r
) << dendl
;
3896 map
<ceph_tid_t
, ProxyWriteOpRef
>::iterator p
= proxywrite_ops
.find(tid
);
3897 if (p
== proxywrite_ops
.end()) {
3898 dout(10) << __func__
<< " no proxywrite_op found" << dendl
;
3901 ProxyWriteOpRef pwop
= p
->second
;
3902 ceph_assert(tid
== pwop
->objecter_tid
);
3903 ceph_assert(oid
== pwop
->soid
);
3905 proxywrite_ops
.erase(tid
);
3907 map
<hobject_t
, list
<OpRequestRef
> >::iterator q
= in_progress_proxy_ops
.find(oid
);
3908 if (q
== in_progress_proxy_ops
.end()) {
3909 dout(10) << __func__
<< " no in_progress_proxy_ops found" << dendl
;
3914 list
<OpRequestRef
>& in_progress_op
= q
->second
;
3915 ceph_assert(in_progress_op
.size());
3916 list
<OpRequestRef
>::iterator it
= std::find(in_progress_op
.begin(),
3917 in_progress_op
.end(),
3919 ceph_assert(it
!= in_progress_op
.end());
3920 in_progress_op
.erase(it
);
3921 if (in_progress_op
.size() == 0) {
3922 in_progress_proxy_ops
.erase(oid
);
3923 } else if (std::find(in_progress_op
.begin(),
3924 in_progress_op
.end(),
3925 pwop
->op
) != in_progress_op
.end()) {
3929 dout(20) << __func__
<< " " << oid
<< " tid " << tid
3930 << " in_progress_op size: "
3931 << in_progress_op
.size() << dendl
;
3935 osd
->logger
->inc(l_osd_tier_proxy_write
);
3937 auto m
= pwop
->op
->get_req
<MOSDOp
>();
3938 ceph_assert(m
!= NULL
);
3940 if (!pwop
->sent_reply
) {
3942 assert(pwop
->ctx
->reply
== nullptr);
3943 MOSDOpReply
*reply
= new MOSDOpReply(m
, r
, get_osdmap_epoch(), 0,
3944 true /* we claim it below */);
3945 reply
->set_reply_versions(eversion_t(), pwop
->user_version
);
3946 reply
->add_flags(CEPH_OSD_FLAG_ACK
| CEPH_OSD_FLAG_ONDISK
);
3947 reply
->claim_op_out_data(pwop
->ops
);
3948 dout(10) << " sending commit on " << pwop
<< " " << reply
<< dendl
;
3949 osd
->send_message_osd_client(reply
, m
->get_connection());
3950 pwop
->sent_reply
= true;
3951 pwop
->ctx
->op
->mark_commit_sent();
3958 void PrimaryLogPG::cancel_proxy_write(ProxyWriteOpRef pwop
,
3959 vector
<ceph_tid_t
> *tids
)
3961 dout(10) << __func__
<< " " << pwop
->soid
<< dendl
;
3962 pwop
->canceled
= true;
3964 // cancel objecter op, if we can
3965 if (pwop
->objecter_tid
) {
3966 tids
->push_back(pwop
->objecter_tid
);
3969 proxywrite_ops
.erase(pwop
->objecter_tid
);
3970 pwop
->objecter_tid
= 0;
3974 class PromoteCallback
: public PrimaryLogPG::CopyCallback
{
3975 ObjectContextRef obc
;
3979 PromoteCallback(ObjectContextRef obc_
, PrimaryLogPG
*pg_
)
3982 start(ceph_clock_now()) {}
3984 void finish(PrimaryLogPG::CopyCallbackResults results
) override
{
3985 PrimaryLogPG::CopyResults
*results_data
= results
.get
<1>();
3986 int r
= results
.get
<0>();
3987 if (obc
->obs
.oi
.has_manifest() && obc
->obs
.oi
.manifest
.is_chunked()) {
3988 pg
->finish_promote_manifest(r
, results_data
, obc
);
3990 pg
->finish_promote(r
, results_data
, obc
);
3992 pg
->osd
->logger
->tinc(l_osd_tier_promote_lat
, ceph_clock_now() - start
);
3996 class PromoteManifestCallback
: public PrimaryLogPG::CopyCallback
{
3997 ObjectContextRef obc
;
4000 PrimaryLogPG::OpContext
*ctx
;
4001 PrimaryLogPG::CopyCallbackResults promote_results
;
4003 PromoteManifestCallback(ObjectContextRef obc_
, PrimaryLogPG
*pg_
, PrimaryLogPG::OpContext
*ctx
)
4006 start(ceph_clock_now()), ctx(ctx
) {}
4008 void finish(PrimaryLogPG::CopyCallbackResults results
) override
{
4009 PrimaryLogPG::CopyResults
*results_data
= results
.get
<1>();
4010 int r
= results
.get
<0>();
4011 promote_results
= results
;
4012 if (obc
->obs
.oi
.has_manifest() && obc
->obs
.oi
.manifest
.is_redirect()) {
4013 ctx
->user_at_version
= results_data
->user_version
;
4016 ctx
->pg
->execute_ctx(ctx
);
4018 if (r
!= -ECANCELED
) {
4020 ctx
->pg
->osd
->reply_op_error(ctx
->op
, r
);
4021 } else if (results_data
->should_requeue
) {
4023 ctx
->pg
->requeue_op(ctx
->op
);
4025 ctx
->pg
->close_op_ctx(ctx
);
4027 pg
->osd
->logger
->tinc(l_osd_tier_promote_lat
, ceph_clock_now() - start
);
4029 friend struct PromoteFinisher
;
4032 struct PromoteFinisher
: public PrimaryLogPG::OpFinisher
{
4033 PromoteManifestCallback
*promote_callback
;
4035 explicit PromoteFinisher(PromoteManifestCallback
*promote_callback
)
4036 : promote_callback(promote_callback
) {
4039 int execute() override
{
4040 if (promote_callback
->ctx
->obc
->obs
.oi
.manifest
.is_redirect()) {
4041 promote_callback
->ctx
->pg
->finish_promote(promote_callback
->promote_results
.get
<0>(),
4042 promote_callback
->promote_results
.get
<1>(),
4043 promote_callback
->obc
);
4044 } else if (promote_callback
->ctx
->obc
->obs
.oi
.manifest
.is_chunked()) {
4045 promote_callback
->ctx
->pg
->finish_promote_manifest(promote_callback
->promote_results
.get
<0>(),
4046 promote_callback
->promote_results
.get
<1>(),
4047 promote_callback
->obc
);
4049 ceph_abort_msg("unrecognized manifest type");
4055 void PrimaryLogPG::promote_object(ObjectContextRef obc
,
4056 const hobject_t
& missing_oid
,
4057 const object_locator_t
& oloc
,
4059 ObjectContextRef
*promote_obc
)
4061 hobject_t hoid
= obc
? obc
->obs
.oi
.soid
: missing_oid
;
4062 ceph_assert(hoid
!= hobject_t());
4063 if (m_scrubber
->write_blocked_by_scrub(hoid
)) {
4064 dout(10) << __func__
<< " " << hoid
4065 << " blocked by scrub" << dendl
;
4067 waiting_for_scrub
.push_back(op
);
4068 op
->mark_delayed("waiting for scrub");
4069 dout(10) << __func__
<< " " << hoid
4070 << " placing op in waiting_for_scrub" << dendl
;
4072 dout(10) << __func__
<< " " << hoid
4073 << " no op, dropping on the floor" << dendl
;
4077 if (op
&& !check_laggy_requeue(op
)) {
4080 if (!obc
) { // we need to create an ObjectContext
4081 ceph_assert(missing_oid
!= hobject_t());
4082 obc
= get_object_context(missing_oid
, true);
4088 * Before promote complete, if there are proxy-reads for the object,
4089 * for this case we don't use DONTNEED.
4091 unsigned src_fadvise_flags
= LIBRADOS_OP_FLAG_FADVISE_SEQUENTIAL
;
4092 map
<hobject_t
, list
<OpRequestRef
>>::iterator q
= in_progress_proxy_ops
.find(obc
->obs
.oi
.soid
);
4093 if (q
== in_progress_proxy_ops
.end()) {
4094 src_fadvise_flags
|= LIBRADOS_OP_FLAG_FADVISE_DONTNEED
;
4098 object_locator_t my_oloc
;
4100 if (!obc
->obs
.oi
.has_manifest()) {
4102 my_oloc
.pool
= pool
.info
.tier_of
;
4103 src_hoid
= obc
->obs
.oi
.soid
;
4104 cb
= new PromoteCallback(obc
, this);
4106 if (obc
->obs
.oi
.manifest
.is_chunked()) {
4107 src_hoid
= obc
->obs
.oi
.soid
;
4108 cb
= new PromoteCallback(obc
, this);
4109 } else if (obc
->obs
.oi
.manifest
.is_redirect()) {
4110 object_locator_t
src_oloc(obc
->obs
.oi
.manifest
.redirect_target
);
4112 src_hoid
= obc
->obs
.oi
.manifest
.redirect_target
;
4113 cb
= new PromoteCallback(obc
, this);
4115 ceph_abort_msg("unrecognized manifest type");
4119 unsigned flags
= CEPH_OSD_COPY_FROM_FLAG_IGNORE_OVERLAY
|
4120 CEPH_OSD_COPY_FROM_FLAG_IGNORE_CACHE
|
4121 CEPH_OSD_COPY_FROM_FLAG_MAP_SNAP_CLONE
|
4122 CEPH_OSD_COPY_FROM_FLAG_RWORDERED
;
4123 start_copy(cb
, obc
, src_hoid
, my_oloc
, 0, flags
,
4124 obc
->obs
.oi
.soid
.snap
== CEPH_NOSNAP
,
4125 src_fadvise_flags
, 0);
4127 ceph_assert(obc
->is_blocked());
4130 wait_for_blocked_object(obc
->obs
.oi
.soid
, op
);
4132 recovery_state
.update_stats(
4133 [](auto &history
, auto &stats
) {
4134 stats
.stats
.sum
.num_promote
++;
4139 void PrimaryLogPG::execute_ctx(OpContext
*ctx
)
4142 dout(10) << __func__
<< " " << ctx
<< dendl
;
4143 ctx
->reset_obs(ctx
->obc
);
4144 ctx
->update_log_only
= false; // reset in case finish_copyfrom() is re-running execute_ctx
4145 OpRequestRef op
= ctx
->op
;
4146 auto m
= op
->get_req
<MOSDOp
>();
4147 ObjectContextRef obc
= ctx
->obc
;
4148 const hobject_t
& soid
= obc
->obs
.oi
.soid
;
4150 // this method must be idempotent since we may call it several times
4151 // before we finally apply the resulting transaction.
4152 ctx
->op_t
.reset(new PGTransaction
);
4154 if (op
->may_write() || op
->may_cache()) {
4156 if (!(m
->has_flag(CEPH_OSD_FLAG_ENFORCE_SNAPC
)) &&
4157 pool
.info
.is_pool_snaps_mode()) {
4159 ctx
->snapc
= pool
.snapc
;
4161 // client specified snapc
4162 ctx
->snapc
.seq
= m
->get_snap_seq();
4163 ctx
->snapc
.snaps
= m
->get_snaps();
4164 filter_snapc(ctx
->snapc
.snaps
);
4166 if ((m
->has_flag(CEPH_OSD_FLAG_ORDERSNAP
)) &&
4167 ctx
->snapc
.seq
< obc
->ssc
->snapset
.seq
) {
4168 dout(10) << " ORDERSNAP flag set and snapc seq " << ctx
->snapc
.seq
4169 << " < snapset seq " << obc
->ssc
->snapset
.seq
4170 << " on " << obc
->obs
.oi
.soid
<< dendl
;
4171 reply_ctx(ctx
, -EOLDSNAPC
);
4176 ctx
->at_version
= get_next_version();
4177 ctx
->mtime
= m
->get_mtime();
4179 dout(10) << __func__
<< " " << soid
<< " " << *ctx
->ops
4180 << " ov " << obc
->obs
.oi
.version
<< " av " << ctx
->at_version
4181 << " snapc " << ctx
->snapc
4182 << " snapset " << obc
->ssc
->snapset
4185 dout(10) << __func__
<< " " << soid
<< " " << *ctx
->ops
4186 << " ov " << obc
->obs
.oi
.version
4190 if (!ctx
->user_at_version
)
4191 ctx
->user_at_version
= obc
->obs
.oi
.user_version
;
4192 dout(30) << __func__
<< " user_at_version " << ctx
->user_at_version
<< dendl
;
4196 osd_reqid_t reqid
= ctx
->op
->get_reqid();
4198 tracepoint(osd
, prepare_tx_enter
, reqid
.name
._type
,
4199 reqid
.name
._num
, reqid
.tid
, reqid
.inc
);
4203 int result
= prepare_transaction(ctx
);
4207 osd_reqid_t reqid
= ctx
->op
->get_reqid();
4209 tracepoint(osd
, prepare_tx_exit
, reqid
.name
._type
,
4210 reqid
.name
._num
, reqid
.tid
, reqid
.inc
);
4213 bool pending_async_reads
= !ctx
->pending_async_reads
.empty();
4214 if (result
== -EINPROGRESS
|| pending_async_reads
) {
4216 if (pending_async_reads
) {
4217 ceph_assert(pool
.info
.is_erasure());
4218 in_progress_async_reads
.push_back(make_pair(op
, ctx
));
4219 ctx
->start_async_reads(this);
4224 if (result
== -EAGAIN
) {
4225 // clean up after the ctx
4230 bool ignore_out_data
= false;
4231 if (!ctx
->op_t
->empty() &&
4234 // successful update
4235 if (ctx
->op
->allows_returnvec()) {
4236 // enforce reasonable bound on the return buffer sizes
4237 for (auto& i
: *ctx
->ops
) {
4238 if (i
.outdata
.length() > cct
->_conf
->osd_max_write_op_reply_len
) {
4239 dout(10) << __func__
<< " op " << i
<< " outdata overflow" << dendl
;
4240 result
= -EOVERFLOW
; // overall result is overflow
4241 i
.rval
= -EOVERFLOW
;
4246 // legacy behavior -- zero result and return data etc.
4247 ignore_out_data
= true;
4252 // prepare the reply
4253 ctx
->reply
= new MOSDOpReply(m
, result
, get_osdmap_epoch(), 0,
4255 dout(20) << __func__
<< " alloc reply " << ctx
->reply
4256 << " result " << result
<< dendl
;
4259 if ((ctx
->op_t
->empty() || result
< 0) && !ctx
->update_log_only
) {
4260 // finish side-effects
4262 do_osd_op_effects(ctx
, m
->get_connection());
4264 complete_read_ctx(result
, ctx
);
4268 ctx
->reply
->set_reply_versions(ctx
->at_version
, ctx
->user_at_version
);
4270 ceph_assert(op
->may_write() || op
->may_cache());
4273 recovery_state
.update_trim_to();
4275 // verify that we are doing this in order?
4276 if (cct
->_conf
->osd_debug_op_order
&& m
->get_source().is_client() &&
4277 !pool
.info
.is_tier() && !pool
.info
.has_tiers()) {
4278 map
<client_t
,ceph_tid_t
>& cm
= debug_op_order
[obc
->obs
.oi
.soid
];
4279 ceph_tid_t t
= m
->get_tid();
4280 client_t n
= m
->get_source().num();
4281 map
<client_t
,ceph_tid_t
>::iterator p
= cm
.find(n
);
4282 if (p
== cm
.end()) {
4283 dout(20) << " op order client." << n
<< " tid " << t
<< " (first)" << dendl
;
4286 dout(20) << " op order client." << n
<< " tid " << t
<< " last was " << p
->second
<< dendl
;
4287 if (p
->second
> t
) {
4288 derr
<< "bad op order, already applied " << p
->second
<< " > this " << t
<< dendl
;
4289 ceph_abort_msg("out of order op");
4295 if (ctx
->update_log_only
) {
4297 do_osd_op_effects(ctx
, m
->get_connection());
4299 dout(20) << __func__
<< " update_log_only -- result=" << result
<< dendl
;
4300 // save just what we need from ctx
4301 MOSDOpReply
*reply
= ctx
->reply
;
4302 ctx
->reply
= nullptr;
4303 reply
->get_header().data_off
= (ctx
->data_off
? *ctx
->data_off
: 0);
4305 if (result
== -ENOENT
) {
4306 reply
->set_enoent_reply_versions(info
.last_update
,
4307 info
.last_user_version
);
4309 reply
->add_flags(CEPH_OSD_FLAG_ACK
| CEPH_OSD_FLAG_ONDISK
);
4310 // append to pg log for dup detection - don't save buffers for now
4311 record_write_error(op
, soid
, reply
, result
,
4312 ctx
->op
->allows_returnvec() ? ctx
: nullptr);
4317 // no need to capture PG ref, repop cancel will handle that
4318 // Can capture the ctx by pointer, it's owned by the repop
4319 ctx
->register_on_commit(
4322 log_op_stats(*ctx
->op
, ctx
->bytes_written
, ctx
->bytes_read
);
4324 if (m
&& !ctx
->sent_reply
) {
4325 MOSDOpReply
*reply
= ctx
->reply
;
4326 ctx
->reply
= nullptr;
4327 reply
->add_flags(CEPH_OSD_FLAG_ACK
| CEPH_OSD_FLAG_ONDISK
);
4328 dout(10) << " sending reply on " << *m
<< " " << reply
<< dendl
;
4329 osd
->send_message_osd_client(reply
, m
->get_connection());
4330 ctx
->sent_reply
= true;
4331 ctx
->op
->mark_commit_sent();
4334 ctx
->register_on_success(
4338 ctx
->op
? ctx
->op
->get_req()->get_connection() :
4341 ctx
->register_on_finish(
4346 // issue replica writes
4347 ceph_tid_t rep_tid
= osd
->get_tid();
4349 RepGather
*repop
= new_repop(ctx
, rep_tid
);
4351 issue_repop(repop
, ctx
);
4356 void PrimaryLogPG::close_op_ctx(OpContext
*ctx
) {
4357 release_object_locks(ctx
->lock_manager
);
4361 for (auto p
= ctx
->on_finish
.begin(); p
!= ctx
->on_finish
.end();
4362 ctx
->on_finish
.erase(p
++)) {
4368 void PrimaryLogPG::reply_ctx(OpContext
*ctx
, int r
)
4371 osd
->reply_op_error(ctx
->op
, r
);
4375 void PrimaryLogPG::log_op_stats(const OpRequest
& op
,
4377 const uint64_t outb
)
4379 auto m
= op
.get_req
<MOSDOp
>();
4380 const utime_t now
= ceph_clock_now();
4382 const utime_t latency
= now
- m
->get_recv_stamp();
4383 const utime_t process_latency
= now
- op
.get_dequeued_time();
4385 osd
->logger
->inc(l_osd_op
);
4387 osd
->logger
->inc(l_osd_op_outb
, outb
);
4388 osd
->logger
->inc(l_osd_op_inb
, inb
);
4389 osd
->logger
->tinc(l_osd_op_lat
, latency
);
4390 osd
->logger
->tinc(l_osd_op_process_lat
, process_latency
);
4392 if (op
.may_read() && op
.may_write()) {
4393 osd
->logger
->inc(l_osd_op_rw
);
4394 osd
->logger
->inc(l_osd_op_rw_inb
, inb
);
4395 osd
->logger
->inc(l_osd_op_rw_outb
, outb
);
4396 osd
->logger
->tinc(l_osd_op_rw_lat
, latency
);
4397 osd
->logger
->hinc(l_osd_op_rw_lat_inb_hist
, latency
.to_nsec(), inb
);
4398 osd
->logger
->hinc(l_osd_op_rw_lat_outb_hist
, latency
.to_nsec(), outb
);
4399 osd
->logger
->tinc(l_osd_op_rw_process_lat
, process_latency
);
4400 } else if (op
.may_read()) {
4401 osd
->logger
->inc(l_osd_op_r
);
4402 osd
->logger
->inc(l_osd_op_r_outb
, outb
);
4403 osd
->logger
->tinc(l_osd_op_r_lat
, latency
);
4404 osd
->logger
->hinc(l_osd_op_r_lat_outb_hist
, latency
.to_nsec(), outb
);
4405 osd
->logger
->tinc(l_osd_op_r_process_lat
, process_latency
);
4406 } else if (op
.may_write() || op
.may_cache()) {
4407 osd
->logger
->inc(l_osd_op_w
);
4408 osd
->logger
->inc(l_osd_op_w_inb
, inb
);
4409 osd
->logger
->tinc(l_osd_op_w_lat
, latency
);
4410 osd
->logger
->hinc(l_osd_op_w_lat_inb_hist
, latency
.to_nsec(), inb
);
4411 osd
->logger
->tinc(l_osd_op_w_process_lat
, process_latency
);
4416 dout(15) << "log_op_stats " << *m
4419 << " lat " << latency
<< dendl
;
4421 if (m_dynamic_perf_stats
.is_enabled()) {
4422 m_dynamic_perf_stats
.add(osd
, info
, op
, inb
, outb
, latency
);
4426 void PrimaryLogPG::set_dynamic_perf_stats_queries(
4427 const std::list
<OSDPerfMetricQuery
> &queries
)
4429 m_dynamic_perf_stats
.set_queries(queries
);
4432 void PrimaryLogPG::get_dynamic_perf_stats(DynamicPerfStats
*stats
)
4434 std::swap(m_dynamic_perf_stats
, *stats
);
4437 void PrimaryLogPG::do_scan(
4439 ThreadPool::TPHandle
&handle
)
4441 auto m
= op
->get_req
<MOSDPGScan
>();
4442 ceph_assert(m
->get_type() == MSG_OSD_PG_SCAN
);
4443 dout(10) << "do_scan " << *m
<< dendl
;
4448 case MOSDPGScan::OP_SCAN_GET_DIGEST
:
4450 auto dpp
= get_dpp();
4451 if (osd
->check_backfill_full(dpp
)) {
4452 dout(1) << __func__
<< ": Canceling backfill: Full." << dendl
;
4453 queue_peering_event(
4455 std::make_shared
<PGPeeringEvent
>(
4458 PeeringState::BackfillTooFull())));
4462 BackfillInterval bi
;
4463 bi
.begin
= m
->begin
;
4464 // No need to flush, there won't be any in progress writes occuring
4467 cct
->_conf
->osd_backfill_scan_min
,
4468 cct
->_conf
->osd_backfill_scan_max
,
4471 MOSDPGScan
*reply
= new MOSDPGScan(
4472 MOSDPGScan::OP_SCAN_DIGEST
,
4474 get_osdmap_epoch(), m
->query_epoch
,
4475 spg_t(info
.pgid
.pgid
, get_primary().shard
), bi
.begin
, bi
.end
);
4476 encode(bi
.objects
, reply
->get_data());
4477 osd
->send_message_osd_cluster(reply
, m
->get_connection());
4481 case MOSDPGScan::OP_SCAN_DIGEST
:
4483 pg_shard_t from
= m
->from
;
4485 // Check that from is in backfill_targets vector
4486 ceph_assert(is_backfill_target(from
));
4488 BackfillInterval
& bi
= peer_backfill_info
[from
];
4489 bi
.begin
= m
->begin
;
4491 auto p
= m
->get_data().cbegin();
4493 // take care to preserve ordering!
4495 decode_noclear(bi
.objects
, p
);
4496 dout(10) << __func__
<< " bi.begin=" << bi
.begin
<< " bi.end=" << bi
.end
4497 << " bi.objects.size()=" << bi
.objects
.size() << dendl
;
4499 if (waiting_on_backfill
.erase(from
)) {
4500 if (waiting_on_backfill
.empty()) {
4502 peer_backfill_info
.size() ==
4503 get_backfill_targets().size());
4504 finish_recovery_op(hobject_t::get_max());
4507 // we canceled backfill for a while due to a too full, and this
4508 // is an extra response from a non-too-full peer
4509 dout(20) << __func__
<< " canceled backfill (too full?)" << dendl
;
4516 void PrimaryLogPG::do_backfill(OpRequestRef op
)
4518 auto m
= op
->get_req
<MOSDPGBackfill
>();
4519 ceph_assert(m
->get_type() == MSG_OSD_PG_BACKFILL
);
4520 dout(10) << "do_backfill " << *m
<< dendl
;
4525 case MOSDPGBackfill::OP_BACKFILL_FINISH
:
4527 ceph_assert(cct
->_conf
->osd_kill_backfill_at
!= 1);
4529 MOSDPGBackfill
*reply
= new MOSDPGBackfill(
4530 MOSDPGBackfill::OP_BACKFILL_FINISH_ACK
,
4533 spg_t(info
.pgid
.pgid
, get_primary().shard
));
4534 reply
->set_priority(recovery_state
.get_recovery_op_priority());
4535 osd
->send_message_osd_cluster(reply
, m
->get_connection());
4536 queue_peering_event(
4538 std::make_shared
<PGPeeringEvent
>(
4545 case MOSDPGBackfill::OP_BACKFILL_PROGRESS
:
4547 ceph_assert(cct
->_conf
->osd_kill_backfill_at
!= 2);
4549 ObjectStore::Transaction t
;
4550 recovery_state
.update_backfill_progress(
4553 m
->op
== MOSDPGBackfill::OP_BACKFILL_PROGRESS
,
4556 int tr
= osd
->store
->queue_transaction(ch
, std::move(t
), NULL
);
4557 ceph_assert(tr
== 0);
4561 case MOSDPGBackfill::OP_BACKFILL_FINISH_ACK
:
4563 ceph_assert(is_primary());
4564 ceph_assert(cct
->_conf
->osd_kill_backfill_at
!= 3);
4565 finish_recovery_op(hobject_t::get_max());
4571 void PrimaryLogPG::do_backfill_remove(OpRequestRef op
)
4573 const MOSDPGBackfillRemove
*m
= static_cast<const MOSDPGBackfillRemove
*>(
4575 ceph_assert(m
->get_type() == MSG_OSD_PG_BACKFILL_REMOVE
);
4576 dout(7) << __func__
<< " " << m
->ls
<< dendl
;
4580 ObjectStore::Transaction t
;
4581 for (auto& p
: m
->ls
) {
4582 if (is_remote_backfilling()) {
4584 int r
= osd
->store
->stat(ch
, ghobject_t(p
.first
, ghobject_t::NO_GEN
,
4585 pg_whoami
.shard
) , &st
);
4587 sub_local_num_bytes(st
.st_size
);
4589 if (pool
.info
.is_erasure()) {
4591 int r
= osd
->store
->getattr(
4593 ghobject_t(p
.first
, ghobject_t::NO_GEN
, pg_whoami
.shard
),
4597 object_info_t
oi(bv
);
4598 usersize
= oi
.size
* pgbackend
->get_ec_data_chunk_count();
4600 dout(0) << __func__
<< " " << ghobject_t(p
.first
, ghobject_t::NO_GEN
, pg_whoami
.shard
)
4601 << " can't get object info" << dendl
;
4605 usersize
= st
.st_size
;
4607 sub_num_bytes(usersize
);
4608 dout(10) << __func__
<< " " << ghobject_t(p
.first
, ghobject_t::NO_GEN
, pg_whoami
.shard
)
4609 << " sub actual data by " << st
.st_size
4610 << " sub num_bytes by " << usersize
4614 remove_snap_mapped_object(t
, p
.first
);
4616 int r
= osd
->store
->queue_transaction(ch
, std::move(t
), NULL
);
4617 ceph_assert(r
== 0);
4620 int PrimaryLogPG::trim_object(
4621 bool first
, const hobject_t
&coid
, snapid_t snap_to_trim
,
4622 PrimaryLogPG::OpContextUPtr
*ctxp
)
4628 ObjectContextRef obc
= get_object_context(coid
, false, NULL
);
4629 if (!obc
|| !obc
->ssc
|| !obc
->ssc
->exists
) {
4630 osd
->clog
->error() << __func__
<< ": Can not trim " << coid
4631 << " repair needed " << (obc
? "(no obc->ssc or !exists)" : "(no obc)");
4635 hobject_t head_oid
= coid
.get_head();
4636 ObjectContextRef head_obc
= get_object_context(head_oid
, false);
4638 osd
->clog
->error() << __func__
<< ": Can not trim " << coid
4639 << " repair needed, no snapset obc for " << head_oid
;
4643 SnapSet
& snapset
= obc
->ssc
->snapset
;
4645 object_info_t
&coi
= obc
->obs
.oi
;
4646 auto citer
= snapset
.clone_snaps
.find(coid
.snap
);
4647 if (citer
== snapset
.clone_snaps
.end()) {
4648 osd
->clog
->error() << "No clone_snaps in snapset " << snapset
4649 << " for object " << coid
<< "\n";
4652 set
<snapid_t
> old_snaps(citer
->second
.begin(), citer
->second
.end());
4653 if (old_snaps
.empty()) {
4654 osd
->clog
->error() << "No object info snaps for object " << coid
;
4658 dout(10) << coid
<< " old_snaps " << old_snaps
4659 << " old snapset " << snapset
<< dendl
;
4660 if (snapset
.seq
== 0) {
4661 osd
->clog
->error() << "No snapset.seq for object " << coid
;
4665 set
<snapid_t
> new_snaps
;
4666 const OSDMapRef
& osdmap
= get_osdmap();
4667 for (set
<snapid_t
>::iterator i
= old_snaps
.begin();
4668 i
!= old_snaps
.end();
4670 if (!osdmap
->in_removed_snaps_queue(info
.pgid
.pgid
.pool(), *i
) &&
4671 *i
!= snap_to_trim
) {
4672 new_snaps
.insert(*i
);
4676 vector
<snapid_t
>::iterator p
= snapset
.clones
.end();
4678 if (new_snaps
.empty()) {
4679 p
= std::find(snapset
.clones
.begin(), snapset
.clones
.end(), coid
.snap
);
4680 if (p
== snapset
.clones
.end()) {
4681 osd
->clog
->error() << "Snap " << coid
.snap
<< " not in clones";
4686 OpContextUPtr ctx
= simple_opc_create(obc
);
4687 ctx
->head_obc
= head_obc
;
4689 if (!ctx
->lock_manager
.get_snaptrimmer_write(
4693 close_op_ctx(ctx
.release());
4694 dout(10) << __func__
<< ": Unable to get a wlock on " << coid
<< dendl
;
4698 if (!ctx
->lock_manager
.get_snaptrimmer_write(
4702 close_op_ctx(ctx
.release());
4703 dout(10) << __func__
<< ": Unable to get a wlock on " << head_oid
<< dendl
;
4707 ctx
->at_version
= get_next_version();
4709 PGTransaction
*t
= ctx
->op_t
.get();
4711 int64_t num_objects_before_trim
= ctx
->delta_stats
.num_objects
;
4713 if (new_snaps
.empty()) {
4715 dout(10) << coid
<< " snaps " << old_snaps
<< " -> "
4716 << new_snaps
<< " ... deleting" << dendl
;
4719 ceph_assert(p
!= snapset
.clones
.end());
4721 snapid_t last
= coid
.snap
;
4722 ctx
->delta_stats
.num_bytes
-= snapset
.get_clone_bytes(last
);
4724 if (p
!= snapset
.clones
.begin()) {
4725 // not the oldest... merge overlap into next older clone
4726 vector
<snapid_t
>::iterator n
= p
- 1;
4727 hobject_t prev_coid
= coid
;
4728 prev_coid
.snap
= *n
;
4729 bool adjust_prev_bytes
= is_present_clone(prev_coid
);
4731 if (adjust_prev_bytes
)
4732 ctx
->delta_stats
.num_bytes
-= snapset
.get_clone_bytes(*n
);
4734 snapset
.clone_overlap
[*n
].intersection_of(
4735 snapset
.clone_overlap
[*p
]);
4737 if (adjust_prev_bytes
)
4738 ctx
->delta_stats
.num_bytes
+= snapset
.get_clone_bytes(*n
);
4740 ctx
->delta_stats
.num_objects
--;
4742 ctx
->delta_stats
.num_objects_dirty
--;
4744 ctx
->delta_stats
.num_objects_omap
--;
4745 if (coi
.is_whiteout()) {
4746 dout(20) << __func__
<< " trimming whiteout on " << coid
<< dendl
;
4747 ctx
->delta_stats
.num_whiteouts
--;
4749 ctx
->delta_stats
.num_object_clones
--;
4750 if (coi
.is_cache_pinned())
4751 ctx
->delta_stats
.num_objects_pinned
--;
4752 if (coi
.has_manifest()) {
4753 dec_all_refcount_manifest(coi
, ctx
.get());
4754 ctx
->delta_stats
.num_objects_manifest
--;
4756 obc
->obs
.exists
= false;
4758 snapset
.clones
.erase(p
);
4759 snapset
.clone_overlap
.erase(last
);
4760 snapset
.clone_size
.erase(last
);
4761 snapset
.clone_snaps
.erase(last
);
4765 pg_log_entry_t::DELETE
,
4768 ctx
->obs
->oi
.version
,
4780 coi
= object_info_t(coid
);
4782 ctx
->at_version
.version
++;
4784 // save adjusted snaps for this object
4785 dout(10) << coid
<< " snaps " << old_snaps
<< " -> " << new_snaps
<< dendl
;
4786 snapset
.clone_snaps
[coid
.snap
] =
4787 vector
<snapid_t
>(new_snaps
.rbegin(), new_snaps
.rend());
4788 // we still do a 'modify' event on this object just to trigger a
4789 // snapmapper.update ... :(
4791 coi
.prior_version
= coi
.version
;
4792 coi
.version
= ctx
->at_version
;
4794 encode(coi
, bl
, get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD
, nullptr));
4795 t
->setattr(coid
, OI_ATTR
, bl
);
4799 pg_log_entry_t::MODIFY
,
4808 ctx
->at_version
.version
++;
4816 // save head snapset
4817 dout(10) << coid
<< " new snapset " << snapset
<< " on "
4818 << head_obc
->obs
.oi
<< dendl
;
4819 if (snapset
.clones
.empty() &&
4820 (head_obc
->obs
.oi
.is_whiteout() &&
4821 !(head_obc
->obs
.oi
.is_dirty() && pool
.info
.is_tier()) &&
4822 !head_obc
->obs
.oi
.is_cache_pinned())) {
4823 // NOTE: this arguably constitutes minor interference with the
4824 // tiering agent if this is a cache tier since a snap trim event
4825 // is effectively evicting a whiteout we might otherwise want to
4827 dout(10) << coid
<< " removing " << head_oid
<< dendl
;
4830 pg_log_entry_t::DELETE
,
4833 head_obc
->obs
.oi
.version
,
4839 dout(10) << "removing snap head" << dendl
;
4840 object_info_t
& oi
= head_obc
->obs
.oi
;
4841 ctx
->delta_stats
.num_objects
--;
4842 if (oi
.is_dirty()) {
4843 ctx
->delta_stats
.num_objects_dirty
--;
4846 ctx
->delta_stats
.num_objects_omap
--;
4847 if (oi
.is_whiteout()) {
4848 dout(20) << __func__
<< " trimming whiteout on " << oi
.soid
<< dendl
;
4849 ctx
->delta_stats
.num_whiteouts
--;
4851 if (oi
.is_cache_pinned()) {
4852 ctx
->delta_stats
.num_objects_pinned
--;
4854 if (oi
.has_manifest()) {
4855 ctx
->delta_stats
.num_objects_manifest
--;
4856 dec_all_refcount_manifest(oi
, ctx
.get());
4858 head_obc
->obs
.exists
= false;
4859 head_obc
->obs
.oi
= object_info_t(head_oid
);
4860 t
->remove(head_oid
);
4862 if (get_osdmap()->require_osd_release
< ceph_release_t::octopus
) {
4863 // filter SnapSet::snaps for the benefit of pre-octopus
4864 // peers. This is perhaps overly conservative in that I'm not
4865 // certain they need this, but let's be conservative here.
4866 dout(10) << coid
<< " filtering snapset on " << head_oid
<< dendl
;
4867 snapset
.filter(pool
.info
);
4869 snapset
.snaps
.clear();
4871 dout(10) << coid
<< " writing updated snapset on " << head_oid
4872 << ", snapset is " << snapset
<< dendl
;
4875 pg_log_entry_t::MODIFY
,
4878 head_obc
->obs
.oi
.version
,
4885 head_obc
->obs
.oi
.prior_version
= head_obc
->obs
.oi
.version
;
4886 head_obc
->obs
.oi
.version
= ctx
->at_version
;
4888 map
<string
, bufferlist
, less
<>> attrs
;
4890 encode(snapset
, bl
);
4891 attrs
[SS_ATTR
] = std::move(bl
);
4894 encode(head_obc
->obs
.oi
, bl
,
4895 get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD
, nullptr));
4896 attrs
[OI_ATTR
] = std::move(bl
);
4897 t
->setattrs(head_oid
, attrs
);
4900 // Stats reporting - Set number of objects trimmed
4901 if (num_objects_before_trim
> ctx
->delta_stats
.num_objects
) {
4902 int64_t num_objects_trimmed
=
4903 num_objects_before_trim
- ctx
->delta_stats
.num_objects
;
4904 add_objects_trimmed_count(num_objects_trimmed
);
4907 *ctxp
= std::move(ctx
);
4911 void PrimaryLogPG::kick_snap_trim()
4913 ceph_assert(is_active());
4914 ceph_assert(is_primary());
4916 !state_test(PG_STATE_PREMERGE
) &&
4917 !snap_trimq
.empty()) {
4918 if (get_osdmap()->test_flag(CEPH_OSDMAP_NOSNAPTRIM
)) {
4919 dout(10) << __func__
<< ": nosnaptrim set, not kicking" << dendl
;
4921 dout(10) << __func__
<< ": clean and snaps to trim, kicking" << dendl
;
4922 reset_objects_trimmed();
4923 set_snaptrim_begin_stamp();
4924 snap_trimmer_machine
.process_event(KickTrim());
4929 void PrimaryLogPG::snap_trimmer_scrub_complete()
4931 if (is_primary() && is_active() && is_clean() && !snap_trimq
.empty()) {
4932 dout(10) << "scrub finished - requeuing snap_trimmer" << dendl
;
4933 snap_trimmer_machine
.process_event(ScrubComplete());
4937 void PrimaryLogPG::snap_trimmer(epoch_t queued
)
4939 if (recovery_state
.is_deleting() || pg_has_reset_since(queued
)) {
4943 ceph_assert(is_primary());
4945 dout(10) << "snap_trimmer posting" << dendl
;
4946 snap_trimmer_machine
.process_event(DoSnapWork());
4947 dout(10) << "snap_trimmer complete" << dendl
;
4953 template<typename U
, typename V
>
4954 int do_cmp_xattr(int op
, const U
& lhs
, const V
& rhs
)
4957 case CEPH_OSD_CMPXATTR_OP_EQ
:
4959 case CEPH_OSD_CMPXATTR_OP_NE
:
4961 case CEPH_OSD_CMPXATTR_OP_GT
:
4963 case CEPH_OSD_CMPXATTR_OP_GTE
:
4965 case CEPH_OSD_CMPXATTR_OP_LT
:
4967 case CEPH_OSD_CMPXATTR_OP_LTE
:
4974 } // anonymous namespace
4976 int PrimaryLogPG::do_xattr_cmp_u64(int op
, uint64_t v1
, bufferlist
& xattr
)
4980 if (xattr
.length()) {
4981 const char* first
= xattr
.c_str();
4982 if (auto [p
, ec
] = std::from_chars(first
, first
+ xattr
.length(), v2
);
4983 ec
!= std::errc()) {
4989 dout(20) << "do_xattr_cmp_u64 '" << v1
<< "' vs '" << v2
<< "' op " << op
<< dendl
;
4990 return do_cmp_xattr(op
, v1
, v2
);
4993 int PrimaryLogPG::do_xattr_cmp_str(int op
, string
& v1s
, bufferlist
& xattr
)
4995 string_view
v2s(xattr
.c_str(), xattr
.length());
4996 dout(20) << "do_xattr_cmp_str '" << v1s
<< "' vs '" << v2s
<< "' op " << op
<< dendl
;
4997 return do_cmp_xattr(op
, v1s
, v2s
);
5000 int PrimaryLogPG::do_writesame(OpContext
*ctx
, OSDOp
& osd_op
)
5002 ceph_osd_op
& op
= osd_op
.op
;
5003 vector
<OSDOp
> write_ops(1);
5004 OSDOp
& write_op
= write_ops
[0];
5005 uint64_t write_length
= op
.writesame
.length
;
5011 if (!op
.writesame
.data_length
|| write_length
% op
.writesame
.data_length
)
5014 if (op
.writesame
.data_length
!= osd_op
.indata
.length()) {
5015 derr
<< "invalid length ws data length " << op
.writesame
.data_length
<< " actual len " << osd_op
.indata
.length() << dendl
;
5019 while (write_length
) {
5020 write_op
.indata
.append(osd_op
.indata
);
5021 write_length
-= op
.writesame
.data_length
;
5024 write_op
.op
.op
= CEPH_OSD_OP_WRITE
;
5025 write_op
.op
.extent
.offset
= op
.writesame
.offset
;
5026 write_op
.op
.extent
.length
= op
.writesame
.length
;
5027 result
= do_osd_ops(ctx
, write_ops
);
5029 derr
<< "do_writesame do_osd_ops failed " << result
<< dendl
;
5034 // ========================================================================
5035 // low level osd ops
5037 int PrimaryLogPG::do_tmap2omap(OpContext
*ctx
, unsigned flags
)
5039 dout(20) << " convert tmap to omap for " << ctx
->new_obs
.oi
.soid
<< dendl
;
5040 bufferlist header
, vals
;
5041 int r
= _get_tmap(ctx
, &header
, &vals
);
5043 if (r
== -ENODATA
&& (flags
& CEPH_OSD_TMAP2OMAP_NULLOK
))
5048 vector
<OSDOp
> ops(3);
5050 ops
[0].op
.op
= CEPH_OSD_OP_TRUNCATE
;
5051 ops
[0].op
.extent
.offset
= 0;
5052 ops
[0].op
.extent
.length
= 0;
5054 ops
[1].op
.op
= CEPH_OSD_OP_OMAPSETHEADER
;
5055 ops
[1].indata
= std::move(header
);
5057 ops
[2].op
.op
= CEPH_OSD_OP_OMAPSETVALS
;
5058 ops
[2].indata
= std::move(vals
);
5060 return do_osd_ops(ctx
, ops
);
5063 int PrimaryLogPG::do_tmapup_slow(OpContext
*ctx
, bufferlist::const_iterator
& bp
,
5064 OSDOp
& osd_op
, bufferlist
& bl
)
5068 map
<string
, bufferlist
> m
;
5070 auto p
= bl
.cbegin();
5073 ceph_assert(p
.end());
5083 case CEPH_OSD_TMAP_SET
: // insert key
5091 case CEPH_OSD_TMAP_RM
: // remove key
5093 if (!m
.count(key
)) {
5098 case CEPH_OSD_TMAP_RMSLOPPY
: // remove key
5102 case CEPH_OSD_TMAP_HDR
: // update header
5114 encode(header
, obl
);
5118 vector
<OSDOp
> nops(1);
5119 OSDOp
& newop
= nops
[0];
5120 newop
.op
.op
= CEPH_OSD_OP_WRITEFULL
;
5121 newop
.op
.extent
.offset
= 0;
5122 newop
.op
.extent
.length
= obl
.length();
5124 do_osd_ops(ctx
, nops
);
5128 int PrimaryLogPG::do_tmapup(OpContext
*ctx
, bufferlist::const_iterator
& bp
, OSDOp
& osd_op
)
5130 bufferlist::const_iterator orig_bp
= bp
;
5133 dout(10) << "tmapup is a no-op" << dendl
;
5135 // read the whole object
5136 vector
<OSDOp
> nops(1);
5137 OSDOp
& newop
= nops
[0];
5138 newop
.op
.op
= CEPH_OSD_OP_READ
;
5139 newop
.op
.extent
.offset
= 0;
5140 newop
.op
.extent
.length
= 0;
5141 result
= do_osd_ops(ctx
, nops
);
5143 dout(10) << "tmapup read " << newop
.outdata
.length() << dendl
;
5145 dout(30) << " starting is \n";
5146 newop
.outdata
.hexdump(*_dout
);
5149 auto ip
= newop
.outdata
.cbegin();
5152 dout(30) << "the update command is: \n";
5153 osd_op
.indata
.hexdump(*_dout
);
5159 if (newop
.outdata
.length()) {
5163 dout(10) << "tmapup header " << header
.length() << dendl
;
5165 if (!bp
.end() && *bp
== CEPH_OSD_TMAP_HDR
) {
5168 dout(10) << "tmapup new header " << header
.length() << dendl
;
5171 encode(header
, obl
);
5173 dout(20) << "tmapup initial nkeys " << nkeys
<< dendl
;
5176 bufferlist newkeydata
;
5177 string nextkey
, last_in_key
;
5179 bool have_next
= false;
5182 decode(nextkey
, ip
);
5183 decode(nextval
, ip
);
5185 while (!bp
.end() && !result
) {
5192 catch (ceph::buffer::error
& e
) {
5195 if (key
< last_in_key
) {
5196 dout(5) << "tmapup warning: key '" << key
<< "' < previous key '" << last_in_key
5197 << "', falling back to an inefficient (unsorted) update" << dendl
;
5199 return do_tmapup_slow(ctx
, bp
, osd_op
, newop
.outdata
);
5203 dout(10) << "tmapup op " << (int)op
<< " key " << key
<< dendl
;
5205 // skip existing intervening keys
5206 bool key_exists
= false;
5207 while (have_next
&& !key_exists
) {
5208 dout(20) << " (have_next=" << have_next
<< " nextkey=" << nextkey
<< ")" << dendl
;
5211 if (nextkey
< key
) {
5213 encode(nextkey
, newkeydata
);
5214 encode(nextval
, newkeydata
);
5215 dout(20) << " keep " << nextkey
<< " " << nextval
.length() << dendl
;
5217 // don't copy; discard old value. and stop.
5218 dout(20) << " drop " << nextkey
<< " " << nextval
.length() << dendl
;
5223 decode(nextkey
, ip
);
5224 decode(nextval
, ip
);
5230 if (op
== CEPH_OSD_TMAP_SET
) {
5235 catch (ceph::buffer::error
& e
) {
5238 encode(key
, newkeydata
);
5239 encode(val
, newkeydata
);
5240 dout(20) << " set " << key
<< " " << val
.length() << dendl
;
5242 } else if (op
== CEPH_OSD_TMAP_CREATE
) {
5250 catch (ceph::buffer::error
& e
) {
5253 encode(key
, newkeydata
);
5254 encode(val
, newkeydata
);
5255 dout(20) << " create " << key
<< " " << val
.length() << dendl
;
5257 } else if (op
== CEPH_OSD_TMAP_RM
) {
5262 } else if (op
== CEPH_OSD_TMAP_RMSLOPPY
) {
5265 dout(10) << " invalid tmap op " << (int)op
<< dendl
;
5272 encode(nextkey
, newkeydata
);
5273 encode(nextval
, newkeydata
);
5274 dout(20) << " keep " << nextkey
<< " " << nextval
.length() << dendl
;
5278 rest
.substr_of(newop
.outdata
, ip
.get_off(), newop
.outdata
.length() - ip
.get_off());
5279 dout(20) << " keep trailing " << rest
.length()
5280 << " at " << newkeydata
.length() << dendl
;
5281 newkeydata
.claim_append(rest
);
5284 // encode final key count + key data
5285 dout(20) << "tmapup final nkeys " << nkeys
<< dendl
;
5287 obl
.claim_append(newkeydata
);
5290 dout(30) << " final is \n";
5291 obl
.hexdump(*_dout
);
5295 auto tp
= obl
.cbegin();
5298 map
<string
,bufferlist
> d
;
5300 ceph_assert(tp
.end());
5301 dout(0) << " **** debug sanity check, looks ok ****" << dendl
;
5306 dout(20) << "tmapput write " << obl
.length() << dendl
;
5307 newop
.op
.op
= CEPH_OSD_OP_WRITEFULL
;
5308 newop
.op
.extent
.offset
= 0;
5309 newop
.op
.extent
.length
= obl
.length();
5311 do_osd_ops(ctx
, nops
);
5317 static int check_offset_and_length(uint64_t offset
, uint64_t length
,
5318 uint64_t max
, DoutPrefixProvider
*dpp
)
5320 if (offset
>= max
||
5322 offset
+ length
> max
) {
5323 ldpp_dout(dpp
, 10) << __func__
<< " "
5324 << "osd_max_object_size: " << max
5325 << "; Hard limit of object size is 4GB." << dendl
;
5332 struct FillInVerifyExtent
: public Context
{
5335 bufferlist
*outdatap
;
5336 std::optional
<uint32_t> maybe_crc
;
5341 FillInVerifyExtent(ceph_le64
*r
, int32_t *rv
, bufferlist
*blp
,
5342 std::optional
<uint32_t> mc
, uint64_t size
,
5343 OSDService
*osd
, hobject_t soid
, uint32_t flags
) :
5344 r(r
), rval(rv
), outdatap(blp
), maybe_crc(mc
),
5345 size(size
), osd(osd
), soid(soid
), flags(flags
) {}
5346 void finish(int len
) override
{
5354 // whole object? can we verify the checksum?
5355 if (maybe_crc
&& *r
== size
) {
5356 uint32_t crc
= outdatap
->crc32c(-1);
5357 if (maybe_crc
!= crc
) {
5358 osd
->clog
->error() << std::hex
<< " full-object read crc 0x" << crc
5359 << " != expected 0x" << *maybe_crc
5360 << std::dec
<< " on " << soid
;
5361 if (!(flags
& CEPH_OSD_OP_FLAG_FAILOK
)) {
5370 struct ToSparseReadResult
: public Context
{
5372 bufferlist
* data_bl
;
5373 uint64_t data_offset
;
5375 ToSparseReadResult(int* result
, bufferlist
* bl
, uint64_t offset
,
5377 : result(result
), data_bl(bl
), data_offset(offset
),len(len
) {}
5378 void finish(int r
) override
{
5386 map
<uint64_t, uint64_t> extents
= {{data_offset
, r
}};
5387 encode(extents
, outdata
);
5388 encode_destructively(*data_bl
, outdata
);
5389 data_bl
->swap(outdata
);
5393 template<typename V
>
5394 static string
list_keys(const map
<string
, V
>& m
) {
5396 for (typename map
<string
, V
>::const_iterator itr
= m
.begin(); itr
!= m
.end(); ++itr
) {
5400 s
.append(itr
->first
);
5405 template<typename T
>
5406 static string
list_entries(const T
& m
) {
5408 for (typename
T::const_iterator itr
= m
.begin(); itr
!= m
.end(); ++itr
) {
5417 void PrimaryLogPG::maybe_create_new_object(
5419 bool ignore_transaction
)
5421 ObjectState
& obs
= ctx
->new_obs
;
5423 ctx
->delta_stats
.num_objects
++;
5425 ceph_assert(!obs
.oi
.is_whiteout());
5426 obs
.oi
.new_object();
5427 if (!ignore_transaction
)
5428 ctx
->op_t
->create(obs
.oi
.soid
);
5429 } else if (obs
.oi
.is_whiteout()) {
5430 dout(10) << __func__
<< " clearing whiteout on " << obs
.oi
.soid
<< dendl
;
5431 ctx
->new_obs
.oi
.clear_flag(object_info_t::FLAG_WHITEOUT
);
5432 --ctx
->delta_stats
.num_whiteouts
;
5436 struct ReadFinisher
: public PrimaryLogPG::OpFinisher
{
5439 explicit ReadFinisher(OSDOp
& osd_op
) : osd_op(osd_op
) {
5442 int execute() override
{
5447 struct C_ChecksumRead
: public Context
{
5448 PrimaryLogPG
*primary_log_pg
;
5450 Checksummer::CSumType csum_type
;
5451 bufferlist init_value_bl
;
5452 ceph_le64 read_length
;
5454 Context
*fill_extent_ctx
;
5456 C_ChecksumRead(PrimaryLogPG
*primary_log_pg
, OSDOp
&osd_op
,
5457 Checksummer::CSumType csum_type
, bufferlist
&&init_value_bl
,
5458 std::optional
<uint32_t> maybe_crc
, uint64_t size
,
5459 OSDService
*osd
, hobject_t soid
, uint32_t flags
)
5460 : primary_log_pg(primary_log_pg
), osd_op(osd_op
),
5461 csum_type(csum_type
), init_value_bl(std::move(init_value_bl
)),
5462 fill_extent_ctx(new FillInVerifyExtent(&read_length
, &osd_op
.rval
,
5463 &read_bl
, maybe_crc
, size
,
5464 osd
, soid
, flags
)) {
5466 ~C_ChecksumRead() override
{
5467 delete fill_extent_ctx
;
5470 void finish(int r
) override
{
5471 fill_extent_ctx
->complete(r
);
5472 fill_extent_ctx
= nullptr;
5474 if (osd_op
.rval
>= 0) {
5475 bufferlist::const_iterator init_value_bl_it
= init_value_bl
.begin();
5476 osd_op
.rval
= primary_log_pg
->finish_checksum(osd_op
, csum_type
,
5477 &init_value_bl_it
, read_bl
);
5482 int PrimaryLogPG::do_checksum(OpContext
*ctx
, OSDOp
& osd_op
,
5483 bufferlist::const_iterator
*bl_it
)
5485 dout(20) << __func__
<< dendl
;
5487 auto& op
= osd_op
.op
;
5488 if (op
.checksum
.chunk_size
> 0) {
5489 if (op
.checksum
.length
== 0) {
5490 dout(10) << __func__
<< ": length required when chunk size provided"
5494 if (op
.checksum
.length
% op
.checksum
.chunk_size
!= 0) {
5495 dout(10) << __func__
<< ": length not aligned to chunk size" << dendl
;
5500 auto& oi
= ctx
->new_obs
.oi
;
5501 if (op
.checksum
.offset
== 0 && op
.checksum
.length
== 0) {
5502 // zeroed offset+length implies checksum whole object
5503 op
.checksum
.length
= oi
.size
;
5504 } else if (op
.checksum
.offset
>= oi
.size
) {
5505 // read size was trimmed to zero, do nothing
5506 // see PrimaryLogPG::do_read
5508 } else if (op
.extent
.offset
+ op
.extent
.length
> oi
.size
) {
5509 op
.extent
.length
= oi
.size
- op
.extent
.offset
;
5510 if (op
.checksum
.chunk_size
> 0 &&
5511 op
.checksum
.length
% op
.checksum
.chunk_size
!= 0) {
5512 dout(10) << __func__
<< ": length (trimmed to 0x"
5513 << std::hex
<< op
.checksum
.length
5514 << ") not aligned to chunk size 0x"
5515 << op
.checksum
.chunk_size
<< std::dec
5521 Checksummer::CSumType csum_type
;
5522 switch (op
.checksum
.type
) {
5523 case CEPH_OSD_CHECKSUM_OP_TYPE_XXHASH32
:
5524 csum_type
= Checksummer::CSUM_XXHASH32
;
5526 case CEPH_OSD_CHECKSUM_OP_TYPE_XXHASH64
:
5527 csum_type
= Checksummer::CSUM_XXHASH64
;
5529 case CEPH_OSD_CHECKSUM_OP_TYPE_CRC32C
:
5530 csum_type
= Checksummer::CSUM_CRC32C
;
5533 dout(10) << __func__
<< ": unknown crc type ("
5534 << static_cast<uint32_t>(op
.checksum
.type
) << ")" << dendl
;
5538 size_t csum_init_value_size
= Checksummer::get_csum_init_value_size(csum_type
);
5539 if (bl_it
->get_remaining() < csum_init_value_size
) {
5540 dout(10) << __func__
<< ": init value not provided" << dendl
;
5544 bufferlist init_value_bl
;
5545 init_value_bl
.substr_of(bl_it
->get_bl(), bl_it
->get_off(),
5546 csum_init_value_size
);
5547 *bl_it
+= csum_init_value_size
;
5549 if (pool
.info
.is_erasure() && op
.checksum
.length
> 0) {
5550 // If there is a data digest and it is possible we are reading
5551 // entire object, pass the digest.
5552 std::optional
<uint32_t> maybe_crc
;
5553 if (oi
.is_data_digest() && op
.checksum
.offset
== 0 &&
5554 op
.checksum
.length
>= oi
.size
) {
5555 maybe_crc
= oi
.data_digest
;
5559 auto& soid
= oi
.soid
;
5560 auto checksum_ctx
= new C_ChecksumRead(this, osd_op
, csum_type
,
5561 std::move(init_value_bl
), maybe_crc
,
5562 oi
.size
, osd
, soid
, op
.flags
);
5564 ctx
->pending_async_reads
.push_back({
5565 {op
.checksum
.offset
, op
.checksum
.length
, op
.flags
},
5566 {&checksum_ctx
->read_bl
, checksum_ctx
}});
5568 dout(10) << __func__
<< ": async_read noted for " << soid
<< dendl
;
5569 ctx
->op_finishers
[ctx
->current_osd_subop_num
].reset(
5570 new ReadFinisher(osd_op
));
5571 return -EINPROGRESS
;
5575 std::vector
<OSDOp
> read_ops(1);
5576 auto& read_op
= read_ops
[0];
5577 if (op
.checksum
.length
> 0) {
5578 read_op
.op
.op
= CEPH_OSD_OP_READ
;
5579 read_op
.op
.flags
= op
.flags
;
5580 read_op
.op
.extent
.offset
= op
.checksum
.offset
;
5581 read_op
.op
.extent
.length
= op
.checksum
.length
;
5582 read_op
.op
.extent
.truncate_size
= 0;
5583 read_op
.op
.extent
.truncate_seq
= 0;
5585 int r
= do_osd_ops(ctx
, read_ops
);
5587 derr
<< __func__
<< ": do_osd_ops failed: " << cpp_strerror(r
) << dendl
;
5592 bufferlist::const_iterator init_value_bl_it
= init_value_bl
.begin();
5593 return finish_checksum(osd_op
, csum_type
, &init_value_bl_it
,
5597 int PrimaryLogPG::finish_checksum(OSDOp
& osd_op
,
5598 Checksummer::CSumType csum_type
,
5599 bufferlist::const_iterator
*init_value_bl_it
,
5600 const bufferlist
&read_bl
) {
5601 dout(20) << __func__
<< dendl
;
5603 auto& op
= osd_op
.op
;
5605 if (op
.checksum
.length
> 0 && read_bl
.length() != op
.checksum
.length
) {
5606 derr
<< __func__
<< ": bytes read " << read_bl
.length() << " != "
5607 << op
.checksum
.length
<< dendl
;
5611 size_t csum_chunk_size
= (op
.checksum
.chunk_size
!= 0 ?
5612 op
.checksum
.chunk_size
: read_bl
.length());
5613 uint32_t csum_count
= (csum_chunk_size
> 0 ?
5614 read_bl
.length() / csum_chunk_size
: 0);
5617 bufferptr csum_data
;
5618 if (csum_count
> 0) {
5619 size_t csum_value_size
= Checksummer::get_csum_value_size(csum_type
);
5620 csum_data
= ceph::buffer::create(csum_value_size
* csum_count
);
5622 csum
.append(csum_data
);
5624 switch (csum_type
) {
5625 case Checksummer::CSUM_XXHASH32
:
5627 Checksummer::xxhash32::init_value_t init_value
;
5628 decode(init_value
, *init_value_bl_it
);
5629 Checksummer::calculate
<Checksummer::xxhash32
>(
5630 init_value
, csum_chunk_size
, 0, read_bl
.length(), read_bl
,
5634 case Checksummer::CSUM_XXHASH64
:
5636 Checksummer::xxhash64::init_value_t init_value
;
5637 decode(init_value
, *init_value_bl_it
);
5638 Checksummer::calculate
<Checksummer::xxhash64
>(
5639 init_value
, csum_chunk_size
, 0, read_bl
.length(), read_bl
,
5643 case Checksummer::CSUM_CRC32C
:
5645 Checksummer::crc32c::init_value_t init_value
;
5646 decode(init_value
, *init_value_bl_it
);
5647 Checksummer::calculate
<Checksummer::crc32c
>(
5648 init_value
, csum_chunk_size
, 0, read_bl
.length(), read_bl
,
5657 encode(csum_count
, osd_op
.outdata
);
5658 osd_op
.outdata
.claim_append(csum
);
5662 struct C_ExtentCmpRead
: public Context
{
5663 PrimaryLogPG
*primary_log_pg
;
5665 ceph_le64 read_length
{};
5667 Context
*fill_extent_ctx
;
5669 C_ExtentCmpRead(PrimaryLogPG
*primary_log_pg
, OSDOp
&osd_op
,
5670 std::optional
<uint32_t> maybe_crc
, uint64_t size
,
5671 OSDService
*osd
, hobject_t soid
, uint32_t flags
)
5672 : primary_log_pg(primary_log_pg
), osd_op(osd_op
),
5673 fill_extent_ctx(new FillInVerifyExtent(&read_length
, &osd_op
.rval
,
5674 &read_bl
, maybe_crc
, size
,
5675 osd
, soid
, flags
)) {
5677 ~C_ExtentCmpRead() override
{
5678 delete fill_extent_ctx
;
5681 void finish(int r
) override
{
5685 delete fill_extent_ctx
;
5687 fill_extent_ctx
->complete(r
);
5689 fill_extent_ctx
= nullptr;
5691 if (osd_op
.rval
>= 0) {
5692 osd_op
.rval
= primary_log_pg
->finish_extent_cmp(osd_op
, read_bl
);
5697 int PrimaryLogPG::do_extent_cmp(OpContext
*ctx
, OSDOp
& osd_op
)
5699 dout(20) << __func__
<< dendl
;
5700 ceph_osd_op
& op
= osd_op
.op
;
5702 auto& oi
= ctx
->new_obs
.oi
;
5703 uint64_t size
= oi
.size
;
5704 if ((oi
.truncate_seq
< op
.extent
.truncate_seq
) &&
5705 (op
.extent
.offset
+ op
.extent
.length
> op
.extent
.truncate_size
)) {
5706 size
= op
.extent
.truncate_size
;
5709 if (op
.extent
.offset
>= size
) {
5710 op
.extent
.length
= 0;
5711 } else if (op
.extent
.offset
+ op
.extent
.length
> size
) {
5712 op
.extent
.length
= size
- op
.extent
.offset
;
5715 if (op
.extent
.length
== 0) {
5716 dout(20) << __func__
<< " zero length extent" << dendl
;
5717 return finish_extent_cmp(osd_op
, bufferlist
{});
5718 } else if (!ctx
->obs
->exists
|| ctx
->obs
->oi
.is_whiteout()) {
5719 dout(20) << __func__
<< " object DNE" << dendl
;
5720 return finish_extent_cmp(osd_op
, {});
5721 } else if (pool
.info
.is_erasure()) {
5722 // If there is a data digest and it is possible we are reading
5723 // entire object, pass the digest.
5724 std::optional
<uint32_t> maybe_crc
;
5725 if (oi
.is_data_digest() && op
.checksum
.offset
== 0 &&
5726 op
.checksum
.length
>= oi
.size
) {
5727 maybe_crc
= oi
.data_digest
;
5731 auto& soid
= oi
.soid
;
5732 auto extent_cmp_ctx
= new C_ExtentCmpRead(this, osd_op
, maybe_crc
, oi
.size
,
5733 osd
, soid
, op
.flags
);
5734 ctx
->pending_async_reads
.push_back({
5735 {op
.extent
.offset
, op
.extent
.length
, op
.flags
},
5736 {&extent_cmp_ctx
->read_bl
, extent_cmp_ctx
}});
5738 dout(10) << __func__
<< ": async_read noted for " << soid
<< dendl
;
5740 ctx
->op_finishers
[ctx
->current_osd_subop_num
].reset(
5741 new ReadFinisher(osd_op
));
5742 return -EINPROGRESS
;
5746 vector
<OSDOp
> read_ops(1);
5747 OSDOp
& read_op
= read_ops
[0];
5749 read_op
.op
.op
= CEPH_OSD_OP_SYNC_READ
;
5750 read_op
.op
.extent
.offset
= op
.extent
.offset
;
5751 read_op
.op
.extent
.length
= op
.extent
.length
;
5752 read_op
.op
.extent
.truncate_seq
= op
.extent
.truncate_seq
;
5753 read_op
.op
.extent
.truncate_size
= op
.extent
.truncate_size
;
5755 int result
= do_osd_ops(ctx
, read_ops
);
5757 derr
<< __func__
<< " failed " << result
<< dendl
;
5760 return finish_extent_cmp(osd_op
, read_op
.outdata
);
5763 int PrimaryLogPG::finish_extent_cmp(OSDOp
& osd_op
, const bufferlist
&read_bl
)
5765 for (uint64_t idx
= 0; idx
< osd_op
.indata
.length(); ++idx
) {
5766 char read_byte
= (idx
< read_bl
.length() ? read_bl
[idx
] : 0);
5767 if (osd_op
.indata
[idx
] != read_byte
) {
5768 return (-MAX_ERRNO
- idx
);
5775 int PrimaryLogPG::do_read(OpContext
*ctx
, OSDOp
& osd_op
) {
5776 dout(20) << __func__
<< dendl
;
5777 auto& op
= osd_op
.op
;
5778 auto& oi
= ctx
->new_obs
.oi
;
5779 auto& soid
= oi
.soid
;
5780 __u32 seq
= oi
.truncate_seq
;
5781 uint64_t size
= oi
.size
;
5782 bool trimmed_read
= false;
5784 dout(30) << __func__
<< " oi.size: " << oi
.size
<< dendl
;
5785 dout(30) << __func__
<< " oi.truncate_seq: " << oi
.truncate_seq
<< dendl
;
5786 dout(30) << __func__
<< " op.extent.truncate_seq: " << op
.extent
.truncate_seq
<< dendl
;
5787 dout(30) << __func__
<< " op.extent.truncate_size: " << op
.extent
.truncate_size
<< dendl
;
5789 // are we beyond truncate_size?
5790 if ( (seq
< op
.extent
.truncate_seq
) &&
5791 (op
.extent
.offset
+ op
.extent
.length
> op
.extent
.truncate_size
) &&
5792 (size
> op
.extent
.truncate_size
) )
5793 size
= op
.extent
.truncate_size
;
5795 if (op
.extent
.length
== 0) //length is zero mean read the whole object
5796 op
.extent
.length
= size
;
5798 if (op
.extent
.offset
>= size
) {
5799 op
.extent
.length
= 0;
5800 trimmed_read
= true;
5801 } else if (op
.extent
.offset
+ op
.extent
.length
> size
) {
5802 op
.extent
.length
= size
- op
.extent
.offset
;
5803 trimmed_read
= true;
5806 dout(30) << __func__
<< "op.extent.length is now " << op
.extent
.length
<< dendl
;
5808 // read into a buffer
5810 if (trimmed_read
&& op
.extent
.length
== 0) {
5811 // read size was trimmed to zero and it is expected to do nothing
5812 // a read operation of 0 bytes does *not* do nothing, this is why
5813 // the trimmed_read boolean is needed
5814 } else if (pool
.info
.is_erasure()) {
5815 // The initialisation below is required to silence a false positive
5816 // -Wmaybe-uninitialized warning
5817 std::optional
<uint32_t> maybe_crc
;
5818 // If there is a data digest and it is possible we are reading
5819 // entire object, pass the digest. FillInVerifyExtent will
5820 // will check the oi.size again.
5821 if (oi
.is_data_digest() && op
.extent
.offset
== 0 &&
5822 op
.extent
.length
>= oi
.size
)
5823 maybe_crc
= oi
.data_digest
;
5824 ctx
->pending_async_reads
.push_back(
5826 boost::make_tuple(op
.extent
.offset
, op
.extent
.length
, op
.flags
),
5827 make_pair(&osd_op
.outdata
,
5828 new FillInVerifyExtent(&op
.extent
.length
, &osd_op
.rval
,
5829 &osd_op
.outdata
, maybe_crc
, oi
.size
,
5830 osd
, soid
, op
.flags
))));
5831 dout(10) << " async_read noted for " << soid
<< dendl
;
5833 ctx
->op_finishers
[ctx
->current_osd_subop_num
].reset(
5834 new ReadFinisher(osd_op
));
5836 int r
= pgbackend
->objects_read_sync(
5837 soid
, op
.extent
.offset
, op
.extent
.length
, op
.flags
, &osd_op
.outdata
);
5838 // whole object? can we verify the checksum?
5839 if (r
>= 0 && op
.extent
.offset
== 0 &&
5840 (uint64_t)r
== oi
.size
&& oi
.is_data_digest()) {
5841 uint32_t crc
= osd_op
.outdata
.crc32c(-1);
5842 if (oi
.data_digest
!= crc
) {
5843 osd
->clog
->error() << info
.pgid
<< std::hex
5844 << " full-object read crc 0x" << crc
5845 << " != expected 0x" << oi
.data_digest
5846 << std::dec
<< " on " << soid
;
5847 r
= -EIO
; // try repair later
5851 r
= rep_repair_primary_object(soid
, ctx
);
5854 op
.extent
.length
= r
;
5855 else if (r
== -EAGAIN
) {
5859 op
.extent
.length
= 0;
5861 dout(10) << " read got " << r
<< " / " << op
.extent
.length
5862 << " bytes from obj " << soid
<< dendl
;
5865 ctx
->delta_stats
.num_rd_kb
+= shift_round_up(op
.extent
.length
, 10);
5866 ctx
->delta_stats
.num_rd
++;
5871 int PrimaryLogPG::do_sparse_read(OpContext
*ctx
, OSDOp
& osd_op
) {
5872 dout(20) << __func__
<< dendl
;
5873 auto& op
= osd_op
.op
;
5874 auto& oi
= ctx
->new_obs
.oi
;
5875 auto& soid
= oi
.soid
;
5876 uint64_t size
= oi
.size
;
5877 uint64_t offset
= op
.extent
.offset
;
5878 uint64_t length
= op
.extent
.length
;
5880 // are we beyond truncate_size?
5881 if ((oi
.truncate_seq
< op
.extent
.truncate_seq
) &&
5882 (op
.extent
.offset
+ op
.extent
.length
> op
.extent
.truncate_size
) &&
5883 (size
> op
.extent
.truncate_size
)) {
5884 size
= op
.extent
.truncate_size
;
5887 if (offset
> size
) {
5889 } else if (offset
+ length
> size
) {
5890 length
= size
- offset
;
5894 if (pool
.info
.is_erasure()) {
5895 // translate sparse read to a normal one if not supported
5898 ctx
->pending_async_reads
.push_back(
5900 boost::make_tuple(offset
, length
, op
.flags
),
5903 new ToSparseReadResult(&osd_op
.rval
, &osd_op
.outdata
, offset
,
5904 &op
.extent
.length
))));
5905 dout(10) << " async_read (was sparse_read) noted for " << soid
<< dendl
;
5907 ctx
->op_finishers
[ctx
->current_osd_subop_num
].reset(
5908 new ReadFinisher(osd_op
));
5910 dout(10) << " sparse read ended up empty for " << soid
<< dendl
;
5911 map
<uint64_t, uint64_t> extents
;
5912 encode(extents
, osd_op
.outdata
);
5915 // read into a buffer
5916 map
<uint64_t, uint64_t> m
;
5917 int r
= osd
->store
->fiemap(ch
, ghobject_t(soid
, ghobject_t::NO_GEN
,
5925 r
= pgbackend
->objects_readv_sync(soid
, std::move(m
), op
.flags
, &data_bl
);
5927 r
= rep_repair_primary_object(soid
, ctx
);
5933 // Why SPARSE_READ need checksum? In fact, librbd always use sparse-read.
5934 // Maybe at first, there is no much whole objects. With continued use, more
5935 // and more whole object exist. So from this point, for spare-read add
5936 // checksum make sense.
5937 if ((uint64_t)r
== oi
.size
&& oi
.is_data_digest()) {
5938 uint32_t crc
= data_bl
.crc32c(-1);
5939 if (oi
.data_digest
!= crc
) {
5940 osd
->clog
->error() << info
.pgid
<< std::hex
5941 << " full-object read crc 0x" << crc
5942 << " != expected 0x" << oi
.data_digest
5943 << std::dec
<< " on " << soid
;
5944 r
= rep_repair_primary_object(soid
, ctx
);
5951 op
.extent
.length
= r
;
5953 encode(m
, osd_op
.outdata
); // re-encode since it might be modified
5954 ::encode_destructively(data_bl
, osd_op
.outdata
);
5956 dout(10) << " sparse_read got " << r
<< " bytes from object "
5960 ctx
->delta_stats
.num_rd_kb
+= shift_round_up(op
.extent
.length
, 10);
5961 ctx
->delta_stats
.num_rd
++;
5965 int PrimaryLogPG::do_osd_ops(OpContext
*ctx
, vector
<OSDOp
>& ops
)
5968 SnapSetContext
*ssc
= ctx
->obc
->ssc
;
5969 ObjectState
& obs
= ctx
->new_obs
;
5970 object_info_t
& oi
= obs
.oi
;
5971 const hobject_t
& soid
= oi
.soid
;
5972 const bool skip_data_digest
= osd
->store
->has_builtin_csum() &&
5973 osd
->osd_skip_data_digest
;
5975 PGTransaction
* t
= ctx
->op_t
.get();
5977 dout(10) << "do_osd_op " << soid
<< " " << ops
<< dendl
;
5979 ctx
->current_osd_subop_num
= 0;
5980 for (auto p
= ops
.begin(); p
!= ops
.end(); ++p
, ctx
->current_osd_subop_num
++, ctx
->processed_subop_count
++) {
5982 ceph_osd_op
& op
= osd_op
.op
;
5984 OpFinisher
* op_finisher
= nullptr;
5986 auto op_finisher_it
= ctx
->op_finishers
.find(ctx
->current_osd_subop_num
);
5987 if (op_finisher_it
!= ctx
->op_finishers
.end()) {
5988 op_finisher
= op_finisher_it
->second
.get();
5992 // TODO: check endianness (ceph_le32 vs uint32_t, etc.)
5993 // The fields in ceph_osd_op are little-endian (according to the definition in rados.h),
5994 // but the code in this function seems to treat them as native-endian. What should the
5996 tracepoint(osd
, do_osd_op_pre
, soid
.oid
.name
.c_str(), soid
.snap
.val
, op
.op
, ceph_osd_op_name(op
.op
), op
.flags
);
5998 dout(10) << "do_osd_op " << osd_op
<< dendl
;
6000 auto bp
= osd_op
.indata
.cbegin();
6002 // user-visible modifcation?
6004 // non user-visible modifications
6005 case CEPH_OSD_OP_WATCH
:
6006 case CEPH_OSD_OP_CACHE_EVICT
:
6007 case CEPH_OSD_OP_CACHE_FLUSH
:
6008 case CEPH_OSD_OP_CACHE_TRY_FLUSH
:
6009 case CEPH_OSD_OP_UNDIRTY
:
6010 case CEPH_OSD_OP_COPY_FROM
: // we handle user_version update explicitly
6011 case CEPH_OSD_OP_COPY_FROM2
:
6012 case CEPH_OSD_OP_CACHE_PIN
:
6013 case CEPH_OSD_OP_CACHE_UNPIN
:
6014 case CEPH_OSD_OP_SET_REDIRECT
:
6015 case CEPH_OSD_OP_SET_CHUNK
:
6016 case CEPH_OSD_OP_TIER_PROMOTE
:
6017 case CEPH_OSD_OP_TIER_FLUSH
:
6018 case CEPH_OSD_OP_TIER_EVICT
:
6021 if (op
.op
& CEPH_OSD_OP_MODE_WR
)
6022 ctx
->user_modify
= true;
6025 // munge -1 truncate to 0 truncate
6026 if (ceph_osd_op_uses_extent(op
.op
) &&
6027 op
.extent
.truncate_seq
== 1 &&
6028 op
.extent
.truncate_size
== (-1ULL)) {
6029 op
.extent
.truncate_size
= 0;
6030 op
.extent
.truncate_seq
= 0;
6033 // munge ZERO -> TRUNCATE? (don't munge to DELETE or we risk hosing attributes)
6034 if (op
.op
== CEPH_OSD_OP_ZERO
&&
6036 op
.extent
.offset
< static_cast<Option::size_t>(osd
->osd_max_object_size
) &&
6037 op
.extent
.length
>= 1 &&
6038 op
.extent
.length
<= static_cast<Option::size_t>(osd
->osd_max_object_size
) &&
6039 op
.extent
.offset
+ op
.extent
.length
>= oi
.size
) {
6040 if (op
.extent
.offset
>= oi
.size
) {
6044 dout(10) << " munging ZERO " << op
.extent
.offset
<< "~" << op
.extent
.length
6045 << " -> TRUNCATE " << op
.extent
.offset
<< " (old size is " << oi
.size
<< ")" << dendl
;
6046 op
.op
= CEPH_OSD_OP_TRUNCATE
;
6053 case CEPH_OSD_OP_CMPEXT
:
6055 tracepoint(osd
, do_osd_op_pre_extent_cmp
, soid
.oid
.name
.c_str(),
6056 soid
.snap
.val
, oi
.size
, oi
.truncate_seq
, op
.extent
.offset
,
6057 op
.extent
.length
, op
.extent
.truncate_size
,
6058 op
.extent
.truncate_seq
);
6060 if (op_finisher
== nullptr) {
6061 result
= do_extent_cmp(ctx
, osd_op
);
6063 result
= op_finisher
->execute();
6067 case CEPH_OSD_OP_SYNC_READ
:
6068 if (pool
.info
.is_erasure()) {
6069 result
= -EOPNOTSUPP
;
6073 case CEPH_OSD_OP_READ
:
6075 tracepoint(osd
, do_osd_op_pre_read
, soid
.oid
.name
.c_str(),
6076 soid
.snap
.val
, oi
.size
, oi
.truncate_seq
, op
.extent
.offset
,
6077 op
.extent
.length
, op
.extent
.truncate_size
,
6078 op
.extent
.truncate_seq
);
6079 if (op_finisher
== nullptr) {
6080 if (!ctx
->data_off
) {
6081 ctx
->data_off
= op
.extent
.offset
;
6083 result
= do_read(ctx
, osd_op
);
6085 result
= op_finisher
->execute();
6089 case CEPH_OSD_OP_CHECKSUM
:
6092 tracepoint(osd
, do_osd_op_pre_checksum
, soid
.oid
.name
.c_str(),
6093 soid
.snap
.val
, oi
.size
, oi
.truncate_seq
, op
.checksum
.type
,
6094 op
.checksum
.offset
, op
.checksum
.length
,
6095 op
.checksum
.chunk_size
);
6097 if (op_finisher
== nullptr) {
6098 result
= do_checksum(ctx
, osd_op
, &bp
);
6100 result
= op_finisher
->execute();
6106 case CEPH_OSD_OP_MAPEXT
:
6107 tracepoint(osd
, do_osd_op_pre_mapext
, soid
.oid
.name
.c_str(), soid
.snap
.val
, op
.extent
.offset
, op
.extent
.length
);
6108 if (pool
.info
.is_erasure()) {
6109 result
= -EOPNOTSUPP
;
6114 // read into a buffer
6116 int r
= osd
->store
->fiemap(ch
, ghobject_t(soid
, ghobject_t::NO_GEN
,
6118 op
.extent
.offset
, op
.extent
.length
, bl
);
6119 osd_op
.outdata
= std::move(bl
);
6123 ctx
->delta_stats
.num_rd_kb
+= shift_round_up(bl
.length(), 10);
6124 ctx
->delta_stats
.num_rd
++;
6125 dout(10) << " map_extents done on object " << soid
<< dendl
;
6130 case CEPH_OSD_OP_SPARSE_READ
:
6131 tracepoint(osd
, do_osd_op_pre_sparse_read
, soid
.oid
.name
.c_str(),
6132 soid
.snap
.val
, oi
.size
, oi
.truncate_seq
, op
.extent
.offset
,
6133 op
.extent
.length
, op
.extent
.truncate_size
,
6134 op
.extent
.truncate_seq
);
6135 if (op_finisher
== nullptr) {
6136 result
= do_sparse_read(ctx
, osd_op
);
6138 result
= op_finisher
->execute();
6142 case CEPH_OSD_OP_CALL
:
6144 string cname
, mname
;
6147 bp
.copy(op
.cls
.class_len
, cname
);
6148 bp
.copy(op
.cls
.method_len
, mname
);
6149 bp
.copy(op
.cls
.indata_len
, indata
);
6150 } catch (ceph::buffer::error
& e
) {
6151 dout(10) << "call unable to decode class + method + indata" << dendl
;
6152 dout(30) << "in dump: ";
6153 osd_op
.indata
.hexdump(*_dout
);
6156 tracepoint(osd
, do_osd_op_pre_call
, soid
.oid
.name
.c_str(), soid
.snap
.val
, "???", "???");
6159 tracepoint(osd
, do_osd_op_pre_call
, soid
.oid
.name
.c_str(), soid
.snap
.val
, cname
.c_str(), mname
.c_str());
6161 ClassHandler::ClassData
*cls
;
6162 result
= ClassHandler::get_instance().open_class(cname
, &cls
);
6163 ceph_assert(result
== 0); // init_op_flags() already verified this works.
6165 ClassHandler::ClassMethod
*method
= cls
->get_method(mname
);
6167 dout(10) << "call method " << cname
<< "." << mname
<< " does not exist" << dendl
;
6168 result
= -EOPNOTSUPP
;
6172 int flags
= method
->get_flags();
6173 if (flags
& CLS_METHOD_WR
)
6174 ctx
->user_modify
= true;
6177 dout(10) << "call method " << cname
<< "." << mname
<< dendl
;
6178 int prev_rd
= ctx
->num_read
;
6179 int prev_wr
= ctx
->num_write
;
6180 result
= method
->exec((cls_method_context_t
)&ctx
, indata
, outdata
);
6182 if (ctx
->num_read
> prev_rd
&& !(flags
& CLS_METHOD_RD
)) {
6183 derr
<< "method " << cname
<< "." << mname
<< " tried to read object but is not marked RD" << dendl
;
6187 if (ctx
->num_write
> prev_wr
&& !(flags
& CLS_METHOD_WR
)) {
6188 derr
<< "method " << cname
<< "." << mname
<< " tried to update object but is not marked WR" << dendl
;
6193 dout(10) << "method called response length=" << outdata
.length() << dendl
;
6194 op
.extent
.length
= outdata
.length();
6195 osd_op
.outdata
.claim_append(outdata
);
6196 dout(30) << "out dump: ";
6197 osd_op
.outdata
.hexdump(*_dout
);
6202 case CEPH_OSD_OP_STAT
:
6203 // note: stat does not require RD
6205 tracepoint(osd
, do_osd_op_pre_stat
, soid
.oid
.name
.c_str(), soid
.snap
.val
);
6207 if (obs
.exists
&& !oi
.is_whiteout()) {
6208 encode(oi
.size
, osd_op
.outdata
);
6209 encode(oi
.mtime
, osd_op
.outdata
);
6210 dout(10) << "stat oi has " << oi
.size
<< " " << oi
.mtime
<< dendl
;
6213 dout(10) << "stat oi object does not exist" << dendl
;
6216 ctx
->delta_stats
.num_rd
++;
6220 case CEPH_OSD_OP_ISDIRTY
:
6223 tracepoint(osd
, do_osd_op_pre_isdirty
, soid
.oid
.name
.c_str(), soid
.snap
.val
);
6224 bool is_dirty
= obs
.oi
.is_dirty();
6225 encode(is_dirty
, osd_op
.outdata
);
6226 ctx
->delta_stats
.num_rd
++;
6231 case CEPH_OSD_OP_UNDIRTY
:
6235 tracepoint(osd
, do_osd_op_pre_undirty
, soid
.oid
.name
.c_str(), soid
.snap
.val
);
6236 if (oi
.is_dirty()) {
6237 ctx
->undirty
= true; // see make_writeable()
6239 ctx
->delta_stats
.num_wr
++;
6244 case CEPH_OSD_OP_CACHE_TRY_FLUSH
:
6248 tracepoint(osd
, do_osd_op_pre_try_flush
, soid
.oid
.name
.c_str(), soid
.snap
.val
);
6249 if (ctx
->lock_type
!= RWState::RWNONE
) {
6250 dout(10) << "cache-try-flush without SKIPRWLOCKS flag set" << dendl
;
6254 if (pool
.info
.cache_mode
== pg_pool_t::CACHEMODE_NONE
|| obs
.oi
.has_manifest()) {
6262 if (oi
.is_cache_pinned()) {
6263 dout(10) << "cache-try-flush on a pinned object, consider unpin this object first" << dendl
;
6267 if (oi
.is_dirty()) {
6268 result
= start_flush(ctx
->op
, ctx
->obc
, false, NULL
, std::nullopt
);
6269 if (result
== -EINPROGRESS
)
6277 case CEPH_OSD_OP_CACHE_FLUSH
:
6281 tracepoint(osd
, do_osd_op_pre_cache_flush
, soid
.oid
.name
.c_str(), soid
.snap
.val
);
6282 if (ctx
->lock_type
== RWState::RWNONE
) {
6283 dout(10) << "cache-flush with SKIPRWLOCKS flag set" << dendl
;
6287 if (pool
.info
.cache_mode
== pg_pool_t::CACHEMODE_NONE
|| obs
.oi
.has_manifest()) {
6295 if (oi
.is_cache_pinned()) {
6296 dout(10) << "cache-flush on a pinned object, consider unpin this object first" << dendl
;
6301 if (oi
.is_dirty()) {
6302 result
= start_flush(ctx
->op
, ctx
->obc
, true, &missing
, std::nullopt
);
6303 if (result
== -EINPROGRESS
)
6308 // Check special return value which has set missing_return
6309 if (result
== -ENOENT
) {
6310 dout(10) << __func__
<< " CEPH_OSD_OP_CACHE_FLUSH got ENOENT" << dendl
;
6311 ceph_assert(!missing
.is_min());
6312 wait_for_unreadable_object(missing
, ctx
->op
);
6313 // Error code which is used elsewhere when wait_for_unreadable_object() is used
6319 case CEPH_OSD_OP_CACHE_EVICT
:
6323 tracepoint(osd
, do_osd_op_pre_cache_evict
, soid
.oid
.name
.c_str(), soid
.snap
.val
);
6324 if (pool
.info
.cache_mode
== pg_pool_t::CACHEMODE_NONE
|| obs
.oi
.has_manifest()) {
6332 if (oi
.is_cache_pinned()) {
6333 dout(10) << "cache-evict on a pinned object, consider unpin this object first" << dendl
;
6337 if (oi
.is_dirty()) {
6341 if (!oi
.watchers
.empty()) {
6345 if (soid
.snap
== CEPH_NOSNAP
) {
6346 result
= _verify_no_head_clones(soid
, ssc
->snapset
);
6350 result
= _delete_oid(ctx
, true, false);
6352 // mark that this is a cache eviction to avoid triggering normal
6353 // make_writeable() clone creation in finish_ctx()
6354 ctx
->cache_operation
= true;
6356 osd
->logger
->inc(l_osd_tier_evict
);
6360 case CEPH_OSD_OP_GETXATTR
:
6364 bp
.copy(op
.xattr
.name_len
, aname
);
6365 tracepoint(osd
, do_osd_op_pre_getxattr
, soid
.oid
.name
.c_str(), soid
.snap
.val
, aname
.c_str());
6366 string name
= "_" + aname
;
6367 int r
= getattr_maybe_cache(
6372 op
.xattr
.value_len
= osd_op
.outdata
.length();
6374 ctx
->delta_stats
.num_rd_kb
+= shift_round_up(osd_op
.outdata
.length(), 10);
6378 ctx
->delta_stats
.num_rd
++;
6382 case CEPH_OSD_OP_GETXATTRS
:
6385 tracepoint(osd
, do_osd_op_pre_getxattrs
, soid
.oid
.name
.c_str(), soid
.snap
.val
);
6386 map
<string
, bufferlist
,less
<>> out
;
6387 result
= getattrs_maybe_cache(
6393 ctx
->delta_stats
.num_rd_kb
+= shift_round_up(bl
.length(), 10);
6394 ctx
->delta_stats
.num_rd
++;
6395 osd_op
.outdata
.claim_append(bl
);
6399 case CEPH_OSD_OP_CMPXATTR
:
6403 bp
.copy(op
.xattr
.name_len
, aname
);
6404 tracepoint(osd
, do_osd_op_pre_cmpxattr
, soid
.oid
.name
.c_str(), soid
.snap
.val
, aname
.c_str());
6405 string name
= "_" + aname
;
6406 name
[op
.xattr
.name_len
+ 1] = 0;
6409 result
= getattr_maybe_cache(
6413 if (result
< 0 && result
!= -EEXIST
&& result
!= -ENODATA
)
6416 ctx
->delta_stats
.num_rd
++;
6417 ctx
->delta_stats
.num_rd_kb
+= shift_round_up(xattr
.length(), 10);
6419 switch (op
.xattr
.cmp_mode
) {
6420 case CEPH_OSD_CMPXATTR_MODE_STRING
:
6423 bp
.copy(op
.xattr
.value_len
, val
);
6424 val
[op
.xattr
.value_len
] = 0;
6425 dout(10) << "CEPH_OSD_OP_CMPXATTR name=" << name
<< " val=" << val
6426 << " op=" << (int)op
.xattr
.cmp_op
<< " mode=" << (int)op
.xattr
.cmp_mode
<< dendl
;
6427 result
= do_xattr_cmp_str(op
.xattr
.cmp_op
, val
, xattr
);
6431 case CEPH_OSD_CMPXATTR_MODE_U64
:
6437 catch (ceph::buffer::error
& e
) {
6441 dout(10) << "CEPH_OSD_OP_CMPXATTR name=" << name
<< " val=" << u64val
6442 << " op=" << (int)op
.xattr
.cmp_op
<< " mode=" << (int)op
.xattr
.cmp_mode
<< dendl
;
6443 result
= do_xattr_cmp_u64(op
.xattr
.cmp_op
, u64val
, xattr
);
6448 dout(10) << "bad cmp mode " << (int)op
.xattr
.cmp_mode
<< dendl
;
6453 dout(10) << "comparison returned false" << dendl
;
6454 result
= -ECANCELED
;
6458 dout(10) << "comparison returned " << result
<< " " << cpp_strerror(-result
) << dendl
;
6462 dout(10) << "comparison returned true" << dendl
;
6466 case CEPH_OSD_OP_ASSERT_VER
:
6469 uint64_t ver
= op
.assert_ver
.ver
;
6470 tracepoint(osd
, do_osd_op_pre_assert_ver
, soid
.oid
.name
.c_str(), soid
.snap
.val
, ver
);
6473 } else if (ver
< oi
.user_version
) {
6475 } else if (ver
> oi
.user_version
) {
6476 result
= -EOVERFLOW
;
6481 case CEPH_OSD_OP_LIST_WATCHERS
:
6484 tracepoint(osd
, do_osd_op_pre_list_watchers
, soid
.oid
.name
.c_str(), soid
.snap
.val
);
6485 obj_list_watch_response_t resp
;
6487 map
<pair
<uint64_t, entity_name_t
>, watch_info_t
>::const_iterator oi_iter
;
6488 for (oi_iter
= oi
.watchers
.begin(); oi_iter
!= oi
.watchers
.end();
6490 dout(20) << "key cookie=" << oi_iter
->first
.first
6491 << " entity=" << oi_iter
->first
.second
<< " "
6492 << oi_iter
->second
<< dendl
;
6493 ceph_assert(oi_iter
->first
.first
== oi_iter
->second
.cookie
);
6494 ceph_assert(oi_iter
->first
.second
.is_client());
6496 watch_item_t
wi(oi_iter
->first
.second
, oi_iter
->second
.cookie
,
6497 oi_iter
->second
.timeout_seconds
, oi_iter
->second
.addr
);
6498 resp
.entries
.push_back(wi
);
6501 resp
.encode(osd_op
.outdata
, ctx
->get_features());
6504 ctx
->delta_stats
.num_rd
++;
6508 case CEPH_OSD_OP_LIST_SNAPS
:
6511 tracepoint(osd
, do_osd_op_pre_list_snaps
, soid
.oid
.name
.c_str(), soid
.snap
.val
);
6512 obj_list_snap_response_t resp
;
6515 ssc
= ctx
->obc
->ssc
= get_snapset_context(soid
, false);
6518 dout(20) << " snapset " << ssc
->snapset
<< dendl
;
6520 int clonecount
= ssc
->snapset
.clones
.size();
6521 clonecount
++; // for head
6522 resp
.clones
.reserve(clonecount
);
6523 for (auto clone_iter
= ssc
->snapset
.clones
.begin();
6524 clone_iter
!= ssc
->snapset
.clones
.end(); ++clone_iter
) {
6526 ci
.cloneid
= *clone_iter
;
6528 hobject_t clone_oid
= soid
;
6529 clone_oid
.snap
= *clone_iter
;
6531 auto p
= ssc
->snapset
.clone_snaps
.find(*clone_iter
);
6532 if (p
== ssc
->snapset
.clone_snaps
.end()) {
6533 osd
->clog
->error() << "osd." << osd
->whoami
6534 << ": inconsistent clone_snaps found for oid "
6535 << soid
<< " clone " << *clone_iter
6536 << " snapset " << ssc
->snapset
;
6540 for (auto q
= p
->second
.rbegin(); q
!= p
->second
.rend(); ++q
) {
6541 ci
.snaps
.push_back(*q
);
6544 dout(20) << " clone " << *clone_iter
<< " snaps " << ci
.snaps
<< dendl
;
6546 map
<snapid_t
, interval_set
<uint64_t> >::const_iterator coi
;
6547 coi
= ssc
->snapset
.clone_overlap
.find(ci
.cloneid
);
6548 if (coi
== ssc
->snapset
.clone_overlap
.end()) {
6549 osd
->clog
->error() << "osd." << osd
->whoami
6550 << ": inconsistent clone_overlap found for oid "
6551 << soid
<< " clone " << *clone_iter
;
6555 const interval_set
<uint64_t> &o
= coi
->second
;
6556 ci
.overlap
.reserve(o
.num_intervals());
6557 for (interval_set
<uint64_t>::const_iterator r
= o
.begin();
6558 r
!= o
.end(); ++r
) {
6559 ci
.overlap
.push_back(pair
<uint64_t,uint64_t>(r
.get_start(),
6563 map
<snapid_t
, uint64_t>::const_iterator si
;
6564 si
= ssc
->snapset
.clone_size
.find(ci
.cloneid
);
6565 if (si
== ssc
->snapset
.clone_size
.end()) {
6566 osd
->clog
->error() << "osd." << osd
->whoami
6567 << ": inconsistent clone_size found for oid "
6568 << soid
<< " clone " << *clone_iter
;
6572 ci
.size
= si
->second
;
6574 resp
.clones
.push_back(ci
);
6579 if (!ctx
->obc
->obs
.oi
.is_whiteout()) {
6580 ceph_assert(obs
.exists
);
6582 ci
.cloneid
= CEPH_NOSNAP
;
6584 //Size for HEAD is oi.size
6587 resp
.clones
.push_back(ci
);
6589 resp
.seq
= ssc
->snapset
.seq
;
6591 resp
.encode(osd_op
.outdata
);
6594 ctx
->delta_stats
.num_rd
++;
6598 case CEPH_OSD_OP_NOTIFY
:
6605 uint32_t ver
; // obsolete
6607 decode(timeout
, bp
);
6609 } catch (const ceph::buffer::error
&e
) {
6612 tracepoint(osd
, do_osd_op_pre_notify
, soid
.oid
.name
.c_str(), soid
.snap
.val
, timeout
);
6614 timeout
= cct
->_conf
->osd_default_notify_timeout
;
6617 n
.timeout
= timeout
;
6618 n
.notify_id
= osd
->get_next_id(get_osdmap_epoch());
6619 n
.cookie
= op
.notify
.cookie
;
6621 ctx
->notifies
.push_back(n
);
6623 // return our unique notify id to the client
6624 encode(n
.notify_id
, osd_op
.outdata
);
6628 case CEPH_OSD_OP_NOTIFY_ACK
:
6632 uint64_t notify_id
= 0;
6633 uint64_t watch_cookie
= 0;
6634 decode(notify_id
, bp
);
6635 decode(watch_cookie
, bp
);
6636 bufferlist reply_bl
;
6638 decode(reply_bl
, bp
);
6640 tracepoint(osd
, do_osd_op_pre_notify_ack
, soid
.oid
.name
.c_str(), soid
.snap
.val
, notify_id
, watch_cookie
, "Y");
6641 OpContext::NotifyAck
ack(notify_id
, watch_cookie
, reply_bl
);
6642 ctx
->notify_acks
.push_back(ack
);
6643 } catch (const ceph::buffer::error
&e
) {
6644 tracepoint(osd
, do_osd_op_pre_notify_ack
, soid
.oid
.name
.c_str(), soid
.snap
.val
, op
.watch
.cookie
, 0, "N");
6645 OpContext::NotifyAck
ack(
6646 // op.watch.cookie is actually the notify_id for historical reasons
6649 ctx
->notify_acks
.push_back(ack
);
6654 case CEPH_OSD_OP_SETALLOCHINT
:
6658 tracepoint(osd
, do_osd_op_pre_setallochint
, soid
.oid
.name
.c_str(), soid
.snap
.val
, op
.alloc_hint
.expected_object_size
, op
.alloc_hint
.expected_write_size
);
6659 maybe_create_new_object(ctx
);
6660 oi
.expected_object_size
= op
.alloc_hint
.expected_object_size
;
6661 oi
.expected_write_size
= op
.alloc_hint
.expected_write_size
;
6662 oi
.alloc_hint_flags
= op
.alloc_hint
.flags
;
6663 t
->set_alloc_hint(soid
, op
.alloc_hint
.expected_object_size
,
6664 op
.alloc_hint
.expected_write_size
,
6665 op
.alloc_hint
.flags
);
6672 // -- object data --
6674 case CEPH_OSD_OP_WRITE
:
6678 __u32 seq
= oi
.truncate_seq
;
6679 tracepoint(osd
, do_osd_op_pre_write
, soid
.oid
.name
.c_str(), soid
.snap
.val
, oi
.size
, seq
, op
.extent
.offset
, op
.extent
.length
, op
.extent
.truncate_size
, op
.extent
.truncate_seq
);
6680 if (op
.extent
.length
!= osd_op
.indata
.length()) {
6685 if (pool
.info
.has_flag(pg_pool_t::FLAG_WRITE_FADVISE_DONTNEED
))
6686 op
.flags
= op
.flags
| CEPH_OSD_OP_FLAG_FADVISE_DONTNEED
;
6688 if (pool
.info
.requires_aligned_append() &&
6689 (op
.extent
.offset
% pool
.info
.required_alignment() != 0)) {
6690 result
= -EOPNOTSUPP
;
6695 if (pool
.info
.requires_aligned_append() && op
.extent
.offset
) {
6696 result
= -EOPNOTSUPP
;
6699 } else if (op
.extent
.offset
!= oi
.size
&&
6700 pool
.info
.requires_aligned_append()) {
6701 result
= -EOPNOTSUPP
;
6705 if (seq
&& (seq
> op
.extent
.truncate_seq
) &&
6706 (op
.extent
.offset
+ op
.extent
.length
> oi
.size
)) {
6707 // old write, arrived after trimtrunc
6708 op
.extent
.length
= (op
.extent
.offset
> oi
.size
? 0 : oi
.size
- op
.extent
.offset
);
6709 dout(10) << " old truncate_seq " << op
.extent
.truncate_seq
<< " < current " << seq
6710 << ", adjusting write length to " << op
.extent
.length
<< dendl
;
6712 t
.substr_of(osd_op
.indata
, 0, op
.extent
.length
);
6713 osd_op
.indata
.swap(t
);
6715 if (op
.extent
.truncate_seq
> seq
) {
6716 // write arrives before trimtrunc
6717 if (obs
.exists
&& !oi
.is_whiteout()) {
6718 dout(10) << " truncate_seq " << op
.extent
.truncate_seq
<< " > current " << seq
6719 << ", truncating to " << op
.extent
.truncate_size
<< dendl
;
6720 t
->truncate(soid
, op
.extent
.truncate_size
);
6721 oi
.truncate_seq
= op
.extent
.truncate_seq
;
6722 oi
.truncate_size
= op
.extent
.truncate_size
;
6723 if (oi
.size
> op
.extent
.truncate_size
) {
6724 interval_set
<uint64_t> trim
;
6725 trim
.insert(op
.extent
.truncate_size
,
6726 oi
.size
- op
.extent
.truncate_size
);
6727 ctx
->modified_ranges
.union_of(trim
);
6728 ctx
->clean_regions
.mark_data_region_dirty(op
.extent
.truncate_size
, oi
.size
- op
.extent
.truncate_size
);
6729 oi
.clear_data_digest();
6731 if (op
.extent
.truncate_size
!= oi
.size
) {
6732 truncate_update_size_and_usage(ctx
->delta_stats
,
6734 op
.extent
.truncate_size
);
6737 dout(10) << " truncate_seq " << op
.extent
.truncate_seq
<< " > current " << seq
6738 << ", but object is new" << dendl
;
6739 oi
.truncate_seq
= op
.extent
.truncate_seq
;
6740 oi
.truncate_size
= op
.extent
.truncate_size
;
6743 result
= check_offset_and_length(
6744 op
.extent
.offset
, op
.extent
.length
,
6745 static_cast<Option::size_t>(osd
->osd_max_object_size
), get_dpp());
6749 maybe_create_new_object(ctx
);
6751 if (op
.extent
.length
== 0) {
6752 if (op
.extent
.offset
> oi
.size
) {
6754 soid
, op
.extent
.offset
);
6755 truncate_update_size_and_usage(ctx
->delta_stats
, oi
,
6762 soid
, op
.extent
.offset
, op
.extent
.length
, osd_op
.indata
, op
.flags
);
6765 if (op
.extent
.offset
== 0 && op
.extent
.length
>= oi
.size
6766 && !skip_data_digest
) {
6767 obs
.oi
.set_data_digest(osd_op
.indata
.crc32c(-1));
6768 } else if (op
.extent
.offset
== oi
.size
&& obs
.oi
.is_data_digest()) {
6769 if (skip_data_digest
) {
6770 obs
.oi
.clear_data_digest();
6772 obs
.oi
.set_data_digest(osd_op
.indata
.crc32c(obs
.oi
.data_digest
));
6775 obs
.oi
.clear_data_digest();
6777 write_update_size_and_usage(ctx
->delta_stats
, oi
, ctx
->modified_ranges
,
6778 op
.extent
.offset
, op
.extent
.length
);
6779 ctx
->clean_regions
.mark_data_region_dirty(op
.extent
.offset
, op
.extent
.length
);
6780 dout(10) << "clean_regions modified" << ctx
->clean_regions
<< dendl
;
6784 case CEPH_OSD_OP_WRITEFULL
:
6787 { // write full object
6788 tracepoint(osd
, do_osd_op_pre_writefull
, soid
.oid
.name
.c_str(), soid
.snap
.val
, oi
.size
, 0, op
.extent
.length
);
6790 if (op
.extent
.length
!= osd_op
.indata
.length()) {
6794 result
= check_offset_and_length(
6795 0, op
.extent
.length
,
6796 static_cast<Option::size_t>(osd
->osd_max_object_size
), get_dpp());
6800 if (pool
.info
.has_flag(pg_pool_t::FLAG_WRITE_FADVISE_DONTNEED
))
6801 op
.flags
= op
.flags
| CEPH_OSD_OP_FLAG_FADVISE_DONTNEED
;
6803 maybe_create_new_object(ctx
);
6804 if (pool
.info
.is_erasure()) {
6805 t
->truncate(soid
, 0);
6806 } else if (obs
.exists
&& op
.extent
.length
< oi
.size
) {
6807 t
->truncate(soid
, op
.extent
.length
);
6809 if (op
.extent
.length
) {
6810 t
->write(soid
, 0, op
.extent
.length
, osd_op
.indata
, op
.flags
);
6812 if (!skip_data_digest
) {
6813 obs
.oi
.set_data_digest(osd_op
.indata
.crc32c(-1));
6815 obs
.oi
.clear_data_digest();
6817 ctx
->clean_regions
.mark_data_region_dirty(0,
6818 std::max((uint64_t)op
.extent
.length
, oi
.size
));
6819 write_update_size_and_usage(ctx
->delta_stats
, oi
, ctx
->modified_ranges
,
6820 0, op
.extent
.length
, true);
6824 case CEPH_OSD_OP_WRITESAME
:
6826 tracepoint(osd
, do_osd_op_pre_writesame
, soid
.oid
.name
.c_str(), soid
.snap
.val
, oi
.size
, op
.writesame
.offset
, op
.writesame
.length
, op
.writesame
.data_length
);
6827 result
= do_writesame(ctx
, osd_op
);
6830 case CEPH_OSD_OP_ROLLBACK
:
6832 tracepoint(osd
, do_osd_op_pre_rollback
, soid
.oid
.name
.c_str(), soid
.snap
.val
);
6833 result
= _rollback_to(ctx
, osd_op
);
6836 case CEPH_OSD_OP_ZERO
:
6837 tracepoint(osd
, do_osd_op_pre_zero
, soid
.oid
.name
.c_str(), soid
.snap
.val
, op
.extent
.offset
, op
.extent
.length
);
6838 if (pool
.info
.requires_aligned_append()) {
6839 result
= -EOPNOTSUPP
;
6844 result
= check_offset_and_length(
6845 op
.extent
.offset
, op
.extent
.length
,
6846 static_cast<Option::size_t>(osd
->osd_max_object_size
), get_dpp());
6850 if (op
.extent
.length
&& obs
.exists
&& !oi
.is_whiteout()) {
6851 t
->zero(soid
, op
.extent
.offset
, op
.extent
.length
);
6852 interval_set
<uint64_t> ch
;
6853 ch
.insert(op
.extent
.offset
, op
.extent
.length
);
6854 ctx
->modified_ranges
.union_of(ch
);
6855 ctx
->clean_regions
.mark_data_region_dirty(op
.extent
.offset
, op
.extent
.length
);
6856 ctx
->delta_stats
.num_wr
++;
6857 oi
.clear_data_digest();
6863 case CEPH_OSD_OP_CREATE
:
6867 tracepoint(osd
, do_osd_op_pre_create
, soid
.oid
.name
.c_str(), soid
.snap
.val
);
6868 if (obs
.exists
&& !oi
.is_whiteout() &&
6869 (op
.flags
& CEPH_OSD_OP_FLAG_EXCL
)) {
6870 result
= -EEXIST
; /* this is an exclusive create */
6872 if (osd_op
.indata
.length()) {
6873 auto p
= osd_op
.indata
.cbegin();
6876 decode(category
, p
);
6878 catch (ceph::buffer::error
& e
) {
6882 // category is no longer implemented.
6884 maybe_create_new_object(ctx
);
6890 case CEPH_OSD_OP_TRIMTRUNC
:
6891 op
.extent
.offset
= op
.extent
.truncate_size
;
6894 case CEPH_OSD_OP_TRUNCATE
:
6895 tracepoint(osd
, do_osd_op_pre_truncate
, soid
.oid
.name
.c_str(), soid
.snap
.val
, oi
.size
, oi
.truncate_seq
, op
.extent
.offset
, op
.extent
.length
, op
.extent
.truncate_size
, op
.extent
.truncate_seq
);
6896 if (pool
.info
.requires_aligned_append()) {
6897 result
= -EOPNOTSUPP
;
6904 if (!obs
.exists
|| oi
.is_whiteout()) {
6905 dout(10) << " object dne, truncate is a no-op" << dendl
;
6909 result
= check_offset_and_length(
6910 op
.extent
.offset
, op
.extent
.length
,
6911 static_cast<Option::size_t>(osd
->osd_max_object_size
), get_dpp());
6915 if (op
.extent
.truncate_seq
) {
6916 ceph_assert(op
.extent
.offset
== op
.extent
.truncate_size
);
6917 if (op
.extent
.truncate_seq
<= oi
.truncate_seq
) {
6918 dout(10) << " truncate seq " << op
.extent
.truncate_seq
<< " <= current " << oi
.truncate_seq
6919 << ", no-op" << dendl
;
6922 dout(10) << " truncate seq " << op
.extent
.truncate_seq
<< " > current " << oi
.truncate_seq
6923 << ", truncating" << dendl
;
6924 oi
.truncate_seq
= op
.extent
.truncate_seq
;
6925 oi
.truncate_size
= op
.extent
.truncate_size
;
6928 maybe_create_new_object(ctx
);
6929 t
->truncate(soid
, op
.extent
.offset
);
6930 if (oi
.size
> op
.extent
.offset
) {
6931 interval_set
<uint64_t> trim
;
6932 trim
.insert(op
.extent
.offset
, oi
.size
-op
.extent
.offset
);
6933 ctx
->modified_ranges
.union_of(trim
);
6934 ctx
->clean_regions
.mark_data_region_dirty(op
.extent
.offset
, oi
.size
- op
.extent
.offset
);
6935 } else if (oi
.size
< op
.extent
.offset
) {
6936 ctx
->clean_regions
.mark_data_region_dirty(oi
.size
, op
.extent
.offset
- oi
.size
);
6938 if (op
.extent
.offset
!= oi
.size
) {
6939 truncate_update_size_and_usage(ctx
->delta_stats
,
6943 ctx
->delta_stats
.num_wr
++;
6944 // do no set exists, or we will break above DELETE -> TRUNCATE munging.
6946 oi
.clear_data_digest();
6950 case CEPH_OSD_OP_DELETE
:
6953 tracepoint(osd
, do_osd_op_pre_delete
, soid
.oid
.name
.c_str(), soid
.snap
.val
);
6955 result
= _delete_oid(ctx
, false, ctx
->ignore_cache
);
6959 case CEPH_OSD_OP_WATCH
:
6963 tracepoint(osd
, do_osd_op_pre_watch
, soid
.oid
.name
.c_str(), soid
.snap
.val
,
6964 op
.watch
.cookie
, op
.watch
.op
);
6970 uint64_t cookie
= op
.watch
.cookie
;
6971 entity_name_t entity
= ctx
->reqid
.name
;
6972 ObjectContextRef obc
= ctx
->obc
;
6974 dout(10) << "watch " << ceph_osd_watch_op_name(op
.watch
.op
)
6975 << ": ctx->obc=" << (void *)obc
.get() << " cookie=" << cookie
6976 << " oi.version=" << oi
.version
.version
<< " ctx->at_version=" << ctx
->at_version
<< dendl
;
6977 dout(10) << "watch: oi.user_version=" << oi
.user_version
<< dendl
;
6978 dout(10) << "watch: peer_addr="
6979 << ctx
->op
->get_req()->get_connection()->get_peer_addr() << dendl
;
6981 uint32_t timeout
= cct
->_conf
->osd_client_watch_timeout
;
6982 if (op
.watch
.timeout
!= 0) {
6983 timeout
= op
.watch
.timeout
;
6986 watch_info_t
w(cookie
, timeout
,
6987 ctx
->op
->get_req()->get_connection()->get_peer_addr());
6988 if (op
.watch
.op
== CEPH_OSD_WATCH_OP_WATCH
||
6989 op
.watch
.op
== CEPH_OSD_WATCH_OP_LEGACY_WATCH
) {
6990 if (oi
.watchers
.count(make_pair(cookie
, entity
))) {
6991 dout(10) << " found existing watch " << w
<< " by " << entity
<< dendl
;
6993 dout(10) << " registered new watch " << w
<< " by " << entity
<< dendl
;
6994 oi
.watchers
[make_pair(cookie
, entity
)] = w
;
6995 t
->nop(soid
); // make sure update the object_info on disk!
6997 bool will_ping
= (op
.watch
.op
== CEPH_OSD_WATCH_OP_WATCH
);
6998 ctx
->watch_connects
.push_back(make_pair(w
, will_ping
));
6999 } else if (op
.watch
.op
== CEPH_OSD_WATCH_OP_RECONNECT
) {
7000 if (!oi
.watchers
.count(make_pair(cookie
, entity
))) {
7004 dout(10) << " found existing watch " << w
<< " by " << entity
<< dendl
;
7005 ctx
->watch_connects
.push_back(make_pair(w
, true));
7006 } else if (op
.watch
.op
== CEPH_OSD_WATCH_OP_PING
) {
7007 /* Note: WATCH with PING doesn't cause may_write() to return true,
7008 * so if there is nothing else in the transaction, this is going
7009 * to run do_osd_op_effects, but not write out a log entry */
7010 if (!oi
.watchers
.count(make_pair(cookie
, entity
))) {
7014 map
<pair
<uint64_t,entity_name_t
>,WatchRef
>::iterator p
=
7015 obc
->watchers
.find(make_pair(cookie
, entity
));
7016 if (p
== obc
->watchers
.end() ||
7017 !p
->second
->is_connected()) {
7018 // client needs to reconnect
7019 result
= -ETIMEDOUT
;
7022 dout(10) << " found existing watch " << w
<< " by " << entity
<< dendl
;
7023 p
->second
->got_ping(ceph_clock_now());
7025 } else if (op
.watch
.op
== CEPH_OSD_WATCH_OP_UNWATCH
) {
7026 map
<pair
<uint64_t, entity_name_t
>, watch_info_t
>::iterator oi_iter
=
7027 oi
.watchers
.find(make_pair(cookie
, entity
));
7028 if (oi_iter
!= oi
.watchers
.end()) {
7029 dout(10) << " removed watch " << oi_iter
->second
<< " by "
7031 oi
.watchers
.erase(oi_iter
);
7032 t
->nop(soid
); // update oi on disk
7033 ctx
->watch_disconnects
.push_back(
7034 watch_disconnect_t(cookie
, entity
, false));
7036 dout(10) << " can't remove: no watch by " << entity
<< dendl
;
7042 case CEPH_OSD_OP_CACHE_PIN
:
7043 tracepoint(osd
, do_osd_op_pre_cache_pin
, soid
.oid
.name
.c_str(), soid
.snap
.val
);
7044 if ((!pool
.info
.is_tier() ||
7045 pool
.info
.cache_mode
== pg_pool_t::CACHEMODE_NONE
)) {
7047 dout(10) << " pin object is only allowed on the cache tier " << dendl
;
7053 if (!obs
.exists
|| oi
.is_whiteout()) {
7058 if (!oi
.is_cache_pinned()) {
7059 oi
.set_flag(object_info_t::FLAG_CACHE_PIN
);
7061 ctx
->delta_stats
.num_objects_pinned
++;
7062 ctx
->delta_stats
.num_wr
++;
7067 case CEPH_OSD_OP_CACHE_UNPIN
:
7068 tracepoint(osd
, do_osd_op_pre_cache_unpin
, soid
.oid
.name
.c_str(), soid
.snap
.val
);
7069 if ((!pool
.info
.is_tier() ||
7070 pool
.info
.cache_mode
== pg_pool_t::CACHEMODE_NONE
)) {
7072 dout(10) << " pin object is only allowed on the cache tier " << dendl
;
7078 if (!obs
.exists
|| oi
.is_whiteout()) {
7083 if (oi
.is_cache_pinned()) {
7084 oi
.clear_flag(object_info_t::FLAG_CACHE_PIN
);
7086 ctx
->delta_stats
.num_objects_pinned
--;
7087 ctx
->delta_stats
.num_wr
++;
7092 case CEPH_OSD_OP_SET_REDIRECT
:
7096 if (pool
.info
.is_tier()) {
7104 if (get_osdmap()->require_osd_release
< ceph_release_t::luminous
) {
7105 result
= -EOPNOTSUPP
;
7109 object_t target_name
;
7110 object_locator_t target_oloc
;
7111 snapid_t target_snapid
= (uint64_t)op
.copy_from
.snapid
;
7112 version_t target_version
= op
.copy_from
.src_version
;
7114 decode(target_name
, bp
);
7115 decode(target_oloc
, bp
);
7117 catch (ceph::buffer::error
& e
) {
7122 result
= get_osdmap()->object_locator_to_pg(target_name
, target_oloc
, raw_pg
);
7124 dout(5) << " pool information is invalid: " << result
<< dendl
;
7127 hobject_t
target(target_name
, target_oloc
.key
, target_snapid
,
7128 raw_pg
.ps(), raw_pg
.pool(),
7129 target_oloc
.nspace
);
7130 if (target
== soid
) {
7131 dout(20) << " set-redirect self is invalid" << dendl
;
7136 bool need_reference
= (osd_op
.op
.flags
& CEPH_OSD_OP_FLAG_WITH_REFERENCE
);
7137 bool has_reference
= (oi
.flags
& object_info_t::FLAG_REDIRECT_HAS_REFERENCE
);
7138 if (has_reference
) {
7140 dout(5) << " the object is already a manifest " << dendl
;
7143 if (op_finisher
== nullptr && need_reference
) {
7145 ctx
->op_finishers
[ctx
->current_osd_subop_num
].reset(
7146 new SetManifestFinisher(osd_op
));
7147 ManifestOpRef mop
= std::make_shared
<ManifestOp
>(ctx
->obc
, new RefCountCallback(ctx
, osd_op
));
7148 auto* fin
= new C_SetManifestRefCountDone(this, soid
, 0);
7149 ceph_tid_t tid
= refcount_manifest(soid
, target
,
7150 refcount_t::INCREMENT_REF
, fin
, std::nullopt
);
7154 manifest_ops
[soid
] = mop
;
7155 ctx
->obc
->start_block();
7156 result
= -EINPROGRESS
;
7160 result
= op_finisher
->execute();
7161 ceph_assert(result
== 0);
7164 if (!oi
.has_manifest() && !oi
.manifest
.is_redirect())
7165 ctx
->delta_stats
.num_objects_manifest
++;
7167 oi
.set_flag(object_info_t::FLAG_MANIFEST
);
7168 oi
.manifest
.redirect_target
= target
;
7169 oi
.manifest
.type
= object_manifest_t::TYPE_REDIRECT
;
7170 t
->truncate(soid
, 0);
7171 ctx
->clean_regions
.mark_data_region_dirty(0, oi
.size
);
7172 if (oi
.is_omap() && pool
.info
.supports_omap()) {
7173 t
->omap_clear(soid
);
7174 obs
.oi
.clear_omap_digest();
7175 obs
.oi
.clear_flag(object_info_t::FLAG_OMAP
);
7176 ctx
->clean_regions
.mark_omap_dirty();
7178 write_update_size_and_usage(ctx
->delta_stats
, oi
, ctx
->modified_ranges
,
7180 ctx
->delta_stats
.num_bytes
-= oi
.size
;
7183 oi
.user_version
= target_version
;
7184 ctx
->user_at_version
= target_version
;
7186 map
<string
,bufferlist
,less
<>> rmattrs
;
7187 result
= getattrs_maybe_cache(ctx
->obc
, &rmattrs
);
7189 dout(10) << __func__
<< " error: " << cpp_strerror(result
) << dendl
;
7192 map
<string
, bufferlist
>::iterator iter
;
7193 for (iter
= rmattrs
.begin(); iter
!= rmattrs
.end(); ++iter
) {
7194 const string
& name
= iter
->first
;
7195 t
->rmattr(soid
, name
);
7197 if (!has_reference
&& need_reference
) {
7198 oi
.set_flag(object_info_t::FLAG_REDIRECT_HAS_REFERENCE
);
7200 dout(10) << "set-redirect oid:" << oi
.soid
<< " user_version: " << oi
.user_version
<< dendl
;
7202 ctx
->op_finishers
.erase(ctx
->current_osd_subop_num
);
7209 case CEPH_OSD_OP_SET_CHUNK
:
7213 if (pool
.info
.is_tier()) {
7221 if (get_osdmap()->require_osd_release
< ceph_release_t::luminous
) {
7222 result
= -EOPNOTSUPP
;
7225 if (oi
.manifest
.is_redirect()) {
7230 object_locator_t tgt_oloc
;
7231 uint64_t src_offset
, src_length
, tgt_offset
;
7234 decode(src_offset
, bp
);
7235 decode(src_length
, bp
);
7236 decode(tgt_oloc
, bp
);
7237 decode(tgt_name
, bp
);
7238 decode(tgt_offset
, bp
);
7240 catch (ceph::buffer::error
& e
) {
7249 if (src_offset
+ src_length
> oi
.size
) {
7253 if (!(osd_op
.op
.flags
& CEPH_OSD_OP_FLAG_WITH_REFERENCE
)) {
7254 result
= -EOPNOTSUPP
;
7257 if (pool
.info
.is_erasure()) {
7258 result
= -EOPNOTSUPP
;
7262 for (auto &p
: oi
.manifest
.chunk_map
) {
7263 interval_set
<uint64_t> chunk
;
7264 chunk
.insert(p
.first
, p
.second
.length
);
7265 if (chunk
.intersects(src_offset
, src_length
)) {
7266 dout(20) << __func__
<< " overlapped !! offset: " << src_offset
<< " length: " << src_length
7267 << " chunk_info: " << p
<< dendl
;
7268 result
= -EOPNOTSUPP
;
7274 chunk_info_t chunk_info
;
7275 result
= get_osdmap()->object_locator_to_pg(tgt_name
, tgt_oloc
, raw_pg
);
7277 dout(5) << " pool information is invalid: " << result
<< dendl
;
7280 hobject_t
target(tgt_name
, tgt_oloc
.key
, snapid_t(),
7281 raw_pg
.ps(), raw_pg
.pool(),
7283 bool has_reference
= (oi
.manifest
.chunk_map
.find(src_offset
) != oi
.manifest
.chunk_map
.end()) &&
7284 (oi
.manifest
.chunk_map
[src_offset
].test_flag(chunk_info_t::FLAG_HAS_REFERENCE
));
7285 if (has_reference
) {
7287 dout(5) << " the object is already a manifest " << dendl
;
7290 chunk_info
.oid
= target
;
7291 chunk_info
.offset
= tgt_offset
;
7292 chunk_info
.length
= src_length
;
7293 if (op_finisher
== nullptr) {
7295 ctx
->op_finishers
[ctx
->current_osd_subop_num
].reset(
7296 new SetManifestFinisher(osd_op
));
7297 object_manifest_t set_chunk
;
7298 bool need_inc_ref
= false;
7299 set_chunk
.chunk_map
[src_offset
] = chunk_info
;
7300 need_inc_ref
= inc_refcount_by_set(ctx
, set_chunk
, osd_op
);
7302 result
= -EINPROGRESS
;
7307 result
= op_finisher
->execute();
7308 ceph_assert(result
== 0);
7311 oi
.manifest
.chunk_map
[src_offset
] = chunk_info
;
7312 if (!oi
.has_manifest() && !oi
.manifest
.is_chunked())
7313 ctx
->delta_stats
.num_objects_manifest
++;
7314 oi
.set_flag(object_info_t::FLAG_MANIFEST
);
7315 oi
.manifest
.type
= object_manifest_t::TYPE_CHUNKED
;
7316 if (!has_reference
) {
7317 oi
.manifest
.chunk_map
[src_offset
].set_flag(chunk_info_t::FLAG_HAS_REFERENCE
);
7320 ctx
->cache_operation
= true;
7322 dout(10) << "set-chunked oid:" << oi
.soid
<< " user_version: " << oi
.user_version
7323 << " chunk_info: " << chunk_info
<< dendl
;
7325 ctx
->op_finishers
.erase(ctx
->current_osd_subop_num
);
7331 case CEPH_OSD_OP_TIER_PROMOTE
:
7335 if (pool
.info
.is_tier()) {
7343 if (get_osdmap()->require_osd_release
< ceph_release_t::luminous
) {
7344 result
= -EOPNOTSUPP
;
7347 if (!obs
.oi
.has_manifest()) {
7352 if (op_finisher
== nullptr) {
7353 PromoteManifestCallback
*cb
;
7354 object_locator_t my_oloc
;
7357 if (obs
.oi
.manifest
.is_chunked()) {
7358 src_hoid
= obs
.oi
.soid
;
7359 } else if (obs
.oi
.manifest
.is_redirect()) {
7360 object_locator_t
src_oloc(obs
.oi
.manifest
.redirect_target
);
7362 src_hoid
= obs
.oi
.manifest
.redirect_target
;
7364 ceph_abort_msg("unrecognized manifest type");
7366 cb
= new PromoteManifestCallback(ctx
->obc
, this, ctx
);
7367 ctx
->op_finishers
[ctx
->current_osd_subop_num
].reset(
7368 new PromoteFinisher(cb
));
7369 unsigned flags
= CEPH_OSD_COPY_FROM_FLAG_IGNORE_OVERLAY
|
7370 CEPH_OSD_COPY_FROM_FLAG_IGNORE_CACHE
|
7371 CEPH_OSD_COPY_FROM_FLAG_MAP_SNAP_CLONE
|
7372 CEPH_OSD_COPY_FROM_FLAG_RWORDERED
;
7373 unsigned src_fadvise_flags
= LIBRADOS_OP_FLAG_FADVISE_SEQUENTIAL
;
7374 start_copy(cb
, ctx
->obc
, src_hoid
, my_oloc
, 0, flags
,
7375 obs
.oi
.soid
.snap
== CEPH_NOSNAP
,
7376 src_fadvise_flags
, 0);
7378 dout(10) << "tier-promote oid:" << oi
.soid
<< " manifest: " << obs
.oi
.manifest
<< dendl
;
7379 result
= -EINPROGRESS
;
7381 result
= op_finisher
->execute();
7382 ceph_assert(result
== 0);
7383 ctx
->op_finishers
.erase(ctx
->current_osd_subop_num
);
7389 case CEPH_OSD_OP_TIER_FLUSH
:
7393 if (pool
.info
.is_tier()) {
7401 if (get_osdmap()->require_osd_release
< ceph_release_t::octopus
) {
7402 result
= -EOPNOTSUPP
;
7406 if (oi
.is_dirty() || !obs
.oi
.has_manifest()) {
7407 result
= start_flush(ctx
->op
, ctx
->obc
, true, NULL
, std::nullopt
, true);
7408 if (result
== -EINPROGRESS
)
7417 case CEPH_OSD_OP_TIER_EVICT
:
7421 if (pool
.info
.is_tier()) {
7429 if (get_osdmap()->require_osd_release
< ceph_release_t::octopus
) {
7430 result
= -EOPNOTSUPP
;
7433 if (!obs
.oi
.has_manifest()) {
7438 // The chunks already has a reference, so it is just enough to invoke truncate if necessary
7439 for (auto &p
: obs
.oi
.manifest
.chunk_map
) {
7440 p
.second
.set_flag(chunk_info_t::FLAG_MISSING
);
7442 t
->zero(soid
, p
.first
, p
.second
.length
);
7443 interval_set
<uint64_t> ch
;
7444 ch
.insert(p
.first
, p
.second
.length
);
7445 ctx
->modified_ranges
.union_of(ch
);
7446 ctx
->clean_regions
.mark_data_region_dirty(p
.first
, p
.second
.length
);
7448 oi
.clear_data_digest();
7449 ctx
->delta_stats
.num_wr
++;
7450 ctx
->cache_operation
= true;
7451 ctx
->undirty
= true;
7452 osd
->logger
->inc(l_osd_tier_evict
);
7457 case CEPH_OSD_OP_UNSET_MANIFEST
:
7461 if (pool
.info
.is_tier()) {
7469 if (!oi
.has_manifest()) {
7470 result
= -EOPNOTSUPP
;
7473 if (get_osdmap()->require_osd_release
< ceph_release_t::luminous
) {
7474 result
= -EOPNOTSUPP
;
7478 dec_all_refcount_manifest(oi
, ctx
);
7480 oi
.clear_flag(object_info_t::FLAG_MANIFEST
);
7481 oi
.manifest
= object_manifest_t();
7482 ctx
->delta_stats
.num_objects_manifest
--;
7483 ctx
->delta_stats
.num_wr
++;
7489 // -- object attrs --
7491 case CEPH_OSD_OP_SETXATTR
:
7495 if (cct
->_conf
->osd_max_attr_size
> 0 &&
7496 op
.xattr
.value_len
> cct
->_conf
->osd_max_attr_size
) {
7497 tracepoint(osd
, do_osd_op_pre_setxattr
, soid
.oid
.name
.c_str(), soid
.snap
.val
, "???");
7501 unsigned max_name_len
=
7502 std::min
<uint64_t>(osd
->store
->get_max_attr_name_length(),
7503 cct
->_conf
->osd_max_attr_name_len
);
7504 if (op
.xattr
.name_len
> max_name_len
) {
7505 result
= -ENAMETOOLONG
;
7508 maybe_create_new_object(ctx
);
7510 bp
.copy(op
.xattr
.name_len
, aname
);
7511 tracepoint(osd
, do_osd_op_pre_setxattr
, soid
.oid
.name
.c_str(), soid
.snap
.val
, aname
.c_str());
7512 string name
= "_" + aname
;
7514 bp
.copy(op
.xattr
.value_len
, bl
);
7515 t
->setattr(soid
, name
, bl
);
7516 ctx
->delta_stats
.num_wr
++;
7520 case CEPH_OSD_OP_RMXATTR
:
7525 bp
.copy(op
.xattr
.name_len
, aname
);
7526 tracepoint(osd
, do_osd_op_pre_rmxattr
, soid
.oid
.name
.c_str(), soid
.snap
.val
, aname
.c_str());
7527 if (!obs
.exists
|| oi
.is_whiteout()) {
7531 string name
= "_" + aname
;
7532 t
->rmattr(soid
, name
);
7533 ctx
->delta_stats
.num_wr
++;
7538 // -- fancy writers --
7539 case CEPH_OSD_OP_APPEND
:
7541 tracepoint(osd
, do_osd_op_pre_append
, soid
.oid
.name
.c_str(), soid
.snap
.val
, oi
.size
, oi
.truncate_seq
, op
.extent
.offset
, op
.extent
.length
, op
.extent
.truncate_size
, op
.extent
.truncate_seq
);
7542 // just do it inline; this works because we are happy to execute
7543 // fancy op on replicas as well.
7544 vector
<OSDOp
> nops(1);
7545 OSDOp
& newop
= nops
[0];
7546 newop
.op
.op
= CEPH_OSD_OP_WRITE
;
7547 newop
.op
.extent
.offset
= oi
.size
;
7548 newop
.op
.extent
.length
= op
.extent
.length
;
7549 newop
.op
.extent
.truncate_seq
= oi
.truncate_seq
;
7550 newop
.indata
= osd_op
.indata
;
7551 result
= do_osd_ops(ctx
, nops
);
7552 osd_op
.outdata
= std::move(newop
.outdata
);
7556 case CEPH_OSD_OP_STARTSYNC
:
7561 // -- trivial map --
7562 case CEPH_OSD_OP_TMAPGET
:
7563 tracepoint(osd
, do_osd_op_pre_tmapget
, soid
.oid
.name
.c_str(), soid
.snap
.val
);
7564 if (pool
.info
.is_erasure()) {
7565 result
= -EOPNOTSUPP
;
7569 vector
<OSDOp
> nops(1);
7570 OSDOp
& newop
= nops
[0];
7571 newop
.op
.op
= CEPH_OSD_OP_SYNC_READ
;
7572 newop
.op
.extent
.offset
= 0;
7573 newop
.op
.extent
.length
= 0;
7574 result
= do_osd_ops(ctx
, nops
);
7575 osd_op
.outdata
= std::move(newop
.outdata
);
7579 case CEPH_OSD_OP_TMAPPUT
:
7580 tracepoint(osd
, do_osd_op_pre_tmapput
, soid
.oid
.name
.c_str(), soid
.snap
.val
);
7581 if (pool
.info
.is_erasure()) {
7582 result
= -EOPNOTSUPP
;
7586 //_dout_lock.Lock();
7587 //osd_op.data.hexdump(*_dout);
7588 //_dout_lock.Unlock();
7590 // verify sort order
7591 bool unsorted
= false;
7601 dout(10) << "tmapput key " << key
<< dendl
;
7604 if (key
< last_key
) {
7605 dout(10) << "TMAPPUT is unordered; resorting" << dendl
;
7614 vector
<OSDOp
> nops(1);
7615 OSDOp
& newop
= nops
[0];
7616 newop
.op
.op
= CEPH_OSD_OP_WRITEFULL
;
7617 newop
.op
.extent
.offset
= 0;
7618 newop
.op
.extent
.length
= osd_op
.indata
.length();
7619 newop
.indata
= osd_op
.indata
;
7622 bp
= osd_op
.indata
.begin();
7624 map
<string
, bufferlist
> m
;
7627 ceph_assert(bp
.end());
7629 encode(header
, newbl
);
7631 newop
.indata
= newbl
;
7633 result
= do_osd_ops(ctx
, nops
);
7634 ceph_assert(result
== 0);
7638 case CEPH_OSD_OP_TMAPUP
:
7639 tracepoint(osd
, do_osd_op_pre_tmapup
, soid
.oid
.name
.c_str(), soid
.snap
.val
);
7640 if (pool
.info
.is_erasure()) {
7641 result
= -EOPNOTSUPP
;
7645 result
= do_tmapup(ctx
, bp
, osd_op
);
7648 case CEPH_OSD_OP_TMAP2OMAP
:
7650 tracepoint(osd
, do_osd_op_pre_tmap2omap
, soid
.oid
.name
.c_str(), soid
.snap
.val
);
7651 result
= do_tmap2omap(ctx
, op
.tmap2omap
.flags
);
7655 case CEPH_OSD_OP_OMAPGETKEYS
:
7659 uint64_t max_return
;
7661 decode(start_after
, bp
);
7662 decode(max_return
, bp
);
7664 catch (ceph::buffer::error
& e
) {
7666 tracepoint(osd
, do_osd_op_pre_omapgetkeys
, soid
.oid
.name
.c_str(), soid
.snap
.val
, "???", 0);
7669 if (max_return
> cct
->_conf
->osd_max_omap_entries_per_request
) {
7670 max_return
= cct
->_conf
->osd_max_omap_entries_per_request
;
7672 tracepoint(osd
, do_osd_op_pre_omapgetkeys
, soid
.oid
.name
.c_str(), soid
.snap
.val
, start_after
.c_str(), max_return
);
7676 bool truncated
= false;
7678 ObjectMap::ObjectMapIterator iter
= osd
->store
->get_omap_iterator(
7679 ch
, ghobject_t(soid
)
7682 iter
->upper_bound(start_after
);
7683 for (num
= 0; iter
->valid(); ++num
, iter
->next()) {
7684 if (num
>= max_return
||
7685 bl
.length() >= cct
->_conf
->osd_max_omap_bytes_per_request
) {
7689 encode(iter
->key(), bl
);
7691 } // else return empty out_set
7692 encode(num
, osd_op
.outdata
);
7693 osd_op
.outdata
.claim_append(bl
);
7694 encode(truncated
, osd_op
.outdata
);
7695 ctx
->delta_stats
.num_rd_kb
+= shift_round_up(osd_op
.outdata
.length(), 10);
7696 ctx
->delta_stats
.num_rd
++;
7700 case CEPH_OSD_OP_OMAPGETVALS
:
7704 uint64_t max_return
;
7705 string filter_prefix
;
7707 decode(start_after
, bp
);
7708 decode(max_return
, bp
);
7709 decode(filter_prefix
, bp
);
7711 catch (ceph::buffer::error
& e
) {
7713 tracepoint(osd
, do_osd_op_pre_omapgetvals
, soid
.oid
.name
.c_str(), soid
.snap
.val
, "???", 0, "???");
7716 if (max_return
> cct
->_conf
->osd_max_omap_entries_per_request
) {
7717 max_return
= cct
->_conf
->osd_max_omap_entries_per_request
;
7719 tracepoint(osd
, do_osd_op_pre_omapgetvals
, soid
.oid
.name
.c_str(), soid
.snap
.val
, start_after
.c_str(), max_return
, filter_prefix
.c_str());
7722 bool truncated
= false;
7725 ObjectMap::ObjectMapIterator iter
= osd
->store
->get_omap_iterator(
7726 ch
, ghobject_t(soid
)
7732 iter
->upper_bound(start_after
);
7733 if (filter_prefix
> start_after
) iter
->lower_bound(filter_prefix
);
7736 iter
->key().substr(0, filter_prefix
.size()) == filter_prefix
;
7737 ++num
, iter
->next()) {
7738 dout(20) << "Found key " << iter
->key() << dendl
;
7739 if (num
>= max_return
||
7740 bl
.length() >= cct
->_conf
->osd_max_omap_bytes_per_request
) {
7744 encode(iter
->key(), bl
);
7745 encode(iter
->value(), bl
);
7747 } // else return empty out_set
7748 encode(num
, osd_op
.outdata
);
7749 osd_op
.outdata
.claim_append(bl
);
7750 encode(truncated
, osd_op
.outdata
);
7751 ctx
->delta_stats
.num_rd_kb
+= shift_round_up(osd_op
.outdata
.length(), 10);
7752 ctx
->delta_stats
.num_rd
++;
7756 case CEPH_OSD_OP_OMAPGETHEADER
:
7757 tracepoint(osd
, do_osd_op_pre_omapgetheader
, soid
.oid
.name
.c_str(), soid
.snap
.val
);
7758 if (!oi
.is_omap()) {
7759 // return empty header
7764 osd
->store
->omap_get_header(ch
, ghobject_t(soid
), &osd_op
.outdata
);
7765 ctx
->delta_stats
.num_rd_kb
+= shift_round_up(osd_op
.outdata
.length(), 10);
7766 ctx
->delta_stats
.num_rd
++;
7770 case CEPH_OSD_OP_OMAPGETVALSBYKEYS
:
7773 set
<string
> keys_to_get
;
7775 decode(keys_to_get
, bp
);
7777 catch (ceph::buffer::error
& e
) {
7779 tracepoint(osd
, do_osd_op_pre_omapgetvalsbykeys
, soid
.oid
.name
.c_str(), soid
.snap
.val
, "???");
7782 tracepoint(osd
, do_osd_op_pre_omapgetvalsbykeys
, soid
.oid
.name
.c_str(), soid
.snap
.val
, list_entries(keys_to_get
).c_str());
7783 map
<string
, bufferlist
> out
;
7785 osd
->store
->omap_get_values(ch
, ghobject_t(soid
), keys_to_get
, &out
);
7786 } // else return empty omap entries
7787 encode(out
, osd_op
.outdata
);
7788 ctx
->delta_stats
.num_rd_kb
+= shift_round_up(osd_op
.outdata
.length(), 10);
7789 ctx
->delta_stats
.num_rd
++;
7793 case CEPH_OSD_OP_OMAP_CMP
:
7796 if (!obs
.exists
|| oi
.is_whiteout()) {
7798 tracepoint(osd
, do_osd_op_pre_omap_cmp
, soid
.oid
.name
.c_str(), soid
.snap
.val
, "???");
7801 map
<string
, pair
<bufferlist
, int> > assertions
;
7803 decode(assertions
, bp
);
7805 catch (ceph::buffer::error
& e
) {
7807 tracepoint(osd
, do_osd_op_pre_omap_cmp
, soid
.oid
.name
.c_str(), soid
.snap
.val
, "???");
7810 tracepoint(osd
, do_osd_op_pre_omap_cmp
, soid
.oid
.name
.c_str(), soid
.snap
.val
, list_keys(assertions
).c_str());
7812 map
<string
, bufferlist
> out
;
7816 for (map
<string
, pair
<bufferlist
, int> >::iterator i
= assertions
.begin();
7817 i
!= assertions
.end();
7819 to_get
.insert(i
->first
);
7820 int r
= osd
->store
->omap_get_values(ch
, ghobject_t(soid
),
7826 } // else leave out empty
7828 //Should set num_rd_kb based on encode length of map
7829 ctx
->delta_stats
.num_rd
++;
7833 for (map
<string
, pair
<bufferlist
, int> >::iterator i
= assertions
.begin();
7834 i
!= assertions
.end();
7836 auto out_entry
= out
.find(i
->first
);
7837 bufferlist
&bl
= (out_entry
!= out
.end()) ?
7838 out_entry
->second
: empty
;
7839 switch (i
->second
.second
) {
7840 case CEPH_OSD_CMPXATTR_OP_EQ
:
7841 if (!(bl
== i
->second
.first
)) {
7845 case CEPH_OSD_CMPXATTR_OP_LT
:
7846 if (!(bl
< i
->second
.first
)) {
7850 case CEPH_OSD_CMPXATTR_OP_GT
:
7851 if (!(bl
> i
->second
.first
)) {
7869 case CEPH_OSD_OP_OMAPSETVALS
:
7870 if (!pool
.info
.supports_omap()) {
7871 result
= -EOPNOTSUPP
;
7872 tracepoint(osd
, do_osd_op_pre_omapsetvals
, soid
.oid
.name
.c_str(), soid
.snap
.val
);
7878 maybe_create_new_object(ctx
);
7879 bufferlist to_set_bl
;
7881 decode_str_str_map_to_bl(bp
, &to_set_bl
);
7883 catch (ceph::buffer::error
& e
) {
7885 tracepoint(osd
, do_osd_op_pre_omapsetvals
, soid
.oid
.name
.c_str(), soid
.snap
.val
);
7888 tracepoint(osd
, do_osd_op_pre_omapsetvals
, soid
.oid
.name
.c_str(), soid
.snap
.val
);
7889 if (cct
->_conf
->subsys
.should_gather
<dout_subsys
, 20>()) {
7890 dout(20) << "setting vals: " << dendl
;
7891 map
<string
,bufferlist
> to_set
;
7892 bufferlist::const_iterator pt
= to_set_bl
.begin();
7894 for (map
<string
, bufferlist
>::iterator i
= to_set
.begin();
7897 dout(20) << "\t" << i
->first
<< dendl
;
7900 t
->omap_setkeys(soid
, to_set_bl
);
7901 ctx
->clean_regions
.mark_omap_dirty();
7902 ctx
->delta_stats
.num_wr
++;
7903 ctx
->delta_stats
.num_wr_kb
+= shift_round_up(to_set_bl
.length(), 10);
7905 obs
.oi
.set_flag(object_info_t::FLAG_OMAP
);
7906 obs
.oi
.clear_omap_digest();
7909 case CEPH_OSD_OP_OMAPSETHEADER
:
7910 tracepoint(osd
, do_osd_op_pre_omapsetheader
, soid
.oid
.name
.c_str(), soid
.snap
.val
);
7911 if (!pool
.info
.supports_omap()) {
7912 result
= -EOPNOTSUPP
;
7918 maybe_create_new_object(ctx
);
7919 t
->omap_setheader(soid
, osd_op
.indata
);
7920 ctx
->clean_regions
.mark_omap_dirty();
7921 ctx
->delta_stats
.num_wr
++;
7923 obs
.oi
.set_flag(object_info_t::FLAG_OMAP
);
7924 obs
.oi
.clear_omap_digest();
7927 case CEPH_OSD_OP_OMAPCLEAR
:
7928 tracepoint(osd
, do_osd_op_pre_omapclear
, soid
.oid
.name
.c_str(), soid
.snap
.val
);
7929 if (!pool
.info
.supports_omap()) {
7930 result
= -EOPNOTSUPP
;
7936 if (!obs
.exists
|| oi
.is_whiteout()) {
7941 t
->omap_clear(soid
);
7942 ctx
->clean_regions
.mark_omap_dirty();
7943 ctx
->delta_stats
.num_wr
++;
7944 obs
.oi
.clear_omap_digest();
7945 obs
.oi
.clear_flag(object_info_t::FLAG_OMAP
);
7950 case CEPH_OSD_OP_OMAPRMKEYS
:
7951 if (!pool
.info
.supports_omap()) {
7952 result
= -EOPNOTSUPP
;
7953 tracepoint(osd
, do_osd_op_pre_omaprmkeys
, soid
.oid
.name
.c_str(), soid
.snap
.val
);
7959 if (!obs
.exists
|| oi
.is_whiteout()) {
7961 tracepoint(osd
, do_osd_op_pre_omaprmkeys
, soid
.oid
.name
.c_str(), soid
.snap
.val
);
7964 bufferlist to_rm_bl
;
7966 decode_str_set_to_bl(bp
, &to_rm_bl
);
7968 catch (ceph::buffer::error
& e
) {
7970 tracepoint(osd
, do_osd_op_pre_omaprmkeys
, soid
.oid
.name
.c_str(), soid
.snap
.val
);
7973 tracepoint(osd
, do_osd_op_pre_omaprmkeys
, soid
.oid
.name
.c_str(), soid
.snap
.val
);
7974 t
->omap_rmkeys(soid
, to_rm_bl
);
7975 ctx
->clean_regions
.mark_omap_dirty();
7976 ctx
->delta_stats
.num_wr
++;
7978 obs
.oi
.clear_omap_digest();
7981 case CEPH_OSD_OP_OMAPRMKEYRANGE
:
7982 tracepoint(osd
, do_osd_op_pre_omaprmkeyrange
, soid
.oid
.name
.c_str(), soid
.snap
.val
);
7983 if (!pool
.info
.supports_omap()) {
7984 result
= -EOPNOTSUPP
;
7990 if (!obs
.exists
|| oi
.is_whiteout()) {
7994 std::string key_begin
, key_end
;
7996 decode(key_begin
, bp
);
7997 decode(key_end
, bp
);
7998 } catch (ceph::buffer::error
& e
) {
8002 t
->omap_rmkeyrange(soid
, key_begin
, key_end
);
8003 ctx
->clean_regions
.mark_omap_dirty();
8004 ctx
->delta_stats
.num_wr
++;
8006 obs
.oi
.clear_omap_digest();
8009 case CEPH_OSD_OP_COPY_GET
:
8011 tracepoint(osd
, do_osd_op_pre_copy_get
, soid
.oid
.name
.c_str(),
8013 if (op_finisher
== nullptr) {
8014 result
= do_copy_get(ctx
, bp
, osd_op
, ctx
->obc
);
8016 result
= op_finisher
->execute();
8020 case CEPH_OSD_OP_COPY_FROM
:
8021 case CEPH_OSD_OP_COPY_FROM2
:
8026 object_locator_t src_oloc
;
8027 uint32_t truncate_seq
= 0;
8028 uint64_t truncate_size
= 0;
8029 bool have_truncate
= false;
8030 snapid_t src_snapid
= (uint64_t)op
.copy_from
.snapid
;
8031 version_t src_version
= op
.copy_from
.src_version
;
8033 if ((op
.op
== CEPH_OSD_OP_COPY_FROM2
) &&
8034 (op
.copy_from
.flags
& ~CEPH_OSD_COPY_FROM_FLAGS
)) {
8035 dout(20) << "invalid copy-from2 flags 0x"
8036 << std::hex
<< (int)op
.copy_from
.flags
<< std::dec
<< dendl
;
8041 decode(src_name
, bp
);
8042 decode(src_oloc
, bp
);
8043 // check if client sent us truncate_seq and truncate_size
8044 if ((op
.op
== CEPH_OSD_OP_COPY_FROM2
) &&
8045 (op
.copy_from
.flags
& CEPH_OSD_COPY_FROM_FLAG_TRUNCATE_SEQ
)) {
8046 decode(truncate_seq
, bp
);
8047 decode(truncate_size
, bp
);
8048 have_truncate
= true;
8051 catch (ceph::buffer::error
& e
) {
8054 do_osd_op_pre_copy_from
,
8055 soid
.oid
.name
.c_str(),
8067 do_osd_op_pre_copy_from
,
8068 soid
.oid
.name
.c_str(),
8070 src_name
.name
.c_str(),
8072 src_oloc
.key
.c_str(),
8073 src_oloc
.nspace
.c_str(),
8077 if (op_finisher
== nullptr) {
8080 get_osdmap()->object_locator_to_pg(src_name
, src_oloc
, raw_pg
);
8081 hobject_t
src(src_name
, src_oloc
.key
, src_snapid
,
8082 raw_pg
.ps(), raw_pg
.pool(),
8085 dout(20) << " copy from self is invalid" << dendl
;
8089 CopyFromCallback
*cb
= new CopyFromCallback(ctx
, osd_op
);
8091 cb
->set_truncate(truncate_seq
, truncate_size
);
8092 ctx
->op_finishers
[ctx
->current_osd_subop_num
].reset(
8093 new CopyFromFinisher(cb
));
8094 start_copy(cb
, ctx
->obc
, src
, src_oloc
, src_version
,
8097 op
.copy_from
.src_fadvise_flags
,
8099 result
= -EINPROGRESS
;
8102 result
= op_finisher
->execute();
8103 ceph_assert(result
== 0);
8105 // COPY_FROM cannot be executed multiple times -- it must restart
8106 ctx
->op_finishers
.erase(ctx
->current_osd_subop_num
);
8112 tracepoint(osd
, do_osd_op_pre_unknown
, soid
.oid
.name
.c_str(), soid
.snap
.val
, op
.op
, ceph_osd_op_name(op
.op
));
8113 dout(1) << "unrecognized osd op " << op
.op
8114 << " " << ceph_osd_op_name(op
.op
)
8116 result
= -EOPNOTSUPP
;
8120 osd_op
.rval
= result
;
8121 tracepoint(osd
, do_osd_op_post
, soid
.oid
.name
.c_str(), soid
.snap
.val
, op
.op
, ceph_osd_op_name(op
.op
), op
.flags
, result
);
8122 if (result
< 0 && (op
.flags
& CEPH_OSD_OP_FLAG_FAILOK
) &&
8123 result
!= -EAGAIN
&& result
!= -EINPROGRESS
)
8130 dout(10) << __func__
<< " error: " << cpp_strerror(result
) << dendl
;
8135 int PrimaryLogPG::_get_tmap(OpContext
*ctx
, bufferlist
*header
, bufferlist
*vals
)
8137 if (ctx
->new_obs
.oi
.size
== 0) {
8138 dout(20) << "unable to get tmap for zero sized " << ctx
->new_obs
.oi
.soid
<< dendl
;
8141 vector
<OSDOp
> nops(1);
8142 OSDOp
&newop
= nops
[0];
8143 newop
.op
.op
= CEPH_OSD_OP_TMAPGET
;
8144 do_osd_ops(ctx
, nops
);
8146 bufferlist::const_iterator i
= newop
.outdata
.begin();
8148 (*vals
).substr_of(newop
.outdata
, i
.get_off(), i
.get_remaining());
8150 dout(20) << "unsuccessful at decoding tmap for " << ctx
->new_obs
.oi
.soid
8154 dout(20) << "successful at decoding tmap for " << ctx
->new_obs
.oi
.soid
8159 int PrimaryLogPG::_verify_no_head_clones(const hobject_t
& soid
,
8162 // verify that all clones have been evicted
8163 dout(20) << __func__
<< " verifying clones are absent "
8165 for (vector
<snapid_t
>::const_iterator p
= ss
.clones
.begin();
8166 p
!= ss
.clones
.end();
8168 hobject_t clone_oid
= soid
;
8169 clone_oid
.snap
= *p
;
8170 if (is_missing_object(clone_oid
))
8172 ObjectContextRef clone_obc
= get_object_context(clone_oid
, false);
8173 if (clone_obc
&& clone_obc
->obs
.exists
) {
8174 dout(10) << __func__
<< " cannot evict head before clone "
8175 << clone_oid
<< dendl
;
8178 if (copy_ops
.count(clone_oid
)) {
8179 dout(10) << __func__
<< " cannot evict head, pending promote on clone "
8180 << clone_oid
<< dendl
;
8187 inline int PrimaryLogPG::_delete_oid(
8189 bool no_whiteout
, // no whiteouts, no matter what.
8190 bool try_no_whiteout
) // try not to whiteout
8192 SnapSet
& snapset
= ctx
->new_snapset
;
8193 ObjectState
& obs
= ctx
->new_obs
;
8194 object_info_t
& oi
= obs
.oi
;
8195 const hobject_t
& soid
= oi
.soid
;
8196 PGTransaction
* t
= ctx
->op_t
.get();
8198 // cache: cache: set whiteout on delete?
8199 bool whiteout
= false;
8200 if (pool
.info
.cache_mode
!= pg_pool_t::CACHEMODE_NONE
8202 && !try_no_whiteout
) {
8206 // in luminous or later, we can't delete the head if there are
8207 // clones. we trust the caller passing no_whiteout has already
8208 // verified they don't exist.
8209 if (!snapset
.clones
.empty() ||
8210 (!ctx
->snapc
.snaps
.empty() && ctx
->snapc
.snaps
[0] > snapset
.seq
)) {
8212 dout(20) << __func__
<< " has or will have clones but no_whiteout=1"
8215 dout(20) << __func__
<< " has or will have clones; will whiteout"
8220 dout(20) << __func__
<< " " << soid
<< " whiteout=" << (int)whiteout
8221 << " no_whiteout=" << (int)no_whiteout
8222 << " try_no_whiteout=" << (int)try_no_whiteout
8224 if (!obs
.exists
|| (obs
.oi
.is_whiteout() && whiteout
))
8230 interval_set
<uint64_t> ch
;
8231 ch
.insert(0, oi
.size
);
8232 ctx
->modified_ranges
.union_of(ch
);
8233 ctx
->clean_regions
.mark_data_region_dirty(0, oi
.size
);
8236 ctx
->clean_regions
.mark_omap_dirty();
8237 ctx
->delta_stats
.num_wr
++;
8238 if (soid
.is_snap()) {
8239 ceph_assert(ctx
->obc
->ssc
->snapset
.clone_overlap
.count(soid
.snap
));
8240 ctx
->delta_stats
.num_bytes
-= ctx
->obc
->ssc
->snapset
.get_clone_bytes(soid
.snap
);
8242 ctx
->delta_stats
.num_bytes
-= oi
.size
;
8247 // disconnect all watchers
8248 for (map
<pair
<uint64_t, entity_name_t
>, watch_info_t
>::iterator p
=
8249 oi
.watchers
.begin();
8250 p
!= oi
.watchers
.end();
8252 dout(20) << __func__
<< " will disconnect watcher " << p
->first
<< dendl
;
8253 ctx
->watch_disconnects
.push_back(
8254 watch_disconnect_t(p
->first
.first
, p
->first
.second
, true));
8256 oi
.watchers
.clear();
8259 dout(20) << __func__
<< " setting whiteout on " << soid
<< dendl
;
8260 oi
.set_flag(object_info_t::FLAG_WHITEOUT
);
8261 ctx
->delta_stats
.num_whiteouts
++;
8263 osd
->logger
->inc(l_osd_tier_whiteout
);
8267 if (oi
.has_manifest()) {
8268 ctx
->delta_stats
.num_objects_manifest
--;
8269 dec_all_refcount_manifest(oi
, ctx
);
8273 ctx
->delta_stats
.num_objects
--;
8275 ctx
->delta_stats
.num_object_clones
--;
8276 if (oi
.is_whiteout()) {
8277 dout(20) << __func__
<< " deleting whiteout on " << soid
<< dendl
;
8278 ctx
->delta_stats
.num_whiteouts
--;
8279 oi
.clear_flag(object_info_t::FLAG_WHITEOUT
);
8281 if (oi
.is_cache_pinned()) {
8282 ctx
->delta_stats
.num_objects_pinned
--;
8288 int PrimaryLogPG::_rollback_to(OpContext
*ctx
, OSDOp
& op
)
8290 ObjectState
& obs
= ctx
->new_obs
;
8291 object_info_t
& oi
= obs
.oi
;
8292 const hobject_t
& soid
= oi
.soid
;
8293 snapid_t snapid
= (uint64_t)op
.op
.snap
.snapid
;
8294 hobject_t missing_oid
;
8296 dout(10) << "_rollback_to " << soid
<< " snapid " << snapid
<< dendl
;
8298 ObjectContextRef rollback_to
;
8300 int ret
= find_object_context(
8301 hobject_t(soid
.oid
, soid
.get_key(), snapid
, soid
.get_hash(), info
.pgid
.pool(),
8302 soid
.get_namespace()),
8303 &rollback_to
, false, false, &missing_oid
);
8304 if (ret
== -EAGAIN
) {
8305 /* clone must be missing */
8306 ceph_assert(is_degraded_or_backfilling_object(missing_oid
) || is_degraded_on_async_recovery_target(missing_oid
));
8307 dout(20) << "_rollback_to attempted to roll back to a missing or backfilling clone "
8308 << missing_oid
<< " (requested snapid: ) " << snapid
<< dendl
;
8309 block_write_on_degraded_snap(missing_oid
, ctx
->op
);
8313 ObjectContextRef promote_obc
;
8314 cache_result_t tier_mode_result
;
8315 if (obs
.exists
&& obs
.oi
.has_manifest()) {
8317 * In the case of manifest object, the object_info exists on the base tier at all time,
8318 * so promote_obc should be equal to rollback_to
8320 promote_obc
= rollback_to
;
8322 maybe_handle_manifest_detail(
8328 maybe_handle_cache_detail(
8338 switch (tier_mode_result
) {
8339 case cache_result_t::NOOP
:
8341 case cache_result_t::BLOCKED_PROMOTE
:
8342 ceph_assert(promote_obc
);
8343 block_write_on_snap_rollback(soid
, promote_obc
, ctx
->op
);
8345 case cache_result_t::BLOCKED_FULL
:
8346 block_write_on_full_cache(soid
, ctx
->op
);
8348 case cache_result_t::REPLIED_WITH_EAGAIN
:
8349 ceph_abort_msg("this can't happen, no rollback on replica");
8351 ceph_abort_msg("must promote was set, other values are not valid");
8356 if (ret
== -ENOENT
|| (rollback_to
&& rollback_to
->obs
.oi
.is_whiteout())) {
8357 // there's no snapshot here, or there's no object.
8358 // if there's no snapshot, we delete the object; otherwise, do nothing.
8359 dout(20) << "_rollback_to deleting head on " << soid
.oid
8360 << " because got ENOENT|whiteout on find_object_context" << dendl
;
8361 if (ctx
->obc
->obs
.oi
.watchers
.size()) {
8362 // Cannot delete an object with watchers
8365 _delete_oid(ctx
, false, false);
8369 // ummm....huh? It *can't* return anything else at time of writing.
8370 ceph_abort_msg("unexpected error code in _rollback_to");
8371 } else { //we got our context, let's use it to do the rollback!
8372 hobject_t
& rollback_to_sobject
= rollback_to
->obs
.oi
.soid
;
8373 if (is_degraded_or_backfilling_object(rollback_to_sobject
) ||
8374 is_degraded_on_async_recovery_target(rollback_to_sobject
)) {
8375 dout(20) << "_rollback_to attempted to roll back to a degraded object "
8376 << rollback_to_sobject
<< " (requested snapid: ) " << snapid
<< dendl
;
8377 block_write_on_degraded_snap(rollback_to_sobject
, ctx
->op
);
8379 } else if (rollback_to
->obs
.oi
.soid
.snap
== CEPH_NOSNAP
) {
8380 // rolling back to the head; we just need to clone it.
8383 if (rollback_to
->obs
.oi
.has_manifest() && rollback_to
->obs
.oi
.manifest
.is_chunked()) {
8385 * looking at the following case, the foo head needs the reference of chunk4 and chunk5
8386 * in case snap[1] is removed.
8388 * Before rollback to snap[1]:
8390 * foo snap[1]: [chunk4] [chunk5]
8391 * foo snap[0]: [ chunk2 ]
8392 * foo head : [chunk1] [chunk3]
8396 * foo snap[1]: [chunk4] [chunk5]
8397 * foo snap[0]: [ chunk2 ]
8398 * foo head : [chunk4] [chunk5]
8401 OpFinisher
* op_finisher
= nullptr;
8402 auto op_finisher_it
= ctx
->op_finishers
.find(ctx
->current_osd_subop_num
);
8403 if (op_finisher_it
!= ctx
->op_finishers
.end()) {
8404 op_finisher
= op_finisher_it
->second
.get();
8407 bool need_inc_ref
= inc_refcount_by_set(ctx
, rollback_to
->obs
.oi
.manifest
, op
);
8409 ceph_assert(op_finisher_it
== ctx
->op_finishers
.end());
8410 ctx
->op_finishers
[ctx
->current_osd_subop_num
].reset(
8411 new SetManifestFinisher(op
));
8412 return -EINPROGRESS
;
8415 op_finisher
->execute();
8416 ctx
->op_finishers
.erase(ctx
->current_osd_subop_num
);
8419 _do_rollback_to(ctx
, rollback_to
, op
);
8425 void PrimaryLogPG::_do_rollback_to(OpContext
*ctx
, ObjectContextRef rollback_to
,
8428 SnapSet
& snapset
= ctx
->new_snapset
;
8429 ObjectState
& obs
= ctx
->new_obs
;
8430 object_info_t
& oi
= obs
.oi
;
8431 const hobject_t
& soid
= oi
.soid
;
8432 PGTransaction
* t
= ctx
->op_t
.get();
8433 snapid_t snapid
= (uint64_t)op
.op
.snap
.snapid
;
8434 hobject_t
& rollback_to_sobject
= rollback_to
->obs
.oi
.soid
;
8436 /* 1) Delete current head
8437 * 2) Clone correct snapshot into head
8438 * 3) Calculate clone_overlaps by following overlaps
8439 * forward from rollback snapshot */
8440 dout(10) << "_do_rollback_to deleting " << soid
.oid
8441 << " and rolling back to old snap" << dendl
;
8445 if (obs
.oi
.has_manifest()) {
8446 dec_all_refcount_manifest(obs
.oi
, ctx
);
8447 oi
.manifest
.clear();
8448 oi
.manifest
.type
= object_manifest_t::TYPE_NONE
;
8449 oi
.clear_flag(object_info_t::FLAG_MANIFEST
);
8450 ctx
->delta_stats
.num_objects_manifest
--;
8451 ctx
->cache_operation
= true; // do not trigger to call ref function to calculate refcount
8454 t
->clone(soid
, rollback_to_sobject
);
8455 t
->add_obc(rollback_to
);
8457 map
<snapid_t
, interval_set
<uint64_t> >::iterator iter
=
8458 snapset
.clone_overlap
.lower_bound(snapid
);
8459 ceph_assert(iter
!= snapset
.clone_overlap
.end());
8460 interval_set
<uint64_t> overlaps
= iter
->second
;
8462 iter
!= snapset
.clone_overlap
.end();
8464 overlaps
.intersection_of(iter
->second
);
8466 if (obs
.oi
.size
> 0) {
8467 interval_set
<uint64_t> modified
;
8468 modified
.insert(0, obs
.oi
.size
);
8469 overlaps
.intersection_of(modified
);
8470 modified
.subtract(overlaps
);
8471 ctx
->modified_ranges
.union_of(modified
);
8474 // Adjust the cached objectcontext
8475 maybe_create_new_object(ctx
, true);
8476 ctx
->delta_stats
.num_bytes
-= obs
.oi
.size
;
8477 ctx
->delta_stats
.num_bytes
+= rollback_to
->obs
.oi
.size
;
8478 ctx
->clean_regions
.mark_data_region_dirty(0, std::max(obs
.oi
.size
, rollback_to
->obs
.oi
.size
));
8479 ctx
->clean_regions
.mark_omap_dirty();
8480 obs
.oi
.size
= rollback_to
->obs
.oi
.size
;
8481 if (rollback_to
->obs
.oi
.is_data_digest())
8482 obs
.oi
.set_data_digest(rollback_to
->obs
.oi
.data_digest
);
8484 obs
.oi
.clear_data_digest();
8485 if (rollback_to
->obs
.oi
.is_omap_digest())
8486 obs
.oi
.set_omap_digest(rollback_to
->obs
.oi
.omap_digest
);
8488 obs
.oi
.clear_omap_digest();
8490 if (rollback_to
->obs
.oi
.has_manifest() && rollback_to
->obs
.oi
.manifest
.is_chunked()) {
8491 obs
.oi
.set_flag(object_info_t::FLAG_MANIFEST
);
8492 obs
.oi
.manifest
.type
= rollback_to
->obs
.oi
.manifest
.type
;
8493 obs
.oi
.manifest
.chunk_map
= rollback_to
->obs
.oi
.manifest
.chunk_map
;
8494 ctx
->cache_operation
= true;
8495 ctx
->delta_stats
.num_objects_manifest
++;
8498 if (rollback_to
->obs
.oi
.is_omap()) {
8499 dout(10) << __func__
<< " setting omap flag on " << obs
.oi
.soid
<< dendl
;
8500 obs
.oi
.set_flag(object_info_t::FLAG_OMAP
);
8502 dout(10) << __func__
<< " clearing omap flag on " << obs
.oi
.soid
<< dendl
;
8503 obs
.oi
.clear_flag(object_info_t::FLAG_OMAP
);
8507 void PrimaryLogPG::_make_clone(
8510 ObjectContextRef clone_obc
,
8511 const hobject_t
& head
, const hobject_t
& coid
,
8515 encode(*poi
, bv
, get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD
, nullptr));
8517 t
->clone(coid
, head
);
8518 setattr_maybe_cache(clone_obc
, t
, OI_ATTR
, bv
);
8519 rmattr_maybe_cache(clone_obc
, t
, SS_ATTR
);
8522 void PrimaryLogPG::make_writeable(OpContext
*ctx
)
8524 const hobject_t
& soid
= ctx
->obs
->oi
.soid
;
8525 SnapContext
& snapc
= ctx
->snapc
;
8528 ceph_assert(soid
.snap
== CEPH_NOSNAP
);
8529 dout(20) << "make_writeable " << soid
<< " snapset=" << ctx
->new_snapset
8530 << " snapc=" << snapc
<< dendl
;
8532 bool was_dirty
= ctx
->obc
->obs
.oi
.is_dirty();
8533 if (ctx
->new_obs
.exists
) {
8534 // we will mark the object dirty
8535 if (ctx
->undirty
&& was_dirty
) {
8536 dout(20) << " clearing DIRTY flag" << dendl
;
8537 ceph_assert(ctx
->new_obs
.oi
.is_dirty());
8538 ctx
->new_obs
.oi
.clear_flag(object_info_t::FLAG_DIRTY
);
8539 --ctx
->delta_stats
.num_objects_dirty
;
8540 osd
->logger
->inc(l_osd_tier_clean
);
8541 } else if (!was_dirty
&& !ctx
->undirty
) {
8542 dout(20) << " setting DIRTY flag" << dendl
;
8543 ctx
->new_obs
.oi
.set_flag(object_info_t::FLAG_DIRTY
);
8544 ++ctx
->delta_stats
.num_objects_dirty
;
8545 osd
->logger
->inc(l_osd_tier_dirty
);
8549 dout(20) << " deletion, decrementing num_dirty and clearing flag" << dendl
;
8550 ctx
->new_obs
.oi
.clear_flag(object_info_t::FLAG_DIRTY
);
8551 --ctx
->delta_stats
.num_objects_dirty
;
8555 if ((ctx
->new_obs
.exists
&&
8556 ctx
->new_obs
.oi
.is_omap()) &&
8557 (!ctx
->obc
->obs
.exists
||
8558 !ctx
->obc
->obs
.oi
.is_omap())) {
8559 ++ctx
->delta_stats
.num_objects_omap
;
8561 if ((!ctx
->new_obs
.exists
||
8562 !ctx
->new_obs
.oi
.is_omap()) &&
8563 (ctx
->obc
->obs
.exists
&&
8564 ctx
->obc
->obs
.oi
.is_omap())) {
8565 --ctx
->delta_stats
.num_objects_omap
;
8568 if (ctx
->new_snapset
.seq
> snapc
.seq
) {
8569 dout(10) << " op snapset is old" << dendl
;
8572 if ((ctx
->obs
->exists
&& !ctx
->obs
->oi
.is_whiteout()) && // head exist(ed)
8573 snapc
.snaps
.size() && // there are snaps
8574 !ctx
->cache_operation
&&
8575 snapc
.snaps
[0] > ctx
->new_snapset
.seq
) { // existing object is old
8577 hobject_t coid
= soid
;
8578 coid
.snap
= snapc
.seq
;
8580 const auto snaps
= [&] {
8581 auto last
= find_if_not(
8582 begin(snapc
.snaps
), end(snapc
.snaps
),
8583 [&](snapid_t snap_id
) { return snap_id
> ctx
->new_snapset
.seq
; });
8584 return vector
<snapid_t
>{begin(snapc
.snaps
), last
};
8588 object_info_t
static_snap_oi(coid
);
8589 object_info_t
*snap_oi
;
8591 ctx
->clone_obc
= object_contexts
.lookup_or_create(static_snap_oi
.soid
);
8592 ctx
->clone_obc
->destructor_callback
=
8593 new C_PG_ObjectContext(this, ctx
->clone_obc
.get());
8594 ctx
->clone_obc
->obs
.oi
= static_snap_oi
;
8595 ctx
->clone_obc
->obs
.exists
= true;
8596 ctx
->clone_obc
->ssc
= ctx
->obc
->ssc
;
8597 ctx
->clone_obc
->ssc
->ref
++;
8598 if (pool
.info
.is_erasure())
8599 ctx
->clone_obc
->attr_cache
= ctx
->obc
->attr_cache
;
8600 snap_oi
= &ctx
->clone_obc
->obs
.oi
;
8601 if (ctx
->obc
->obs
.oi
.has_manifest()) {
8602 if ((ctx
->obc
->obs
.oi
.flags
& object_info_t::FLAG_REDIRECT_HAS_REFERENCE
) &&
8603 ctx
->obc
->obs
.oi
.manifest
.is_redirect()) {
8604 snap_oi
->set_flag(object_info_t::FLAG_MANIFEST
);
8605 snap_oi
->manifest
.type
= object_manifest_t::TYPE_REDIRECT
;
8606 snap_oi
->manifest
.redirect_target
= ctx
->obc
->obs
.oi
.manifest
.redirect_target
;
8607 } else if (ctx
->obc
->obs
.oi
.manifest
.is_chunked()) {
8608 snap_oi
->set_flag(object_info_t::FLAG_MANIFEST
);
8609 snap_oi
->manifest
.type
= object_manifest_t::TYPE_CHUNKED
;
8610 snap_oi
->manifest
.chunk_map
= ctx
->obc
->obs
.oi
.manifest
.chunk_map
;
8612 ceph_abort_msg("unrecognized manifest type");
8615 bool got
= ctx
->lock_manager
.get_write_greedy(
8620 dout(20) << " got greedy write on clone_obc " << *ctx
->clone_obc
<< dendl
;
8622 snap_oi
= &static_snap_oi
;
8624 snap_oi
->version
= ctx
->at_version
;
8625 snap_oi
->prior_version
= ctx
->obs
->oi
.version
;
8626 snap_oi
->copy_user_bits(ctx
->obs
->oi
);
8628 _make_clone(ctx
, ctx
->op_t
.get(), ctx
->clone_obc
, soid
, coid
, snap_oi
);
8630 ctx
->delta_stats
.num_objects
++;
8631 if (snap_oi
->is_dirty()) {
8632 ctx
->delta_stats
.num_objects_dirty
++;
8633 osd
->logger
->inc(l_osd_tier_dirty
);
8635 if (snap_oi
->is_omap())
8636 ctx
->delta_stats
.num_objects_omap
++;
8637 if (snap_oi
->is_cache_pinned())
8638 ctx
->delta_stats
.num_objects_pinned
++;
8639 if (snap_oi
->has_manifest())
8640 ctx
->delta_stats
.num_objects_manifest
++;
8641 ctx
->delta_stats
.num_object_clones
++;
8642 ctx
->new_snapset
.clones
.push_back(coid
.snap
);
8643 ctx
->new_snapset
.clone_size
[coid
.snap
] = ctx
->obs
->oi
.size
;
8644 ctx
->new_snapset
.clone_snaps
[coid
.snap
] = snaps
;
8646 // clone_overlap should contain an entry for each clone
8647 // (an empty interval_set if there is no overlap)
8648 ctx
->new_snapset
.clone_overlap
[coid
.snap
];
8649 if (ctx
->obs
->oi
.size
) {
8650 ctx
->new_snapset
.clone_overlap
[coid
.snap
].insert(0, ctx
->obs
->oi
.size
);
8654 dout(10) << " cloning v " << ctx
->obs
->oi
.version
8655 << " to " << coid
<< " v " << ctx
->at_version
8656 << " snaps=" << snaps
8657 << " snapset=" << ctx
->new_snapset
<< dendl
;
8658 ctx
->log
.push_back(pg_log_entry_t(
8659 pg_log_entry_t::CLONE
, coid
, ctx
->at_version
,
8660 ctx
->obs
->oi
.version
,
8661 ctx
->obs
->oi
.user_version
,
8662 osd_reqid_t(), ctx
->new_obs
.oi
.mtime
, 0));
8663 encode(snaps
, ctx
->log
.back().snaps
);
8665 ctx
->at_version
.version
++;
8668 // update most recent clone_overlap and usage stats
8669 if (ctx
->new_snapset
.clones
.size() > 0) {
8670 // the clone_overlap is difference of range between head and clones.
8671 // we need to check whether the most recent clone exists, if it's
8672 // been evicted, it's not included in the stats, but the clone_overlap
8673 // is still exist in the snapset, so we should update the
8674 // clone_overlap to make it sense.
8675 hobject_t last_clone_oid
= soid
;
8676 last_clone_oid
.snap
= ctx
->new_snapset
.clone_overlap
.rbegin()->first
;
8677 interval_set
<uint64_t> &newest_overlap
=
8678 ctx
->new_snapset
.clone_overlap
.rbegin()->second
;
8679 ctx
->modified_ranges
.intersection_of(newest_overlap
);
8680 if (is_present_clone(last_clone_oid
)) {
8681 // modified_ranges is still in use by the clone
8682 ctx
->delta_stats
.num_bytes
+= ctx
->modified_ranges
.size();
8684 newest_overlap
.subtract(ctx
->modified_ranges
);
8687 if (snapc
.seq
> ctx
->new_snapset
.seq
) {
8688 // update snapset with latest snap context
8689 ctx
->new_snapset
.seq
= snapc
.seq
;
8690 if (get_osdmap()->require_osd_release
< ceph_release_t::octopus
) {
8691 ctx
->new_snapset
.snaps
= snapc
.snaps
;
8693 ctx
->new_snapset
.snaps
.clear();
8696 dout(20) << "make_writeable " << soid
8697 << " done, snapset=" << ctx
->new_snapset
<< dendl
;
8701 void PrimaryLogPG::write_update_size_and_usage(object_stat_sum_t
& delta_stats
, object_info_t
& oi
,
8702 interval_set
<uint64_t>& modified
, uint64_t offset
,
8703 uint64_t length
, bool write_full
)
8705 interval_set
<uint64_t> ch
;
8708 ch
.insert(0, oi
.size
);
8710 ch
.insert(offset
, length
);
8711 modified
.union_of(ch
);
8713 (offset
+ length
> oi
.size
&& length
)) {
8714 uint64_t new_size
= offset
+ length
;
8715 delta_stats
.num_bytes
-= oi
.size
;
8716 delta_stats
.num_bytes
+= new_size
;
8720 delta_stats
.num_wr
++;
8721 delta_stats
.num_wr_kb
+= shift_round_up(length
, 10);
8724 void PrimaryLogPG::truncate_update_size_and_usage(
8725 object_stat_sum_t
& delta_stats
,
8727 uint64_t truncate_size
)
8729 if (oi
.size
!= truncate_size
) {
8730 delta_stats
.num_bytes
-= oi
.size
;
8731 delta_stats
.num_bytes
+= truncate_size
;
8732 oi
.size
= truncate_size
;
8736 void PrimaryLogPG::complete_disconnect_watches(
8737 ObjectContextRef obc
,
8738 const list
<watch_disconnect_t
> &to_disconnect
)
8740 for (list
<watch_disconnect_t
>::const_iterator i
=
8741 to_disconnect
.begin();
8742 i
!= to_disconnect
.end();
8744 pair
<uint64_t, entity_name_t
> watcher(i
->cookie
, i
->name
);
8745 auto watchers_entry
= obc
->watchers
.find(watcher
);
8746 if (watchers_entry
!= obc
->watchers
.end()) {
8747 WatchRef watch
= watchers_entry
->second
;
8748 dout(10) << "do_osd_op_effects disconnect watcher " << watcher
<< dendl
;
8749 obc
->watchers
.erase(watcher
);
8750 watch
->remove(i
->send_disconnect
);
8752 dout(10) << "do_osd_op_effects disconnect failed to find watcher "
8753 << watcher
<< dendl
;
8758 void PrimaryLogPG::do_osd_op_effects(OpContext
*ctx
, const ConnectionRef
& conn
)
8760 entity_name_t entity
= ctx
->reqid
.name
;
8761 dout(15) << "do_osd_op_effects " << entity
<< " con " << conn
.get() << dendl
;
8763 // disconnects first
8764 complete_disconnect_watches(ctx
->obc
, ctx
->watch_disconnects
);
8768 auto session
= conn
->get_priv();
8772 for (list
<pair
<watch_info_t
,bool> >::iterator i
= ctx
->watch_connects
.begin();
8773 i
!= ctx
->watch_connects
.end();
8775 pair
<uint64_t, entity_name_t
> watcher(i
->first
.cookie
, entity
);
8776 dout(15) << "do_osd_op_effects applying watch connect on session "
8777 << session
.get() << " watcher " << watcher
<< dendl
;
8779 if (ctx
->obc
->watchers
.count(watcher
)) {
8780 dout(15) << "do_osd_op_effects found existing watch watcher " << watcher
8782 watch
= ctx
->obc
->watchers
[watcher
];
8784 dout(15) << "do_osd_op_effects new watcher " << watcher
8786 watch
= Watch::makeWatchRef(
8787 this, osd
, ctx
->obc
, i
->first
.timeout_seconds
,
8788 i
->first
.cookie
, entity
, conn
->get_peer_addr());
8789 ctx
->obc
->watchers
.insert(
8794 watch
->connect(conn
, i
->second
);
8797 for (list
<notify_info_t
>::iterator p
= ctx
->notifies
.begin();
8798 p
!= ctx
->notifies
.end();
8800 dout(10) << "do_osd_op_effects, notify " << *p
<< dendl
;
8801 ConnectionRef
conn(ctx
->op
->get_req()->get_connection());
8803 Notify::makeNotifyRef(
8805 ctx
->reqid
.name
.num(),
8810 ctx
->obc
->obs
.oi
.user_version
,
8812 for (map
<pair
<uint64_t, entity_name_t
>, WatchRef
>::iterator i
=
8813 ctx
->obc
->watchers
.begin();
8814 i
!= ctx
->obc
->watchers
.end();
8816 dout(10) << "starting notify on watch " << i
->first
<< dendl
;
8817 i
->second
->start_notify(notif
);
8822 for (list
<OpContext::NotifyAck
>::iterator p
= ctx
->notify_acks
.begin();
8823 p
!= ctx
->notify_acks
.end();
8825 if (p
->watch_cookie
)
8826 dout(10) << "notify_ack " << make_pair(*(p
->watch_cookie
), p
->notify_id
) << dendl
;
8828 dout(10) << "notify_ack " << make_pair("NULL", p
->notify_id
) << dendl
;
8829 for (map
<pair
<uint64_t, entity_name_t
>, WatchRef
>::iterator i
=
8830 ctx
->obc
->watchers
.begin();
8831 i
!= ctx
->obc
->watchers
.end();
8833 if (i
->first
.second
!= entity
) continue;
8834 if (p
->watch_cookie
&&
8835 *(p
->watch_cookie
) != i
->first
.first
) continue;
8836 dout(10) << "acking notify on watch " << i
->first
<< dendl
;
8837 i
->second
->notify_ack(p
->notify_id
, p
->reply_bl
);
8842 hobject_t
PrimaryLogPG::generate_temp_object(const hobject_t
& target
)
8845 ss
<< "temp_" << info
.pgid
<< "_" << get_role()
8846 << "_" << osd
->monc
->get_global_id() << "_" << (++temp_seq
);
8847 hobject_t hoid
= target
.make_temp_hobject(ss
.str());
8848 dout(20) << __func__
<< " " << hoid
<< dendl
;
8852 hobject_t
PrimaryLogPG::get_temp_recovery_object(
8853 const hobject_t
& target
,
8857 ss
<< "temp_recovering_" << info
.pgid
// (note this includes the shardid)
8859 << "_" << info
.history
.same_interval_since
8860 << "_" << target
.snap
;
8861 // pgid + version + interval + snapid is unique, and short
8862 hobject_t hoid
= target
.make_temp_hobject(ss
.str());
8863 dout(20) << __func__
<< " " << hoid
<< dendl
;
8867 int PrimaryLogPG::prepare_transaction(OpContext
*ctx
)
8869 ceph_assert(!ctx
->ops
->empty());
8871 // valid snap context?
8872 if (!ctx
->snapc
.is_valid()) {
8873 dout(10) << " invalid snapc " << ctx
->snapc
<< dendl
;
8877 // prepare the actual mutation
8878 int result
= do_osd_ops(ctx
, *ctx
->ops
);
8880 if (ctx
->op
->may_write() &&
8881 get_osdmap()->require_osd_release
>= ceph_release_t::kraken
) {
8882 // need to save the error code in the pg log, to detect dup ops,
8883 // but do nothing else
8884 ctx
->update_log_only
= true;
8889 // read-op? write-op noop? done?
8890 if (ctx
->op_t
->empty() && !ctx
->modify
) {
8891 if (ctx
->pending_async_reads
.empty())
8892 unstable_stats
.add(ctx
->delta_stats
);
8893 if (ctx
->op
->may_write() &&
8894 get_osdmap()->require_osd_release
>= ceph_release_t::kraken
) {
8895 ctx
->update_log_only
= true;
8901 if ((ctx
->delta_stats
.num_bytes
> 0 ||
8902 ctx
->delta_stats
.num_objects
> 0) && // FIXME: keys?
8903 pool
.info
.has_flag(pg_pool_t::FLAG_FULL
)) {
8904 auto m
= ctx
->op
->get_req
<MOSDOp
>();
8905 if (ctx
->reqid
.name
.is_mds() || // FIXME: ignore MDS for now
8906 m
->has_flag(CEPH_OSD_FLAG_FULL_FORCE
)) {
8907 dout(20) << __func__
<< " full, but proceeding due to FULL_FORCE or MDS"
8909 } else if (m
->has_flag(CEPH_OSD_FLAG_FULL_TRY
)) {
8910 // they tried, they failed.
8911 dout(20) << __func__
<< " full, replying to FULL_TRY op" << dendl
;
8912 return pool
.info
.has_flag(pg_pool_t::FLAG_FULL_QUOTA
) ? -EDQUOT
: -ENOSPC
;
8915 dout(20) << __func__
<< " full, dropping request (bad client)" << dendl
;
8920 const hobject_t
& soid
= ctx
->obs
->oi
.soid
;
8921 // clone, if necessary
8922 if (soid
.snap
== CEPH_NOSNAP
)
8923 make_writeable(ctx
);
8926 ctx
->new_obs
.exists
? pg_log_entry_t::MODIFY
:
8927 pg_log_entry_t::DELETE
,
8933 void PrimaryLogPG::finish_ctx(OpContext
*ctx
, int log_op_type
, int result
)
8935 const hobject_t
& soid
= ctx
->obs
->oi
.soid
;
8936 dout(20) << __func__
<< " " << soid
<< " " << ctx
8937 << " op " << pg_log_entry_t::get_op_name(log_op_type
)
8939 utime_t now
= ceph_clock_now();
8942 // Drop the reference if deduped chunk is modified
8943 if (ctx
->new_obs
.oi
.is_dirty() &&
8944 (ctx
->obs
->oi
.has_manifest() && ctx
->obs
->oi
.manifest
.is_chunked()) &&
8945 !ctx
->cache_operation
&&
8946 log_op_type
!= pg_log_entry_t::PROMOTE
) {
8947 update_chunk_map_by_dirty(ctx
);
8948 // If a clone is creating, ignore dropping the reference for manifest object
8949 if (!ctx
->delta_stats
.num_object_clones
) {
8950 dec_refcount_by_dirty(ctx
);
8954 // finish and log the op.
8955 if (ctx
->user_modify
) {
8956 // update the user_version for any modify ops, except for the watch op
8957 ctx
->user_at_version
= std::max(info
.last_user_version
, ctx
->new_obs
.oi
.user_version
) + 1;
8958 /* In order for new clients and old clients to interoperate properly
8959 * when exchanging versions, we need to lower bound the user_version
8960 * (which our new clients pay proper attention to)
8961 * by the at_version (which is all the old clients can ever see). */
8962 if (ctx
->at_version
.version
> ctx
->user_at_version
)
8963 ctx
->user_at_version
= ctx
->at_version
.version
;
8964 ctx
->new_obs
.oi
.user_version
= ctx
->user_at_version
;
8966 ctx
->bytes_written
= ctx
->op_t
->get_bytes_written();
8968 if (ctx
->new_obs
.exists
) {
8969 ctx
->new_obs
.oi
.version
= ctx
->at_version
;
8970 ctx
->new_obs
.oi
.prior_version
= ctx
->obs
->oi
.version
;
8971 ctx
->new_obs
.oi
.last_reqid
= ctx
->reqid
;
8972 if (ctx
->mtime
!= utime_t()) {
8973 ctx
->new_obs
.oi
.mtime
= ctx
->mtime
;
8974 dout(10) << " set mtime to " << ctx
->new_obs
.oi
.mtime
<< dendl
;
8975 ctx
->new_obs
.oi
.local_mtime
= now
;
8977 dout(10) << " mtime unchanged at " << ctx
->new_obs
.oi
.mtime
<< dendl
;
8981 map
<string
, bufferlist
, less
<>> attrs
;
8982 bufferlist
bv(sizeof(ctx
->new_obs
.oi
));
8983 encode(ctx
->new_obs
.oi
, bv
,
8984 get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD
, nullptr));
8985 attrs
[OI_ATTR
] = std::move(bv
);
8988 if (soid
.snap
== CEPH_NOSNAP
) {
8989 dout(10) << " final snapset " << ctx
->new_snapset
8990 << " in " << soid
<< dendl
;
8992 encode(ctx
->new_snapset
, bss
);
8993 attrs
[SS_ATTR
] = std::move(bss
);
8995 dout(10) << " no snapset (this is a clone)" << dendl
;
8997 ctx
->op_t
->setattrs(soid
, attrs
);
9000 ctx
->new_obs
.oi
= object_info_t(ctx
->obc
->obs
.oi
.soid
);
9005 pg_log_entry_t(log_op_type
, soid
, ctx
->at_version
,
9006 ctx
->obs
->oi
.version
,
9007 ctx
->user_at_version
, ctx
->reqid
,
9009 (ctx
->op
&& ctx
->op
->allows_returnvec()) ? result
: 0));
9010 if (ctx
->op
&& ctx
->op
->allows_returnvec()) {
9011 // also the per-op values
9012 ctx
->log
.back().set_op_returns(*ctx
->ops
);
9013 dout(20) << __func__
<< " op_returns " << ctx
->log
.back().op_returns
9017 ctx
->log
.back().clean_regions
= ctx
->clean_regions
;
9018 dout(20) << __func__
<< " object " << soid
<< " marks clean_regions " << ctx
->log
.back().clean_regions
<< dendl
;
9020 if (soid
.snap
< CEPH_NOSNAP
) {
9021 switch (log_op_type
) {
9022 case pg_log_entry_t::MODIFY
:
9023 case pg_log_entry_t::PROMOTE
:
9024 case pg_log_entry_t::CLEAN
:
9025 dout(20) << __func__
<< " encoding snaps from " << ctx
->new_snapset
9027 encode(ctx
->new_snapset
.clone_snaps
[soid
.snap
], ctx
->log
.back().snaps
);
9034 if (!ctx
->extra_reqids
.empty()) {
9035 dout(20) << __func__
<< " extra_reqids " << ctx
->extra_reqids
<< " "
9036 << ctx
->extra_reqid_return_codes
<< dendl
;
9037 ctx
->log
.back().extra_reqids
.swap(ctx
->extra_reqids
);
9038 ctx
->log
.back().extra_reqid_return_codes
.swap(ctx
->extra_reqid_return_codes
);
9041 // apply new object state.
9042 ctx
->obc
->obs
= ctx
->new_obs
;
9044 if (soid
.is_head() && !ctx
->obc
->obs
.exists
) {
9045 ctx
->obc
->ssc
->exists
= false;
9046 ctx
->obc
->ssc
->snapset
= SnapSet();
9048 ctx
->obc
->ssc
->exists
= true;
9049 ctx
->obc
->ssc
->snapset
= ctx
->new_snapset
;
9053 void PrimaryLogPG::apply_stats(
9054 const hobject_t
&soid
,
9055 const object_stat_sum_t
&delta_stats
) {
9057 recovery_state
.apply_op_stats(soid
, delta_stats
);
9058 for (set
<pg_shard_t
>::const_iterator i
= get_backfill_targets().begin();
9059 i
!= get_backfill_targets().end();
9062 const pg_info_t
& pinfo
= recovery_state
.get_peer_info(bt
);
9063 if (soid
> pinfo
.last_backfill
&& soid
<= last_backfill_started
) {
9064 pending_backfill_updates
[soid
].stats
.add(delta_stats
);
9068 m_scrubber
->stats_of_handled_objects(delta_stats
, soid
);
9071 void PrimaryLogPG::complete_read_ctx(int result
, OpContext
*ctx
)
9073 auto m
= ctx
->op
->get_req
<MOSDOp
>();
9074 ceph_assert(ctx
->async_reads_complete());
9076 for (auto p
= ctx
->ops
->begin();
9077 p
!= ctx
->ops
->end() && result
>= 0; ++p
) {
9078 if (p
->rval
< 0 && !(p
->op
.flags
& CEPH_OSD_OP_FLAG_FAILOK
)) {
9082 ctx
->bytes_read
+= p
->outdata
.length();
9084 ctx
->reply
->get_header().data_off
= (ctx
->data_off
? *ctx
->data_off
: 0);
9086 MOSDOpReply
*reply
= ctx
->reply
;
9087 ctx
->reply
= nullptr;
9090 if (!ctx
->ignore_log_op_stats
) {
9091 log_op_stats(*ctx
->op
, ctx
->bytes_written
, ctx
->bytes_read
);
9093 publish_stats_to_osd();
9096 // on read, return the current object version
9098 reply
->set_reply_versions(eversion_t(), ctx
->obs
->oi
.user_version
);
9100 reply
->set_reply_versions(eversion_t(), ctx
->user_at_version
);
9102 } else if (result
== -ENOENT
) {
9103 // on ENOENT, set a floor for what the next user version will be.
9104 reply
->set_enoent_reply_versions(info
.last_update
, info
.last_user_version
);
9107 reply
->set_result(result
);
9108 reply
->add_flags(CEPH_OSD_FLAG_ACK
| CEPH_OSD_FLAG_ONDISK
);
9109 osd
->send_message_osd_client(reply
, m
->get_connection());
9113 // ========================================================================
9116 struct C_Copyfrom
: public Context
{
9119 epoch_t last_peering_reset
;
9121 PrimaryLogPG::CopyOpRef cop
; // used for keeping the cop alive
9122 C_Copyfrom(PrimaryLogPG
*p
, hobject_t o
, epoch_t lpr
,
9123 const PrimaryLogPG::CopyOpRef
& c
)
9124 : pg(p
), oid(o
), last_peering_reset(lpr
),
9127 void finish(int r
) override
{
9128 if (r
== -ECANCELED
)
9130 std::scoped_lock l
{*pg
};
9131 if (last_peering_reset
== pg
->get_last_peering_reset()) {
9132 pg
->process_copy_chunk(oid
, tid
, r
);
9138 struct C_CopyFrom_AsyncReadCb
: public Context
{
9140 object_copy_data_t reply_obj
;
9143 C_CopyFrom_AsyncReadCb(OSDOp
*osd_op
, uint64_t features
) :
9144 osd_op(osd_op
), features(features
), len(0) {}
9145 void finish(int r
) override
{
9151 ceph_assert(len
> 0);
9152 ceph_assert(len
<= reply_obj
.data
.length());
9154 bl
.substr_of(reply_obj
.data
, 0, len
);
9155 reply_obj
.data
.swap(bl
);
9156 encode(reply_obj
, osd_op
->outdata
, features
);
9160 struct C_CopyChunk
: public Context
{
9163 epoch_t last_peering_reset
;
9165 PrimaryLogPG::CopyOpRef cop
; // used for keeping the cop alive
9166 uint64_t offset
= 0;
9167 C_CopyChunk(PrimaryLogPG
*p
, hobject_t o
, epoch_t lpr
,
9168 const PrimaryLogPG::CopyOpRef
& c
)
9169 : pg(p
), oid(o
), last_peering_reset(lpr
),
9172 void finish(int r
) override
{
9173 if (r
== -ECANCELED
)
9175 std::scoped_lock l
{*pg
};
9176 if (last_peering_reset
== pg
->get_last_peering_reset()) {
9177 pg
->process_copy_chunk_manifest(oid
, tid
, r
, offset
);
9183 int PrimaryLogPG::do_copy_get(OpContext
*ctx
, bufferlist::const_iterator
& bp
,
9184 OSDOp
& osd_op
, ObjectContextRef
&obc
)
9186 object_info_t
& oi
= obc
->obs
.oi
;
9187 hobject_t
& soid
= oi
.soid
;
9189 object_copy_cursor_t cursor
;
9193 decode(out_max
, bp
);
9195 catch (ceph::buffer::error
& e
) {
9200 const MOSDOp
*op
= reinterpret_cast<const MOSDOp
*>(ctx
->op
->get_req());
9201 uint64_t features
= op
->get_features();
9203 bool async_read_started
= false;
9204 object_copy_data_t _reply_obj
;
9205 C_CopyFrom_AsyncReadCb
*cb
= nullptr;
9206 if (pool
.info
.is_erasure()) {
9207 cb
= new C_CopyFrom_AsyncReadCb(&osd_op
, features
);
9209 object_copy_data_t
&reply_obj
= cb
? cb
->reply_obj
: _reply_obj
;
9211 reply_obj
.size
= oi
.size
;
9212 reply_obj
.mtime
= oi
.mtime
;
9213 ceph_assert(obc
->ssc
);
9214 if (soid
.snap
< CEPH_NOSNAP
) {
9215 auto p
= obc
->ssc
->snapset
.clone_snaps
.find(soid
.snap
);
9216 ceph_assert(p
!= obc
->ssc
->snapset
.clone_snaps
.end()); // warn?
9217 reply_obj
.snaps
= p
->second
;
9219 reply_obj
.snap_seq
= obc
->ssc
->snapset
.seq
;
9221 if (oi
.is_data_digest()) {
9222 reply_obj
.flags
|= object_copy_data_t::FLAG_DATA_DIGEST
;
9223 reply_obj
.data_digest
= oi
.data_digest
;
9225 if (oi
.is_omap_digest()) {
9226 reply_obj
.flags
|= object_copy_data_t::FLAG_OMAP_DIGEST
;
9227 reply_obj
.omap_digest
= oi
.omap_digest
;
9229 reply_obj
.truncate_seq
= oi
.truncate_seq
;
9230 reply_obj
.truncate_size
= oi
.truncate_size
;
9233 map
<string
,bufferlist
,less
<>>& out_attrs
= reply_obj
.attrs
;
9234 if (!cursor
.attr_complete
) {
9235 result
= getattrs_maybe_cache(
9244 cursor
.attr_complete
= true;
9245 dout(20) << " got attrs" << dendl
;
9248 int64_t left
= out_max
- osd_op
.outdata
.length();
9251 bufferlist
& bl
= reply_obj
.data
;
9252 if (left
> 0 && !cursor
.data_complete
) {
9253 if (cursor
.data_offset
< oi
.size
) {
9254 uint64_t max_read
= std::min(oi
.size
- cursor
.data_offset
, (uint64_t)left
);
9256 async_read_started
= true;
9257 ctx
->pending_async_reads
.push_back(
9259 boost::make_tuple(cursor
.data_offset
, max_read
, osd_op
.op
.flags
),
9260 make_pair(&bl
, cb
)));
9263 ctx
->op_finishers
[ctx
->current_osd_subop_num
].reset(
9264 new ReadFinisher(osd_op
));
9265 result
= -EINPROGRESS
;
9267 dout(10) << __func__
<< ": async_read noted for " << soid
<< dendl
;
9269 result
= pgbackend
->objects_read_sync(
9270 oi
.soid
, cursor
.data_offset
, max_read
, osd_op
.op
.flags
, &bl
);
9275 cursor
.data_offset
+= max_read
;
9277 if (cursor
.data_offset
== oi
.size
) {
9278 cursor
.data_complete
= true;
9279 dout(20) << " got data" << dendl
;
9281 ceph_assert(cursor
.data_offset
<= oi
.size
);
9285 uint32_t omap_keys
= 0;
9286 if (!pool
.info
.supports_omap() || !oi
.is_omap()) {
9287 cursor
.omap_complete
= true;
9289 if (left
> 0 && !cursor
.omap_complete
) {
9290 ceph_assert(cursor
.data_complete
);
9291 if (cursor
.omap_offset
.empty()) {
9292 osd
->store
->omap_get_header(ch
, ghobject_t(oi
.soid
),
9293 &reply_obj
.omap_header
);
9295 bufferlist omap_data
;
9296 ObjectMap::ObjectMapIterator iter
=
9297 osd
->store
->get_omap_iterator(ch
, ghobject_t(oi
.soid
));
9299 iter
->upper_bound(cursor
.omap_offset
);
9300 for (; iter
->valid(); iter
->next()) {
9302 encode(iter
->key(), omap_data
);
9303 encode(iter
->value(), omap_data
);
9304 left
-= iter
->key().length() + 4 + iter
->value().length() + 4;
9309 encode(omap_keys
, reply_obj
.omap_data
);
9310 reply_obj
.omap_data
.claim_append(omap_data
);
9312 if (iter
->valid()) {
9313 cursor
.omap_offset
= iter
->key();
9315 cursor
.omap_complete
= true;
9316 dout(20) << " got omap" << dendl
;
9321 if (cursor
.is_complete()) {
9322 // include reqids only in the final step. this is a bit fragile
9324 recovery_state
.get_pg_log().get_log().get_object_reqids(ctx
->obc
->obs
.oi
.soid
, 10,
9326 &reply_obj
.reqid_return_codes
);
9327 dout(20) << " got reqids" << dendl
;
9330 dout(20) << " cursor.is_complete=" << cursor
.is_complete()
9331 << " " << out_attrs
.size() << " attrs"
9332 << " " << bl
.length() << " bytes"
9333 << " " << reply_obj
.omap_header
.length() << " omap header bytes"
9334 << " " << reply_obj
.omap_data
.length() << " omap data bytes in "
9335 << omap_keys
<< " keys"
9336 << " " << reply_obj
.reqids
.size() << " reqids"
9338 reply_obj
.cursor
= cursor
;
9339 if (!async_read_started
) {
9340 encode(reply_obj
, osd_op
.outdata
, features
);
9342 if (cb
&& !async_read_started
) {
9352 void PrimaryLogPG::fill_in_copy_get_noent(OpRequestRef
& op
, hobject_t oid
,
9355 const MOSDOp
*m
= static_cast<const MOSDOp
*>(op
->get_req());
9356 uint64_t features
= m
->get_features();
9357 object_copy_data_t reply_obj
;
9359 recovery_state
.get_pg_log().get_log().get_object_reqids(oid
, 10, &reply_obj
.reqids
,
9360 &reply_obj
.reqid_return_codes
);
9361 dout(20) << __func__
<< " got reqids " << reply_obj
.reqids
<< dendl
;
9362 encode(reply_obj
, osd_op
.outdata
, features
);
9363 osd_op
.rval
= -ENOENT
;
9364 MOSDOpReply
*reply
= new MOSDOpReply(m
, 0, get_osdmap_epoch(), 0, false);
9365 reply
->set_result(-ENOENT
);
9366 reply
->add_flags(CEPH_OSD_FLAG_ACK
| CEPH_OSD_FLAG_ONDISK
);
9367 osd
->send_message_osd_client(reply
, m
->get_connection());
9370 void PrimaryLogPG::start_copy(CopyCallback
*cb
, ObjectContextRef obc
,
9371 hobject_t src
, object_locator_t oloc
,
9372 version_t version
, unsigned flags
,
9373 bool mirror_snapset
,
9374 unsigned src_obj_fadvise_flags
,
9375 unsigned dest_obj_fadvise_flags
)
9377 const hobject_t
& dest
= obc
->obs
.oi
.soid
;
9378 dout(10) << __func__
<< " " << dest
9379 << " from " << src
<< " " << oloc
<< " v" << version
9380 << " flags " << flags
9381 << (mirror_snapset
? " mirror_snapset" : "")
9384 ceph_assert(!mirror_snapset
|| src
.snap
== CEPH_NOSNAP
);
9386 // cancel a previous in-progress copy?
9387 if (copy_ops
.count(dest
)) {
9388 // FIXME: if the src etc match, we could avoid restarting from the
9390 CopyOpRef cop
= copy_ops
[dest
];
9391 vector
<ceph_tid_t
> tids
;
9392 cancel_copy(cop
, false, &tids
);
9393 osd
->objecter
->op_cancel(tids
, -ECANCELED
);
9396 CopyOpRef
cop(std::make_shared
<CopyOp
>(cb
, obc
, src
, oloc
, version
, flags
,
9397 mirror_snapset
, src_obj_fadvise_flags
,
9398 dest_obj_fadvise_flags
));
9399 copy_ops
[dest
] = cop
;
9400 dout(20) << fmt::format("{}: blocking {}", __func__
, dest
) << dendl
;
9403 if (!obc
->obs
.oi
.has_manifest()) {
9404 _copy_some(obc
, cop
);
9406 if (obc
->obs
.oi
.manifest
.is_redirect()) {
9407 _copy_some(obc
, cop
);
9408 } else if (obc
->obs
.oi
.manifest
.is_chunked()) {
9409 auto p
= obc
->obs
.oi
.manifest
.chunk_map
.begin();
9410 _copy_some_manifest(obc
, cop
, p
->first
);
9412 ceph_abort_msg("unrecognized manifest type");
9417 void PrimaryLogPG::_copy_some(ObjectContextRef obc
, CopyOpRef cop
)
9419 dout(10) << __func__
<< " " << *obc
<< " " << cop
<< dendl
;
9422 if (cop
->flags
& CEPH_OSD_COPY_FROM_FLAG_FLUSH
)
9423 flags
|= CEPH_OSD_FLAG_FLUSH
;
9424 if (cop
->flags
& CEPH_OSD_COPY_FROM_FLAG_IGNORE_CACHE
)
9425 flags
|= CEPH_OSD_FLAG_IGNORE_CACHE
;
9426 if (cop
->flags
& CEPH_OSD_COPY_FROM_FLAG_IGNORE_OVERLAY
)
9427 flags
|= CEPH_OSD_FLAG_IGNORE_OVERLAY
;
9428 if (cop
->flags
& CEPH_OSD_COPY_FROM_FLAG_MAP_SNAP_CLONE
)
9429 flags
|= CEPH_OSD_FLAG_MAP_SNAP_CLONE
;
9430 if (cop
->flags
& CEPH_OSD_COPY_FROM_FLAG_RWORDERED
)
9431 flags
|= CEPH_OSD_FLAG_RWORDERED
;
9433 C_GatherBuilder
gather(cct
);
9435 if (cop
->cursor
.is_initial() && cop
->mirror_snapset
) {
9437 ceph_assert(cop
->src
.snap
== CEPH_NOSNAP
);
9439 op
.list_snaps(&cop
->results
.snapset
, NULL
);
9440 ceph_tid_t tid
= osd
->objecter
->read(cop
->src
.oid
, cop
->oloc
, op
,
9442 flags
, gather
.new_sub(), NULL
);
9443 cop
->objecter_tid2
= tid
;
9447 if (cop
->results
.user_version
) {
9448 op
.assert_version(cop
->results
.user_version
);
9450 // we should learn the version after the first chunk, if we didn't know
9452 ceph_assert(cop
->cursor
.is_initial());
9454 op
.copy_get(&cop
->cursor
, get_copy_chunk_size(),
9455 &cop
->results
.object_size
, &cop
->results
.mtime
,
9456 &cop
->attrs
, &cop
->data
, &cop
->omap_header
, &cop
->omap_data
,
9457 &cop
->results
.snaps
, &cop
->results
.snap_seq
,
9458 &cop
->results
.flags
,
9459 &cop
->results
.source_data_digest
,
9460 &cop
->results
.source_omap_digest
,
9461 &cop
->results
.reqids
,
9462 &cop
->results
.reqid_return_codes
,
9463 &cop
->results
.truncate_seq
,
9464 &cop
->results
.truncate_size
,
9466 op
.set_last_op_flags(cop
->src_obj_fadvise_flags
);
9468 C_Copyfrom
*fin
= new C_Copyfrom(this, obc
->obs
.oi
.soid
,
9469 get_last_peering_reset(), cop
);
9470 gather
.set_finisher(new C_OnFinisher(fin
,
9471 osd
->get_objecter_finisher(get_pg_shard())));
9473 ceph_tid_t tid
= osd
->objecter
->read(cop
->src
.oid
, cop
->oloc
, op
,
9474 cop
->src
.snap
, NULL
,
9477 // discover the object version if we don't know it yet
9478 cop
->results
.user_version
? NULL
: &cop
->results
.user_version
);
9480 cop
->objecter_tid
= tid
;
9484 void PrimaryLogPG::_copy_some_manifest(ObjectContextRef obc
, CopyOpRef cop
, uint64_t start_offset
)
9486 dout(10) << __func__
<< " " << *obc
<< " " << cop
<< dendl
;
9489 if (cop
->flags
& CEPH_OSD_COPY_FROM_FLAG_FLUSH
)
9490 flags
|= CEPH_OSD_FLAG_FLUSH
;
9491 if (cop
->flags
& CEPH_OSD_COPY_FROM_FLAG_IGNORE_CACHE
)
9492 flags
|= CEPH_OSD_FLAG_IGNORE_CACHE
;
9493 if (cop
->flags
& CEPH_OSD_COPY_FROM_FLAG_IGNORE_OVERLAY
)
9494 flags
|= CEPH_OSD_FLAG_IGNORE_OVERLAY
;
9495 if (cop
->flags
& CEPH_OSD_COPY_FROM_FLAG_MAP_SNAP_CLONE
)
9496 flags
|= CEPH_OSD_FLAG_MAP_SNAP_CLONE
;
9497 if (cop
->flags
& CEPH_OSD_COPY_FROM_FLAG_RWORDERED
)
9498 flags
|= CEPH_OSD_FLAG_RWORDERED
;
9501 uint64_t last_offset
= 0, chunks_size
= 0;
9502 object_manifest_t
*manifest
= &obc
->obs
.oi
.manifest
;
9503 map
<uint64_t, chunk_info_t
>::iterator iter
= manifest
->chunk_map
.find(start_offset
);
9504 for (;iter
!= manifest
->chunk_map
.end(); ++iter
) {
9506 chunks_size
+= iter
->second
.length
;
9507 last_offset
= iter
->first
;
9508 if (get_copy_chunk_size() < chunks_size
) {
9513 cop
->num_chunk
= num_chunks
;
9514 cop
->start_offset
= start_offset
;
9515 cop
->last_offset
= last_offset
;
9516 dout(20) << __func__
<< " oid " << obc
->obs
.oi
.soid
<< " num_chunks: " << num_chunks
9517 << " start_offset: " << start_offset
<< " chunks_size: " << chunks_size
9518 << " last_offset: " << last_offset
<< dendl
;
9520 iter
= manifest
->chunk_map
.find(start_offset
);
9521 for (;iter
!= manifest
->chunk_map
.end(); ++iter
) {
9522 uint64_t obj_offset
= iter
->first
;
9523 uint64_t length
= manifest
->chunk_map
[iter
->first
].length
;
9524 hobject_t soid
= manifest
->chunk_map
[iter
->first
].oid
;
9525 object_locator_t
oloc(soid
);
9526 CopyCallback
* cb
= NULL
;
9527 CopyOpRef
sub_cop(std::make_shared
<CopyOp
>(cb
, ObjectContextRef(), cop
->src
, oloc
,
9528 cop
->results
.user_version
, cop
->flags
, cop
->mirror_snapset
,
9529 cop
->src_obj_fadvise_flags
, cop
->dest_obj_fadvise_flags
));
9530 sub_cop
->cursor
.data_offset
= obj_offset
;
9531 cop
->chunk_cops
[obj_offset
] = sub_cop
;
9533 int s
= sub_cop
->chunk_ops
.size();
9534 sub_cop
->chunk_ops
.resize(s
+1);
9535 sub_cop
->chunk_ops
[s
].op
.op
= CEPH_OSD_OP_READ
;
9536 sub_cop
->chunk_ops
[s
].op
.extent
.offset
= manifest
->chunk_map
[iter
->first
].offset
;
9537 sub_cop
->chunk_ops
[s
].op
.extent
.length
= length
;
9540 op
.dup(sub_cop
->chunk_ops
);
9542 if (cop
->results
.user_version
) {
9543 op
.assert_version(cop
->results
.user_version
);
9545 // we should learn the version after the first chunk, if we didn't know
9547 ceph_assert(cop
->cursor
.is_initial());
9549 op
.set_last_op_flags(cop
->src_obj_fadvise_flags
);
9551 C_CopyChunk
*fin
= new C_CopyChunk(this, obc
->obs
.oi
.soid
,
9552 get_last_peering_reset(), cop
);
9553 fin
->offset
= obj_offset
;
9555 ceph_tid_t tid
= osd
->objecter
->read(
9557 sub_cop
->src
.snap
, NULL
,
9559 new C_OnFinisher(fin
, osd
->get_objecter_finisher(get_pg_shard())),
9560 // discover the object version if we don't know it yet
9561 sub_cop
->results
.user_version
? NULL
: &sub_cop
->results
.user_version
);
9563 sub_cop
->objecter_tid
= tid
;
9565 dout(20) << __func__
<< " tgt_oid: " << soid
.oid
<< " tgt_offset: "
9566 << manifest
->chunk_map
[iter
->first
].offset
9567 << " length: " << length
<< " pool id: " << oloc
.pool
9568 << " tid: " << tid
<< dendl
;
9570 if (last_offset
<= iter
->first
) {
9576 void PrimaryLogPG::process_copy_chunk(hobject_t oid
, ceph_tid_t tid
, int r
)
9578 dout(10) << __func__
<< " " << oid
<< " tid " << tid
9579 << " " << cpp_strerror(r
) << dendl
;
9580 map
<hobject_t
,CopyOpRef
>::iterator p
= copy_ops
.find(oid
);
9581 if (p
== copy_ops
.end()) {
9582 dout(10) << __func__
<< " no copy_op found" << dendl
;
9585 CopyOpRef cop
= p
->second
;
9586 if (tid
!= cop
->objecter_tid
) {
9587 dout(10) << __func__
<< " tid " << tid
<< " != cop " << cop
9588 << " tid " << cop
->objecter_tid
<< dendl
;
9592 if (cop
->omap_data
.length() || cop
->omap_header
.length())
9593 cop
->results
.has_omap
= true;
9595 if (r
>= 0 && !pool
.info
.supports_omap() &&
9596 (cop
->omap_data
.length() || cop
->omap_header
.length())) {
9599 cop
->objecter_tid
= 0;
9600 cop
->objecter_tid2
= 0; // assume this ordered before us (if it happened)
9601 ObjectContextRef
& cobc
= cop
->obc
;
9606 ceph_assert(cop
->rval
>= 0);
9608 if (oid
.snap
< CEPH_NOSNAP
&& !cop
->results
.snaps
.empty()) {
9609 // verify snap hasn't been deleted
9610 vector
<snapid_t
>::iterator p
= cop
->results
.snaps
.begin();
9611 while (p
!= cop
->results
.snaps
.end()) {
9612 // make best effort to sanitize snaps/clones.
9613 if (get_osdmap()->in_removed_snaps_queue(info
.pgid
.pgid
.pool(), *p
)) {
9614 dout(10) << __func__
<< " clone snap " << *p
<< " has been deleted"
9616 for (vector
<snapid_t
>::iterator q
= p
+ 1;
9617 q
!= cop
->results
.snaps
.end();
9620 cop
->results
.snaps
.resize(cop
->results
.snaps
.size() - 1);
9625 if (cop
->results
.snaps
.empty()) {
9626 dout(10) << __func__
<< " no more snaps for " << oid
<< dendl
;
9632 ceph_assert(cop
->rval
>= 0);
9634 if (!cop
->temp_cursor
.data_complete
) {
9635 cop
->results
.data_digest
= cop
->data
.crc32c(cop
->results
.data_digest
);
9637 if (pool
.info
.supports_omap() && !cop
->temp_cursor
.omap_complete
) {
9638 if (cop
->omap_header
.length()) {
9639 cop
->results
.omap_digest
=
9640 cop
->omap_header
.crc32c(cop
->results
.omap_digest
);
9642 if (cop
->omap_data
.length()) {
9644 keys
.substr_of(cop
->omap_data
, 4, cop
->omap_data
.length() - 4);
9645 cop
->results
.omap_digest
= keys
.crc32c(cop
->results
.omap_digest
);
9649 if (!cop
->temp_cursor
.attr_complete
) {
9650 for (map
<string
,bufferlist
>::iterator p
= cop
->attrs
.begin();
9651 p
!= cop
->attrs
.end();
9653 cop
->results
.attrs
[string("_") + p
->first
] = p
->second
;
9658 if (!cop
->cursor
.is_complete()) {
9659 // write out what we have so far
9660 if (cop
->temp_cursor
.is_initial()) {
9661 ceph_assert(!cop
->results
.started_temp_obj
);
9662 cop
->results
.started_temp_obj
= true;
9663 cop
->results
.temp_oid
= generate_temp_object(oid
);
9664 dout(20) << __func__
<< " using temp " << cop
->results
.temp_oid
<< dendl
;
9666 ObjectContextRef tempobc
= get_object_context(cop
->results
.temp_oid
, true);
9667 OpContextUPtr ctx
= simple_opc_create(tempobc
);
9668 if (cop
->temp_cursor
.is_initial()) {
9669 ctx
->new_temp_oid
= cop
->results
.temp_oid
;
9671 _write_copy_chunk(cop
, ctx
->op_t
.get());
9672 simple_opc_submit(std::move(ctx
));
9673 dout(10) << __func__
<< " fetching more" << dendl
;
9674 _copy_some(cobc
, cop
);
9679 if (cop
->results
.is_data_digest() || cop
->results
.is_omap_digest()) {
9680 dout(20) << __func__
<< std::hex
9681 << " got digest: rx data 0x" << cop
->results
.data_digest
9682 << " omap 0x" << cop
->results
.omap_digest
9683 << ", source: data 0x" << cop
->results
.source_data_digest
9684 << " omap 0x" << cop
->results
.source_omap_digest
9686 << " flags " << cop
->results
.flags
9689 if (cop
->results
.is_data_digest() &&
9690 cop
->results
.data_digest
!= cop
->results
.source_data_digest
) {
9691 derr
<< __func__
<< std::hex
<< " data digest 0x" << cop
->results
.data_digest
9692 << " != source 0x" << cop
->results
.source_data_digest
<< std::dec
9694 osd
->clog
->error() << info
.pgid
<< " copy from " << cop
->src
9695 << " to " << cop
->obc
->obs
.oi
.soid
<< std::hex
9696 << " data digest 0x" << cop
->results
.data_digest
9697 << " != source 0x" << cop
->results
.source_data_digest
9702 if (cop
->results
.is_omap_digest() &&
9703 cop
->results
.omap_digest
!= cop
->results
.source_omap_digest
) {
9704 derr
<< __func__
<< std::hex
9705 << " omap digest 0x" << cop
->results
.omap_digest
9706 << " != source 0x" << cop
->results
.source_omap_digest
9707 << std::dec
<< dendl
;
9708 osd
->clog
->error() << info
.pgid
<< " copy from " << cop
->src
9709 << " to " << cop
->obc
->obs
.oi
.soid
<< std::hex
9710 << " omap digest 0x" << cop
->results
.omap_digest
9711 << " != source 0x" << cop
->results
.source_omap_digest
9716 if (cct
->_conf
->osd_debug_inject_copyfrom_error
) {
9717 derr
<< __func__
<< " injecting copyfrom failure" << dendl
;
9722 cop
->results
.fill_in_final_tx
= std::function
<void(PGTransaction
*)>(
9723 [this, &cop
/* avoid ref cycle */](PGTransaction
*t
) {
9724 ObjectState
& obs
= cop
->obc
->obs
;
9725 if (cop
->temp_cursor
.is_initial()) {
9726 dout(20) << "fill_in_final_tx: writing "
9727 << "directly to final object" << dendl
;
9728 // write directly to final object
9729 cop
->results
.temp_oid
= obs
.oi
.soid
;
9730 _write_copy_chunk(cop
, t
);
9732 // finish writing to temp object, then move into place
9733 dout(20) << "fill_in_final_tx: writing to temp object" << dendl
;
9734 if (obs
.oi
.has_manifest() && obs
.oi
.manifest
.is_redirect() && obs
.exists
) {
9735 /* In redirect manifest case, the object exists in the upper tier.
9736 * So, to avoid a conflict when rename() is called, remove existing
9739 t
->remove(obs
.oi
.soid
);
9741 _write_copy_chunk(cop
, t
);
9742 t
->rename(obs
.oi
.soid
, cop
->results
.temp_oid
);
9744 t
->setattrs(obs
.oi
.soid
, cop
->results
.attrs
);
9747 dout(20) << __func__
<< " success; committing" << dendl
;
9750 dout(20) << __func__
<< " complete r = " << cpp_strerror(r
) << dendl
;
9751 CopyCallbackResults
results(r
, &cop
->results
);
9752 cop
->cb
->complete(results
);
9754 copy_ops
.erase(cobc
->obs
.oi
.soid
);
9757 if (r
< 0 && cop
->results
.started_temp_obj
) {
9758 dout(10) << __func__
<< " deleting partial temp object "
9759 << cop
->results
.temp_oid
<< dendl
;
9760 ObjectContextRef tempobc
= get_object_context(cop
->results
.temp_oid
, true);
9761 OpContextUPtr ctx
= simple_opc_create(tempobc
);
9762 ctx
->op_t
->remove(cop
->results
.temp_oid
);
9763 ctx
->discard_temp_oid
= cop
->results
.temp_oid
;
9764 simple_opc_submit(std::move(ctx
));
9767 // cancel and requeue proxy ops on this object
9769 cancel_and_requeue_proxy_ops(cobc
->obs
.oi
.soid
);
9772 kick_object_context_blocked(cobc
);
9775 void PrimaryLogPG::process_copy_chunk_manifest(hobject_t oid
, ceph_tid_t tid
, int r
, uint64_t offset
)
9777 dout(10) << __func__
<< " " << oid
<< " tid " << tid
9778 << " " << cpp_strerror(r
) << dendl
;
9779 map
<hobject_t
,CopyOpRef
>::iterator p
= copy_ops
.find(oid
);
9780 if (p
== copy_ops
.end()) {
9781 dout(10) << __func__
<< " no copy_op found" << dendl
;
9784 CopyOpRef obj_cop
= p
->second
;
9785 CopyOpRef chunk_cop
= obj_cop
->chunk_cops
[offset
];
9787 if (tid
!= chunk_cop
->objecter_tid
) {
9788 dout(10) << __func__
<< " tid " << tid
<< " != cop " << chunk_cop
9789 << " tid " << chunk_cop
->objecter_tid
<< dendl
;
9793 if (chunk_cop
->omap_data
.length() || chunk_cop
->omap_header
.length()) {
9797 chunk_cop
->objecter_tid
= 0;
9798 chunk_cop
->objecter_tid2
= 0; // assume this ordered before us (if it happened)
9799 ObjectContextRef
& cobc
= obj_cop
->obc
;
9800 OSDOp
&chunk_data
= chunk_cop
->chunk_ops
[0];
9803 obj_cop
->failed
= true;
9807 if (obj_cop
->failed
) {
9810 if (!chunk_data
.outdata
.length()) {
9812 obj_cop
->failed
= true;
9816 obj_cop
->num_chunk
--;
9818 /* check all of the copyop are completed */
9819 if (obj_cop
->num_chunk
) {
9820 dout(20) << __func__
<< " num_chunk: " << obj_cop
->num_chunk
<< dendl
;
9825 OpContextUPtr ctx
= simple_opc_create(obj_cop
->obc
);
9826 if (!ctx
->lock_manager
.take_write_lock(
9827 obj_cop
->obc
->obs
.oi
.soid
,
9829 // recovery op can take read lock.
9830 // so need to wait for recovery completion
9832 obj_cop
->failed
= true;
9833 close_op_ctx(ctx
.release());
9836 dout(20) << __func__
<< " took lock on obc, " << obj_cop
->obc
->rwstate
<< dendl
;
9838 PGTransaction
*t
= ctx
->op_t
.get();
9839 ObjectState
& obs
= ctx
->new_obs
;
9840 for (auto p
: obj_cop
->chunk_cops
) {
9841 OSDOp
&sub_chunk
= p
.second
->chunk_ops
[0];
9842 t
->write(cobc
->obs
.oi
.soid
,
9843 p
.second
->cursor
.data_offset
,
9844 sub_chunk
.outdata
.length(),
9846 p
.second
->dest_obj_fadvise_flags
);
9847 dout(20) << __func__
<< " offset: " << p
.second
->cursor
.data_offset
9848 << " length: " << sub_chunk
.outdata
.length() << dendl
;
9849 write_update_size_and_usage(ctx
->delta_stats
, obs
.oi
, ctx
->modified_ranges
,
9850 p
.second
->cursor
.data_offset
, sub_chunk
.outdata
.length());
9851 obs
.oi
.manifest
.chunk_map
[p
.second
->cursor
.data_offset
].clear_flag(chunk_info_t::FLAG_MISSING
);
9852 ctx
->clean_regions
.mark_data_region_dirty(p
.second
->cursor
.data_offset
, sub_chunk
.outdata
.length());
9853 sub_chunk
.outdata
.clear();
9855 obs
.oi
.clear_data_digest();
9856 ctx
->at_version
= get_next_version();
9857 finish_ctx(ctx
.get(), pg_log_entry_t::PROMOTE
);
9858 simple_opc_submit(std::move(ctx
));
9859 obj_cop
->chunk_cops
.clear();
9861 auto p
= cobc
->obs
.oi
.manifest
.chunk_map
.rbegin();
9862 /* check remaining work */
9863 if (p
!= cobc
->obs
.oi
.manifest
.chunk_map
.rend()) {
9864 if (obj_cop
->last_offset
< p
->first
) {
9865 for (auto &en
: cobc
->obs
.oi
.manifest
.chunk_map
) {
9866 if (obj_cop
->last_offset
< en
.first
) {
9867 _copy_some_manifest(cobc
, obj_cop
, en
.first
);
9876 dout(20) << __func__
<< " complete r = " << cpp_strerror(r
) << dendl
;
9877 CopyCallbackResults
results(r
, &obj_cop
->results
);
9878 obj_cop
->cb
->complete(results
);
9880 copy_ops
.erase(cobc
->obs
.oi
.soid
);
9883 // cancel and requeue proxy ops on this object
9885 cancel_and_requeue_proxy_ops(cobc
->obs
.oi
.soid
);
9888 kick_object_context_blocked(cobc
);
9891 void PrimaryLogPG::cancel_and_requeue_proxy_ops(hobject_t oid
) {
9892 vector
<ceph_tid_t
> tids
;
9893 for (map
<ceph_tid_t
, ProxyReadOpRef
>::iterator it
= proxyread_ops
.begin();
9894 it
!= proxyread_ops
.end();) {
9895 if (it
->second
->soid
== oid
) {
9896 cancel_proxy_read((it
++)->second
, &tids
);
9901 for (map
<ceph_tid_t
, ProxyWriteOpRef
>::iterator it
= proxywrite_ops
.begin();
9902 it
!= proxywrite_ops
.end();) {
9903 if (it
->second
->soid
== oid
) {
9904 cancel_proxy_write((it
++)->second
, &tids
);
9909 osd
->objecter
->op_cancel(tids
, -ECANCELED
);
9910 kick_proxy_ops_blocked(oid
);
9913 void PrimaryLogPG::_write_copy_chunk(CopyOpRef cop
, PGTransaction
*t
)
9915 dout(20) << __func__
<< " " << cop
9916 << " " << cop
->attrs
.size() << " attrs"
9917 << " " << cop
->data
.length() << " bytes"
9918 << " " << cop
->omap_header
.length() << " omap header bytes"
9919 << " " << cop
->omap_data
.length() << " omap data bytes"
9921 if (!cop
->temp_cursor
.attr_complete
) {
9922 t
->create(cop
->results
.temp_oid
);
9924 if (!cop
->temp_cursor
.data_complete
) {
9925 ceph_assert(cop
->data
.length() + cop
->temp_cursor
.data_offset
==
9926 cop
->cursor
.data_offset
);
9927 if (pool
.info
.required_alignment() &&
9928 !cop
->cursor
.data_complete
) {
9930 * Trim off the unaligned bit at the end, we'll adjust cursor.data_offset
9931 * to pick it up on the next pass.
9933 ceph_assert(cop
->temp_cursor
.data_offset
%
9934 pool
.info
.required_alignment() == 0);
9935 if (cop
->data
.length() % pool
.info
.required_alignment() != 0) {
9937 cop
->data
.length() % pool
.info
.required_alignment();
9939 bl
.substr_of(cop
->data
, 0, cop
->data
.length() - to_trim
);
9941 cop
->cursor
.data_offset
-= to_trim
;
9942 ceph_assert(cop
->data
.length() + cop
->temp_cursor
.data_offset
==
9943 cop
->cursor
.data_offset
);
9946 if (cop
->data
.length()) {
9948 cop
->results
.temp_oid
,
9949 cop
->temp_cursor
.data_offset
,
9952 cop
->dest_obj_fadvise_flags
);
9956 if (pool
.info
.supports_omap()) {
9957 if (!cop
->temp_cursor
.omap_complete
) {
9958 if (cop
->omap_header
.length()) {
9960 cop
->results
.temp_oid
,
9962 cop
->omap_header
.clear();
9964 if (cop
->omap_data
.length()) {
9965 map
<string
,bufferlist
> omap
;
9966 bufferlist::const_iterator p
= cop
->omap_data
.begin();
9968 t
->omap_setkeys(cop
->results
.temp_oid
, omap
);
9969 cop
->omap_data
.clear();
9973 ceph_assert(cop
->omap_header
.length() == 0);
9974 ceph_assert(cop
->omap_data
.length() == 0);
9976 cop
->temp_cursor
= cop
->cursor
;
9979 void PrimaryLogPG::finish_copyfrom(CopyFromCallback
*cb
)
9981 OpContext
*ctx
= cb
->ctx
;
9982 dout(20) << "finish_copyfrom on " << ctx
->obs
->oi
.soid
<< dendl
;
9984 ObjectState
& obs
= ctx
->new_obs
;
9986 dout(20) << __func__
<< ": exists, removing" << dendl
;
9987 ctx
->op_t
->remove(obs
.oi
.soid
);
9989 ctx
->delta_stats
.num_objects
++;
9992 if (cb
->is_temp_obj_used()) {
9993 ctx
->discard_temp_oid
= cb
->results
->temp_oid
;
9995 cb
->results
->fill_in_final_tx(ctx
->op_t
.get());
9997 // CopyFromCallback fills this in for us
9998 obs
.oi
.user_version
= ctx
->user_at_version
;
10000 if (cb
->results
->is_data_digest()) {
10001 obs
.oi
.set_data_digest(cb
->results
->data_digest
);
10003 obs
.oi
.clear_data_digest();
10005 if (cb
->results
->is_omap_digest()) {
10006 obs
.oi
.set_omap_digest(cb
->results
->omap_digest
);
10008 obs
.oi
.clear_omap_digest();
10011 obs
.oi
.truncate_seq
= cb
->truncate_seq
;
10012 obs
.oi
.truncate_size
= cb
->truncate_size
;
10014 obs
.oi
.mtime
= ceph::real_clock::to_timespec(cb
->results
->mtime
);
10015 ctx
->mtime
= utime_t();
10017 ctx
->extra_reqids
= cb
->results
->reqids
;
10018 ctx
->extra_reqid_return_codes
= cb
->results
->reqid_return_codes
;
10020 // cache: clear whiteout?
10021 if (obs
.oi
.is_whiteout()) {
10022 dout(10) << __func__
<< " clearing whiteout on " << obs
.oi
.soid
<< dendl
;
10023 obs
.oi
.clear_flag(object_info_t::FLAG_WHITEOUT
);
10024 --ctx
->delta_stats
.num_whiteouts
;
10027 if (cb
->results
->has_omap
) {
10028 dout(10) << __func__
<< " setting omap flag on " << obs
.oi
.soid
<< dendl
;
10029 obs
.oi
.set_flag(object_info_t::FLAG_OMAP
);
10030 ctx
->clean_regions
.mark_omap_dirty();
10032 dout(10) << __func__
<< " clearing omap flag on " << obs
.oi
.soid
<< dendl
;
10033 obs
.oi
.clear_flag(object_info_t::FLAG_OMAP
);
10036 interval_set
<uint64_t> ch
;
10037 if (obs
.oi
.size
> 0)
10038 ch
.insert(0, obs
.oi
.size
);
10039 ctx
->modified_ranges
.union_of(ch
);
10040 ctx
->clean_regions
.mark_data_region_dirty(0, std::max(obs
.oi
.size
, cb
->get_data_size()));
10042 if (cb
->get_data_size() != obs
.oi
.size
) {
10043 ctx
->delta_stats
.num_bytes
-= obs
.oi
.size
;
10044 obs
.oi
.size
= cb
->get_data_size();
10045 ctx
->delta_stats
.num_bytes
+= obs
.oi
.size
;
10047 ctx
->delta_stats
.num_wr
++;
10048 ctx
->delta_stats
.num_wr_kb
+= shift_round_up(obs
.oi
.size
, 10);
10050 osd
->logger
->inc(l_osd_copyfrom
);
10053 void PrimaryLogPG::finish_promote(int r
, CopyResults
*results
,
10054 ObjectContextRef obc
)
10056 const hobject_t
& soid
= obc
->obs
.oi
.soid
;
10057 dout(10) << __func__
<< " " << soid
<< " r=" << r
10058 << " uv" << results
->user_version
<< dendl
;
10060 if (r
== -ECANCELED
) {
10064 if (r
!= -ENOENT
&& soid
.is_snap()) {
10065 if (results
->snaps
.empty()) {
10066 // we must have read "snap" content from the head object in the
10067 // base pool. use snap_seq to construct what snaps should be
10068 // for this clone (what is was before we evicted the clean clone
10069 // from this pool, and what it will be when we flush and the
10070 // clone eventually happens in the base pool). we want to use
10071 // snaps in (results->snap_seq,soid.snap]
10072 SnapSet
& snapset
= obc
->ssc
->snapset
;
10073 for (auto p
= snapset
.clone_snaps
.rbegin();
10074 p
!= snapset
.clone_snaps
.rend();
10076 for (auto snap
: p
->second
) {
10077 if (snap
> soid
.snap
) {
10080 if (snap
<= results
->snap_seq
) {
10083 results
->snaps
.push_back(snap
);
10088 dout(20) << __func__
<< " snaps " << results
->snaps
<< dendl
;
10089 filter_snapc(results
->snaps
);
10091 dout(20) << __func__
<< " filtered snaps " << results
->snaps
<< dendl
;
10092 if (results
->snaps
.empty()) {
10093 dout(20) << __func__
10094 << " snaps are empty, clone is invalid,"
10095 << " setting r to ENOENT" << dendl
;
10100 if (r
< 0 && results
->started_temp_obj
) {
10101 dout(10) << __func__
<< " abort; will clean up partial work" << dendl
;
10102 ObjectContextRef tempobc
= get_object_context(results
->temp_oid
, false);
10103 ceph_assert(tempobc
);
10104 OpContextUPtr ctx
= simple_opc_create(tempobc
);
10105 ctx
->op_t
->remove(results
->temp_oid
);
10106 simple_opc_submit(std::move(ctx
));
10107 results
->started_temp_obj
= false;
10110 if (r
== -ENOENT
&& soid
.is_snap()) {
10111 dout(10) << __func__
10112 << ": enoent while trying to promote clone, " << soid
10113 << " must have been trimmed, removing from snapset"
10115 hobject_t
head(soid
.get_head());
10116 ObjectContextRef obc
= get_object_context(head
, false);
10119 OpContextUPtr tctx
= simple_opc_create(obc
);
10120 tctx
->at_version
= get_next_version();
10121 if (get_osdmap()->require_osd_release
< ceph_release_t::octopus
) {
10122 filter_snapc(tctx
->new_snapset
.snaps
);
10124 tctx
->new_snapset
.snaps
.clear();
10126 vector
<snapid_t
> new_clones
;
10127 map
<snapid_t
, vector
<snapid_t
>> new_clone_snaps
;
10128 for (vector
<snapid_t
>::iterator i
= tctx
->new_snapset
.clones
.begin();
10129 i
!= tctx
->new_snapset
.clones
.end();
10131 if (*i
!= soid
.snap
) {
10132 new_clones
.push_back(*i
);
10133 auto p
= tctx
->new_snapset
.clone_snaps
.find(*i
);
10134 if (p
!= tctx
->new_snapset
.clone_snaps
.end()) {
10135 new_clone_snaps
[*i
] = p
->second
;
10139 tctx
->new_snapset
.clones
.swap(new_clones
);
10140 tctx
->new_snapset
.clone_overlap
.erase(soid
.snap
);
10141 tctx
->new_snapset
.clone_size
.erase(soid
.snap
);
10142 tctx
->new_snapset
.clone_snaps
.swap(new_clone_snaps
);
10144 // take RWWRITE lock for duration of our local write. ignore starvation.
10145 if (!tctx
->lock_manager
.take_write_lock(
10148 ceph_abort_msg("problem!");
10150 dout(20) << __func__
<< " took lock on obc, " << obc
->rwstate
<< dendl
;
10152 finish_ctx(tctx
.get(), pg_log_entry_t::PROMOTE
);
10154 simple_opc_submit(std::move(tctx
));
10158 bool whiteout
= false;
10159 if (r
== -ENOENT
) {
10160 ceph_assert(soid
.snap
== CEPH_NOSNAP
); // snap case is above
10161 dout(10) << __func__
<< " whiteout " << soid
<< dendl
;
10165 if (r
< 0 && !whiteout
) {
10166 derr
<< __func__
<< " unexpected promote error " << cpp_strerror(r
) << dendl
;
10167 // pass error to everyone blocked on this object
10168 // FIXME: this is pretty sloppy, but at this point we got
10169 // something unexpected and don't have many other options.
10170 map
<hobject_t
,list
<OpRequestRef
>>::iterator blocked_iter
=
10171 waiting_for_blocked_object
.find(soid
);
10172 if (blocked_iter
!= waiting_for_blocked_object
.end()) {
10173 while (!blocked_iter
->second
.empty()) {
10174 osd
->reply_op_error(blocked_iter
->second
.front(), r
);
10175 blocked_iter
->second
.pop_front();
10177 waiting_for_blocked_object
.erase(blocked_iter
);
10182 osd
->promote_finish(results
->object_size
);
10184 OpContextUPtr tctx
= simple_opc_create(obc
);
10185 tctx
->at_version
= get_next_version();
10187 if (!obc
->obs
.oi
.has_manifest()) {
10188 ++tctx
->delta_stats
.num_objects
;
10190 if (soid
.snap
< CEPH_NOSNAP
)
10191 ++tctx
->delta_stats
.num_object_clones
;
10192 tctx
->new_obs
.exists
= true;
10194 tctx
->extra_reqids
= results
->reqids
;
10195 tctx
->extra_reqid_return_codes
= results
->reqid_return_codes
;
10197 if (obc
->obs
.oi
.has_manifest() && obc
->obs
.oi
.manifest
.is_redirect()) {
10198 tctx
->new_obs
.oi
.manifest
.type
= object_manifest_t::TYPE_NONE
;
10199 tctx
->new_obs
.oi
.clear_flag(object_info_t::FLAG_REDIRECT_HAS_REFERENCE
);
10200 tctx
->new_obs
.oi
.clear_flag(object_info_t::FLAG_MANIFEST
);
10201 tctx
->new_obs
.oi
.manifest
.redirect_target
= hobject_t();
10202 tctx
->delta_stats
.num_objects_manifest
--;
10203 if (obc
->obs
.oi
.test_flag(object_info_t::FLAG_REDIRECT_HAS_REFERENCE
)) {
10204 dec_all_refcount_manifest(obc
->obs
.oi
, tctx
.get());
10209 // create a whiteout
10210 tctx
->op_t
->create(soid
);
10211 tctx
->new_obs
.oi
.set_flag(object_info_t::FLAG_WHITEOUT
);
10212 ++tctx
->delta_stats
.num_whiteouts
;
10213 dout(20) << __func__
<< " creating whiteout on " << soid
<< dendl
;
10214 osd
->logger
->inc(l_osd_tier_whiteout
);
10216 if (results
->has_omap
) {
10217 dout(10) << __func__
<< " setting omap flag on " << soid
<< dendl
;
10218 tctx
->new_obs
.oi
.set_flag(object_info_t::FLAG_OMAP
);
10219 ++tctx
->delta_stats
.num_objects_omap
;
10222 results
->fill_in_final_tx(tctx
->op_t
.get());
10223 if (results
->started_temp_obj
) {
10224 tctx
->discard_temp_oid
= results
->temp_oid
;
10226 tctx
->new_obs
.oi
.size
= results
->object_size
;
10227 tctx
->new_obs
.oi
.user_version
= results
->user_version
;
10228 tctx
->new_obs
.oi
.mtime
= ceph::real_clock::to_timespec(results
->mtime
);
10229 tctx
->mtime
= utime_t();
10230 if (results
->is_data_digest()) {
10231 tctx
->new_obs
.oi
.set_data_digest(results
->data_digest
);
10233 tctx
->new_obs
.oi
.clear_data_digest();
10235 if (results
->object_size
)
10236 tctx
->clean_regions
.mark_data_region_dirty(0, results
->object_size
);
10237 if (results
->is_omap_digest()) {
10238 tctx
->new_obs
.oi
.set_omap_digest(results
->omap_digest
);
10240 tctx
->new_obs
.oi
.clear_omap_digest();
10242 if (results
->has_omap
)
10243 tctx
->clean_regions
.mark_omap_dirty();
10244 tctx
->new_obs
.oi
.truncate_seq
= results
->truncate_seq
;
10245 tctx
->new_obs
.oi
.truncate_size
= results
->truncate_size
;
10247 if (soid
.snap
!= CEPH_NOSNAP
) {
10248 ceph_assert(obc
->ssc
->snapset
.clone_snaps
.count(soid
.snap
));
10249 ceph_assert(obc
->ssc
->snapset
.clone_size
.count(soid
.snap
));
10250 ceph_assert(obc
->ssc
->snapset
.clone_size
[soid
.snap
] ==
10251 results
->object_size
);
10252 ceph_assert(obc
->ssc
->snapset
.clone_overlap
.count(soid
.snap
));
10254 tctx
->delta_stats
.num_bytes
+= obc
->ssc
->snapset
.get_clone_bytes(soid
.snap
);
10256 tctx
->delta_stats
.num_bytes
+= results
->object_size
;
10260 if (results
->mirror_snapset
) {
10261 ceph_assert(tctx
->new_obs
.oi
.soid
.snap
== CEPH_NOSNAP
);
10262 tctx
->new_snapset
.from_snap_set(
10264 get_osdmap()->require_osd_release
< ceph_release_t::luminous
);
10266 dout(20) << __func__
<< " new_snapset " << tctx
->new_snapset
<< dendl
;
10268 // take RWWRITE lock for duration of our local write. ignore starvation.
10269 if (!tctx
->lock_manager
.take_write_lock(
10272 ceph_abort_msg("problem!");
10274 dout(20) << __func__
<< " took lock on obc, " << obc
->rwstate
<< dendl
;
10276 finish_ctx(tctx
.get(), pg_log_entry_t::PROMOTE
);
10278 simple_opc_submit(std::move(tctx
));
10280 osd
->logger
->inc(l_osd_tier_promote
);
10283 agent_state
->is_idle())
10284 agent_choose_mode();
10287 void PrimaryLogPG::finish_promote_manifest(int r
, CopyResults
*results
,
10288 ObjectContextRef obc
)
10290 const hobject_t
& soid
= obc
->obs
.oi
.soid
;
10291 dout(10) << __func__
<< " " << soid
<< " r=" << r
10292 << " uv" << results
->user_version
<< dendl
;
10294 if (r
== -ECANCELED
|| r
== -EAGAIN
) {
10299 derr
<< __func__
<< " unexpected promote error " << cpp_strerror(r
) << dendl
;
10300 // pass error to everyone blocked on this object
10301 // FIXME: this is pretty sloppy, but at this point we got
10302 // something unexpected and don't have many other options.
10303 map
<hobject_t
,list
<OpRequestRef
>>::iterator blocked_iter
=
10304 waiting_for_blocked_object
.find(soid
);
10305 if (blocked_iter
!= waiting_for_blocked_object
.end()) {
10306 while (!blocked_iter
->second
.empty()) {
10307 osd
->reply_op_error(blocked_iter
->second
.front(), r
);
10308 blocked_iter
->second
.pop_front();
10310 waiting_for_blocked_object
.erase(blocked_iter
);
10315 osd
->promote_finish(results
->object_size
);
10316 osd
->logger
->inc(l_osd_tier_promote
);
10319 agent_state
->is_idle())
10320 agent_choose_mode();
10323 void PrimaryLogPG::cancel_copy(CopyOpRef cop
, bool requeue
,
10324 vector
<ceph_tid_t
> *tids
)
10326 dout(10) << __func__
<< " " << cop
->obc
->obs
.oi
.soid
10327 << " from " << cop
->src
<< " " << cop
->oloc
10328 << " v" << cop
->results
.user_version
<< dendl
;
10330 // cancel objecter op, if we can
10331 if (cop
->objecter_tid
) {
10332 tids
->push_back(cop
->objecter_tid
);
10333 cop
->objecter_tid
= 0;
10334 if (cop
->objecter_tid2
) {
10335 tids
->push_back(cop
->objecter_tid2
);
10336 cop
->objecter_tid2
= 0;
10340 copy_ops
.erase(cop
->obc
->obs
.oi
.soid
);
10341 cop
->obc
->stop_block();
10343 kick_object_context_blocked(cop
->obc
);
10344 cop
->results
.should_requeue
= requeue
;
10345 CopyCallbackResults
result(-ECANCELED
, &cop
->results
);
10346 cop
->cb
->complete(result
);
10348 // There may still be an objecter callback referencing this copy op.
10349 // That callback will not need the obc since it's been canceled, and
10350 // we need the obc reference to go away prior to flush.
10351 cop
->obc
= ObjectContextRef();
10354 void PrimaryLogPG::cancel_copy_ops(bool requeue
, vector
<ceph_tid_t
> *tids
)
10356 dout(10) << __func__
<< dendl
;
10357 map
<hobject_t
,CopyOpRef
>::iterator p
= copy_ops
.begin();
10358 while (p
!= copy_ops
.end()) {
10359 // requeue this op? can I queue up all of them?
10360 cancel_copy((p
++)->second
, requeue
, tids
);
10364 struct C_gather
: public Context
{
10365 PrimaryLogPGRef pg
;
10367 epoch_t last_peering_reset
;
10369 C_gather(PrimaryLogPG
*pg_
, hobject_t oid_
, epoch_t lpr_
, OSDOp
*osd_op_
) :
10370 pg(pg_
), oid(oid_
), last_peering_reset(lpr_
), osd_op(osd_op_
) {}
10371 void finish(int r
) override
{
10372 if (r
== -ECANCELED
)
10374 std::scoped_lock locker
{*pg
};
10375 auto p
= pg
->cls_gather_ops
.find(oid
);
10376 if (p
== pg
->cls_gather_ops
.end()) {
10377 // op was cancelled
10380 if (last_peering_reset
!= pg
->get_last_peering_reset()) {
10384 PrimaryLogPG::OpContext
*ctx
= p
->second
.ctx
;
10385 pg
->cls_gather_ops
.erase(p
);
10386 pg
->execute_ctx(ctx
);
10390 int PrimaryLogPG::start_cls_gather(OpContext
*ctx
, std::map
<std::string
, bufferlist
> *src_obj_buffs
, const std::string
& pool
,
10391 const char *cls
, const char *method
, bufferlist
& inbl
)
10393 OpRequestRef op
= ctx
->op
;
10394 MOSDOp
*m
= static_cast<MOSDOp
*>(op
->get_nonconst_req());
10396 auto pool_id
= osd
->objecter
->with_osdmap(std::mem_fn(&OSDMap::lookup_pg_pool_name
), pool
);
10397 object_locator_t
oloc(pool_id
);
10399 ObjectState
& obs
= ctx
->new_obs
;
10400 object_info_t
& oi
= obs
.oi
;
10401 const hobject_t
& soid
= oi
.soid
;
10403 ObjectContextRef obc
= get_object_context(soid
, false);
10404 C_GatherBuilder
gather(cct
);
10406 auto [iter
, inserted
] = cls_gather_ops
.emplace(soid
, CLSGatherOp(ctx
, obc
, op
));
10407 ceph_assert(inserted
);
10408 auto &cgop
= iter
->second
;
10409 for (std::map
<std::string
, bufferlist
>::iterator it
= src_obj_buffs
->begin(); it
!= src_obj_buffs
->end(); it
++) {
10410 std::string oid
= it
->first
;
10411 ObjectOperation obj_op
;
10412 obj_op
.call(cls
, method
, inbl
);
10413 uint32_t flags
= 0;
10414 ceph_tid_t tid
= osd
->objecter
->read(
10415 object_t(oid
), oloc
, obj_op
,
10416 m
->get_snapid(), &it
->second
,
10417 flags
, gather
.new_sub());
10418 cgop
.objecter_tids
.push_back(tid
);
10419 dout(10) << __func__
<< " src=" << oid
<< ", tgt=" << soid
<< dendl
;
10422 C_gather
*fin
= new C_gather(this, soid
, get_last_peering_reset(), &(*ctx
->ops
)[ctx
->current_osd_subop_num
]);
10423 gather
.set_finisher(new C_OnFinisher(fin
,
10424 osd
->get_objecter_finisher(get_pg_shard())));
10427 return -EINPROGRESS
;
10430 // ========================================================================
10433 // Flush a dirty object in the cache tier by writing it back to the
10434 // base tier. The sequence looks like:
10436 // * send a copy-from operation to the base tier to copy the current
10437 // version of the object
10438 // * base tier will pull the object via (perhaps multiple) copy-get(s)
10439 // * on completion, we check if the object has been modified. if so,
10440 // just reply with -EAGAIN.
10441 // * try to take a write lock so we can clear the dirty flag. if this
10442 // fails, wait and retry
10443 // * start a repop that clears the bit.
10445 // If we have to wait, we will retry by coming back through the
10446 // start_flush method. We check if a flush is already in progress
10447 // and, if so, try to finish it by rechecking the version and trying
10448 // to clear the dirty bit.
10450 // In order for the cache-flush (a write op) to not block the copy-get
10451 // from reading the object, the client *must* set the SKIPRWLOCKS
10454 // NOTE: normally writes are strictly ordered for the client, but
10455 // flushes are special in that they can be reordered with respect to
10456 // other writes. In particular, we can't have a flush request block
10457 // an update to the cache pool object!
10459 struct C_Flush
: public Context
{
10460 PrimaryLogPGRef pg
;
10462 epoch_t last_peering_reset
;
10465 C_Flush(PrimaryLogPG
*p
, hobject_t o
, epoch_t lpr
)
10466 : pg(p
), oid(o
), last_peering_reset(lpr
),
10467 tid(0), start(ceph_clock_now())
10469 void finish(int r
) override
{
10470 if (r
== -ECANCELED
)
10472 std::scoped_lock locker
{*pg
};
10473 if (last_peering_reset
== pg
->get_last_peering_reset()) {
10474 pg
->finish_flush(oid
, tid
, r
);
10475 pg
->osd
->logger
->tinc(l_osd_tier_flush_lat
, ceph_clock_now() - start
);
10480 int PrimaryLogPG::start_dedup(OpRequestRef op
, ObjectContextRef obc
)
10482 const object_info_t
& oi
= obc
->obs
.oi
;
10483 const hobject_t
& soid
= oi
.soid
;
10485 ceph_assert(obc
->is_blocked());
10486 if (oi
.size
== 0) {
10490 if (pool
.info
.get_fingerprint_type() == pg_pool_t::TYPE_FINGERPRINT_NONE
) {
10491 dout(0) << " fingerprint algorithm is not set " << dendl
;
10494 if (pool
.info
.get_dedup_tier() <= 0) {
10495 dout(10) << " dedup tier is not set " << dendl
;
10500 * The operations to make dedup chunks are tracked by a ManifestOp.
10501 * This op will be finished if all the operations are completed.
10503 ManifestOpRef
mop(std::make_shared
<ManifestOp
>(obc
, nullptr));
10506 std::map
<uint64_t, bufferlist
> chunks
;
10507 int r
= do_cdc(oi
, mop
->new_manifest
.chunk_map
, chunks
);
10511 if (!chunks
.size()) {
10515 // chunks issued here are different with chunk_map newly generated
10516 // because the same chunks in previous snap will not be issued
10517 // So, we need two data structures; the first is the issued chunk list to track
10518 // issued operations, and the second is the new chunk_map to update chunk_map after
10519 // all operations are finished
10520 object_ref_delta_t refs
;
10521 ObjectContextRef obc_l
, obc_g
;
10522 get_adjacent_clones(obc
, obc_l
, obc_g
);
10523 // skip if the same content exits in prev snap at same offset
10524 mop
->new_manifest
.calc_refs_to_inc_on_set(
10525 obc_l
? &(obc_l
->obs
.oi
.manifest
) : nullptr,
10526 obc_g
? &(obc_g
->obs
.oi
.manifest
) : nullptr,
10529 for (auto p
: chunks
) {
10530 hobject_t target
= mop
->new_manifest
.chunk_map
[p
.first
].oid
;
10531 if (refs
.find(target
) == refs
.end()) {
10534 C_SetDedupChunks
*fin
= new C_SetDedupChunks(this, soid
, get_last_peering_reset(), p
.first
);
10535 ceph_tid_t tid
= refcount_manifest(soid
, target
, refcount_t::CREATE_OR_GET_REF
,
10536 fin
, std::move(chunks
[p
.first
]));
10537 mop
->chunks
[target
] = make_pair(p
.first
, p
.second
.length());
10539 mop
->tids
[p
.first
] = tid
;
10541 dout(10) << __func__
<< " oid: " << soid
<< " tid: " << tid
10542 << " target: " << target
<< " offset: " << p
.first
10543 << " length: " << p
.second
.length() << dendl
;
10546 if (mop
->tids
.size()) {
10547 manifest_ops
[soid
] = mop
;
10548 manifest_ops
[soid
]->op
= op
;
10554 return -EINPROGRESS
;
10557 int PrimaryLogPG::do_cdc(const object_info_t
& oi
,
10558 std::map
<uint64_t, chunk_info_t
>& chunk_map
,
10559 std::map
<uint64_t, bufferlist
>& chunks
)
10561 string chunk_algo
= pool
.info
.get_dedup_chunk_algorithm_name();
10562 int64_t chunk_size
= pool
.info
.get_dedup_cdc_chunk_size();
10563 uint64_t total_length
= 0;
10565 std::unique_ptr
<CDC
> cdc
= CDC::create(chunk_algo
, cbits(chunk_size
)-1);
10567 dout(0) << __func__
<< " unrecognized chunk-algorithm " << dendl
;
10573 * We disable EC pool as a base tier of distributed dedup.
10574 * The reason why we disallow erasure code pool here is that the EC pool does not support objects_read_sync().
10575 * Therefore, we should change the current implementation totally to make EC pool compatible.
10576 * As s result, we leave this as a future work.
10578 int r
= pgbackend
->objects_read_sync(
10579 oi
.soid
, 0, oi
.size
, 0, &bl
);
10581 dout(0) << __func__
<< " read fail " << oi
.soid
10582 << " len: " << oi
.size
<< " r: " << r
<< dendl
;
10585 if (bl
.length() != oi
.size
) {
10586 dout(0) << __func__
<< " bl.length: " << bl
.length() << " != oi.size: "
10587 << oi
.size
<< " during chunking " << dendl
;
10591 dout(10) << __func__
<< " oid: " << oi
.soid
<< " len: " << bl
.length()
10592 << " oi.size: " << oi
.size
10593 << " chunk_size: " << chunk_size
<< dendl
;
10595 vector
<pair
<uint64_t, uint64_t>> cdc_chunks
;
10596 cdc
->calc_chunks(bl
, &cdc_chunks
);
10599 for (auto p
: cdc_chunks
) {
10601 chunk
.substr_of(bl
, p
.first
, p
.second
);
10602 auto [ret
, target
] = get_fpoid_from_chunk(oi
.soid
, chunk
);
10606 chunks
[p
.first
] = std::move(chunk
);
10607 chunk_map
[p
.first
] = chunk_info_t(0, p
.second
, target
);
10608 total_length
+= p
.second
;
10610 return total_length
;
10613 std::pair
<int, hobject_t
> PrimaryLogPG::get_fpoid_from_chunk(
10614 const hobject_t soid
, bufferlist
& chunk
)
10616 pg_pool_t::fingerprint_t fp_algo
= pool
.info
.get_fingerprint_type();
10617 if (fp_algo
== pg_pool_t::TYPE_FINGERPRINT_NONE
) {
10618 return make_pair(-EINVAL
, hobject_t());
10620 object_t fp_oid
= [&fp_algo
, &chunk
]() -> string
{
10622 case pg_pool_t::TYPE_FINGERPRINT_SHA1
:
10623 return ceph::crypto::digest
<ceph::crypto::SHA1
>(chunk
).to_str();
10624 case pg_pool_t::TYPE_FINGERPRINT_SHA256
:
10625 return ceph::crypto::digest
<ceph::crypto::SHA256
>(chunk
).to_str();
10626 case pg_pool_t::TYPE_FINGERPRINT_SHA512
:
10627 return ceph::crypto::digest
<ceph::crypto::SHA512
>(chunk
).to_str();
10629 assert(0 == "unrecognized fingerprint type");
10635 object_locator_t
oloc(soid
);
10636 oloc
.pool
= pool
.info
.get_dedup_tier();
10637 // check if dedup_tier isn't set
10638 ceph_assert(oloc
.pool
> 0);
10639 int ret
= get_osdmap()->object_locator_to_pg(fp_oid
, oloc
, raw_pg
);
10641 return make_pair(ret
, hobject_t());
10643 hobject_t
target(fp_oid
, oloc
.key
, snapid_t(),
10644 raw_pg
.ps(), raw_pg
.pool(),
10646 return make_pair(0, target
);
10649 int PrimaryLogPG::finish_set_dedup(hobject_t oid
, int r
, ceph_tid_t tid
, uint64_t offset
)
10651 dout(10) << __func__
<< " " << oid
<< " tid " << tid
10652 << " " << cpp_strerror(r
) << dendl
;
10653 map
<hobject_t
,ManifestOpRef
>::iterator p
= manifest_ops
.find(oid
);
10654 if (p
== manifest_ops
.end()) {
10655 dout(10) << __func__
<< " no manifest_op found" << dendl
;
10658 ManifestOpRef mop
= p
->second
;
10659 mop
->results
[offset
] = r
;
10661 // if any failure occurs, put a mark on the results to recognize the failure
10662 mop
->results
[0] = r
;
10664 if (mop
->num_chunks
!= mop
->results
.size()) {
10665 // there are on-going works
10666 return -EINPROGRESS
;
10668 ObjectContextRef obc
= mop
->obc
;
10670 ceph_assert(obc
->is_blocked());
10672 kick_object_context_blocked(obc
);
10673 if (mop
->results
[0] < 0) {
10674 // check if the previous op returns fail
10675 ceph_assert(mop
->num_chunks
== mop
->results
.size());
10676 manifest_ops
.erase(oid
);
10677 osd
->reply_op_error(mop
->op
, mop
->results
[0]);
10681 if (mop
->chunks
.size()) {
10682 OpContextUPtr ctx
= simple_opc_create(obc
);
10684 if (ctx
->lock_manager
.get_lock_type(
10689 dout(20) << __func__
<< " took write lock" << dendl
;
10690 } else if (mop
->op
) {
10691 dout(10) << __func__
<< " waiting on write lock " << mop
->op
<< dendl
;
10692 close_op_ctx(ctx
.release());
10696 ctx
->at_version
= get_next_version();
10697 ctx
->new_obs
= obc
->obs
;
10698 ctx
->new_obs
.oi
.clear_flag(object_info_t::FLAG_DIRTY
);
10699 --ctx
->delta_stats
.num_objects_dirty
;
10700 if (!ctx
->obs
->oi
.has_manifest()) {
10701 ctx
->delta_stats
.num_objects_manifest
++;
10702 ctx
->new_obs
.oi
.set_flag(object_info_t::FLAG_MANIFEST
);
10703 ctx
->new_obs
.oi
.manifest
.type
= object_manifest_t::TYPE_CHUNKED
;
10707 * Let's assume that there is a manifest snapshotted object, and we issue tier_flush() to head.
10708 * head: [0, 2) aaa <-- tier_flush()
10709 * 20: [0, 2) ddd, [6, 2) bbb, [8, 2) ccc
10711 * In this case, if the new chunk_map is as follows,
10712 * new_chunk_map : [0, 2) ddd, [6, 2) bbb, [8, 2) ccc
10713 * we should drop aaa from head by using calc_refs_to_drop_on_removal().
10714 * So, the precedure is
10715 * 1. calc_refs_to_drop_on_removal()
10716 * 2. register old references to drop after tier_flush() is committed
10717 * 3. update new chunk_map
10720 ObjectCleanRegions c_regions
= ctx
->clean_regions
;
10721 ObjectContextRef cobc
= get_prev_clone_obc(obc
);
10722 c_regions
.mark_fully_dirty();
10723 // CDC was done on entire range of manifest object,
10724 // so the first thing we should do here is to drop the reference to old chunks
10725 ObjectContextRef obc_l
, obc_g
;
10726 get_adjacent_clones(obc
, obc_l
, obc_g
);
10727 // clear all old references
10728 object_ref_delta_t refs
;
10729 ctx
->obs
->oi
.manifest
.calc_refs_to_drop_on_removal(
10730 obc_l
? &(obc_l
->obs
.oi
.manifest
) : nullptr,
10731 obc_g
? &(obc_g
->obs
.oi
.manifest
) : nullptr,
10733 if (!refs
.is_empty()) {
10734 ctx
->register_on_commit(
10735 [oid
, this, refs
](){
10736 dec_refcount(oid
, refs
);
10740 // set new references
10741 ctx
->new_obs
.oi
.manifest
.chunk_map
= mop
->new_manifest
.chunk_map
;
10743 finish_ctx(ctx
.get(), pg_log_entry_t::CLEAN
);
10744 simple_opc_submit(std::move(ctx
));
10747 osd
->reply_op_error(mop
->op
, r
);
10749 manifest_ops
.erase(oid
);
10753 int PrimaryLogPG::finish_set_manifest_refcount(hobject_t oid
, int r
, ceph_tid_t tid
, uint64_t offset
)
10755 dout(10) << __func__
<< " " << oid
<< " tid " << tid
10756 << " " << cpp_strerror(r
) << dendl
;
10757 map
<hobject_t
,ManifestOpRef
>::iterator p
= manifest_ops
.find(oid
);
10758 if (p
== manifest_ops
.end()) {
10759 dout(10) << __func__
<< " no manifest_op found" << dendl
;
10762 ManifestOpRef mop
= p
->second
;
10763 mop
->results
[offset
] = r
;
10765 // if any failure occurs, put a mark on the results to recognize the failure
10766 mop
->results
[0] = r
;
10768 if (mop
->num_chunks
!= mop
->results
.size()) {
10769 // there are on-going works
10770 return -EINPROGRESS
;
10774 mop
->cb
->complete(r
);
10777 manifest_ops
.erase(p
);
10783 int PrimaryLogPG::start_flush(
10784 OpRequestRef op
, ObjectContextRef obc
,
10785 bool blocking
, hobject_t
*pmissing
,
10786 std::optional
<std::function
<void()>> &&on_flush
,
10789 const object_info_t
& oi
= obc
->obs
.oi
;
10790 const hobject_t
& soid
= oi
.soid
;
10791 dout(10) << __func__
<< " " << soid
10792 << " v" << oi
.version
10793 << " uv" << oi
.user_version
10794 << " " << (blocking
? "blocking" : "non-blocking/best-effort")
10797 bool preoctopus_compat
=
10798 get_osdmap()->require_osd_release
< ceph_release_t::octopus
;
10800 if (preoctopus_compat
) {
10801 // for pre-octopus compatibility, filter SnapSet::snaps. not
10802 // certain we need this, but let's be conservative.
10803 snapset
= obc
->ssc
->snapset
.get_filtered(pool
.info
);
10805 // NOTE: change this to a const ref when we remove this compat code
10806 snapset
= obc
->ssc
->snapset
;
10809 if ((obc
->obs
.oi
.has_manifest() && obc
->obs
.oi
.manifest
.is_chunked())
10811 // current dedup tier only supports blocking operation
10813 return -EOPNOTSUPP
;
10817 // verify there are no (older) check for dirty clones
10819 dout(20) << " snapset " << snapset
<< dendl
;
10820 vector
<snapid_t
>::reverse_iterator p
= snapset
.clones
.rbegin();
10821 while (p
!= snapset
.clones
.rend() && *p
>= soid
.snap
)
10823 if (p
!= snapset
.clones
.rend()) {
10824 hobject_t next
= soid
;
10826 ceph_assert(next
.snap
< soid
.snap
);
10827 if (recovery_state
.get_pg_log().get_missing().is_missing(next
)) {
10828 dout(10) << __func__
<< " missing clone is " << next
<< dendl
;
10833 ObjectContextRef older_obc
= get_object_context(next
, false);
10835 dout(20) << __func__
<< " next oldest clone is " << older_obc
->obs
.oi
10837 if (older_obc
->obs
.oi
.is_dirty()) {
10838 dout(10) << __func__
<< " next oldest clone is dirty: "
10839 << older_obc
->obs
.oi
<< dendl
;
10843 dout(20) << __func__
<< " next oldest clone " << next
10844 << " is not present; implicitly clean" << dendl
;
10847 dout(20) << __func__
<< " no older clones" << dendl
;
10852 dout(20) << fmt::format("{}: blocking {}", __func__
, soid
) << dendl
;
10853 obc
->start_block();
10856 map
<hobject_t
,FlushOpRef
>::iterator p
= flush_ops
.find(soid
);
10857 if (p
!= flush_ops
.end()) {
10858 FlushOpRef fop
= p
->second
;
10859 if (fop
->op
== op
) {
10860 // we couldn't take the write lock on a cache-try-flush before;
10861 // now we are trying again for the lock.
10862 return try_flush_mark_clean(fop
);
10864 if (fop
->flushed_version
== obc
->obs
.oi
.user_version
&&
10865 (fop
->blocking
|| !blocking
)) {
10866 // nonblocking can join anything
10867 // blocking can only join a blocking flush
10868 dout(20) << __func__
<< " piggybacking on existing flush " << dendl
;
10870 fop
->dup_ops
.push_back(op
);
10871 return -EAGAIN
; // clean up this ctx; op will retry later
10874 // cancel current flush since it will fail anyway, or because we
10875 // are blocking and the existing flush is nonblocking.
10876 dout(20) << __func__
<< " canceling previous flush; it will fail" << dendl
;
10878 osd
->reply_op_error(fop
->op
, -EBUSY
);
10879 while (!fop
->dup_ops
.empty()) {
10880 osd
->reply_op_error(fop
->dup_ops
.front(), -EBUSY
);
10881 fop
->dup_ops
.pop_front();
10883 vector
<ceph_tid_t
> tids
;
10884 cancel_flush(fop
, false, &tids
);
10885 osd
->objecter
->op_cancel(tids
, -ECANCELED
);
10888 if ((obc
->obs
.oi
.has_manifest() && obc
->obs
.oi
.manifest
.is_chunked())
10890 int r
= start_dedup(op
, obc
);
10891 if (r
!= -EINPROGRESS
) {
10899 * In general, we need to send a delete and a copyfrom.
10900 * Consider snapc 10:[10, 9, 8, 4, 3, 2]:[10(10, 9), 4(4,3,2)]
10901 * where 4 is marked as clean. To flush 10, we have to:
10902 * 1) delete 4:[4,3,2] -- Logically, the object does not exist after 4
10903 * 2) copyfrom 8:[8,4,3,2] -- flush object after snap 8
10905 * There is a complicating case. Supposed there had been a clone 7
10906 * for snaps [7, 6] which has been trimmed since they no longer exist.
10907 * In the base pool, we'd have 5:[4,3,2]:[4(4,3,2)]+head. When we submit
10908 * the delete, the snap will be promoted to 5, and the head will become
10909 * a whiteout. When the copy-from goes through, we'll end up with
10910 * 8:[8,4,3,2]:[4(4,3,2)]+head.
10912 * Another complication is the case where there is an interval change
10913 * after doing the delete and the flush but before marking the object
10914 * clean. We'll happily delete head and then recreate it at the same
10915 * sequence number, which works out ok.
10918 SnapContext snapc
, dsnapc
;
10919 if (snapset
.seq
!= 0) {
10920 if (soid
.snap
== CEPH_NOSNAP
) {
10921 snapc
= snapset
.get_ssc_as_of(snapset
.seq
);
10923 snapid_t min_included_snap
;
10924 auto p
= snapset
.clone_snaps
.find(soid
.snap
);
10925 ceph_assert(p
!= snapset
.clone_snaps
.end());
10926 min_included_snap
= p
->second
.back();
10927 snapc
= snapset
.get_ssc_as_of(min_included_snap
- 1);
10930 snapid_t prev_snapc
= 0;
10931 for (vector
<snapid_t
>::reverse_iterator citer
= snapset
.clones
.rbegin();
10932 citer
!= snapset
.clones
.rend();
10934 if (*citer
< soid
.snap
) {
10935 prev_snapc
= *citer
;
10940 dsnapc
= snapset
.get_ssc_as_of(prev_snapc
);
10943 object_locator_t
base_oloc(soid
);
10944 base_oloc
.pool
= pool
.info
.tier_of
;
10946 if (dsnapc
.seq
< snapc
.seq
) {
10949 osd
->objecter
->mutate(
10954 ceph::real_clock::from_ceph_timespec(oi
.mtime
),
10955 (CEPH_OSD_FLAG_IGNORE_OVERLAY
|
10956 CEPH_OSD_FLAG_ENFORCE_SNAPC
),
10957 NULL
/* no callback, we'll rely on the ordering w.r.t the next op */);
10960 FlushOpRef
fop(std::make_shared
<FlushOp
>());
10962 fop
->flushed_version
= oi
.user_version
;
10963 fop
->blocking
= blocking
;
10964 fop
->on_flush
= std::move(on_flush
);
10968 if (oi
.is_whiteout()) {
10969 fop
->removal
= true;
10972 object_locator_t
oloc(soid
);
10973 o
.copy_from(soid
.oid
.name
, soid
.snap
, oloc
, oi
.user_version
,
10974 CEPH_OSD_COPY_FROM_FLAG_FLUSH
|
10975 CEPH_OSD_COPY_FROM_FLAG_IGNORE_OVERLAY
|
10976 CEPH_OSD_COPY_FROM_FLAG_IGNORE_CACHE
|
10977 CEPH_OSD_COPY_FROM_FLAG_MAP_SNAP_CLONE
,
10978 LIBRADOS_OP_FLAG_FADVISE_SEQUENTIAL
|LIBRADOS_OP_FLAG_FADVISE_NOCACHE
);
10980 //mean the base tier don't cache data after this
10981 if (agent_state
&& agent_state
->evict_mode
!= TierAgentState::EVICT_MODE_FULL
)
10982 o
.set_last_op_flags(LIBRADOS_OP_FLAG_FADVISE_DONTNEED
);
10984 C_Flush
*fin
= new C_Flush(this, soid
, get_last_peering_reset());
10986 ceph_tid_t tid
= osd
->objecter
->mutate(
10987 soid
.oid
, base_oloc
, o
, snapc
,
10988 ceph::real_clock::from_ceph_timespec(oi
.mtime
),
10989 CEPH_OSD_FLAG_IGNORE_OVERLAY
| CEPH_OSD_FLAG_ENFORCE_SNAPC
,
10990 new C_OnFinisher(fin
,
10991 osd
->get_objecter_finisher(get_pg_shard())));
10992 /* we're under the pg lock and fin->finish() is grabbing that */
10994 fop
->objecter_tid
= tid
;
10996 flush_ops
[soid
] = fop
;
10998 recovery_state
.update_stats(
10999 [&oi
](auto &history
, auto &stats
) {
11000 stats
.stats
.sum
.num_flush
++;
11001 stats
.stats
.sum
.num_flush_kb
+= shift_round_up(oi
.size
, 10);
11004 return -EINPROGRESS
;
11007 void PrimaryLogPG::finish_flush(hobject_t oid
, ceph_tid_t tid
, int r
)
11009 dout(10) << __func__
<< " " << oid
<< " tid " << tid
11010 << " " << cpp_strerror(r
) << dendl
;
11011 map
<hobject_t
,FlushOpRef
>::iterator p
= flush_ops
.find(oid
);
11012 if (p
== flush_ops
.end()) {
11013 dout(10) << __func__
<< " no flush_op found" << dendl
;
11016 FlushOpRef fop
= p
->second
;
11017 if (tid
!= fop
->objecter_tid
&& !fop
->obc
->obs
.oi
.has_manifest()) {
11018 dout(10) << __func__
<< " tid " << tid
<< " != fop " << fop
11019 << " tid " << fop
->objecter_tid
<< dendl
;
11022 ObjectContextRef obc
= fop
->obc
;
11023 fop
->objecter_tid
= 0;
11025 if (r
< 0 && !(r
== -ENOENT
&& fop
->removal
)) {
11027 osd
->reply_op_error(fop
->op
, -EBUSY
);
11028 if (fop
->blocking
) {
11030 kick_object_context_blocked(obc
);
11033 if (!fop
->dup_ops
.empty()) {
11034 dout(20) << __func__
<< " requeueing dups" << dendl
;
11035 requeue_ops(fop
->dup_ops
);
11037 if (fop
->on_flush
) {
11038 (*(fop
->on_flush
))();
11039 fop
->on_flush
= std::nullopt
;
11041 flush_ops
.erase(oid
);
11045 r
= try_flush_mark_clean(fop
);
11046 if (r
== -EBUSY
&& fop
->op
) {
11047 osd
->reply_op_error(fop
->op
, r
);
11051 int PrimaryLogPG::try_flush_mark_clean(FlushOpRef fop
)
11053 ObjectContextRef obc
= fop
->obc
;
11054 const hobject_t
& oid
= obc
->obs
.oi
.soid
;
11056 if (fop
->blocking
) {
11058 kick_object_context_blocked(obc
);
11061 if (fop
->flushed_version
!= obc
->obs
.oi
.user_version
||
11062 !obc
->obs
.exists
) {
11063 if (obc
->obs
.exists
)
11064 dout(10) << __func__
<< " flushed_version " << fop
->flushed_version
11065 << " != current " << obc
->obs
.oi
.user_version
11068 dout(10) << __func__
<< " object no longer exists" << dendl
;
11070 if (!fop
->dup_ops
.empty()) {
11071 dout(20) << __func__
<< " requeueing dups" << dendl
;
11072 requeue_ops(fop
->dup_ops
);
11074 if (fop
->on_flush
) {
11075 (*(fop
->on_flush
))();
11076 fop
->on_flush
= std::nullopt
;
11078 flush_ops
.erase(oid
);
11080 osd
->logger
->inc(l_osd_tier_flush_fail
);
11082 osd
->logger
->inc(l_osd_tier_try_flush_fail
);
11086 if (!fop
->blocking
&&
11087 m_scrubber
->write_blocked_by_scrub(oid
)) {
11089 dout(10) << __func__
<< " blocked by scrub" << dendl
;
11090 requeue_op(fop
->op
);
11091 requeue_ops(fop
->dup_ops
);
11092 return -EAGAIN
; // will retry
11094 osd
->logger
->inc(l_osd_tier_try_flush_fail
);
11095 vector
<ceph_tid_t
> tids
;
11096 cancel_flush(fop
, false, &tids
);
11097 osd
->objecter
->op_cancel(tids
, -ECANCELED
);
11102 // successfully flushed, can we evict this object?
11103 if (!obc
->obs
.oi
.has_manifest() && !fop
->op
&&
11104 agent_state
&& agent_state
->evict_mode
!= TierAgentState::EVICT_MODE_IDLE
&&
11105 agent_maybe_evict(obc
, true)) {
11106 osd
->logger
->inc(l_osd_tier_clean
);
11107 if (fop
->on_flush
) {
11108 (*(fop
->on_flush
))();
11109 fop
->on_flush
= std::nullopt
;
11111 flush_ops
.erase(oid
);
11115 dout(10) << __func__
<< " clearing DIRTY flag for " << oid
<< dendl
;
11116 OpContextUPtr ctx
= simple_opc_create(fop
->obc
);
11118 // successfully flushed; can we clear the dirty bit?
11119 // try to take the lock manually, since we don't
11121 if (ctx
->lock_manager
.get_lock_type(
11126 dout(20) << __func__
<< " took write lock" << dendl
;
11127 } else if (fop
->op
) {
11128 dout(10) << __func__
<< " waiting on write lock " << fop
->op
<< " "
11129 << fop
->dup_ops
<< dendl
;
11130 // fop->op is now waiting on the lock; get fop->dup_ops to wait too.
11131 for (auto op
: fop
->dup_ops
) {
11132 bool locked
= ctx
->lock_manager
.get_lock_type(
11137 ceph_assert(!locked
);
11139 close_op_ctx(ctx
.release());
11140 return -EAGAIN
; // will retry
11142 dout(10) << __func__
<< " failed write lock, no op; failing" << dendl
;
11143 close_op_ctx(ctx
.release());
11144 osd
->logger
->inc(l_osd_tier_try_flush_fail
);
11145 vector
<ceph_tid_t
> tids
;
11146 cancel_flush(fop
, false, &tids
);
11147 osd
->objecter
->op_cancel(tids
, -ECANCELED
);
11151 if (fop
->on_flush
) {
11152 ctx
->register_on_finish(*(fop
->on_flush
));
11153 fop
->on_flush
= std::nullopt
;
11156 ctx
->at_version
= get_next_version();
11158 ctx
->new_obs
= obc
->obs
;
11159 ctx
->new_obs
.oi
.clear_flag(object_info_t::FLAG_DIRTY
);
11160 --ctx
->delta_stats
.num_objects_dirty
;
11161 if (fop
->obc
->obs
.oi
.has_manifest()) {
11162 ceph_assert(obc
->obs
.oi
.manifest
.is_chunked());
11163 PGTransaction
* t
= ctx
->op_t
.get();
11164 uint64_t chunks_size
= 0;
11165 for (auto &p
: ctx
->new_obs
.oi
.manifest
.chunk_map
) {
11166 chunks_size
+= p
.second
.length
;
11168 if (ctx
->new_obs
.oi
.is_omap() && pool
.info
.supports_omap()) {
11169 t
->omap_clear(oid
);
11170 ctx
->new_obs
.oi
.clear_omap_digest();
11171 ctx
->new_obs
.oi
.clear_flag(object_info_t::FLAG_OMAP
);
11172 ctx
->clean_regions
.mark_omap_dirty();
11174 if (obc
->obs
.oi
.size
== chunks_size
) {
11175 t
->truncate(oid
, 0);
11176 interval_set
<uint64_t> trim
;
11177 trim
.insert(0, ctx
->new_obs
.oi
.size
);
11178 ctx
->modified_ranges
.union_of(trim
);
11179 truncate_update_size_and_usage(ctx
->delta_stats
,
11182 ctx
->clean_regions
.mark_data_region_dirty(0, ctx
->new_obs
.oi
.size
);
11183 ctx
->new_obs
.oi
.new_object();
11184 for (auto &p
: ctx
->new_obs
.oi
.manifest
.chunk_map
) {
11185 p
.second
.set_flag(chunk_info_t::FLAG_MISSING
);
11188 for (auto &p
: ctx
->new_obs
.oi
.manifest
.chunk_map
) {
11189 dout(20) << __func__
<< " offset: " << p
.second
.offset
11190 << " length: " << p
.second
.length
<< dendl
;
11191 p
.second
.clear_flag(chunk_info_t::FLAG_MISSING
); // CLEAN
11196 finish_ctx(ctx
.get(), pg_log_entry_t::CLEAN
);
11198 osd
->logger
->inc(l_osd_tier_clean
);
11200 if (!fop
->dup_ops
.empty() || fop
->op
) {
11201 dout(20) << __func__
<< " requeueing for " << ctx
->at_version
<< dendl
;
11202 list
<OpRequestRef
> ls
;
11204 ls
.push_back(fop
->op
);
11205 ls
.splice(ls
.end(), fop
->dup_ops
);
11209 simple_opc_submit(std::move(ctx
));
11211 flush_ops
.erase(oid
);
11214 osd
->logger
->inc(l_osd_tier_flush
);
11216 osd
->logger
->inc(l_osd_tier_try_flush
);
11218 return -EINPROGRESS
;
11221 void PrimaryLogPG::cancel_flush(FlushOpRef fop
, bool requeue
,
11222 vector
<ceph_tid_t
> *tids
)
11224 dout(10) << __func__
<< " " << fop
->obc
->obs
.oi
.soid
<< " tid "
11225 << fop
->objecter_tid
<< dendl
;
11226 if (fop
->objecter_tid
) {
11227 tids
->push_back(fop
->objecter_tid
);
11228 fop
->objecter_tid
= 0;
11230 if (fop
->io_tids
.size()) {
11231 for (auto &p
: fop
->io_tids
) {
11232 tids
->push_back(p
.second
);
11236 if (fop
->blocking
&& fop
->obc
->is_blocked()) {
11237 fop
->obc
->stop_block();
11238 kick_object_context_blocked(fop
->obc
);
11242 requeue_op(fop
->op
);
11243 requeue_ops(fop
->dup_ops
);
11245 if (fop
->on_flush
) {
11246 (*(fop
->on_flush
))();
11247 fop
->on_flush
= std::nullopt
;
11249 flush_ops
.erase(fop
->obc
->obs
.oi
.soid
);
11252 void PrimaryLogPG::cancel_flush_ops(bool requeue
, vector
<ceph_tid_t
> *tids
)
11254 dout(10) << __func__
<< dendl
;
11255 map
<hobject_t
,FlushOpRef
>::iterator p
= flush_ops
.begin();
11256 while (p
!= flush_ops
.end()) {
11257 cancel_flush((p
++)->second
, requeue
, tids
);
11261 bool PrimaryLogPG::is_present_clone(hobject_t coid
)
11263 if (!pool
.info
.allow_incomplete_clones())
11265 if (is_missing_object(coid
))
11267 ObjectContextRef obc
= get_object_context(coid
, false);
11268 return obc
&& obc
->obs
.exists
;
11271 // ========================================================================
11275 void PrimaryLogPG::cancel_cls_gather(map
<hobject_t
,CLSGatherOp
>::iterator iter
, bool requeue
,
11276 vector
<ceph_tid_t
> *tids
)
11278 auto &cgop
= iter
->second
;
11279 for (std::vector
<ceph_tid_t
>::iterator p
= cgop
.objecter_tids
.begin(); p
!= cgop
.objecter_tids
.end(); p
++) {
11280 tids
->push_back(*p
);
11281 dout(10) << __func__
<< " " << cgop
.obc
->obs
.oi
.soid
<< " tid " << *p
<< dendl
;
11283 cgop
.objecter_tids
.clear();
11284 close_op_ctx(cgop
.ctx
);
11288 requeue_op(cgop
.op
);
11290 cls_gather_ops
.erase(iter
);
11293 void PrimaryLogPG::cancel_cls_gather_ops(bool requeue
, vector
<ceph_tid_t
> *tids
)
11295 dout(10) << __func__
<< dendl
;
11296 map
<hobject_t
,CLSGatherOp
>::iterator p
= cls_gather_ops
.begin();
11297 while (p
!= cls_gather_ops
.end()) {
11298 cancel_cls_gather(p
++, requeue
, tids
);
11302 // ========================================================================
11305 class C_OSD_RepopCommit
: public Context
{
11306 PrimaryLogPGRef pg
;
11307 boost::intrusive_ptr
<PrimaryLogPG::RepGather
> repop
;
11309 C_OSD_RepopCommit(PrimaryLogPG
*pg
, PrimaryLogPG::RepGather
*repop
)
11310 : pg(pg
), repop(repop
) {}
11311 void finish(int) override
{
11312 pg
->repop_all_committed(repop
.get());
11316 void PrimaryLogPG::repop_all_committed(RepGather
*repop
)
11318 dout(10) << __func__
<< ": repop tid " << repop
->rep_tid
<< " all committed "
11320 repop
->all_committed
= true;
11321 if (!repop
->rep_aborted
) {
11322 if (repop
->v
!= eversion_t()) {
11323 recovery_state
.complete_write(repop
->v
, repop
->pg_local_last_complete
);
11329 void PrimaryLogPG::op_applied(const eversion_t
&applied_version
)
11331 dout(10) << "op_applied version " << applied_version
<< dendl
;
11332 ceph_assert(applied_version
!= eversion_t());
11333 ceph_assert(applied_version
<= info
.last_update
);
11334 recovery_state
.local_write_applied(applied_version
);
11336 if (is_primary() && m_scrubber
) {
11337 // if there's a scrub operation waiting for the selected chunk to be fully updated -
11338 // allow it to continue
11339 m_scrubber
->on_applied_when_primary(recovery_state
.get_last_update_applied());
11343 void PrimaryLogPG::eval_repop(RepGather
*repop
)
11345 dout(10) << "eval_repop " << *repop
11346 << (repop
->op
&& repop
->op
->get_req
<MOSDOp
>() ? "" : " (no op)") << dendl
;
11349 if (repop
->all_committed
) {
11350 dout(10) << " commit: " << *repop
<< dendl
;
11351 for (auto p
= repop
->on_committed
.begin();
11352 p
!= repop
->on_committed
.end();
11353 repop
->on_committed
.erase(p
++)) {
11356 // send dup commits, in order
11357 auto it
= waiting_for_ondisk
.find(repop
->v
);
11358 if (it
!= waiting_for_ondisk
.end()) {
11359 ceph_assert(waiting_for_ondisk
.begin()->first
== repop
->v
);
11360 for (auto& i
: it
->second
) {
11361 int return_code
= repop
->r
;
11362 if (return_code
>= 0) {
11363 return_code
= std::get
<2>(i
);
11365 osd
->reply_op_error(std::get
<0>(i
), return_code
, repop
->v
,
11366 std::get
<1>(i
), std::get
<3>(i
));
11368 waiting_for_ondisk
.erase(it
);
11371 publish_stats_to_osd();
11373 dout(10) << " removing " << *repop
<< dendl
;
11374 ceph_assert(!repop_queue
.empty());
11375 dout(20) << " q front is " << *repop_queue
.front() << dendl
;
11376 if (repop_queue
.front() == repop
) {
11377 RepGather
*to_remove
= nullptr;
11378 while (!repop_queue
.empty() &&
11379 (to_remove
= repop_queue
.front())->all_committed
) {
11380 repop_queue
.pop_front();
11381 for (auto p
= to_remove
->on_success
.begin();
11382 p
!= to_remove
->on_success
.end();
11383 to_remove
->on_success
.erase(p
++)) {
11386 remove_repop(to_remove
);
11392 void PrimaryLogPG::issue_repop(RepGather
*repop
, OpContext
*ctx
)
11395 const hobject_t
& soid
= ctx
->obs
->oi
.soid
;
11396 dout(7) << "issue_repop rep_tid " << repop
->rep_tid
11401 repop
->v
= ctx
->at_version
;
11403 ctx
->op_t
->add_obc(ctx
->obc
);
11404 if (ctx
->clone_obc
) {
11405 ctx
->op_t
->add_obc(ctx
->clone_obc
);
11407 if (ctx
->head_obc
) {
11408 ctx
->op_t
->add_obc(ctx
->head_obc
);
11411 Context
*on_all_commit
= new C_OSD_RepopCommit(this, repop
);
11412 if (!(ctx
->log
.empty())) {
11413 ceph_assert(ctx
->at_version
>= projected_last_update
);
11414 projected_last_update
= ctx
->at_version
;
11416 for (auto &&entry
: ctx
->log
) {
11417 projected_log
.add(entry
);
11420 recovery_state
.pre_submit_op(
11424 pgbackend
->submit_transaction(
11428 std::move(ctx
->op_t
),
11429 recovery_state
.get_pg_trim_to(),
11430 recovery_state
.get_min_last_complete_ondisk(),
11431 std::move(ctx
->log
),
11432 ctx
->updated_hset_history
,
11439 PrimaryLogPG::RepGather
*PrimaryLogPG::new_repop(
11441 ceph_tid_t rep_tid
)
11444 dout(10) << "new_repop rep_tid " << rep_tid
<< " on " << *ctx
->op
->get_req() << dendl
;
11446 dout(10) << "new_repop rep_tid " << rep_tid
<< " (no op)" << dendl
;
11448 RepGather
*repop
= new RepGather(
11449 ctx
, rep_tid
, info
.last_complete
);
11451 repop
->start
= ceph_clock_now();
11453 repop_queue
.push_back(&repop
->queue_item
);
11456 osd
->logger
->inc(l_osd_op_wip
);
11458 dout(10) << __func__
<< ": " << *repop
<< dendl
;
11462 boost::intrusive_ptr
<PrimaryLogPG::RepGather
> PrimaryLogPG::new_repop(
11463 eversion_t version
,
11465 ObcLockManager
&&manager
,
11467 std::optional
<std::function
<void(void)> > &&on_complete
)
11469 RepGather
*repop
= new RepGather(
11470 std::move(manager
),
11472 std::move(on_complete
),
11474 info
.last_complete
,
11476 repop
->v
= version
;
11478 repop
->start
= ceph_clock_now();
11480 repop_queue
.push_back(&repop
->queue_item
);
11482 osd
->logger
->inc(l_osd_op_wip
);
11484 dout(10) << __func__
<< ": " << *repop
<< dendl
;
11485 return boost::intrusive_ptr
<RepGather
>(repop
);
11488 void PrimaryLogPG::remove_repop(RepGather
*repop
)
11490 dout(20) << __func__
<< " " << *repop
<< dendl
;
11492 for (auto p
= repop
->on_finish
.begin();
11493 p
!= repop
->on_finish
.end();
11494 repop
->on_finish
.erase(p
++)) {
11498 release_object_locks(
11499 repop
->lock_manager
);
11502 osd
->logger
->dec(l_osd_op_wip
);
11505 PrimaryLogPG::OpContextUPtr
PrimaryLogPG::simple_opc_create(ObjectContextRef obc
)
11507 dout(20) << __func__
<< " " << obc
->obs
.oi
.soid
<< dendl
;
11508 ceph_tid_t rep_tid
= osd
->get_tid();
11509 osd_reqid_t
reqid(osd
->get_cluster_msgr_name(), 0, rep_tid
);
11510 OpContextUPtr
ctx(new OpContext(OpRequestRef(), reqid
, nullptr, obc
, this));
11511 ctx
->op_t
.reset(new PGTransaction());
11512 ctx
->mtime
= ceph_clock_now();
11516 void PrimaryLogPG::simple_opc_submit(OpContextUPtr ctx
)
11518 RepGather
*repop
= new_repop(ctx
.get(), ctx
->reqid
.tid
);
11519 dout(20) << __func__
<< " " << repop
<< dendl
;
11520 issue_repop(repop
, ctx
.get());
11522 recovery_state
.update_trim_to();
11527 void PrimaryLogPG::submit_log_entries(
11528 const mempool::osd_pglog::list
<pg_log_entry_t
> &entries
,
11529 ObcLockManager
&&manager
,
11530 std::optional
<std::function
<void(void)> > &&_on_complete
,
11534 dout(10) << __func__
<< " " << entries
<< dendl
;
11535 ceph_assert(is_primary());
11537 eversion_t version
;
11538 if (!entries
.empty()) {
11539 ceph_assert(entries
.rbegin()->version
>= projected_last_update
);
11540 version
= projected_last_update
= entries
.rbegin()->version
;
11543 boost::intrusive_ptr
<RepGather
> repop
;
11544 std::optional
<std::function
<void(void)> > on_complete
;
11545 if (get_osdmap()->require_osd_release
>= ceph_release_t::jewel
) {
11549 std::move(manager
),
11551 std::move(_on_complete
));
11553 on_complete
= std::move(_on_complete
);
11556 pgbackend
->call_write_ordered(
11557 [this, entries
, repop
, on_complete
]() {
11558 ObjectStore::Transaction t
;
11559 eversion_t old_last_update
= info
.last_update
;
11560 recovery_state
.merge_new_log_entries(
11561 entries
, t
, recovery_state
.get_pg_trim_to(),
11562 recovery_state
.get_min_last_complete_ondisk());
11564 set
<pg_shard_t
> waiting_on
;
11565 for (set
<pg_shard_t
>::const_iterator i
= get_acting_recovery_backfill().begin();
11566 i
!= get_acting_recovery_backfill().end();
11568 pg_shard_t
peer(*i
);
11569 if (peer
== pg_whoami
) continue;
11570 ceph_assert(recovery_state
.get_peer_missing().count(peer
));
11571 ceph_assert(recovery_state
.has_peer_info(peer
));
11572 if (get_osdmap()->require_osd_release
>= ceph_release_t::jewel
) {
11573 ceph_assert(repop
);
11574 MOSDPGUpdateLogMissing
*m
= new MOSDPGUpdateLogMissing(
11576 spg_t(info
.pgid
.pgid
, i
->shard
),
11578 get_osdmap_epoch(),
11579 get_last_peering_reset(),
11581 recovery_state
.get_pg_trim_to(),
11582 recovery_state
.get_min_last_complete_ondisk());
11583 osd
->send_message_osd_cluster(
11584 peer
.osd
, m
, get_osdmap_epoch());
11585 waiting_on
.insert(peer
);
11587 MOSDPGLog
*m
= new MOSDPGLog(
11588 peer
.shard
, pg_whoami
.shard
,
11589 info
.last_update
.epoch
,
11590 info
, get_last_peering_reset());
11591 m
->log
.log
= entries
;
11592 m
->log
.tail
= old_last_update
;
11593 m
->log
.head
= info
.last_update
;
11594 osd
->send_message_osd_cluster(
11595 peer
.osd
, m
, get_osdmap_epoch());
11598 ceph_tid_t rep_tid
= repop
->rep_tid
;
11599 waiting_on
.insert(pg_whoami
);
11600 log_entry_update_waiting_on
.insert(
11603 LogUpdateCtx
{std::move(repop
), std::move(waiting_on
)}
11605 struct OnComplete
: public Context
{
11606 PrimaryLogPGRef pg
;
11607 ceph_tid_t rep_tid
;
11610 PrimaryLogPGRef pg
,
11611 ceph_tid_t rep_tid
,
11613 : pg(pg
), rep_tid(rep_tid
), epoch(epoch
) {}
11614 void finish(int) override
{
11615 std::scoped_lock l
{*pg
};
11616 if (!pg
->pg_has_reset_since(epoch
)) {
11617 auto it
= pg
->log_entry_update_waiting_on
.find(rep_tid
);
11618 ceph_assert(it
!= pg
->log_entry_update_waiting_on
.end());
11619 auto it2
= it
->second
.waiting_on
.find(pg
->pg_whoami
);
11620 ceph_assert(it2
!= it
->second
.waiting_on
.end());
11621 it
->second
.waiting_on
.erase(it2
);
11622 if (it
->second
.waiting_on
.empty()) {
11623 pg
->repop_all_committed(it
->second
.repop
.get());
11624 pg
->log_entry_update_waiting_on
.erase(it
);
11629 t
.register_on_commit(
11630 new OnComplete
{this, rep_tid
, get_osdmap_epoch()});
11631 int r
= osd
->store
->queue_transaction(ch
, std::move(t
), NULL
);
11632 ceph_assert(r
== 0);
11633 op_applied(info
.last_update
);
11636 recovery_state
.update_trim_to();
11639 void PrimaryLogPG::cancel_log_updates()
11641 // get rid of all the LogUpdateCtx so their references to repops are
11643 log_entry_update_waiting_on
.clear();
11646 // -------------------------------------------------------
11648 void PrimaryLogPG::get_watchers(list
<obj_watch_item_t
> *ls
)
11650 std::scoped_lock l
{*this};
11651 pair
<hobject_t
, ObjectContextRef
> i
;
11652 while (object_contexts
.get_next(i
.first
, &i
)) {
11653 ObjectContextRef
obc(i
.second
);
11654 get_obc_watchers(obc
, *ls
);
11658 void PrimaryLogPG::get_obc_watchers(ObjectContextRef obc
, list
<obj_watch_item_t
> &pg_watchers
)
11660 for (map
<pair
<uint64_t, entity_name_t
>, WatchRef
>::iterator j
=
11661 obc
->watchers
.begin();
11662 j
!= obc
->watchers
.end();
11664 obj_watch_item_t owi
;
11666 owi
.obj
= obc
->obs
.oi
.soid
;
11667 owi
.wi
.addr
= j
->second
->get_peer_addr();
11668 owi
.wi
.name
= j
->second
->get_entity();
11669 owi
.wi
.cookie
= j
->second
->get_cookie();
11670 owi
.wi
.timeout_seconds
= j
->second
->get_timeout();
11672 dout(30) << "watch: Found oid=" << owi
.obj
<< " addr=" << owi
.wi
.addr
11673 << " name=" << owi
.wi
.name
<< " cookie=" << owi
.wi
.cookie
<< dendl
;
11675 pg_watchers
.push_back(owi
);
11679 void PrimaryLogPG::check_blocklisted_watchers()
11681 dout(20) << "PrimaryLogPG::check_blocklisted_watchers for pg " << get_pgid() << dendl
;
11682 pair
<hobject_t
, ObjectContextRef
> i
;
11683 while (object_contexts
.get_next(i
.first
, &i
))
11684 check_blocklisted_obc_watchers(i
.second
);
11687 void PrimaryLogPG::check_blocklisted_obc_watchers(ObjectContextRef obc
)
11689 dout(20) << "PrimaryLogPG::check_blocklisted_obc_watchers for obc " << obc
->obs
.oi
.soid
<< dendl
;
11690 for (map
<pair
<uint64_t, entity_name_t
>, WatchRef
>::iterator k
=
11691 obc
->watchers
.begin();
11692 k
!= obc
->watchers
.end();
11694 //Advance iterator now so handle_watch_timeout() can erase element
11695 map
<pair
<uint64_t, entity_name_t
>, WatchRef
>::iterator j
= k
++;
11696 dout(30) << "watch: Found " << j
->second
->get_entity() << " cookie " << j
->second
->get_cookie() << dendl
;
11697 entity_addr_t ea
= j
->second
->get_peer_addr();
11698 dout(30) << "watch: Check entity_addr_t " << ea
<< dendl
;
11699 if (get_osdmap()->is_blocklisted(ea
)) {
11700 dout(10) << "watch: Found blocklisted watcher for " << ea
<< dendl
;
11701 ceph_assert(j
->second
->get_pg() == this);
11702 j
->second
->unregister_cb();
11703 handle_watch_timeout(j
->second
);
11708 void PrimaryLogPG::populate_obc_watchers(ObjectContextRef obc
)
11710 ceph_assert(is_primary() && is_active());
11711 auto it_objects
= recovery_state
.get_pg_log().get_log().objects
.find(obc
->obs
.oi
.soid
);
11712 ceph_assert((recovering
.count(obc
->obs
.oi
.soid
) ||
11713 !is_missing_object(obc
->obs
.oi
.soid
)) ||
11714 (it_objects
!= recovery_state
.get_pg_log().get_log().objects
.end() && // or this is a revert... see recover_primary()
11715 it_objects
->second
->op
==
11716 pg_log_entry_t::LOST_REVERT
&&
11717 it_objects
->second
->reverting_to
==
11718 obc
->obs
.oi
.version
));
11720 dout(10) << "populate_obc_watchers " << obc
->obs
.oi
.soid
<< dendl
;
11721 ceph_assert(obc
->watchers
.empty());
11722 // populate unconnected_watchers
11723 for (map
<pair
<uint64_t, entity_name_t
>, watch_info_t
>::iterator p
=
11724 obc
->obs
.oi
.watchers
.begin();
11725 p
!= obc
->obs
.oi
.watchers
.end();
11727 utime_t expire
= info
.stats
.last_became_active
;
11728 expire
+= p
->second
.timeout_seconds
;
11729 dout(10) << " unconnected watcher " << p
->first
<< " will expire " << expire
<< dendl
;
11731 Watch::makeWatchRef(
11732 this, osd
, obc
, p
->second
.timeout_seconds
, p
->first
.first
,
11733 p
->first
.second
, p
->second
.addr
));
11734 watch
->disconnect();
11735 obc
->watchers
.insert(
11737 make_pair(p
->first
.first
, p
->first
.second
),
11740 // Look for watchers from blocklisted clients and drop
11741 check_blocklisted_obc_watchers(obc
);
11744 void PrimaryLogPG::handle_watch_timeout(WatchRef watch
)
11746 ObjectContextRef obc
= watch
->get_obc(); // handle_watch_timeout owns this ref
11747 dout(10) << "handle_watch_timeout obc " << *obc
<< dendl
;
11749 if (!is_active()) {
11750 dout(10) << "handle_watch_timeout not active, no-op" << dendl
;
11753 if (!obc
->obs
.exists
) {
11754 dout(10) << __func__
<< " object " << obc
->obs
.oi
.soid
<< " dne" << dendl
;
11757 if (is_degraded_or_backfilling_object(obc
->obs
.oi
.soid
)) {
11758 callbacks_for_degraded_object
[obc
->obs
.oi
.soid
].push_back(
11759 watch
->get_delayed_cb()
11761 dout(10) << "handle_watch_timeout waiting for degraded on obj "
11762 << obc
->obs
.oi
.soid
11767 if (m_scrubber
->write_blocked_by_scrub(obc
->obs
.oi
.soid
)) {
11768 dout(10) << "handle_watch_timeout waiting for scrub on obj "
11769 << obc
->obs
.oi
.soid
11771 m_scrubber
->add_callback(
11772 watch
->get_delayed_cb() // This callback!
11777 OpContextUPtr ctx
= simple_opc_create(obc
);
11778 ctx
->at_version
= get_next_version();
11780 object_info_t
& oi
= ctx
->new_obs
.oi
;
11781 oi
.watchers
.erase(make_pair(watch
->get_cookie(),
11782 watch
->get_entity()));
11784 list
<watch_disconnect_t
> watch_disconnects
= {
11785 watch_disconnect_t(watch
->get_cookie(), watch
->get_entity(), true)
11787 ctx
->register_on_success(
11788 [this, obc
, watch_disconnects
]() {
11789 complete_disconnect_watches(obc
, watch_disconnects
);
11793 PGTransaction
*t
= ctx
->op_t
.get();
11794 ctx
->log
.push_back(pg_log_entry_t(pg_log_entry_t::MODIFY
, obc
->obs
.oi
.soid
,
11798 osd_reqid_t(), ctx
->mtime
, 0));
11800 oi
.prior_version
= obc
->obs
.oi
.version
;
11801 oi
.version
= ctx
->at_version
;
11803 encode(oi
, bl
, get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD
, nullptr));
11804 t
->setattr(obc
->obs
.oi
.soid
, OI_ATTR
, bl
);
11806 // apply new object state.
11807 ctx
->obc
->obs
= ctx
->new_obs
;
11809 // no ctx->delta_stats
11810 simple_opc_submit(std::move(ctx
));
11813 ObjectContextRef
PrimaryLogPG::create_object_context(const object_info_t
& oi
,
11814 SnapSetContext
*ssc
)
11816 ObjectContextRef
obc(object_contexts
.lookup_or_create(oi
.soid
));
11817 ceph_assert(obc
->destructor_callback
== NULL
);
11818 obc
->destructor_callback
= new C_PG_ObjectContext(this, obc
.get());
11820 obc
->obs
.exists
= false;
11823 register_snapset_context(ssc
);
11824 dout(10) << "create_object_context " << (void*)obc
.get() << " " << oi
.soid
<< " " << dendl
;
11826 populate_obc_watchers(obc
);
11830 ObjectContextRef
PrimaryLogPG::get_object_context(
11831 const hobject_t
& soid
,
11833 const map
<string
, bufferlist
, less
<>> *attrs
)
11835 auto it_objects
= recovery_state
.get_pg_log().get_log().objects
.find(soid
);
11837 attrs
|| !recovery_state
.get_pg_log().get_missing().is_missing(soid
) ||
11838 // or this is a revert... see recover_primary()
11839 (it_objects
!= recovery_state
.get_pg_log().get_log().objects
.end() &&
11840 it_objects
->second
->op
==
11841 pg_log_entry_t::LOST_REVERT
));
11842 ObjectContextRef obc
= object_contexts
.lookup(soid
);
11843 osd
->logger
->inc(l_osd_object_ctx_cache_total
);
11845 osd
->logger
->inc(l_osd_object_ctx_cache_hit
);
11846 dout(10) << __func__
<< ": found obc in cache: " << *obc
11849 dout(10) << __func__
<< ": obc NOT found in cache: " << soid
<< dendl
;
11853 auto it_oi
= attrs
->find(OI_ATTR
);
11854 ceph_assert(it_oi
!= attrs
->end());
11855 bv
= it_oi
->second
;
11857 int r
= pgbackend
->objects_get_attr(soid
, OI_ATTR
, &bv
);
11860 dout(10) << __func__
<< ": no obc for soid "
11861 << soid
<< " and !can_create"
11863 return ObjectContextRef(); // -ENOENT!
11866 dout(10) << __func__
<< ": no obc for soid "
11867 << soid
<< " but can_create"
11870 object_info_t
oi(soid
);
11871 SnapSetContext
*ssc
= get_snapset_context(
11872 soid
, true, 0, false);
11874 obc
= create_object_context(oi
, ssc
);
11875 dout(10) << __func__
<< ": " << *obc
11876 << " oi: " << obc
->obs
.oi
11877 << " " << *obc
->ssc
<< dendl
;
11884 bufferlist::const_iterator bliter
= bv
.begin();
11885 decode(oi
, bliter
);
11887 dout(0) << __func__
<< ": obc corrupt: " << soid
<< dendl
;
11888 return ObjectContextRef(); // -ENOENT!
11891 ceph_assert(oi
.soid
.pool
== (int64_t)info
.pgid
.pool());
11893 obc
= object_contexts
.lookup_or_create(oi
.soid
);
11894 obc
->destructor_callback
= new C_PG_ObjectContext(this, obc
.get());
11896 obc
->obs
.exists
= true;
11898 obc
->ssc
= get_snapset_context(
11900 soid
.has_snapset() ? attrs
: 0);
11902 if (is_primary() && is_active())
11903 populate_obc_watchers(obc
);
11905 if (pool
.info
.is_erasure()) {
11907 obc
->attr_cache
= *attrs
;
11909 int r
= pgbackend
->objects_get_attrs(
11912 ceph_assert(r
== 0);
11916 dout(10) << __func__
<< ": creating obc from disk: " << *obc
11920 // XXX: Caller doesn't expect this
11921 if (obc
->ssc
== NULL
) {
11922 derr
<< __func__
<< ": obc->ssc not available, not returning context" << dendl
;
11923 return ObjectContextRef(); // -ENOENT!
11926 dout(10) << __func__
<< ": " << *obc
11927 << " oi: " << obc
->obs
.oi
11928 << " exists: " << (int)obc
->obs
.exists
11929 << " " << *obc
->ssc
<< dendl
;
11933 void PrimaryLogPG::context_registry_on_change()
11935 pair
<hobject_t
, ObjectContextRef
> i
;
11936 while (object_contexts
.get_next(i
.first
, &i
)) {
11937 ObjectContextRef
obc(i
.second
);
11939 for (map
<pair
<uint64_t, entity_name_t
>, WatchRef
>::iterator j
=
11940 obc
->watchers
.begin();
11941 j
!= obc
->watchers
.end();
11942 obc
->watchers
.erase(j
++)) {
11943 j
->second
->discard();
11951 * If we return an error, and set *pmissing, then promoting that
11954 * If we return -EAGAIN, we will always set *pmissing to the missing
11955 * object to wait for.
11957 * If we return an error but do not set *pmissing, then we know the
11958 * object does not exist.
11960 int PrimaryLogPG::find_object_context(const hobject_t
& oid
,
11961 ObjectContextRef
*pobc
,
11963 bool map_snapid_to_clone
,
11964 hobject_t
*pmissing
)
11967 ceph_assert(oid
.pool
== static_cast<int64_t>(info
.pgid
.pool()));
11969 if (oid
.snap
== CEPH_NOSNAP
) {
11970 ObjectContextRef obc
= get_object_context(oid
, can_create
);
11976 dout(10) << __func__
<< " " << oid
11977 << " @" << oid
.snap
11978 << " oi=" << obc
->obs
.oi
11987 hobject_t head
= oid
.get_head();
11988 SnapSetContext
*ssc
= get_snapset_context(oid
, can_create
);
11989 if (!ssc
|| !(ssc
->exists
|| can_create
)) {
11990 dout(20) << __func__
<< " " << oid
<< " no snapset" << dendl
;
11992 *pmissing
= head
; // start by getting the head
11994 put_snapset_context(ssc
);
11998 if (map_snapid_to_clone
) {
11999 dout(10) << __func__
<< " " << oid
<< " @" << oid
.snap
12000 << " snapset " << ssc
->snapset
12001 << " map_snapid_to_clone=true" << dendl
;
12002 if (oid
.snap
> ssc
->snapset
.seq
) {
12003 // already must be readable
12004 ObjectContextRef obc
= get_object_context(head
, false);
12005 dout(10) << __func__
<< " " << oid
<< " @" << oid
.snap
12006 << " snapset " << ssc
->snapset
12007 << " maps to head" << dendl
;
12009 put_snapset_context(ssc
);
12010 return (obc
&& obc
->obs
.exists
) ? 0 : -ENOENT
;
12012 vector
<snapid_t
>::const_iterator citer
= std::find(
12013 ssc
->snapset
.clones
.begin(),
12014 ssc
->snapset
.clones
.end(),
12016 if (citer
== ssc
->snapset
.clones
.end()) {
12017 dout(10) << __func__
<< " " << oid
<< " @" << oid
.snap
12018 << " snapset " << ssc
->snapset
12019 << " maps to nothing" << dendl
;
12020 put_snapset_context(ssc
);
12024 dout(10) << __func__
<< " " << oid
<< " @" << oid
.snap
12025 << " snapset " << ssc
->snapset
12026 << " maps to " << oid
<< dendl
;
12028 if (recovery_state
.get_pg_log().get_missing().is_missing(oid
)) {
12029 dout(10) << __func__
<< " " << oid
<< " @" << oid
.snap
12030 << " snapset " << ssc
->snapset
12031 << " " << oid
<< " is missing" << dendl
;
12034 put_snapset_context(ssc
);
12038 ObjectContextRef obc
= get_object_context(oid
, false);
12039 if (!obc
|| !obc
->obs
.exists
) {
12040 dout(10) << __func__
<< " " << oid
<< " @" << oid
.snap
12041 << " snapset " << ssc
->snapset
12042 << " " << oid
<< " is not present" << dendl
;
12045 put_snapset_context(ssc
);
12048 dout(10) << __func__
<< " " << oid
<< " @" << oid
.snap
12049 << " snapset " << ssc
->snapset
12050 << " " << oid
<< " HIT" << dendl
;
12052 put_snapset_context(ssc
);
12055 ceph_abort(); //unreachable
12058 dout(10) << __func__
<< " " << oid
<< " @" << oid
.snap
12059 << " snapset " << ssc
->snapset
<< dendl
;
12062 if (oid
.snap
> ssc
->snapset
.seq
) {
12063 ObjectContextRef obc
= get_object_context(head
, false);
12064 dout(10) << __func__
<< " " << head
12065 << " want " << oid
.snap
<< " > snapset seq " << ssc
->snapset
.seq
12066 << " -- HIT " << obc
->obs
12071 ceph_assert(ssc
== obc
->ssc
);
12072 put_snapset_context(ssc
);
12078 // which clone would it be?
12080 while (k
< ssc
->snapset
.clones
.size() &&
12081 ssc
->snapset
.clones
[k
] < oid
.snap
)
12083 if (k
== ssc
->snapset
.clones
.size()) {
12084 dout(10) << __func__
<< " no clones with last >= oid.snap "
12085 << oid
.snap
<< " -- DNE" << dendl
;
12086 put_snapset_context(ssc
);
12089 hobject_t
soid(oid
.oid
, oid
.get_key(), ssc
->snapset
.clones
[k
], oid
.get_hash(),
12090 info
.pgid
.pool(), oid
.get_namespace());
12092 if (recovery_state
.get_pg_log().get_missing().is_missing(soid
)) {
12093 dout(20) << __func__
<< " " << soid
<< " missing, try again later"
12097 put_snapset_context(ssc
);
12101 ObjectContextRef obc
= get_object_context(soid
, false);
12102 if (!obc
|| !obc
->obs
.exists
) {
12105 put_snapset_context(ssc
);
12106 if (is_primary()) {
12107 if (is_degraded_or_backfilling_object(soid
)) {
12108 dout(20) << __func__
<< " clone is degraded or backfilling " << soid
<< dendl
;
12110 } else if (is_degraded_on_async_recovery_target(soid
)) {
12111 dout(20) << __func__
<< " clone is recovering " << soid
<< dendl
;
12114 dout(20) << __func__
<< " missing clone " << soid
<< dendl
;
12118 dout(20) << __func__
<< " replica missing clone" << soid
<< dendl
;
12126 ceph_assert(obc
->ssc
== ssc
);
12127 put_snapset_context(ssc
);
12132 dout(20) << __func__
<< " " << soid
12133 << " snapset " << obc
->ssc
->snapset
12135 snapid_t first
, last
;
12136 auto p
= obc
->ssc
->snapset
.clone_snaps
.find(soid
.snap
);
12137 ceph_assert(p
!= obc
->ssc
->snapset
.clone_snaps
.end());
12138 if (p
->second
.empty()) {
12139 dout(1) << __func__
<< " " << soid
<< " empty snapset -- DNE" << dendl
;
12140 ceph_assert(!cct
->_conf
->osd_debug_verify_snaps
);
12143 if (std::find(p
->second
.begin(), p
->second
.end(), oid
.snap
) ==
12145 dout(20) << __func__
<< " " << soid
<< " clone_snaps " << p
->second
12146 << " does not contain " << oid
.snap
<< " -- DNE" << dendl
;
12149 if (get_osdmap()->in_removed_snaps_queue(info
.pgid
.pgid
.pool(), oid
.snap
)) {
12150 dout(20) << __func__
<< " " << soid
<< " snap " << oid
.snap
12151 << " in removed_snaps_queue" << " -- DNE" << dendl
;
12154 dout(20) << __func__
<< " " << soid
<< " clone_snaps " << p
->second
12155 << " contains " << oid
.snap
<< " -- HIT " << obc
->obs
<< dendl
;
12160 void PrimaryLogPG::object_context_destructor_callback(ObjectContext
*obc
)
12163 put_snapset_context(obc
->ssc
);
12166 void PrimaryLogPG::add_object_context_to_pg_stat(ObjectContextRef obc
, pg_stat_t
*pgstat
)
12168 object_info_t
& oi
= obc
->obs
.oi
;
12170 dout(10) << __func__
<< " " << oi
.soid
<< dendl
;
12171 ceph_assert(!oi
.soid
.is_snapdir());
12173 object_stat_sum_t stat
;
12174 stat
.num_objects
++;
12176 stat
.num_objects_dirty
++;
12177 if (oi
.is_whiteout())
12178 stat
.num_whiteouts
++;
12180 stat
.num_objects_omap
++;
12181 if (oi
.is_cache_pinned())
12182 stat
.num_objects_pinned
++;
12183 if (oi
.has_manifest())
12184 stat
.num_objects_manifest
++;
12186 if (oi
.soid
.is_snap()) {
12187 stat
.num_object_clones
++;
12190 obc
->ssc
= get_snapset_context(oi
.soid
, false);
12191 ceph_assert(obc
->ssc
);
12192 stat
.num_bytes
+= obc
->ssc
->snapset
.get_clone_bytes(oi
.soid
.snap
);
12194 stat
.num_bytes
+= oi
.size
;
12198 pgstat
->stats
.sum
.add(stat
);
12201 void PrimaryLogPG::requeue_op_blocked_by_object(const hobject_t
&soid
) {
12202 map
<hobject_t
, list
<OpRequestRef
>>::iterator p
= waiting_for_blocked_object
.find(soid
);
12203 if (p
!= waiting_for_blocked_object
.end()) {
12204 list
<OpRequestRef
>& ls
= p
->second
;
12205 dout(10) << __func__
<< " " << soid
<< " requeuing " << ls
.size() << " requests" << dendl
;
12207 waiting_for_blocked_object
.erase(p
);
12211 void PrimaryLogPG::kick_object_context_blocked(ObjectContextRef obc
)
12213 const hobject_t
& soid
= obc
->obs
.oi
.soid
;
12214 if (obc
->is_blocked()) {
12215 dout(10) << __func__
<< " " << soid
<< " still blocked" << dendl
;
12219 requeue_op_blocked_by_object(soid
);
12221 map
<hobject_t
, ObjectContextRef
>::iterator i
=
12222 objects_blocked_on_snap_promotion
.find(obc
->obs
.oi
.soid
.get_head());
12223 if (i
!= objects_blocked_on_snap_promotion
.end()) {
12224 ceph_assert(i
->second
== obc
);
12225 ObjectContextRef head_obc
= get_object_context(i
->first
, false);
12226 head_obc
->stop_block();
12227 // kick blocked ops (head)
12228 requeue_op_blocked_by_object(i
->first
);
12229 objects_blocked_on_snap_promotion
.erase(i
);
12232 if (obc
->requeue_scrub_on_unblock
) {
12234 obc
->requeue_scrub_on_unblock
= false;
12236 dout(20) << __func__
<< " requeuing if still active: " << (is_active() ? "yes" : "no") << dendl
;
12238 // only requeue if we are still active: we may be unblocking
12239 // because we are resetting for a new peering interval
12241 osd
->queue_scrub_unblocking(this, is_scrub_blocking_ops());
12246 SnapSetContext
*PrimaryLogPG::get_snapset_context(
12247 const hobject_t
& oid
,
12249 const map
<string
, bufferlist
, less
<>> *attrs
,
12252 std::lock_guard
l(snapset_contexts_lock
);
12253 SnapSetContext
*ssc
;
12254 map
<hobject_t
, SnapSetContext
*>::iterator p
= snapset_contexts
.find(
12255 oid
.get_snapdir());
12256 if (p
!= snapset_contexts
.end()) {
12257 if (can_create
|| p
->second
->exists
) {
12266 if (!(oid
.is_head() && !oid_existed
)) {
12267 r
= pgbackend
->objects_get_attr(oid
.get_head(), SS_ATTR
, &bv
);
12269 if (r
< 0 && !can_create
)
12272 auto it_ss
= attrs
->find(SS_ATTR
);
12273 ceph_assert(it_ss
!= attrs
->end());
12274 bv
= it_ss
->second
;
12276 ssc
= new SnapSetContext(oid
.get_snapdir());
12277 _register_snapset_context(ssc
);
12279 bufferlist::const_iterator bvp
= bv
.begin();
12281 ssc
->snapset
.decode(bvp
);
12282 } catch (const ceph::buffer::error
& e
) {
12283 dout(0) << __func__
<< " Can't decode snapset: " << e
.what() << dendl
;
12286 ssc
->exists
= true;
12288 ssc
->exists
= false;
12296 void PrimaryLogPG::put_snapset_context(SnapSetContext
*ssc
)
12298 std::lock_guard
l(snapset_contexts_lock
);
12300 if (ssc
->ref
== 0) {
12301 if (ssc
->registered
)
12302 snapset_contexts
.erase(ssc
->oid
);
12309 * NONE - didn't pull anything
12310 * YES - pulled what the caller wanted
12311 * HEAD - needed to pull head first
12313 enum { PULL_NONE
, PULL_HEAD
, PULL_YES
};
12315 int PrimaryLogPG::recover_missing(
12316 const hobject_t
&soid
, eversion_t v
,
12318 PGBackend::RecoveryHandle
*h
)
12320 dout(10) << __func__
<< " sar: " << scrub_after_recovery
<< dendl
;
12322 if (recovery_state
.get_missing_loc().is_unfound(soid
)) {
12323 dout(7) << __func__
<< " " << soid
12325 << " but it is unfound" << dendl
;
12329 if (recovery_state
.get_missing_loc().is_deleted(soid
)) {
12330 start_recovery_op(soid
);
12331 ceph_assert(!recovering
.count(soid
));
12332 recovering
.insert(make_pair(soid
, ObjectContextRef()));
12333 epoch_t cur_epoch
= get_osdmap_epoch();
12334 remove_missing_object(soid
, v
, new LambdaContext(
12336 std::scoped_lock locker
{*this};
12337 if (!pg_has_reset_since(cur_epoch
)) {
12338 bool object_missing
= false;
12339 for (const auto& shard
: get_acting_recovery_backfill()) {
12340 if (shard
== pg_whoami
)
12342 if (recovery_state
.get_peer_missing(shard
).is_missing(soid
)) {
12343 dout(20) << __func__
<< ": soid " << soid
<< " needs to be deleted from replica " << shard
<< dendl
;
12344 object_missing
= true;
12348 if (!object_missing
) {
12349 object_stat_sum_t stat_diff
;
12350 stat_diff
.num_objects_recovered
= 1;
12351 if (scrub_after_recovery
)
12352 stat_diff
.num_objects_repaired
= 1;
12353 on_global_recover(soid
, stat_diff
, true);
12355 auto recovery_handle
= pgbackend
->open_recovery_op();
12356 pgbackend
->recover_delete_object(soid
, v
, recovery_handle
);
12357 pgbackend
->run_recovery_op(recovery_handle
, priority
);
12364 // is this a snapped object? if so, consult the snapset.. we may not need the entire object!
12365 ObjectContextRef obc
;
12366 ObjectContextRef head_obc
;
12367 if (soid
.snap
&& soid
.snap
< CEPH_NOSNAP
) {
12368 // do we have the head?
12369 hobject_t head
= soid
.get_head();
12370 if (recovery_state
.get_pg_log().get_missing().is_missing(head
)) {
12371 if (recovering
.count(head
)) {
12372 dout(10) << " missing but already recovering head " << head
<< dendl
;
12375 int r
= recover_missing(
12376 head
, recovery_state
.get_pg_log().get_missing().get_items().find(head
)->second
.need
, priority
,
12378 if (r
!= PULL_NONE
)
12383 head_obc
= get_object_context(
12387 ceph_assert(head_obc
);
12389 start_recovery_op(soid
);
12390 ceph_assert(!recovering
.count(soid
));
12391 recovering
.insert(make_pair(soid
, obc
));
12392 int r
= pgbackend
->recover_object(
12398 // This is only a pull which shouldn't return an error
12399 ceph_assert(r
>= 0);
12403 void PrimaryLogPG::remove_missing_object(const hobject_t
&soid
,
12404 eversion_t v
, Context
*on_complete
)
12406 dout(20) << __func__
<< " " << soid
<< " " << v
<< dendl
;
12407 ceph_assert(on_complete
!= nullptr);
12409 ObjectStore::Transaction t
;
12410 remove_snap_mapped_object(t
, soid
);
12412 ObjectRecoveryInfo recovery_info
;
12413 recovery_info
.soid
= soid
;
12414 recovery_info
.version
= v
;
12416 epoch_t cur_epoch
= get_osdmap_epoch();
12417 t
.register_on_complete(new LambdaContext(
12419 std::unique_lock locker
{*this};
12420 if (!pg_has_reset_since(cur_epoch
)) {
12421 ObjectStore::Transaction t2
;
12422 on_local_recover(soid
, recovery_info
, ObjectContextRef(), true, &t2
);
12423 t2
.register_on_complete(on_complete
);
12424 int r
= osd
->store
->queue_transaction(ch
, std::move(t2
), nullptr);
12425 ceph_assert(r
== 0);
12429 on_complete
->complete(-EAGAIN
);
12432 int r
= osd
->store
->queue_transaction(ch
, std::move(t
), nullptr);
12433 ceph_assert(r
== 0);
12436 void PrimaryLogPG::finish_degraded_object(const hobject_t oid
)
12438 dout(10) << __func__
<< " " << oid
<< dendl
;
12439 if (callbacks_for_degraded_object
.count(oid
)) {
12440 list
<Context
*> contexts
;
12441 contexts
.swap(callbacks_for_degraded_object
[oid
]);
12442 callbacks_for_degraded_object
.erase(oid
);
12443 for (list
<Context
*>::iterator i
= contexts
.begin();
12444 i
!= contexts
.end();
12449 map
<hobject_t
, snapid_t
>::iterator i
= objects_blocked_on_degraded_snap
.find(
12451 if (i
!= objects_blocked_on_degraded_snap
.end() &&
12452 i
->second
== oid
.snap
)
12453 objects_blocked_on_degraded_snap
.erase(i
);
12456 void PrimaryLogPG::_committed_pushed_object(
12457 epoch_t epoch
, eversion_t last_complete
)
12459 std::scoped_lock locker
{*this};
12460 if (!pg_has_reset_since(epoch
)) {
12461 recovery_state
.recovery_committed_to(last_complete
);
12463 dout(10) << __func__
12464 << " pg has changed, not touching last_complete_ondisk" << dendl
;
12468 void PrimaryLogPG::_applied_recovered_object(ObjectContextRef obc
)
12470 dout(20) << __func__
<< dendl
;
12472 dout(20) << "obc = " << *obc
<< dendl
;
12474 ceph_assert(active_pushes
>= 1);
12477 // requeue an active chunky scrub waiting on recovery ops
12478 if (!recovery_state
.is_deleting() && active_pushes
== 0 &&
12479 is_scrub_active()) {
12481 osd
->queue_scrub_pushes_update(this, is_scrub_blocking_ops());
12485 void PrimaryLogPG::_applied_recovered_object_replica()
12487 dout(20) << __func__
<< dendl
;
12488 ceph_assert(active_pushes
>= 1);
12491 // requeue an active scrub waiting on recovery ops
12492 if (!recovery_state
.is_deleting() && active_pushes
== 0 &&
12493 is_scrub_active()) {
12495 osd
->queue_scrub_replica_pushes(this, m_scrubber
->replica_op_priority());
12499 void PrimaryLogPG::on_failed_pull(
12500 const set
<pg_shard_t
> &from
,
12501 const hobject_t
&soid
,
12502 const eversion_t
&v
)
12504 dout(20) << __func__
<< ": " << soid
<< dendl
;
12505 ceph_assert(recovering
.count(soid
));
12506 auto obc
= recovering
[soid
];
12508 list
<OpRequestRef
> blocked_ops
;
12509 obc
->drop_recovery_read(&blocked_ops
);
12510 requeue_ops(blocked_ops
);
12512 recovering
.erase(soid
);
12513 for (auto&& i
: from
) {
12514 if (i
!= pg_whoami
) { // we'll get it below in primary_error
12515 recovery_state
.force_object_missing(i
, soid
, v
);
12519 dout(0) << __func__
<< " " << soid
<< " from shard " << from
12520 << ", reps on " << recovery_state
.get_missing_loc().get_locations(soid
)
12521 << " unfound? " << recovery_state
.get_missing_loc().is_unfound(soid
)
12523 finish_recovery_op(soid
); // close out this attempt,
12524 finish_degraded_object(soid
);
12526 if (from
.count(pg_whoami
)) {
12527 dout(0) << " primary missing oid " << soid
<< " version " << v
<< dendl
;
12528 primary_error(soid
, v
);
12529 backfills_in_flight
.erase(soid
);
12533 eversion_t
PrimaryLogPG::pick_newest_available(const hobject_t
& oid
)
12536 pg_missing_item pmi
;
12537 bool is_missing
= recovery_state
.get_pg_log().get_missing().is_missing(oid
, &pmi
);
12538 ceph_assert(is_missing
);
12540 dout(10) << "pick_newest_available " << oid
<< " " << v
<< " on osd." << osd
->whoami
<< " (local)" << dendl
;
12542 ceph_assert(!get_acting_recovery_backfill().empty());
12543 for (set
<pg_shard_t
>::iterator i
= get_acting_recovery_backfill().begin();
12544 i
!= get_acting_recovery_backfill().end();
12546 if (*i
== get_primary()) continue;
12547 pg_shard_t peer
= *i
;
12548 if (!recovery_state
.get_peer_missing(peer
).is_missing(oid
)) {
12551 eversion_t h
= recovery_state
.get_peer_missing(peer
).get_items().at(oid
).have
;
12552 dout(10) << "pick_newest_available " << oid
<< " " << h
<< " on osd." << peer
<< dendl
;
12557 dout(10) << "pick_newest_available " << oid
<< " " << v
<< " (newest)" << dendl
;
12561 void PrimaryLogPG::do_update_log_missing(OpRequestRef
&op
)
12563 const MOSDPGUpdateLogMissing
*m
= static_cast<const MOSDPGUpdateLogMissing
*>(
12565 ceph_assert(m
->get_type() == MSG_OSD_PG_UPDATE_LOG_MISSING
);
12566 ObjectStore::Transaction t
;
12567 std::optional
<eversion_t
> op_trim_to
, op_roll_forward_to
;
12568 if (m
->pg_trim_to
!= eversion_t())
12569 op_trim_to
= m
->pg_trim_to
;
12570 if (m
->pg_roll_forward_to
!= eversion_t())
12571 op_roll_forward_to
= m
->pg_roll_forward_to
;
12573 dout(20) << __func__
12574 << " op_trim_to = " << op_trim_to
<< " op_roll_forward_to = " << op_roll_forward_to
<< dendl
;
12576 recovery_state
.append_log_entries_update_missing(
12577 m
->entries
, t
, op_trim_to
, op_roll_forward_to
);
12578 eversion_t new_lcod
= info
.last_complete
;
12580 Context
*complete
= new LambdaContext(
12582 const MOSDPGUpdateLogMissing
*msg
= static_cast<const MOSDPGUpdateLogMissing
*>(
12584 std::scoped_lock locker
{*this};
12585 if (!pg_has_reset_since(msg
->get_epoch())) {
12586 update_last_complete_ondisk(new_lcod
);
12587 MOSDPGUpdateLogMissingReply
*reply
=
12588 new MOSDPGUpdateLogMissingReply(
12589 spg_t(info
.pgid
.pgid
, primary_shard().shard
),
12595 reply
->set_priority(CEPH_MSG_PRIO_HIGH
);
12596 msg
->get_connection()->send_message(reply
);
12600 if (get_osdmap()->require_osd_release
>= ceph_release_t::kraken
) {
12601 t
.register_on_commit(complete
);
12603 /* Hack to work around the fact that ReplicatedBackend sends
12604 * ack+commit if commit happens first
12606 * This behavior is no longer necessary, but we preserve it so old
12607 * primaries can keep their repops in order */
12608 if (pool
.info
.is_erasure()) {
12609 t
.register_on_complete(complete
);
12611 t
.register_on_commit(complete
);
12614 int tr
= osd
->store
->queue_transaction(
12618 ceph_assert(tr
== 0);
12619 op_applied(info
.last_update
);
12622 void PrimaryLogPG::do_update_log_missing_reply(OpRequestRef
&op
)
12624 const MOSDPGUpdateLogMissingReply
*m
=
12625 static_cast<const MOSDPGUpdateLogMissingReply
*>(
12627 dout(20) << __func__
<< " got reply from "
12628 << m
->get_from() << dendl
;
12630 auto it
= log_entry_update_waiting_on
.find(m
->get_tid());
12631 if (it
!= log_entry_update_waiting_on
.end()) {
12632 if (it
->second
.waiting_on
.count(m
->get_from())) {
12633 it
->second
.waiting_on
.erase(m
->get_from());
12634 if (m
->last_complete_ondisk
!= eversion_t()) {
12635 update_peer_last_complete_ondisk(m
->get_from(), m
->last_complete_ondisk
);
12639 << info
.pgid
<< " got reply "
12640 << *m
<< " from shard we are not waiting for "
12644 if (it
->second
.waiting_on
.empty()) {
12645 repop_all_committed(it
->second
.repop
.get());
12646 log_entry_update_waiting_on
.erase(it
);
12650 << info
.pgid
<< " got reply "
12651 << *m
<< " on unknown tid " << m
->get_tid();
12655 /* Mark all unfound objects as lost.
12657 void PrimaryLogPG::mark_all_unfound_lost(
12659 std::function
<void(int,const std::string
&,bufferlist
&)> on_finish
)
12661 dout(3) << __func__
<< " " << pg_log_entry_t::get_op_name(what
) << dendl
;
12662 list
<hobject_t
> oids
;
12664 dout(30) << __func__
<< ": log before:\n";
12665 recovery_state
.get_pg_log().get_log().print(*_dout
);
12668 mempool::osd_pglog::list
<pg_log_entry_t
> log_entries
;
12670 utime_t mtime
= ceph_clock_now();
12671 map
<hobject_t
, pg_missing_item
>::const_iterator m
=
12672 recovery_state
.get_missing_loc().get_needs_recovery().begin();
12673 map
<hobject_t
, pg_missing_item
>::const_iterator mend
=
12674 recovery_state
.get_missing_loc().get_needs_recovery().end();
12676 ObcLockManager manager
;
12677 eversion_t v
= get_next_version();
12678 v
.epoch
= get_osdmap_epoch();
12679 uint64_t num_unfound
= recovery_state
.get_missing_loc().num_unfound();
12680 while (m
!= mend
) {
12681 const hobject_t
&oid(m
->first
);
12682 if (!recovery_state
.get_missing_loc().is_unfound(oid
)) {
12683 // We only care about unfound objects
12688 ObjectContextRef obc
;
12692 case pg_log_entry_t::LOST_MARK
:
12693 ceph_abort_msg("actually, not implemented yet!");
12696 case pg_log_entry_t::LOST_REVERT
:
12697 prev
= pick_newest_available(oid
);
12698 if (prev
> eversion_t()) {
12701 pg_log_entry_t::LOST_REVERT
, oid
, v
,
12702 m
->second
.need
, 0, osd_reqid_t(), mtime
, 0);
12703 e
.reverting_to
= prev
;
12704 e
.mark_unrollbackable();
12705 log_entries
.push_back(e
);
12706 dout(10) << e
<< dendl
;
12708 // we are now missing the new version; recovery code will sort it out.
12714 case pg_log_entry_t::LOST_DELETE
:
12716 pg_log_entry_t
e(pg_log_entry_t::LOST_DELETE
, oid
, v
, m
->second
.need
,
12717 0, osd_reqid_t(), mtime
, 0);
12718 if (get_osdmap()->require_osd_release
>= ceph_release_t::jewel
) {
12719 if (pool
.info
.require_rollback()) {
12720 e
.mod_desc
.try_rmobject(v
.version
);
12722 e
.mark_unrollbackable();
12724 } // otherwise, just do what we used to do
12725 dout(10) << e
<< dendl
;
12726 log_entries
.push_back(e
);
12727 oids
.push_back(oid
);
12729 // If context found mark object as deleted in case
12730 // of racing with new creation. This can happen if
12731 // object lost and EIO at primary.
12732 obc
= object_contexts
.lookup(oid
);
12734 obc
->obs
.exists
= false;
12746 recovery_state
.update_stats(
12747 [](auto &history
, auto &stats
) {
12748 stats
.stats_invalid
= true;
12752 submit_log_entries(
12754 std::move(manager
),
12755 std::optional
<std::function
<void(void)> >(
12756 [this, oids
, num_unfound
, on_finish
]() {
12757 if (recovery_state
.perform_deletes_during_peering()) {
12758 for (auto oid
: oids
) {
12759 // clear old locations - merge_new_log_entries will have
12760 // handled rebuilding missing_loc for each of these
12761 // objects if we have the RECOVERY_DELETES flag
12762 recovery_state
.object_recovered(oid
, object_stat_sum_t());
12766 if (is_recovery_unfound()) {
12767 queue_peering_event(
12769 std::make_shared
<PGPeeringEvent
>(
12770 get_osdmap_epoch(),
12771 get_osdmap_epoch(),
12772 PeeringState::DoRecovery())));
12773 } else if (is_backfill_unfound()) {
12774 queue_peering_event(
12776 std::make_shared
<PGPeeringEvent
>(
12777 get_osdmap_epoch(),
12778 get_osdmap_epoch(),
12779 PeeringState::RequestBackfill())));
12785 ss
<< "pg has " << num_unfound
12786 << " objects unfound and apparently lost marking";
12787 string rs
= ss
.str();
12788 dout(0) << "do_command r=" << 0 << " " << rs
<< dendl
;
12789 osd
->clog
->info() << rs
;
12791 on_finish(0, rs
, empty
);
12796 void PrimaryLogPG::_split_into(pg_t child_pgid
, PG
*child
, unsigned split_bits
)
12798 ceph_assert(repop_queue
.empty());
12802 * pg status change notification
12805 void PrimaryLogPG::apply_and_flush_repops(bool requeue
)
12807 list
<OpRequestRef
> rq
;
12809 // apply all repops
12810 while (!repop_queue
.empty()) {
12811 RepGather
*repop
= repop_queue
.front();
12812 repop_queue
.pop_front();
12813 dout(10) << " canceling repop tid " << repop
->rep_tid
<< dendl
;
12814 repop
->rep_aborted
= true;
12815 repop
->on_committed
.clear();
12816 repop
->on_success
.clear();
12820 dout(10) << " requeuing " << *repop
->op
->get_req() << dendl
;
12821 rq
.push_back(repop
->op
);
12822 repop
->op
= OpRequestRef();
12825 // also requeue any dups, interleaved into position
12826 auto p
= waiting_for_ondisk
.find(repop
->v
);
12827 if (p
!= waiting_for_ondisk
.end()) {
12828 dout(10) << " also requeuing ondisk waiters " << p
->second
<< dendl
;
12829 for (auto& i
: p
->second
) {
12830 rq
.push_back(std::get
<0>(i
));
12832 waiting_for_ondisk
.erase(p
);
12836 remove_repop(repop
);
12839 ceph_assert(repop_queue
.empty());
12843 if (!waiting_for_ondisk
.empty()) {
12844 for (auto& i
: waiting_for_ondisk
) {
12845 for (auto& j
: i
.second
) {
12846 derr
<< __func__
<< ": op " << *(std::get
<0>(j
)->get_req())
12847 << " waiting on " << i
.first
<< dendl
;
12850 ceph_assert(waiting_for_ondisk
.empty());
12854 waiting_for_ondisk
.clear();
12857 void PrimaryLogPG::on_flushed()
12859 requeue_ops(waiting_for_flush
);
12860 if (!is_peered() || !is_primary()) {
12861 pair
<hobject_t
, ObjectContextRef
> i
;
12862 while (object_contexts
.get_next(i
.first
, &i
)) {
12863 derr
<< __func__
<< ": object " << i
.first
<< " obc still alive" << dendl
;
12865 ceph_assert(object_contexts
.empty());
12869 void PrimaryLogPG::on_removal(ObjectStore::Transaction
&t
)
12871 dout(10) << __func__
<< dendl
;
12875 t
.register_on_commit(new C_DeleteMore(this, get_osdmap_epoch()));
12878 void PrimaryLogPG::clear_async_reads()
12880 dout(10) << __func__
<< dendl
;
12881 for(auto& i
: in_progress_async_reads
) {
12882 dout(10) << "clear ctx: "
12883 << "OpRequestRef " << i
.first
12884 << " OpContext " << i
.second
12886 close_op_ctx(i
.second
);
12890 void PrimaryLogPG::clear_cache()
12892 object_contexts
.clear();
12895 void PrimaryLogPG::on_shutdown()
12897 dout(10) << __func__
<< dendl
;
12899 if (recovery_queued
) {
12900 recovery_queued
= false;
12901 osd
->clear_queued_recovery(this);
12904 m_scrubber
->scrub_clear_state();
12905 m_scrubber
->rm_from_osd_scrubbing();
12907 vector
<ceph_tid_t
> tids
;
12908 cancel_copy_ops(false, &tids
);
12909 cancel_flush_ops(false, &tids
);
12910 cancel_proxy_ops(false, &tids
);
12911 cancel_manifest_ops(false, &tids
);
12912 cancel_cls_gather_ops(false, &tids
);
12913 osd
->objecter
->op_cancel(tids
, -ECANCELED
);
12915 apply_and_flush_repops(false);
12916 cancel_log_updates();
12917 // we must remove PGRefs, so do this this prior to release_backoffs() callers
12919 // clean up snap trim references
12920 snap_trimmer_machine
.process_event(Reset());
12922 pgbackend
->on_change();
12924 context_registry_on_change();
12925 object_contexts
.clear();
12927 clear_async_reads();
12929 osd
->remote_reserver
.cancel_reservation(info
.pgid
);
12930 osd
->local_reserver
.cancel_reservation(info
.pgid
);
12932 clear_primary_state();
12935 if (is_primary()) {
12936 osd
->clear_ready_to_merge(this);
12940 void PrimaryLogPG::on_activate_complete()
12944 if (!recovery_state
.needs_flush()) {
12945 requeue_ops(waiting_for_peered
);
12946 } else if (!waiting_for_peered
.empty()) {
12947 dout(10) << __func__
<< " flushes in progress, moving "
12948 << waiting_for_peered
.size()
12949 << " items to waiting_for_flush"
12951 ceph_assert(waiting_for_flush
.empty());
12952 waiting_for_flush
.swap(waiting_for_peered
);
12957 if (needs_recovery()) {
12958 dout(10) << "activate not all replicas are up-to-date, queueing recovery" << dendl
;
12959 queue_peering_event(
12961 std::make_shared
<PGPeeringEvent
>(
12962 get_osdmap_epoch(),
12963 get_osdmap_epoch(),
12964 PeeringState::DoRecovery())));
12965 } else if (needs_backfill()) {
12966 dout(10) << "activate queueing backfill" << dendl
;
12967 queue_peering_event(
12969 std::make_shared
<PGPeeringEvent
>(
12970 get_osdmap_epoch(),
12971 get_osdmap_epoch(),
12972 PeeringState::RequestBackfill())));
12974 dout(10) << "activate all replicas clean, no recovery" << dendl
;
12975 queue_peering_event(
12977 std::make_shared
<PGPeeringEvent
>(
12978 get_osdmap_epoch(),
12979 get_osdmap_epoch(),
12980 PeeringState::AllReplicasRecovered())));
12983 publish_stats_to_osd();
12985 if (get_backfill_targets().size()) {
12986 last_backfill_started
= recovery_state
.earliest_backfill();
12987 new_backfill
= true;
12988 ceph_assert(!last_backfill_started
.is_max());
12989 dout(5) << __func__
<< ": bft=" << get_backfill_targets()
12990 << " from " << last_backfill_started
<< dendl
;
12991 for (set
<pg_shard_t
>::const_iterator i
= get_backfill_targets().begin();
12992 i
!= get_backfill_targets().end();
12994 dout(5) << "target shard " << *i
12995 << " from " << recovery_state
.get_peer_info(*i
).last_backfill
13004 void PrimaryLogPG::on_change(ObjectStore::Transaction
&t
)
13006 dout(10) << __func__
<< dendl
;
13008 if (hit_set
&& hit_set
->insert_count() == 0) {
13009 dout(20) << " discarding empty hit_set" << dendl
;
13013 if (recovery_queued
) {
13014 recovery_queued
= false;
13015 osd
->clear_queued_recovery(this);
13018 // requeue everything in the reverse order they should be
13020 requeue_ops(waiting_for_peered
);
13021 requeue_ops(waiting_for_flush
);
13022 requeue_ops(waiting_for_active
);
13023 requeue_ops(waiting_for_readable
);
13025 vector
<ceph_tid_t
> tids
;
13026 cancel_copy_ops(is_primary(), &tids
);
13027 cancel_flush_ops(is_primary(), &tids
);
13028 cancel_proxy_ops(is_primary(), &tids
);
13029 cancel_manifest_ops(is_primary(), &tids
);
13030 cancel_cls_gather_ops(is_primary(), &tids
);
13031 osd
->objecter
->op_cancel(tids
, -ECANCELED
);
13033 // requeue object waiters
13034 for (auto& p
: waiting_for_unreadable_object
) {
13035 release_backoffs(p
.first
);
13037 if (is_primary()) {
13038 requeue_object_waiters(waiting_for_unreadable_object
);
13040 waiting_for_unreadable_object
.clear();
13042 for (map
<hobject_t
,list
<OpRequestRef
>>::iterator p
= waiting_for_degraded_object
.begin();
13043 p
!= waiting_for_degraded_object
.end();
13044 waiting_for_degraded_object
.erase(p
++)) {
13045 release_backoffs(p
->first
);
13047 requeue_ops(p
->second
);
13050 finish_degraded_object(p
->first
);
13053 // requeues waiting_for_scrub
13054 m_scrubber
->scrub_clear_state();
13056 for (auto p
= waiting_for_blocked_object
.begin();
13057 p
!= waiting_for_blocked_object
.end();
13058 waiting_for_blocked_object
.erase(p
++)) {
13060 requeue_ops(p
->second
);
13064 for (auto i
= callbacks_for_degraded_object
.begin();
13065 i
!= callbacks_for_degraded_object
.end();
13067 finish_degraded_object((i
++)->first
);
13069 ceph_assert(callbacks_for_degraded_object
.empty());
13071 if (is_primary()) {
13072 requeue_ops(waiting_for_cache_not_full
);
13074 waiting_for_cache_not_full
.clear();
13076 objects_blocked_on_cache_full
.clear();
13078 for (list
<pair
<OpRequestRef
, OpContext
*> >::iterator i
=
13079 in_progress_async_reads
.begin();
13080 i
!= in_progress_async_reads
.end();
13081 in_progress_async_reads
.erase(i
++)) {
13082 close_op_ctx(i
->second
);
13084 requeue_op(i
->first
);
13087 // this will requeue ops we were working on but didn't finish, and
13089 apply_and_flush_repops(is_primary());
13090 cancel_log_updates();
13092 // do this *after* apply_and_flush_repops so that we catch any newly
13093 // registered watches.
13094 context_registry_on_change();
13096 pgbackend
->on_change_cleanup(&t
);
13097 m_scrubber
->cleanup_store(&t
);
13098 pgbackend
->on_change();
13100 // clear snap_trimmer state
13101 snap_trimmer_machine
.process_event(Reset());
13103 debug_op_order
.clear();
13104 unstable_stats
.clear();
13106 // we don't want to cache object_contexts through the interval change
13107 // NOTE: we actually assert that all currently live references are dead
13108 // by the time the flush for the next interval completes.
13109 object_contexts
.clear();
13111 // should have been cleared above by finishing all of the degraded objects
13112 ceph_assert(objects_blocked_on_degraded_snap
.empty());
13115 void PrimaryLogPG::plpg_on_role_change()
13117 dout(10) << __func__
<< dendl
;
13118 if (get_role() != 0 && hit_set
) {
13119 dout(10) << " clearing hit set" << dendl
;
13124 void PrimaryLogPG::plpg_on_pool_change()
13126 dout(10) << __func__
<< dendl
;
13127 // requeue cache full waiters just in case the cache_mode is
13128 // changing away from writeback mode. note that if we are not
13129 // active the normal requeuing machinery is sufficient (and properly
13132 pool
.info
.cache_mode
!= pg_pool_t::CACHEMODE_WRITEBACK
&&
13133 !waiting_for_cache_not_full
.empty()) {
13134 dout(10) << __func__
<< " requeuing full waiters (not in writeback) "
13136 requeue_ops(waiting_for_cache_not_full
);
13137 objects_blocked_on_cache_full
.clear();
13143 // clear state. called on recovery completion AND cancellation.
13144 void PrimaryLogPG::_clear_recovery_state()
13146 #ifdef DEBUG_RECOVERY_OIDS
13147 recovering_oids
.clear();
13149 dout(15) << __func__
<< " flags: " << m_planned_scrub
<< dendl
;
13151 last_backfill_started
= hobject_t();
13152 set
<hobject_t
>::iterator i
= backfills_in_flight
.begin();
13153 while (i
!= backfills_in_flight
.end()) {
13154 backfills_in_flight
.erase(i
++);
13157 list
<OpRequestRef
> blocked_ops
;
13158 for (map
<hobject_t
, ObjectContextRef
>::iterator i
= recovering
.begin();
13159 i
!= recovering
.end();
13160 recovering
.erase(i
++)) {
13162 i
->second
->drop_recovery_read(&blocked_ops
);
13163 requeue_ops(blocked_ops
);
13166 ceph_assert(backfills_in_flight
.empty());
13167 pending_backfill_updates
.clear();
13168 ceph_assert(recovering
.empty());
13169 pgbackend
->clear_recovery_state();
13172 void PrimaryLogPG::cancel_pull(const hobject_t
&soid
)
13174 dout(20) << __func__
<< ": " << soid
<< dendl
;
13175 ceph_assert(recovering
.count(soid
));
13176 ObjectContextRef obc
= recovering
[soid
];
13178 list
<OpRequestRef
> blocked_ops
;
13179 obc
->drop_recovery_read(&blocked_ops
);
13180 requeue_ops(blocked_ops
);
13182 recovering
.erase(soid
);
13183 finish_recovery_op(soid
);
13184 release_backoffs(soid
);
13185 if (waiting_for_degraded_object
.count(soid
)) {
13186 dout(20) << " kicking degraded waiters on " << soid
<< dendl
;
13187 requeue_ops(waiting_for_degraded_object
[soid
]);
13188 waiting_for_degraded_object
.erase(soid
);
13190 if (waiting_for_unreadable_object
.count(soid
)) {
13191 dout(20) << " kicking unreadable waiters on " << soid
<< dendl
;
13192 requeue_ops(waiting_for_unreadable_object
[soid
]);
13193 waiting_for_unreadable_object
.erase(soid
);
13195 if (is_missing_object(soid
))
13196 recovery_state
.set_last_requested(0);
13197 finish_degraded_object(soid
);
13200 void PrimaryLogPG::check_recovery_sources(const OSDMapRef
& osdmap
)
13202 pgbackend
->check_recovery_sources(osdmap
);
13205 bool PrimaryLogPG::start_recovery_ops(
13207 ThreadPool::TPHandle
&handle
,
13208 uint64_t *ops_started
)
13210 uint64_t& started
= *ops_started
;
13212 bool work_in_progress
= false;
13213 bool recovery_started
= false;
13214 ceph_assert(is_primary());
13215 ceph_assert(is_peered());
13216 ceph_assert(!recovery_state
.is_deleting());
13218 ceph_assert(recovery_queued
);
13219 recovery_queued
= false;
13221 if (!state_test(PG_STATE_RECOVERING
) &&
13222 !state_test(PG_STATE_BACKFILLING
)) {
13223 /* TODO: I think this case is broken and will make do_recovery()
13224 * unhappy since we're returning false */
13225 dout(10) << "recovery raced and were queued twice, ignoring!" << dendl
;
13226 return have_unfound();
13229 const auto &missing
= recovery_state
.get_pg_log().get_missing();
13231 uint64_t num_unfound
= get_num_unfound();
13233 if (!recovery_state
.have_missing()) {
13234 recovery_state
.local_recovery_complete();
13237 if (!missing
.have_missing() || // Primary does not have missing
13238 // or all of the missing objects are unfound.
13239 recovery_state
.all_missing_unfound()) {
13240 // Recover the replicas.
13241 started
= recover_replicas(max
, handle
, &recovery_started
);
13244 // We still have missing objects that we should grab from replicas.
13245 started
+= recover_primary(max
, handle
);
13247 if (!started
&& num_unfound
!= get_num_unfound()) {
13248 // second chance to recovery replicas
13249 started
= recover_replicas(max
, handle
, &recovery_started
);
13252 if (started
|| recovery_started
)
13253 work_in_progress
= true;
13255 bool deferred_backfill
= false;
13256 if (recovering
.empty() &&
13257 state_test(PG_STATE_BACKFILLING
) &&
13258 !get_backfill_targets().empty() && started
< max
&&
13259 missing
.num_missing() == 0 &&
13260 waiting_on_backfill
.empty()) {
13261 if (get_osdmap()->test_flag(CEPH_OSDMAP_NOBACKFILL
)) {
13262 dout(10) << "deferring backfill due to NOBACKFILL" << dendl
;
13263 deferred_backfill
= true;
13264 } else if (get_osdmap()->test_flag(CEPH_OSDMAP_NOREBALANCE
) &&
13266 dout(10) << "deferring backfill due to NOREBALANCE" << dendl
;
13267 deferred_backfill
= true;
13268 } else if (!recovery_state
.is_backfill_reserved()) {
13269 /* DNMNOTE I think this branch is dead */
13270 dout(10) << "deferring backfill due to !backfill_reserved" << dendl
;
13271 if (!backfill_reserving
) {
13272 dout(10) << "queueing RequestBackfill" << dendl
;
13273 backfill_reserving
= true;
13274 queue_peering_event(
13276 std::make_shared
<PGPeeringEvent
>(
13277 get_osdmap_epoch(),
13278 get_osdmap_epoch(),
13279 PeeringState::RequestBackfill())));
13281 deferred_backfill
= true;
13283 started
+= recover_backfill(max
- started
, handle
, &work_in_progress
);
13287 dout(10) << " started " << started
<< dendl
;
13288 osd
->logger
->inc(l_osd_rop
, started
);
13290 if (!recovering
.empty() ||
13291 work_in_progress
|| recovery_ops_active
> 0 || deferred_backfill
)
13292 return !work_in_progress
&& have_unfound();
13294 ceph_assert(recovering
.empty());
13295 ceph_assert(recovery_ops_active
== 0);
13297 dout(10) << __func__
<< " needs_recovery: "
13298 << recovery_state
.get_missing_loc().get_needs_recovery()
13300 dout(10) << __func__
<< " missing_loc: "
13301 << recovery_state
.get_missing_loc().get_missing_locs()
13303 int unfound
= get_num_unfound();
13305 dout(10) << " still have " << unfound
<< " unfound" << dendl
;
13309 if (missing
.num_missing() > 0) {
13310 // this shouldn't happen!
13311 osd
->clog
->error() << info
.pgid
<< " Unexpected Error: recovery ending with "
13312 << missing
.num_missing() << ": " << missing
.get_items();
13316 if (needs_recovery()) {
13317 // this shouldn't happen!
13318 // We already checked num_missing() so we must have missing replicas
13319 osd
->clog
->error() << info
.pgid
13320 << " Unexpected Error: recovery ending with missing replicas";
13324 if (state_test(PG_STATE_RECOVERING
)) {
13325 state_clear(PG_STATE_RECOVERING
);
13326 state_clear(PG_STATE_FORCED_RECOVERY
);
13327 if (needs_backfill()) {
13328 dout(10) << "recovery done, queuing backfill" << dendl
;
13329 queue_peering_event(
13331 std::make_shared
<PGPeeringEvent
>(
13332 get_osdmap_epoch(),
13333 get_osdmap_epoch(),
13334 PeeringState::RequestBackfill())));
13336 dout(10) << "recovery done, no backfill" << dendl
;
13337 state_clear(PG_STATE_FORCED_BACKFILL
);
13338 queue_peering_event(
13340 std::make_shared
<PGPeeringEvent
>(
13341 get_osdmap_epoch(),
13342 get_osdmap_epoch(),
13343 PeeringState::AllReplicasRecovered())));
13345 } else { // backfilling
13346 state_clear(PG_STATE_BACKFILLING
);
13347 state_clear(PG_STATE_FORCED_BACKFILL
);
13348 state_clear(PG_STATE_FORCED_RECOVERY
);
13349 dout(10) << "recovery done, backfill done" << dendl
;
13350 queue_peering_event(
13352 std::make_shared
<PGPeeringEvent
>(
13353 get_osdmap_epoch(),
13354 get_osdmap_epoch(),
13355 PeeringState::Backfilled())));
13362 * do one recovery op.
13363 * return true if done, false if nothing left to do.
13365 uint64_t PrimaryLogPG::recover_primary(uint64_t max
, ThreadPool::TPHandle
&handle
)
13367 ceph_assert(is_primary());
13369 const auto &missing
= recovery_state
.get_pg_log().get_missing();
13371 dout(10) << __func__
<< " recovering " << recovering
.size()
13373 << " missing " << missing
<< dendl
;
13375 dout(25) << __func__
<< " " << missing
.get_items() << dendl
;
13378 pg_log_entry_t
*latest
= 0;
13379 unsigned started
= 0;
13382 PGBackend::RecoveryHandle
*h
= pgbackend
->open_recovery_op();
13383 map
<version_t
, hobject_t
>::const_iterator p
=
13384 missing
.get_rmissing().lower_bound(recovery_state
.get_pg_log().get_log().last_requested
);
13385 while (p
!= missing
.get_rmissing().end()) {
13386 handle
.reset_tp_timeout();
13388 version_t v
= p
->first
;
13390 auto it_objects
= recovery_state
.get_pg_log().get_log().objects
.find(p
->second
);
13391 if (it_objects
!= recovery_state
.get_pg_log().get_log().objects
.end()) {
13392 latest
= it_objects
->second
;
13393 ceph_assert(latest
->is_update() || latest
->is_delete());
13394 soid
= latest
->soid
;
13399 const pg_missing_item
& item
= missing
.get_items().find(p
->second
)->second
;
13402 hobject_t head
= soid
.get_head();
13404 eversion_t need
= item
.need
;
13406 dout(10) << __func__
<< " "
13407 << soid
<< " " << item
.need
13408 << (missing
.is_missing(soid
) ? " (missing)":"")
13409 << (missing
.is_missing(head
) ? " (missing head)":"")
13410 << (recovering
.count(soid
) ? " (recovering)":"")
13411 << (recovering
.count(head
) ? " (recovering head)":"")
13415 switch (latest
->op
) {
13416 case pg_log_entry_t::CLONE
:
13418 * Handling for this special case removed for now, until we
13419 * can correctly construct an accurate SnapSet from the old
13424 case pg_log_entry_t::LOST_REVERT
:
13426 if (item
.have
== latest
->reverting_to
) {
13427 ObjectContextRef obc
= get_object_context(soid
, true);
13429 if (obc
->obs
.oi
.version
== latest
->version
) {
13430 // I'm already reverting
13431 dout(10) << " already reverting " << soid
<< dendl
;
13433 dout(10) << " reverting " << soid
<< " to " << latest
->prior_version
<< dendl
;
13434 obc
->obs
.oi
.version
= latest
->version
;
13436 ObjectStore::Transaction t
;
13438 obc
->obs
.oi
.encode(
13440 get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD
, nullptr));
13441 ceph_assert(!pool
.info
.require_rollback());
13442 t
.setattr(coll
, ghobject_t(soid
), OI_ATTR
, b2
);
13444 recovery_state
.recover_got(
13452 t
.register_on_applied(new C_OSD_AppliedRecoveredObject(this, obc
));
13453 t
.register_on_commit(new C_OSD_CommittedPushedObject(
13455 get_osdmap_epoch(),
13456 info
.last_complete
));
13457 osd
->store
->queue_transaction(ch
, std::move(t
));
13462 * Pull the old version of the object. Update missing_loc here to have the location
13463 * of the version we want.
13465 * This doesn't use the usual missing_loc paths, but that's okay:
13466 * - if we have it locally, we hit the case above, and go from there.
13467 * - if we don't, we always pass through this case during recovery and set up the location
13469 * - this way we don't need to mangle the missing code to be general about needing an old
13472 eversion_t alternate_need
= latest
->reverting_to
;
13473 dout(10) << " need to pull prior_version " << alternate_need
<< " for revert " << item
<< dendl
;
13475 set
<pg_shard_t
> good_peers
;
13476 for (auto p
= recovery_state
.get_peer_missing().begin();
13477 p
!= recovery_state
.get_peer_missing().end();
13479 if (p
->second
.is_missing(soid
, need
) &&
13480 p
->second
.get_items().at(soid
).have
== alternate_need
) {
13481 good_peers
.insert(p
->first
);
13484 recovery_state
.set_revert_with_targets(
13487 dout(10) << " will pull " << alternate_need
<< " or " << need
13489 << recovery_state
.get_missing_loc().get_locations(soid
)
13497 if (!recovering
.count(soid
)) {
13498 if (recovering
.count(head
)) {
13501 int r
= recover_missing(
13502 soid
, need
, recovery_state
.get_recovery_op_priority(), h
);
13515 if (started
>= max
)
13520 // only advance last_requested if we haven't skipped anything
13522 recovery_state
.set_last_requested(v
);
13525 pgbackend
->run_recovery_op(h
, recovery_state
.get_recovery_op_priority());
13529 bool PrimaryLogPG::primary_error(
13530 const hobject_t
& soid
, eversion_t v
)
13532 recovery_state
.force_object_missing(pg_whoami
, soid
, v
);
13533 bool uhoh
= recovery_state
.get_missing_loc().is_unfound(soid
);
13535 osd
->clog
->error() << info
.pgid
<< " missing primary copy of "
13536 << soid
<< ", unfound";
13538 osd
->clog
->error() << info
.pgid
<< " missing primary copy of "
13540 << ", will try copies on "
13541 << recovery_state
.get_missing_loc().get_locations(soid
);
13545 int PrimaryLogPG::prep_object_replica_deletes(
13546 const hobject_t
& soid
, eversion_t v
,
13547 PGBackend::RecoveryHandle
*h
,
13548 bool *work_started
)
13550 ceph_assert(is_primary());
13551 dout(10) << __func__
<< ": on " << soid
<< dendl
;
13553 ObjectContextRef obc
= get_object_context(soid
, false);
13555 if (!obc
->get_recovery_read()) {
13556 dout(20) << "replica delete delayed on " << soid
13557 << "; could not get rw_manager lock" << dendl
;
13558 *work_started
= true;
13561 dout(20) << "replica delete got recovery read lock on " << soid
13566 start_recovery_op(soid
);
13567 ceph_assert(!recovering
.count(soid
));
13569 recovering
.insert(make_pair(soid
, ObjectContextRef()));
13571 recovering
.insert(make_pair(soid
, obc
));
13573 pgbackend
->recover_delete_object(soid
, v
, h
);
13577 int PrimaryLogPG::prep_object_replica_pushes(
13578 const hobject_t
& soid
, eversion_t v
,
13579 PGBackend::RecoveryHandle
*h
,
13580 bool *work_started
)
13582 ceph_assert(is_primary());
13583 dout(10) << __func__
<< ": on " << soid
<< dendl
;
13585 if (soid
.snap
&& soid
.snap
< CEPH_NOSNAP
) {
13586 // do we have the head and/or snapdir?
13587 hobject_t head
= soid
.get_head();
13588 if (recovery_state
.get_pg_log().get_missing().is_missing(head
)) {
13589 if (recovering
.count(head
)) {
13590 dout(10) << " missing but already recovering head " << head
<< dendl
;
13593 int r
= recover_missing(
13594 head
, recovery_state
.get_pg_log().get_missing().get_items().find(head
)->second
.need
,
13595 recovery_state
.get_recovery_op_priority(), h
);
13596 if (r
!= PULL_NONE
)
13603 // NOTE: we know we will get a valid oloc off of disk here.
13604 ObjectContextRef obc
= get_object_context(soid
, false);
13606 primary_error(soid
, v
);
13610 if (!obc
->get_recovery_read()) {
13611 dout(20) << "recovery delayed on " << soid
13612 << "; could not get rw_manager lock" << dendl
;
13613 *work_started
= true;
13616 dout(20) << "recovery got recovery read lock on " << soid
13620 start_recovery_op(soid
);
13621 ceph_assert(!recovering
.count(soid
));
13622 recovering
.insert(make_pair(soid
, obc
));
13624 int r
= pgbackend
->recover_object(
13627 ObjectContextRef(),
13628 obc
, // has snapset context
13631 dout(0) << __func__
<< " Error " << r
<< " on oid " << soid
<< dendl
;
13632 on_failed_pull({ pg_whoami
}, soid
, v
);
13638 uint64_t PrimaryLogPG::recover_replicas(uint64_t max
, ThreadPool::TPHandle
&handle
,
13639 bool *work_started
)
13641 dout(10) << __func__
<< "(" << max
<< ")" << dendl
;
13642 uint64_t started
= 0;
13644 PGBackend::RecoveryHandle
*h
= pgbackend
->open_recovery_op();
13646 // this is FAR from an optimal recovery order. pretty lame, really.
13647 ceph_assert(!get_acting_recovery_backfill().empty());
13648 // choose replicas to recover, replica has the shortest missing list first
13649 // so we can bring it back to normal ASAP
13650 std::vector
<std::pair
<unsigned int, pg_shard_t
>> replicas_by_num_missing
,
13651 async_by_num_missing
;
13652 replicas_by_num_missing
.reserve(get_acting_recovery_backfill().size() - 1);
13653 for (auto &p
: get_acting_recovery_backfill()) {
13654 if (p
== get_primary()) {
13657 auto pm
= recovery_state
.get_peer_missing().find(p
);
13658 ceph_assert(pm
!= recovery_state
.get_peer_missing().end());
13659 auto nm
= pm
->second
.num_missing();
13661 if (is_async_recovery_target(p
)) {
13662 async_by_num_missing
.push_back(make_pair(nm
, p
));
13664 replicas_by_num_missing
.push_back(make_pair(nm
, p
));
13668 // sort by number of missing objects, in ascending order.
13669 auto func
= [](const std::pair
<unsigned int, pg_shard_t
> &lhs
,
13670 const std::pair
<unsigned int, pg_shard_t
> &rhs
) {
13671 return lhs
.first
< rhs
.first
;
13673 // acting goes first
13674 std::sort(replicas_by_num_missing
.begin(), replicas_by_num_missing
.end(), func
);
13675 // then async_recovery_targets
13676 std::sort(async_by_num_missing
.begin(), async_by_num_missing
.end(), func
);
13677 replicas_by_num_missing
.insert(replicas_by_num_missing
.end(),
13678 async_by_num_missing
.begin(), async_by_num_missing
.end());
13679 for (auto &replica
: replicas_by_num_missing
) {
13680 pg_shard_t
&peer
= replica
.second
;
13681 ceph_assert(peer
!= get_primary());
13682 auto pm
= recovery_state
.get_peer_missing().find(peer
);
13683 ceph_assert(pm
!= recovery_state
.get_peer_missing().end());
13684 size_t m_sz
= pm
->second
.num_missing();
13686 dout(10) << " peer osd." << peer
<< " missing " << m_sz
<< " objects." << dendl
;
13687 dout(20) << " peer osd." << peer
<< " missing " << pm
->second
.get_items() << dendl
;
13690 const pg_missing_t
&m(pm
->second
);
13691 for (map
<version_t
, hobject_t
>::const_iterator p
= m
.get_rmissing().begin();
13692 p
!= m
.get_rmissing().end() && started
< max
;
13694 handle
.reset_tp_timeout();
13695 const hobject_t
soid(p
->second
);
13697 if (recovery_state
.get_missing_loc().is_unfound(soid
)) {
13698 dout(10) << __func__
<< ": " << soid
<< " still unfound" << dendl
;
13702 const pg_info_t
&pi
= recovery_state
.get_peer_info(peer
);
13703 if (soid
> pi
.last_backfill
) {
13704 if (!recovering
.count(soid
)) {
13705 derr
<< __func__
<< ": object " << soid
<< " last_backfill "
13706 << pi
.last_backfill
<< dendl
;
13707 derr
<< __func__
<< ": object added to missing set for backfill, but "
13708 << "is not in recovering, error!" << dendl
;
13714 if (recovering
.count(soid
)) {
13715 dout(10) << __func__
<< ": already recovering " << soid
<< dendl
;
13719 if (recovery_state
.get_missing_loc().is_deleted(soid
)) {
13720 dout(10) << __func__
<< ": " << soid
<< " is a delete, removing" << dendl
;
13721 map
<hobject_t
,pg_missing_item
>::const_iterator r
= m
.get_items().find(soid
);
13722 started
+= prep_object_replica_deletes(soid
, r
->second
.need
, h
, work_started
);
13726 if (soid
.is_snap() &&
13727 recovery_state
.get_pg_log().get_missing().is_missing(
13728 soid
.get_head())) {
13729 dout(10) << __func__
<< ": " << soid
.get_head()
13730 << " still missing on primary" << dendl
;
13734 if (recovery_state
.get_pg_log().get_missing().is_missing(soid
)) {
13735 dout(10) << __func__
<< ": " << soid
<< " still missing on primary" << dendl
;
13739 dout(10) << __func__
<< ": recover_object_replicas(" << soid
<< ")" << dendl
;
13740 map
<hobject_t
,pg_missing_item
>::const_iterator r
= m
.get_items().find(soid
);
13741 started
+= prep_object_replica_pushes(soid
, r
->second
.need
, h
, work_started
);
13745 pgbackend
->run_recovery_op(h
, recovery_state
.get_recovery_op_priority());
13749 hobject_t
PrimaryLogPG::earliest_peer_backfill() const
13751 hobject_t e
= hobject_t::get_max();
13752 for (const pg_shard_t
& peer
: get_backfill_targets()) {
13753 const auto iter
= peer_backfill_info
.find(peer
);
13754 ceph_assert(iter
!= peer_backfill_info
.end());
13755 e
= std::min(e
, iter
->second
.begin
);
13760 bool PrimaryLogPG::all_peer_done() const
13762 // Primary hasn't got any more objects
13763 ceph_assert(backfill_info
.empty());
13765 for (const pg_shard_t
& bt
: get_backfill_targets()) {
13766 const auto piter
= peer_backfill_info
.find(bt
);
13767 ceph_assert(piter
!= peer_backfill_info
.end());
13768 const BackfillInterval
& pbi
= piter
->second
;
13769 // See if peer has more to process
13770 if (!pbi
.extends_to_end() || !pbi
.empty())
13781 * backfilled: fully pushed to replica or present in replica's missing set (both
13782 * our copy and theirs).
13784 * All objects on a backfill_target in
13785 * [MIN,peer_backfill_info[backfill_target].begin) are valid; logically-removed
13786 * objects have been actually deleted and all logically-valid objects are replicated.
13787 * There may be PG objects in this interval yet to be backfilled.
13789 * All objects in PG in [MIN,backfill_info.begin) have been backfilled to all
13790 * backfill_targets. There may be objects on backfill_target(s) yet to be deleted.
13792 * For a backfill target, all objects < std::min(peer_backfill_info[target].begin,
13793 * backfill_info.begin) in PG are backfilled. No deleted objects in this
13794 * interval remain on the backfill target.
13796 * For a backfill target, all objects <= peer_info[target].last_backfill
13797 * have been backfilled to target
13799 * There *MAY* be missing/outdated objects between last_backfill_started and
13800 * std::min(peer_backfill_info[*].begin, backfill_info.begin) in the event that client
13801 * io created objects since the last scan. For this reason, we call
13802 * update_range() again before continuing backfill.
13804 uint64_t PrimaryLogPG::recover_backfill(
13806 ThreadPool::TPHandle
&handle
, bool *work_started
)
13808 dout(10) << __func__
<< " (" << max
<< ")"
13809 << " bft=" << get_backfill_targets()
13810 << " last_backfill_started " << last_backfill_started
13811 << (new_backfill
? " new_backfill":"")
13813 ceph_assert(!get_backfill_targets().empty());
13815 // Initialize from prior backfill state
13816 if (new_backfill
) {
13817 // on_activate() was called prior to getting here
13818 ceph_assert(last_backfill_started
== recovery_state
.earliest_backfill());
13819 new_backfill
= false;
13821 // initialize BackfillIntervals
13822 for (set
<pg_shard_t
>::const_iterator i
= get_backfill_targets().begin();
13823 i
!= get_backfill_targets().end();
13825 peer_backfill_info
[*i
].reset(
13826 recovery_state
.get_peer_info(*i
).last_backfill
);
13828 backfill_info
.reset(last_backfill_started
);
13830 backfills_in_flight
.clear();
13831 pending_backfill_updates
.clear();
13834 for (set
<pg_shard_t
>::const_iterator i
= get_backfill_targets().begin();
13835 i
!= get_backfill_targets().end();
13837 dout(10) << "peer osd." << *i
13838 << " info " << recovery_state
.get_peer_info(*i
)
13839 << " interval " << peer_backfill_info
[*i
].begin
13840 << "-" << peer_backfill_info
[*i
].end
13841 << " " << peer_backfill_info
[*i
].objects
.size() << " objects"
13845 // update our local interval to cope with recent changes
13846 backfill_info
.begin
= last_backfill_started
;
13847 update_range(&backfill_info
, handle
);
13850 vector
<boost::tuple
<hobject_t
, eversion_t
, pg_shard_t
> > to_remove
;
13851 set
<hobject_t
> add_to_stat
;
13853 for (set
<pg_shard_t
>::const_iterator i
= get_backfill_targets().begin();
13854 i
!= get_backfill_targets().end();
13856 peer_backfill_info
[*i
].trim_to(
13858 recovery_state
.get_peer_info(*i
).last_backfill
,
13859 last_backfill_started
));
13861 backfill_info
.trim_to(last_backfill_started
);
13863 PGBackend::RecoveryHandle
*h
= pgbackend
->open_recovery_op();
13864 while (ops
< max
) {
13865 if (backfill_info
.begin
<= earliest_peer_backfill() &&
13866 !backfill_info
.extends_to_end() && backfill_info
.empty()) {
13867 hobject_t next
= backfill_info
.end
;
13868 backfill_info
.reset(next
);
13869 backfill_info
.end
= hobject_t::get_max();
13870 update_range(&backfill_info
, handle
);
13871 backfill_info
.trim();
13874 dout(20) << " my backfill interval " << backfill_info
<< dendl
;
13876 bool sent_scan
= false;
13877 for (set
<pg_shard_t
>::const_iterator i
= get_backfill_targets().begin();
13878 i
!= get_backfill_targets().end();
13880 pg_shard_t bt
= *i
;
13881 BackfillInterval
& pbi
= peer_backfill_info
[bt
];
13883 dout(20) << " peer shard " << bt
<< " backfill " << pbi
<< dendl
;
13884 if (pbi
.begin
<= backfill_info
.begin
&&
13885 !pbi
.extends_to_end() && pbi
.empty()) {
13886 dout(10) << " scanning peer osd." << bt
<< " from " << pbi
.end
<< dendl
;
13887 epoch_t e
= get_osdmap_epoch();
13888 MOSDPGScan
*m
= new MOSDPGScan(
13889 MOSDPGScan::OP_SCAN_GET_DIGEST
, pg_whoami
, e
, get_last_peering_reset(),
13890 spg_t(info
.pgid
.pgid
, bt
.shard
),
13891 pbi
.end
, hobject_t());
13893 if (cct
->_conf
->osd_op_queue
== "mclock_scheduler") {
13894 /* This guard preserves legacy WeightedPriorityQueue behavior for
13895 * now, but should be removed after Reef */
13896 m
->set_priority(recovery_state
.get_recovery_op_priority());
13898 osd
->send_message_osd_cluster(bt
.osd
, m
, get_osdmap_epoch());
13899 ceph_assert(waiting_on_backfill
.find(bt
) == waiting_on_backfill
.end());
13900 waiting_on_backfill
.insert(bt
);
13905 // Count simultaneous scans as a single op and let those complete
13908 start_recovery_op(hobject_t::get_max()); // XXX: was pbi.end
13912 if (backfill_info
.empty() && all_peer_done()) {
13913 dout(10) << " reached end for both local and all peers" << dendl
;
13917 // Get object within set of peers to operate on and
13918 // the set of targets for which that object applies.
13919 hobject_t check
= earliest_peer_backfill();
13921 if (check
< backfill_info
.begin
) {
13923 set
<pg_shard_t
> check_targets
;
13924 for (set
<pg_shard_t
>::const_iterator i
= get_backfill_targets().begin();
13925 i
!= get_backfill_targets().end();
13927 pg_shard_t bt
= *i
;
13928 BackfillInterval
& pbi
= peer_backfill_info
[bt
];
13929 if (pbi
.begin
== check
)
13930 check_targets
.insert(bt
);
13932 ceph_assert(!check_targets
.empty());
13934 dout(20) << " BACKFILL removing " << check
13935 << " from peers " << check_targets
<< dendl
;
13936 for (set
<pg_shard_t
>::iterator i
= check_targets
.begin();
13937 i
!= check_targets
.end();
13939 pg_shard_t bt
= *i
;
13940 BackfillInterval
& pbi
= peer_backfill_info
[bt
];
13941 ceph_assert(pbi
.begin
== check
);
13943 to_remove
.push_back(boost::make_tuple(check
, pbi
.objects
.begin()->second
, bt
));
13947 last_backfill_started
= check
;
13949 // Don't increment ops here because deletions
13950 // are cheap and not replied to unlike real recovery_ops,
13951 // and we can't increment ops without requeueing ourself
13954 eversion_t
& obj_v
= backfill_info
.objects
.begin()->second
;
13956 vector
<pg_shard_t
> need_ver_targs
, missing_targs
, keep_ver_targs
, skip_targs
;
13957 for (set
<pg_shard_t
>::const_iterator i
= get_backfill_targets().begin();
13958 i
!= get_backfill_targets().end();
13960 pg_shard_t bt
= *i
;
13961 BackfillInterval
& pbi
= peer_backfill_info
[bt
];
13962 // Find all check peers that have the wrong version
13963 if (check
== backfill_info
.begin
&& check
== pbi
.begin
) {
13964 if (pbi
.objects
.begin()->second
!= obj_v
) {
13965 need_ver_targs
.push_back(bt
);
13967 keep_ver_targs
.push_back(bt
);
13970 const pg_info_t
& pinfo
= recovery_state
.get_peer_info(bt
);
13972 // Only include peers that we've caught up to their backfill line
13973 // otherwise, they only appear to be missing this object
13974 // because their pbi.begin > backfill_info.begin.
13975 if (backfill_info
.begin
> pinfo
.last_backfill
)
13976 missing_targs
.push_back(bt
);
13978 skip_targs
.push_back(bt
);
13982 if (!keep_ver_targs
.empty()) {
13983 // These peers have version obj_v
13984 dout(20) << " BACKFILL keeping " << check
13985 << " with ver " << obj_v
13986 << " on peers " << keep_ver_targs
<< dendl
;
13987 //assert(!waiting_for_degraded_object.count(check));
13989 if (!need_ver_targs
.empty() || !missing_targs
.empty()) {
13990 ObjectContextRef obc
= get_object_context(backfill_info
.begin
, false);
13992 if (obc
->get_recovery_read()) {
13993 if (!need_ver_targs
.empty()) {
13994 dout(20) << " BACKFILL replacing " << check
13995 << " with ver " << obj_v
13996 << " to peers " << need_ver_targs
<< dendl
;
13998 if (!missing_targs
.empty()) {
13999 dout(20) << " BACKFILL pushing " << backfill_info
.begin
14000 << " with ver " << obj_v
14001 << " to peers " << missing_targs
<< dendl
;
14003 vector
<pg_shard_t
> all_push
= need_ver_targs
;
14004 all_push
.insert(all_push
.end(), missing_targs
.begin(), missing_targs
.end());
14006 handle
.reset_tp_timeout();
14007 int r
= prep_backfill_object_push(backfill_info
.begin
, obj_v
, obc
, all_push
, h
);
14009 *work_started
= true;
14010 dout(0) << __func__
<< " Error " << r
<< " trying to backfill " << backfill_info
.begin
<< dendl
;
14015 *work_started
= true;
14016 dout(20) << "backfill blocking on " << backfill_info
.begin
14017 << "; could not get rw_manager lock" << dendl
;
14021 dout(20) << "need_ver_targs=" << need_ver_targs
14022 << " keep_ver_targs=" << keep_ver_targs
<< dendl
;
14023 dout(20) << "backfill_targets=" << get_backfill_targets()
14024 << " missing_targs=" << missing_targs
14025 << " skip_targs=" << skip_targs
<< dendl
;
14027 last_backfill_started
= backfill_info
.begin
;
14028 add_to_stat
.insert(backfill_info
.begin
); // XXX: Only one for all pushes?
14029 backfill_info
.pop_front();
14030 vector
<pg_shard_t
> check_targets
= need_ver_targs
;
14031 check_targets
.insert(check_targets
.end(), keep_ver_targs
.begin(), keep_ver_targs
.end());
14032 for (vector
<pg_shard_t
>::iterator i
= check_targets
.begin();
14033 i
!= check_targets
.end();
14035 pg_shard_t bt
= *i
;
14036 BackfillInterval
& pbi
= peer_backfill_info
[bt
];
14042 for (set
<hobject_t
>::iterator i
= add_to_stat
.begin();
14043 i
!= add_to_stat
.end();
14045 ObjectContextRef obc
= get_object_context(*i
, false);
14048 add_object_context_to_pg_stat(obc
, &stat
);
14049 pending_backfill_updates
[*i
] = stat
;
14051 map
<pg_shard_t
,MOSDPGBackfillRemove
*> reqs
;
14052 for (unsigned i
= 0; i
< to_remove
.size(); ++i
) {
14053 handle
.reset_tp_timeout();
14054 const hobject_t
& oid
= to_remove
[i
].get
<0>();
14055 eversion_t v
= to_remove
[i
].get
<1>();
14056 pg_shard_t peer
= to_remove
[i
].get
<2>();
14057 MOSDPGBackfillRemove
*m
;
14058 auto it
= reqs
.find(peer
);
14059 if (it
!= reqs
.end()) {
14062 m
= reqs
[peer
] = new MOSDPGBackfillRemove(
14063 spg_t(info
.pgid
.pgid
, peer
.shard
),
14064 get_osdmap_epoch());
14065 if (cct
->_conf
->osd_op_queue
== "mclock_scheduler") {
14066 /* This guard preserves legacy WeightedPriorityQueue behavior for
14067 * now, but should be removed after Reef */
14068 m
->set_priority(recovery_state
.get_recovery_op_priority());
14071 m
->ls
.push_back(make_pair(oid
, v
));
14073 if (oid
<= last_backfill_started
)
14074 pending_backfill_updates
[oid
]; // add empty stat!
14076 for (auto p
: reqs
) {
14077 osd
->send_message_osd_cluster(p
.first
.osd
, p
.second
,
14078 get_osdmap_epoch());
14081 pgbackend
->run_recovery_op(h
, recovery_state
.get_recovery_op_priority());
14083 hobject_t backfill_pos
=
14084 std::min(backfill_info
.begin
, earliest_peer_backfill());
14085 dout(5) << "backfill_pos is " << backfill_pos
<< dendl
;
14086 for (set
<hobject_t
>::iterator i
= backfills_in_flight
.begin();
14087 i
!= backfills_in_flight
.end();
14089 dout(20) << *i
<< " is still in flight" << dendl
;
14092 hobject_t next_backfill_to_complete
= backfills_in_flight
.empty() ?
14093 backfill_pos
: *(backfills_in_flight
.begin());
14094 hobject_t new_last_backfill
= recovery_state
.earliest_backfill();
14095 dout(10) << "starting new_last_backfill at " << new_last_backfill
<< dendl
;
14096 for (map
<hobject_t
, pg_stat_t
>::iterator i
=
14097 pending_backfill_updates
.begin();
14098 i
!= pending_backfill_updates
.end() &&
14099 i
->first
< next_backfill_to_complete
;
14100 pending_backfill_updates
.erase(i
++)) {
14101 dout(20) << " pending_backfill_update " << i
->first
<< dendl
;
14102 ceph_assert(i
->first
> new_last_backfill
);
14103 // carried from a previous round – if we are here, then we had to
14104 // be requeued (by e.g. on_global_recover()) and those operations
14106 recovery_state
.update_complete_backfill_object_stats(
14109 new_last_backfill
= i
->first
;
14111 dout(10) << "possible new_last_backfill at " << new_last_backfill
<< dendl
;
14113 ceph_assert(!pending_backfill_updates
.empty() ||
14114 new_last_backfill
== last_backfill_started
);
14115 if (pending_backfill_updates
.empty() &&
14116 backfill_pos
.is_max()) {
14117 ceph_assert(backfills_in_flight
.empty());
14118 new_last_backfill
= backfill_pos
;
14119 last_backfill_started
= backfill_pos
;
14121 dout(10) << "final new_last_backfill at " << new_last_backfill
<< dendl
;
14123 // If new_last_backfill == MAX, then we will send OP_BACKFILL_FINISH to
14124 // all the backfill targets. Otherwise, we will move last_backfill up on
14125 // those targets need it and send OP_BACKFILL_PROGRESS to them.
14126 for (set
<pg_shard_t
>::const_iterator i
= get_backfill_targets().begin();
14127 i
!= get_backfill_targets().end();
14129 pg_shard_t bt
= *i
;
14130 const pg_info_t
& pinfo
= recovery_state
.get_peer_info(bt
);
14132 if (new_last_backfill
> pinfo
.last_backfill
) {
14133 recovery_state
.update_peer_last_backfill(bt
, new_last_backfill
);
14134 epoch_t e
= get_osdmap_epoch();
14135 MOSDPGBackfill
*m
= NULL
;
14136 if (pinfo
.last_backfill
.is_max()) {
14137 m
= new MOSDPGBackfill(
14138 MOSDPGBackfill::OP_BACKFILL_FINISH
,
14140 get_last_peering_reset(),
14141 spg_t(info
.pgid
.pgid
, bt
.shard
));
14142 // Use default priority here, must match sub_op priority
14143 start_recovery_op(hobject_t::get_max());
14145 m
= new MOSDPGBackfill(
14146 MOSDPGBackfill::OP_BACKFILL_PROGRESS
,
14148 get_last_peering_reset(),
14149 spg_t(info
.pgid
.pgid
, bt
.shard
));
14150 // Use default priority here, must match sub_op priority
14152 m
->last_backfill
= pinfo
.last_backfill
;
14153 m
->stats
= pinfo
.stats
;
14155 if (cct
->_conf
->osd_op_queue
== "mclock_scheduler") {
14156 /* This guard preserves legacy WeightedPriorityQueue behavior for
14157 * now, but should be removed after Reef */
14158 m
->set_priority(recovery_state
.get_recovery_op_priority());
14161 osd
->send_message_osd_cluster(bt
.osd
, m
, get_osdmap_epoch());
14162 dout(10) << " peer " << bt
14163 << " num_objects now " << pinfo
.stats
.stats
.sum
.num_objects
14164 << " / " << info
.stats
.stats
.sum
.num_objects
<< dendl
;
14169 *work_started
= true;
14173 int PrimaryLogPG::prep_backfill_object_push(
14174 hobject_t oid
, eversion_t v
,
14175 ObjectContextRef obc
,
14176 vector
<pg_shard_t
> peers
,
14177 PGBackend::RecoveryHandle
*h
)
14179 dout(10) << __func__
<< " " << oid
<< " v " << v
<< " to peers " << peers
<< dendl
;
14180 ceph_assert(!peers
.empty());
14182 backfills_in_flight
.insert(oid
);
14183 recovery_state
.prepare_backfill_for_missing(oid
, v
, peers
);
14185 ceph_assert(!recovering
.count(oid
));
14187 start_recovery_op(oid
);
14188 recovering
.insert(make_pair(oid
, obc
));
14190 int r
= pgbackend
->recover_object(
14193 ObjectContextRef(),
14197 dout(0) << __func__
<< " Error " << r
<< " on oid " << oid
<< dendl
;
14198 on_failed_pull({ pg_whoami
}, oid
, v
);
14203 void PrimaryLogPG::update_range(
14204 BackfillInterval
*bi
,
14205 ThreadPool::TPHandle
&handle
)
14207 int local_min
= cct
->_conf
->osd_backfill_scan_min
;
14208 int local_max
= cct
->_conf
->osd_backfill_scan_max
;
14210 if (bi
->version
< info
.log_tail
) {
14211 dout(10) << __func__
<< ": bi is old, rescanning local backfill_info"
14213 bi
->version
= info
.last_update
;
14214 scan_range(local_min
, local_max
, bi
, handle
);
14217 if (bi
->version
>= projected_last_update
) {
14218 dout(10) << __func__
<< ": bi is current " << dendl
;
14219 ceph_assert(bi
->version
== projected_last_update
);
14220 } else if (bi
->version
>= info
.log_tail
) {
14221 if (recovery_state
.get_pg_log().get_log().empty() && projected_log
.empty()) {
14222 /* Because we don't move log_tail on split, the log might be
14223 * empty even if log_tail != last_update. However, the only
14224 * way to get here with an empty log is if log_tail is actually
14225 * eversion_t(), because otherwise the entry which changed
14226 * last_update since the last scan would have to be present.
14228 ceph_assert(bi
->version
== eversion_t());
14232 dout(10) << __func__
<< ": bi is old, (" << bi
->version
14233 << ") can be updated with log to projected_last_update "
14234 << projected_last_update
<< dendl
;
14236 auto func
= [&](const pg_log_entry_t
&e
) {
14237 dout(10) << __func__
<< ": updating from version " << e
.version
14239 const hobject_t
&soid
= e
.soid
;
14240 if (soid
>= bi
->begin
&&
14242 if (e
.is_update()) {
14243 dout(10) << __func__
<< ": " << e
.soid
<< " updated to version "
14244 << e
.version
<< dendl
;
14245 bi
->objects
.erase(e
.soid
);
14246 bi
->objects
.insert(
14250 } else if (e
.is_delete()) {
14251 dout(10) << __func__
<< ": " << e
.soid
<< " removed" << dendl
;
14252 bi
->objects
.erase(e
.soid
);
14256 dout(10) << "scanning pg log first" << dendl
;
14257 recovery_state
.get_pg_log().get_log().scan_log_after(bi
->version
, func
);
14258 dout(10) << "scanning projected log" << dendl
;
14259 projected_log
.scan_log_after(bi
->version
, func
);
14260 bi
->version
= projected_last_update
;
14262 ceph_abort_msg("scan_range should have raised bi->version past log_tail");
14266 void PrimaryLogPG::scan_range(
14267 int min
, int max
, BackfillInterval
*bi
,
14268 ThreadPool::TPHandle
&handle
)
14270 ceph_assert(is_locked());
14271 dout(10) << "scan_range from " << bi
->begin
<< dendl
;
14272 bi
->clear_objects();
14274 vector
<hobject_t
> ls
;
14276 int r
= pgbackend
->objects_list_partial(bi
->begin
, min
, max
, &ls
, &bi
->end
);
14277 ceph_assert(r
>= 0);
14278 dout(10) << " got " << ls
.size() << " items, next " << bi
->end
<< dendl
;
14279 dout(20) << ls
<< dendl
;
14281 for (vector
<hobject_t
>::iterator p
= ls
.begin(); p
!= ls
.end(); ++p
) {
14282 handle
.reset_tp_timeout();
14283 ObjectContextRef obc
;
14285 obc
= object_contexts
.lookup(*p
);
14287 if (!obc
->obs
.exists
) {
14288 /* If the object does not exist here, it must have been removed
14289 * between the collection_list_partial and here. This can happen
14290 * for the first item in the range, which is usually last_backfill.
14294 bi
->objects
[*p
] = obc
->obs
.oi
.version
;
14295 dout(20) << " " << *p
<< " " << obc
->obs
.oi
.version
<< dendl
;
14298 int r
= pgbackend
->objects_get_attr(*p
, OI_ATTR
, &bl
);
14299 /* If the object does not exist here, it must have been removed
14300 * between the collection_list_partial and here. This can happen
14301 * for the first item in the range, which is usually last_backfill.
14306 ceph_assert(r
>= 0);
14307 object_info_t
oi(bl
);
14308 bi
->objects
[*p
] = oi
.version
;
14309 dout(20) << " " << *p
<< " " << oi
.version
<< dendl
;
14317 * verifies that stray objects have been deleted
14319 void PrimaryLogPG::check_local()
14321 dout(10) << __func__
<< dendl
;
14324 info
.last_update
>=
14325 recovery_state
.get_pg_log().get_tail()); // otherwise we need some help!
14327 if (!cct
->_conf
->osd_debug_verify_stray_on_activate
)
14330 // just scan the log.
14331 set
<hobject_t
> did
;
14332 for (list
<pg_log_entry_t
>::const_reverse_iterator p
= recovery_state
.get_pg_log().get_log().log
.rbegin();
14333 p
!= recovery_state
.get_pg_log().get_log().log
.rend();
14335 if (did
.count(p
->soid
))
14337 did
.insert(p
->soid
);
14339 if (p
->is_delete() && !is_missing_object(p
->soid
)) {
14340 dout(10) << " checking " << p
->soid
14341 << " at " << p
->version
<< dendl
;
14343 int r
= osd
->store
->stat(
14345 ghobject_t(p
->soid
, ghobject_t::NO_GEN
, pg_whoami
.shard
),
14347 if (r
!= -ENOENT
) {
14348 derr
<< __func__
<< " " << p
->soid
<< " exists, but should have been "
14349 << "deleted" << dendl
;
14350 ceph_abort_msg("erroneously present object");
14353 // ignore old(+missing) objects
14360 // ===========================
14363 hobject_t
PrimaryLogPG::get_hit_set_current_object(utime_t stamp
)
14366 ss
<< "hit_set_" << info
.pgid
.pgid
<< "_current_" << stamp
;
14367 hobject_t
hoid(sobject_t(ss
.str(), CEPH_NOSNAP
), "",
14368 info
.pgid
.ps(), info
.pgid
.pool(),
14369 cct
->_conf
->osd_hit_set_namespace
);
14370 dout(20) << __func__
<< " " << hoid
<< dendl
;
14374 hobject_t
PrimaryLogPG::get_hit_set_archive_object(utime_t start
,
14379 ss
<< "hit_set_" << info
.pgid
.pgid
<< "_archive_";
14381 start
.gmtime(ss
, true /* legacy pre-octopus form */) << "_";
14382 end
.gmtime(ss
, true /* legacy pre-octopus form */);
14384 start
.localtime(ss
, true /* legacy pre-octopus form */) << "_";
14385 end
.localtime(ss
, true /* legacy pre-octopus form */);
14387 hobject_t
hoid(sobject_t(ss
.str(), CEPH_NOSNAP
), "",
14388 info
.pgid
.ps(), info
.pgid
.pool(),
14389 cct
->_conf
->osd_hit_set_namespace
);
14390 dout(20) << __func__
<< " " << hoid
<< dendl
;
14394 void PrimaryLogPG::hit_set_clear()
14396 dout(20) << __func__
<< dendl
;
14398 hit_set_start_stamp
= utime_t();
14401 void PrimaryLogPG::hit_set_setup()
14403 if (!is_active() ||
14409 if (is_active() && is_primary() &&
14410 (!pool
.info
.hit_set_count
||
14411 !pool
.info
.hit_set_period
||
14412 pool
.info
.hit_set_params
.get_type() == HitSet::TYPE_NONE
)) {
14415 // only primary is allowed to remove all the hit set objects
14416 hit_set_remove_all();
14420 // FIXME: discard any previous data for now
14423 // include any writes we know about from the pg log. this doesn't
14424 // capture reads, but it is better than nothing!
14425 hit_set_apply_log();
14428 void PrimaryLogPG::hit_set_remove_all()
14430 // If any archives are degraded we skip this
14431 for (auto p
= info
.hit_set
.history
.begin();
14432 p
!= info
.hit_set
.history
.end();
14434 hobject_t aoid
= get_hit_set_archive_object(p
->begin
, p
->end
, p
->using_gmt
);
14436 // Once we hit a degraded object just skip
14437 if (is_degraded_or_backfilling_object(aoid
))
14439 if (m_scrubber
->write_blocked_by_scrub(aoid
))
14443 if (!info
.hit_set
.history
.empty()) {
14444 auto p
= info
.hit_set
.history
.rbegin();
14445 ceph_assert(p
!= info
.hit_set
.history
.rend());
14446 hobject_t oid
= get_hit_set_archive_object(p
->begin
, p
->end
, p
->using_gmt
);
14447 ceph_assert(!is_degraded_or_backfilling_object(oid
));
14448 ObjectContextRef obc
= get_object_context(oid
, false);
14451 OpContextUPtr ctx
= simple_opc_create(obc
);
14452 ctx
->at_version
= get_next_version();
14453 ctx
->updated_hset_history
= info
.hit_set
;
14454 utime_t now
= ceph_clock_now();
14456 hit_set_trim(ctx
, 0);
14457 simple_opc_submit(std::move(ctx
));
14460 recovery_state
.update_hset(pg_hit_set_history_t());
14462 agent_state
->discard_hit_sets();
14466 void PrimaryLogPG::hit_set_create()
14468 utime_t now
= ceph_clock_now();
14469 // make a copy of the params to modify
14470 HitSet::Params
params(pool
.info
.hit_set_params
);
14472 dout(20) << __func__
<< " " << params
<< dendl
;
14473 if (pool
.info
.hit_set_params
.get_type() == HitSet::TYPE_BLOOM
) {
14474 BloomHitSet::Params
*p
=
14475 static_cast<BloomHitSet::Params
*>(params
.impl
.get());
14477 // convert false positive rate so it holds up across the full period
14478 p
->set_fpp(p
->get_fpp() / pool
.info
.hit_set_count
);
14479 if (p
->get_fpp() <= 0.0)
14480 p
->set_fpp(.01); // fpp cannot be zero!
14482 // if we don't have specified size, estimate target size based on the
14484 if (p
->target_size
== 0 && hit_set
) {
14485 utime_t dur
= now
- hit_set_start_stamp
;
14486 unsigned unique
= hit_set
->approx_unique_insert_count();
14487 dout(20) << __func__
<< " previous set had approx " << unique
14488 << " unique items over " << dur
<< " seconds" << dendl
;
14489 p
->target_size
= (double)unique
* (double)pool
.info
.hit_set_period
14492 if (p
->target_size
<
14493 static_cast<uint64_t>(cct
->_conf
->osd_hit_set_min_size
))
14494 p
->target_size
= cct
->_conf
->osd_hit_set_min_size
;
14497 > static_cast<uint64_t>(cct
->_conf
->osd_hit_set_max_size
))
14498 p
->target_size
= cct
->_conf
->osd_hit_set_max_size
;
14500 p
->seed
= now
.sec();
14502 dout(10) << __func__
<< " target_size " << p
->target_size
14503 << " fpp " << p
->get_fpp() << dendl
;
14505 hit_set
.reset(new HitSet(params
));
14506 hit_set_start_stamp
= now
;
14510 * apply log entries to set
14512 * this would only happen after peering, to at least capture writes
14513 * during an interval that was potentially lost.
14515 bool PrimaryLogPG::hit_set_apply_log()
14520 eversion_t to
= info
.last_update
;
14521 eversion_t from
= info
.hit_set
.current_last_update
;
14523 dout(20) << __func__
<< " no update" << dendl
;
14527 dout(20) << __func__
<< " " << to
<< " .. " << info
.last_update
<< dendl
;
14528 list
<pg_log_entry_t
>::const_reverse_iterator p
=
14529 recovery_state
.get_pg_log().get_log().log
.rbegin();
14530 while (p
!= recovery_state
.get_pg_log().get_log().log
.rend() && p
->version
> to
)
14532 while (p
!= recovery_state
.get_pg_log().get_log().log
.rend() && p
->version
> from
) {
14533 hit_set
->insert(p
->soid
);
14540 void PrimaryLogPG::hit_set_persist()
14542 dout(10) << __func__
<< dendl
;
14544 unsigned max
= pool
.info
.hit_set_count
;
14546 utime_t now
= ceph_clock_now();
14549 // If any archives are degraded we skip this persist request
14550 // account for the additional entry being added below
14551 for (auto p
= info
.hit_set
.history
.begin();
14552 p
!= info
.hit_set
.history
.end();
14554 hobject_t aoid
= get_hit_set_archive_object(p
->begin
, p
->end
, p
->using_gmt
);
14556 // Once we hit a degraded object just skip further trim
14557 if (is_degraded_or_backfilling_object(aoid
))
14559 if (m_scrubber
->write_blocked_by_scrub(aoid
))
14563 // If backfill is in progress and we could possibly overlap with the
14564 // hit_set_* objects, back off. Since these all have
14565 // hobject_t::hash set to pgid.ps(), and those sort first, we can
14566 // look just at that. This is necessary because our transactions
14567 // may include a modify of the new hit_set *and* a delete of the
14568 // old one, and this may span the backfill boundary.
14569 for (set
<pg_shard_t
>::const_iterator p
= get_backfill_targets().begin();
14570 p
!= get_backfill_targets().end();
14572 const pg_info_t
& pi
= recovery_state
.get_peer_info(*p
);
14573 if (pi
.last_backfill
== hobject_t() ||
14574 pi
.last_backfill
.get_hash() == info
.pgid
.ps()) {
14575 dout(10) << __func__
<< " backfill target osd." << *p
14576 << " last_backfill has not progressed past pgid ps"
14583 pg_hit_set_info_t new_hset
= pg_hit_set_info_t(pool
.info
.use_gmt_hitset
);
14584 new_hset
.begin
= hit_set_start_stamp
;
14585 new_hset
.end
= now
;
14586 oid
= get_hit_set_archive_object(
14589 new_hset
.using_gmt
);
14591 // If the current object is degraded we skip this persist request
14592 if (m_scrubber
->write_blocked_by_scrub(oid
))
14596 encode(*hit_set
, bl
);
14597 dout(20) << __func__
<< " archive " << oid
<< dendl
;
14600 agent_state
->add_hit_set(new_hset
.begin
, hit_set
);
14601 uint32_t size
= agent_state
->hit_set_map
.size();
14602 if (size
>= pool
.info
.hit_set_count
) {
14603 size
= pool
.info
.hit_set_count
> 0 ? pool
.info
.hit_set_count
- 1: 0;
14605 hit_set_in_memory_trim(size
);
14608 ObjectContextRef obc
= get_object_context(oid
, true);
14609 OpContextUPtr ctx
= simple_opc_create(obc
);
14611 ctx
->at_version
= get_next_version();
14612 ctx
->updated_hset_history
= info
.hit_set
;
14613 pg_hit_set_history_t
&updated_hit_set_hist
= *(ctx
->updated_hset_history
);
14615 updated_hit_set_hist
.current_last_update
= info
.last_update
;
14616 new_hset
.version
= ctx
->at_version
;
14618 updated_hit_set_hist
.history
.push_back(new_hset
);
14621 // fabricate an object_info_t and SnapSet
14622 obc
->obs
.oi
.version
= ctx
->at_version
;
14623 obc
->obs
.oi
.mtime
= now
;
14624 obc
->obs
.oi
.size
= bl
.length();
14625 obc
->obs
.exists
= true;
14626 obc
->obs
.oi
.set_data_digest(bl
.crc32c(-1));
14628 ctx
->new_obs
= obc
->obs
;
14630 ctx
->new_snapset
= obc
->ssc
->snapset
;
14632 ctx
->delta_stats
.num_objects
++;
14633 ctx
->delta_stats
.num_objects_hit_set_archive
++;
14635 ctx
->delta_stats
.num_bytes
+= bl
.length();
14636 ctx
->delta_stats
.num_bytes_hit_set_archive
+= bl
.length();
14639 encode(ctx
->new_snapset
, bss
);
14640 bufferlist
boi(sizeof(ctx
->new_obs
.oi
));
14641 encode(ctx
->new_obs
.oi
, boi
,
14642 get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD
, nullptr));
14644 ctx
->op_t
->create(oid
);
14646 ctx
->op_t
->write(oid
, 0, bl
.length(), bl
, 0);
14647 write_update_size_and_usage(ctx
->delta_stats
, obc
->obs
.oi
, ctx
->modified_ranges
,
14649 ctx
->clean_regions
.mark_data_region_dirty(0, bl
.length());
14651 map
<string
, bufferlist
, std::less
<>> attrs
= {
14652 {OI_ATTR
, std::move(boi
)},
14653 {SS_ATTR
, std::move(bss
)}
14655 setattrs_maybe_cache(ctx
->obc
, ctx
->op_t
.get(), attrs
);
14656 ctx
->log
.push_back(
14658 pg_log_entry_t::MODIFY
,
14667 ctx
->log
.back().clean_regions
= ctx
->clean_regions
;
14669 hit_set_trim(ctx
, max
);
14671 simple_opc_submit(std::move(ctx
));
14674 void PrimaryLogPG::hit_set_trim(OpContextUPtr
&ctx
, unsigned max
)
14676 ceph_assert(ctx
->updated_hset_history
);
14677 pg_hit_set_history_t
&updated_hit_set_hist
=
14678 *(ctx
->updated_hset_history
);
14679 for (unsigned num
= updated_hit_set_hist
.history
.size(); num
> max
; --num
) {
14680 list
<pg_hit_set_info_t
>::iterator p
= updated_hit_set_hist
.history
.begin();
14681 ceph_assert(p
!= updated_hit_set_hist
.history
.end());
14682 hobject_t oid
= get_hit_set_archive_object(p
->begin
, p
->end
, p
->using_gmt
);
14684 ceph_assert(!is_degraded_or_backfilling_object(oid
));
14686 dout(20) << __func__
<< " removing " << oid
<< dendl
;
14687 ++ctx
->at_version
.version
;
14688 ctx
->log
.push_back(
14689 pg_log_entry_t(pg_log_entry_t::DELETE
,
14698 ctx
->op_t
->remove(oid
);
14699 updated_hit_set_hist
.history
.pop_front();
14701 ObjectContextRef obc
= get_object_context(oid
, false);
14703 --ctx
->delta_stats
.num_objects
;
14704 --ctx
->delta_stats
.num_objects_hit_set_archive
;
14705 ctx
->delta_stats
.num_bytes
-= obc
->obs
.oi
.size
;
14706 ctx
->delta_stats
.num_bytes_hit_set_archive
-= obc
->obs
.oi
.size
;
14710 void PrimaryLogPG::hit_set_in_memory_trim(uint32_t max_in_memory
)
14712 while (agent_state
->hit_set_map
.size() > max_in_memory
) {
14713 agent_state
->remove_oldest_hit_set();
14718 // =======================================
14721 void PrimaryLogPG::agent_setup()
14723 ceph_assert(is_locked());
14724 if (!is_active() ||
14726 state_test(PG_STATE_PREMERGE
) ||
14727 pool
.info
.cache_mode
== pg_pool_t::CACHEMODE_NONE
||
14728 pool
.info
.tier_of
< 0 ||
14729 !get_osdmap()->have_pg_pool(pool
.info
.tier_of
)) {
14733 if (!agent_state
) {
14734 agent_state
.reset(new TierAgentState
);
14736 // choose random starting position
14737 agent_state
->position
= hobject_t();
14738 agent_state
->position
.pool
= info
.pgid
.pool();
14739 agent_state
->position
.set_hash(pool
.info
.get_random_pg_position(
14742 agent_state
->start
= agent_state
->position
;
14744 dout(10) << __func__
<< " allocated new state, position "
14745 << agent_state
->position
<< dendl
;
14747 dout(10) << __func__
<< " keeping existing state" << dendl
;
14750 if (info
.stats
.stats_invalid
) {
14751 osd
->clog
->warn() << "pg " << info
.pgid
<< " has invalid (post-split) stats; must scrub before tier agent can activate";
14754 agent_choose_mode();
14757 void PrimaryLogPG::agent_clear()
14760 agent_state
.reset(NULL
);
14763 // Return false if no objects operated on since start of object hash space
14764 bool PrimaryLogPG::agent_work(int start_max
, int agent_flush_quota
)
14766 std::scoped_lock locker
{*this};
14767 if (!agent_state
) {
14768 dout(10) << __func__
<< " no agent state, stopping" << dendl
;
14772 ceph_assert(!recovery_state
.is_deleting());
14774 if (agent_state
->is_idle()) {
14775 dout(10) << __func__
<< " idle, stopping" << dendl
;
14779 osd
->logger
->inc(l_osd_agent_wake
);
14781 dout(10) << __func__
14782 << " max " << start_max
14783 << ", flush " << agent_state
->get_flush_mode_name()
14784 << ", evict " << agent_state
->get_evict_mode_name()
14785 << ", pos " << agent_state
->position
14787 ceph_assert(is_primary());
14788 ceph_assert(is_active());
14790 agent_load_hit_sets();
14792 const pg_pool_t
*base_pool
= get_osdmap()->get_pg_pool(pool
.info
.tier_of
);
14793 ceph_assert(base_pool
);
14796 int ls_max
= cct
->_conf
->osd_pool_default_cache_max_evict_check_size
;
14798 // list some objects. this conveniently lists clones (oldest to
14799 // newest) before heads... the same order we want to flush in.
14801 // NOTE: do not flush the Sequencer. we will assume that the
14802 // listing we get back is imprecise.
14803 vector
<hobject_t
> ls
;
14805 int r
= pgbackend
->objects_list_partial(agent_state
->position
, ls_min
, ls_max
,
14807 ceph_assert(r
>= 0);
14808 dout(20) << __func__
<< " got " << ls
.size() << " objects" << dendl
;
14810 for (vector
<hobject_t
>::iterator p
= ls
.begin();
14813 if (p
->nspace
== cct
->_conf
->osd_hit_set_namespace
) {
14814 dout(20) << __func__
<< " skip (hit set) " << *p
<< dendl
;
14815 osd
->logger
->inc(l_osd_agent_skip
);
14818 if (is_degraded_or_backfilling_object(*p
)) {
14819 dout(20) << __func__
<< " skip (degraded) " << *p
<< dendl
;
14820 osd
->logger
->inc(l_osd_agent_skip
);
14823 if (is_missing_object(p
->get_head())) {
14824 dout(20) << __func__
<< " skip (missing head) " << *p
<< dendl
;
14825 osd
->logger
->inc(l_osd_agent_skip
);
14828 ObjectContextRef obc
= get_object_context(*p
, false, NULL
);
14830 // we didn't flush; we may miss something here.
14831 dout(20) << __func__
<< " skip (no obc) " << *p
<< dendl
;
14832 osd
->logger
->inc(l_osd_agent_skip
);
14835 if (!obc
->obs
.exists
) {
14836 dout(20) << __func__
<< " skip (dne) " << obc
->obs
.oi
.soid
<< dendl
;
14837 osd
->logger
->inc(l_osd_agent_skip
);
14840 if (m_scrubber
->range_intersects_scrub(obc
->obs
.oi
.soid
,
14841 obc
->obs
.oi
.soid
.get_head())) {
14842 dout(20) << __func__
<< " skip (scrubbing) " << obc
->obs
.oi
<< dendl
;
14843 osd
->logger
->inc(l_osd_agent_skip
);
14846 if (obc
->is_blocked()) {
14847 dout(20) << __func__
<< " skip (blocked) " << obc
->obs
.oi
<< dendl
;
14848 osd
->logger
->inc(l_osd_agent_skip
);
14851 if (obc
->is_request_pending()) {
14852 dout(20) << __func__
<< " skip (request pending) " << obc
->obs
.oi
<< dendl
;
14853 osd
->logger
->inc(l_osd_agent_skip
);
14857 // be careful flushing omap to an EC pool.
14858 if (!base_pool
->supports_omap() &&
14859 obc
->obs
.oi
.is_omap()) {
14860 dout(20) << __func__
<< " skip (omap to EC) " << obc
->obs
.oi
<< dendl
;
14861 osd
->logger
->inc(l_osd_agent_skip
);
14865 if (agent_state
->evict_mode
!= TierAgentState::EVICT_MODE_IDLE
&&
14866 agent_maybe_evict(obc
, false))
14868 else if (agent_state
->flush_mode
!= TierAgentState::FLUSH_MODE_IDLE
&&
14869 agent_flush_quota
> 0 && agent_maybe_flush(obc
)) {
14871 --agent_flush_quota
;
14873 if (started
>= start_max
) {
14874 // If finishing early, set "next" to the next object
14875 if (++p
!= ls
.end())
14881 if (++agent_state
->hist_age
> cct
->_conf
->osd_agent_hist_halflife
) {
14882 dout(20) << __func__
<< " resetting atime and temp histograms" << dendl
;
14883 agent_state
->hist_age
= 0;
14884 agent_state
->temp_hist
.decay();
14887 // Total objects operated on so far
14888 int total_started
= agent_state
->started
+ started
;
14889 bool need_delay
= false;
14891 dout(20) << __func__
<< " start pos " << agent_state
->position
14892 << " next start pos " << next
14893 << " started " << total_started
<< dendl
;
14895 // See if we've made a full pass over the object hash space
14896 // This might check at most ls_max objects a second time to notice that
14897 // we've checked every objects at least once.
14898 if (agent_state
->position
< agent_state
->start
&&
14899 next
>= agent_state
->start
) {
14900 dout(20) << __func__
<< " wrap around " << agent_state
->start
<< dendl
;
14901 if (total_started
== 0)
14905 agent_state
->start
= next
;
14907 agent_state
->started
= total_started
;
14909 // See if we are starting from beginning
14911 agent_state
->position
= hobject_t();
14913 agent_state
->position
= next
;
14915 // Discard old in memory HitSets
14916 hit_set_in_memory_trim(pool
.info
.hit_set_count
);
14919 ceph_assert(agent_state
->delaying
== false);
14923 agent_choose_mode();
14927 void PrimaryLogPG::agent_load_hit_sets()
14929 if (agent_state
->evict_mode
== TierAgentState::EVICT_MODE_IDLE
) {
14933 if (agent_state
->hit_set_map
.size() < info
.hit_set
.history
.size()) {
14934 dout(10) << __func__
<< dendl
;
14935 for (auto p
= info
.hit_set
.history
.begin();
14936 p
!= info
.hit_set
.history
.end(); ++p
) {
14937 if (agent_state
->hit_set_map
.count(p
->begin
.sec()) == 0) {
14938 dout(10) << __func__
<< " loading " << p
->begin
<< "-"
14939 << p
->end
<< dendl
;
14940 if (!pool
.info
.is_replicated()) {
14941 // FIXME: EC not supported here yet
14942 derr
<< __func__
<< " on non-replicated pool" << dendl
;
14946 hobject_t oid
= get_hit_set_archive_object(p
->begin
, p
->end
, p
->using_gmt
);
14947 if (is_unreadable_object(oid
)) {
14948 dout(10) << __func__
<< " unreadable " << oid
<< ", waiting" << dendl
;
14952 ObjectContextRef obc
= get_object_context(oid
, false);
14954 derr
<< __func__
<< ": could not load hitset " << oid
<< dendl
;
14960 int r
= osd
->store
->read(ch
, ghobject_t(oid
), 0, 0, bl
);
14961 ceph_assert(r
>= 0);
14963 HitSetRef
hs(new HitSet
);
14964 bufferlist::const_iterator pbl
= bl
.begin();
14966 agent_state
->add_hit_set(p
->begin
.sec(), hs
);
14972 bool PrimaryLogPG::agent_maybe_flush(ObjectContextRef
& obc
)
14974 if (!obc
->obs
.oi
.is_dirty()) {
14975 dout(20) << __func__
<< " skip (clean) " << obc
->obs
.oi
<< dendl
;
14976 osd
->logger
->inc(l_osd_agent_skip
);
14979 if (obc
->obs
.oi
.is_cache_pinned()) {
14980 dout(20) << __func__
<< " skip (cache_pinned) " << obc
->obs
.oi
<< dendl
;
14981 osd
->logger
->inc(l_osd_agent_skip
);
14985 utime_t now
= ceph_clock_now();
14986 utime_t ob_local_mtime
;
14987 if (obc
->obs
.oi
.local_mtime
!= utime_t()) {
14988 ob_local_mtime
= obc
->obs
.oi
.local_mtime
;
14990 ob_local_mtime
= obc
->obs
.oi
.mtime
;
14992 bool evict_mode_full
=
14993 (agent_state
->evict_mode
== TierAgentState::EVICT_MODE_FULL
);
14994 if (!evict_mode_full
&&
14995 obc
->obs
.oi
.soid
.snap
== CEPH_NOSNAP
&& // snaps immutable; don't delay
14996 (ob_local_mtime
+ utime_t(pool
.info
.cache_min_flush_age
, 0) > now
)) {
14997 dout(20) << __func__
<< " skip (too young) " << obc
->obs
.oi
<< dendl
;
14998 osd
->logger
->inc(l_osd_agent_skip
);
15002 if (osd
->agent_is_active_oid(obc
->obs
.oi
.soid
)) {
15003 dout(20) << __func__
<< " skip (flushing) " << obc
->obs
.oi
<< dendl
;
15004 osd
->logger
->inc(l_osd_agent_skip
);
15008 dout(10) << __func__
<< " flushing " << obc
->obs
.oi
<< dendl
;
15010 // FIXME: flush anything dirty, regardless of what distribution of
15013 hobject_t oid
= obc
->obs
.oi
.soid
;
15014 osd
->agent_start_op(oid
);
15015 // no need to capture a pg ref, can't outlive fop or ctx
15016 std::function
<void()> on_flush
= [this, oid
]() {
15017 osd
->agent_finish_op(oid
);
15020 int result
= start_flush(
15021 OpRequestRef(), obc
, false, NULL
,
15023 if (result
!= -EINPROGRESS
) {
15025 dout(10) << __func__
<< " start_flush() failed " << obc
->obs
.oi
15026 << " with " << result
<< dendl
;
15027 osd
->logger
->inc(l_osd_agent_skip
);
15031 osd
->logger
->inc(l_osd_agent_flush
);
15035 bool PrimaryLogPG::agent_maybe_evict(ObjectContextRef
& obc
, bool after_flush
)
15037 const hobject_t
& soid
= obc
->obs
.oi
.soid
;
15038 if (!after_flush
&& obc
->obs
.oi
.is_dirty()) {
15039 dout(20) << __func__
<< " skip (dirty) " << obc
->obs
.oi
<< dendl
;
15042 // This is already checked by agent_work() which passes after_flush = false
15043 if (after_flush
&& m_scrubber
->range_intersects_scrub(soid
, soid
.get_head())) {
15044 dout(20) << __func__
<< " skip (scrubbing) " << obc
->obs
.oi
<< dendl
;
15047 if (!obc
->obs
.oi
.watchers
.empty()) {
15048 dout(20) << __func__
<< " skip (watchers) " << obc
->obs
.oi
<< dendl
;
15051 if (obc
->is_blocked()) {
15052 dout(20) << __func__
<< " skip (blocked) " << obc
->obs
.oi
<< dendl
;
15055 if (obc
->obs
.oi
.is_cache_pinned()) {
15056 dout(20) << __func__
<< " skip (cache_pinned) " << obc
->obs
.oi
<< dendl
;
15060 if (soid
.snap
== CEPH_NOSNAP
) {
15061 int result
= _verify_no_head_clones(soid
, obc
->ssc
->snapset
);
15063 dout(20) << __func__
<< " skip (clones) " << obc
->obs
.oi
<< dendl
;
15068 if (agent_state
->evict_mode
!= TierAgentState::EVICT_MODE_FULL
) {
15069 // is this object old than cache_min_evict_age?
15070 utime_t now
= ceph_clock_now();
15071 utime_t ob_local_mtime
;
15072 if (obc
->obs
.oi
.local_mtime
!= utime_t()) {
15073 ob_local_mtime
= obc
->obs
.oi
.local_mtime
;
15075 ob_local_mtime
= obc
->obs
.oi
.mtime
;
15077 if (ob_local_mtime
+ utime_t(pool
.info
.cache_min_evict_age
, 0) > now
) {
15078 dout(20) << __func__
<< " skip (too young) " << obc
->obs
.oi
<< dendl
;
15079 osd
->logger
->inc(l_osd_agent_skip
);
15082 // is this object old and/or cold enough?
15084 uint64_t temp_upper
= 0, temp_lower
= 0;
15086 agent_estimate_temp(soid
, &temp
);
15087 agent_state
->temp_hist
.add(temp
);
15088 agent_state
->temp_hist
.get_position_micro(temp
, &temp_lower
, &temp_upper
);
15090 dout(20) << __func__
15091 << " temp " << temp
15092 << " pos " << temp_lower
<< "-" << temp_upper
15093 << ", evict_effort " << agent_state
->evict_effort
15095 dout(30) << "agent_state:\n";
15096 auto f
= Formatter::create_unique("");
15097 f
->open_object_section("agent_state");
15098 agent_state
->dump(f
.get());
15099 f
->close_section();
15103 if (1000000 - temp_upper
>= agent_state
->evict_effort
)
15107 dout(10) << __func__
<< " evicting " << obc
->obs
.oi
<< dendl
;
15108 OpContextUPtr ctx
= simple_opc_create(obc
);
15110 auto null_op_req
= OpRequestRef();
15111 if (!ctx
->lock_manager
.get_lock_type(
15116 close_op_ctx(ctx
.release());
15117 dout(20) << __func__
<< " skip (cannot get lock) " << obc
->obs
.oi
<< dendl
;
15121 osd
->agent_start_evict_op();
15122 ctx
->register_on_finish(
15124 osd
->agent_finish_evict_op();
15127 ctx
->at_version
= get_next_version();
15128 ceph_assert(ctx
->new_obs
.exists
);
15129 int r
= _delete_oid(ctx
.get(), true, false);
15130 if (obc
->obs
.oi
.is_omap())
15131 ctx
->delta_stats
.num_objects_omap
--;
15132 ctx
->delta_stats
.num_evict
++;
15133 ctx
->delta_stats
.num_evict_kb
+= shift_round_up(obc
->obs
.oi
.size
, 10);
15134 if (obc
->obs
.oi
.is_dirty())
15135 --ctx
->delta_stats
.num_objects_dirty
;
15136 ceph_assert(r
== 0);
15137 finish_ctx(ctx
.get(), pg_log_entry_t::DELETE
);
15138 simple_opc_submit(std::move(ctx
));
15139 osd
->logger
->inc(l_osd_tier_evict
);
15140 osd
->logger
->inc(l_osd_agent_evict
);
15144 void PrimaryLogPG::agent_stop()
15146 dout(20) << __func__
<< dendl
;
15147 if (agent_state
&& !agent_state
->is_idle()) {
15148 agent_state
->evict_mode
= TierAgentState::EVICT_MODE_IDLE
;
15149 agent_state
->flush_mode
= TierAgentState::FLUSH_MODE_IDLE
;
15150 osd
->agent_disable_pg(this, agent_state
->evict_effort
);
15154 void PrimaryLogPG::agent_delay()
15156 dout(20) << __func__
<< dendl
;
15157 if (agent_state
&& !agent_state
->is_idle()) {
15158 ceph_assert(agent_state
->delaying
== false);
15159 agent_state
->delaying
= true;
15160 osd
->agent_disable_pg(this, agent_state
->evict_effort
);
15164 void PrimaryLogPG::agent_choose_mode_restart()
15166 dout(20) << __func__
<< dendl
;
15167 std::scoped_lock locker
{*this};
15168 if (agent_state
&& agent_state
->delaying
) {
15169 agent_state
->delaying
= false;
15170 agent_choose_mode(true);
15174 bool PrimaryLogPG::agent_choose_mode(bool restart
, OpRequestRef op
)
15176 bool requeued
= false;
15177 // Let delay play out
15178 if (agent_state
->delaying
) {
15179 dout(20) << __func__
<< " " << this << " delaying, ignored" << dendl
;
15183 TierAgentState::flush_mode_t flush_mode
= TierAgentState::FLUSH_MODE_IDLE
;
15184 TierAgentState::evict_mode_t evict_mode
= TierAgentState::EVICT_MODE_IDLE
;
15185 unsigned evict_effort
= 0;
15187 if (info
.stats
.stats_invalid
) {
15188 // idle; stats can't be trusted until we scrub.
15189 dout(20) << __func__
<< " stats invalid (post-split), idle" << dendl
;
15194 uint64_t divisor
= pool
.info
.get_pg_num_divisor(info
.pgid
.pgid
);
15195 ceph_assert(divisor
> 0);
15197 // adjust (effective) user objects down based on the number
15198 // of HitSet objects, which should not count toward our total since
15199 // they cannot be flushed.
15200 uint64_t unflushable
= info
.stats
.stats
.sum
.num_objects_hit_set_archive
;
15202 // also exclude omap objects if ec backing pool
15203 const pg_pool_t
*base_pool
= get_osdmap()->get_pg_pool(pool
.info
.tier_of
);
15204 ceph_assert(base_pool
);
15205 if (!base_pool
->supports_omap())
15206 unflushable
+= info
.stats
.stats
.sum
.num_objects_omap
;
15208 uint64_t num_user_objects
= info
.stats
.stats
.sum
.num_objects
;
15209 if (num_user_objects
> unflushable
)
15210 num_user_objects
-= unflushable
;
15212 num_user_objects
= 0;
15214 uint64_t num_user_bytes
= info
.stats
.stats
.sum
.num_bytes
;
15215 uint64_t unflushable_bytes
= info
.stats
.stats
.sum
.num_bytes_hit_set_archive
;
15216 num_user_bytes
-= unflushable_bytes
;
15217 uint64_t num_overhead_bytes
= osd
->store
->estimate_objects_overhead(num_user_objects
);
15218 num_user_bytes
+= num_overhead_bytes
;
15220 // also reduce the num_dirty by num_objects_omap
15221 int64_t num_dirty
= info
.stats
.stats
.sum
.num_objects_dirty
;
15222 if (!base_pool
->supports_omap()) {
15223 if (num_dirty
> info
.stats
.stats
.sum
.num_objects_omap
)
15224 num_dirty
-= info
.stats
.stats
.sum
.num_objects_omap
;
15229 dout(10) << __func__
15231 << TierAgentState::get_flush_mode_name(agent_state
->flush_mode
)
15233 << TierAgentState::get_evict_mode_name(agent_state
->evict_mode
)
15234 << " num_objects: " << info
.stats
.stats
.sum
.num_objects
15235 << " num_bytes: " << info
.stats
.stats
.sum
.num_bytes
15236 << " num_objects_dirty: " << info
.stats
.stats
.sum
.num_objects_dirty
15237 << " num_objects_omap: " << info
.stats
.stats
.sum
.num_objects_omap
15238 << " num_dirty: " << num_dirty
15239 << " num_user_objects: " << num_user_objects
15240 << " num_user_bytes: " << num_user_bytes
15241 << " num_overhead_bytes: " << num_overhead_bytes
15242 << " pool.info.target_max_bytes: " << pool
.info
.target_max_bytes
15243 << " pool.info.target_max_objects: " << pool
.info
.target_max_objects
15246 // get dirty, full ratios
15247 uint64_t dirty_micro
= 0;
15248 uint64_t full_micro
= 0;
15249 if (pool
.info
.target_max_bytes
&& num_user_objects
> 0) {
15250 uint64_t avg_size
= num_user_bytes
/ num_user_objects
;
15252 num_dirty
* avg_size
* 1000000 /
15253 std::max
<uint64_t>(pool
.info
.target_max_bytes
/ divisor
, 1);
15255 num_user_objects
* avg_size
* 1000000 /
15256 std::max
<uint64_t>(pool
.info
.target_max_bytes
/ divisor
, 1);
15258 if (pool
.info
.target_max_objects
> 0) {
15259 uint64_t dirty_objects_micro
=
15260 num_dirty
* 1000000 /
15261 std::max
<uint64_t>(pool
.info
.target_max_objects
/ divisor
, 1);
15262 if (dirty_objects_micro
> dirty_micro
)
15263 dirty_micro
= dirty_objects_micro
;
15264 uint64_t full_objects_micro
=
15265 num_user_objects
* 1000000 /
15266 std::max
<uint64_t>(pool
.info
.target_max_objects
/ divisor
, 1);
15267 if (full_objects_micro
> full_micro
)
15268 full_micro
= full_objects_micro
;
15270 dout(20) << __func__
<< " dirty " << ((float)dirty_micro
/ 1000000.0)
15271 << " full " << ((float)full_micro
/ 1000000.0)
15275 uint64_t flush_target
= pool
.info
.cache_target_dirty_ratio_micro
;
15276 uint64_t flush_high_target
= pool
.info
.cache_target_dirty_high_ratio_micro
;
15277 uint64_t flush_slop
= (float)flush_target
* cct
->_conf
->osd_agent_slop
;
15278 if (restart
|| agent_state
->flush_mode
== TierAgentState::FLUSH_MODE_IDLE
) {
15279 flush_target
+= flush_slop
;
15280 flush_high_target
+= flush_slop
;
15282 flush_target
-= std::min(flush_target
, flush_slop
);
15283 flush_high_target
-= std::min(flush_high_target
, flush_slop
);
15286 if (dirty_micro
> flush_high_target
) {
15287 flush_mode
= TierAgentState::FLUSH_MODE_HIGH
;
15288 } else if (dirty_micro
> flush_target
|| (!flush_target
&& num_dirty
> 0)) {
15289 flush_mode
= TierAgentState::FLUSH_MODE_LOW
;
15293 uint64_t evict_target
= pool
.info
.cache_target_full_ratio_micro
;
15294 uint64_t evict_slop
= (float)evict_target
* cct
->_conf
->osd_agent_slop
;
15295 if (restart
|| agent_state
->evict_mode
== TierAgentState::EVICT_MODE_IDLE
)
15296 evict_target
+= evict_slop
;
15298 evict_target
-= std::min(evict_target
, evict_slop
);
15300 if (full_micro
> 1000000) {
15301 // evict anything clean
15302 evict_mode
= TierAgentState::EVICT_MODE_FULL
;
15303 evict_effort
= 1000000;
15304 } else if (full_micro
> evict_target
) {
15305 // set effort in [0..1] range based on where we are between
15306 evict_mode
= TierAgentState::EVICT_MODE_SOME
;
15307 uint64_t over
= full_micro
- evict_target
;
15308 uint64_t span
= 1000000 - evict_target
;
15309 evict_effort
= std::max(over
* 1000000 / span
,
15310 uint64_t(1000000.0 *
15311 cct
->_conf
->osd_agent_min_evict_effort
));
15313 // quantize effort to avoid too much reordering in the agent_queue.
15314 uint64_t inc
= cct
->_conf
->osd_agent_quantize_effort
* 1000000;
15315 ceph_assert(inc
> 0);
15316 uint64_t was
= evict_effort
;
15317 evict_effort
-= evict_effort
% inc
;
15318 if (evict_effort
< inc
)
15319 evict_effort
= inc
;
15320 ceph_assert(evict_effort
>= inc
&& evict_effort
<= 1000000);
15321 dout(30) << __func__
<< " evict_effort " << was
<< " quantized by " << inc
<< " to " << evict_effort
<< dendl
;
15326 bool old_idle
= agent_state
->is_idle();
15327 if (flush_mode
!= agent_state
->flush_mode
) {
15328 dout(5) << __func__
<< " flush_mode "
15329 << TierAgentState::get_flush_mode_name(agent_state
->flush_mode
)
15331 << TierAgentState::get_flush_mode_name(flush_mode
)
15333 recovery_state
.update_stats(
15334 [=, this](auto &history
, auto &stats
) {
15335 if (flush_mode
== TierAgentState::FLUSH_MODE_HIGH
) {
15336 osd
->agent_inc_high_count();
15337 stats
.stats
.sum
.num_flush_mode_high
= 1;
15338 } else if (flush_mode
== TierAgentState::FLUSH_MODE_LOW
) {
15339 stats
.stats
.sum
.num_flush_mode_low
= 1;
15341 if (agent_state
->flush_mode
== TierAgentState::FLUSH_MODE_HIGH
) {
15342 osd
->agent_dec_high_count();
15343 stats
.stats
.sum
.num_flush_mode_high
= 0;
15344 } else if (agent_state
->flush_mode
== TierAgentState::FLUSH_MODE_LOW
) {
15345 stats
.stats
.sum
.num_flush_mode_low
= 0;
15349 agent_state
->flush_mode
= flush_mode
;
15351 if (evict_mode
!= agent_state
->evict_mode
) {
15352 dout(5) << __func__
<< " evict_mode "
15353 << TierAgentState::get_evict_mode_name(agent_state
->evict_mode
)
15355 << TierAgentState::get_evict_mode_name(evict_mode
)
15357 if (agent_state
->evict_mode
== TierAgentState::EVICT_MODE_FULL
&&
15361 requeue_ops(waiting_for_flush
);
15362 requeue_ops(waiting_for_active
);
15363 requeue_ops(waiting_for_readable
);
15364 requeue_ops(waiting_for_scrub
);
15365 requeue_ops(waiting_for_cache_not_full
);
15366 objects_blocked_on_cache_full
.clear();
15369 recovery_state
.update_stats(
15370 [=, this](auto &history
, auto &stats
) {
15371 if (evict_mode
== TierAgentState::EVICT_MODE_SOME
) {
15372 stats
.stats
.sum
.num_evict_mode_some
= 1;
15373 } else if (evict_mode
== TierAgentState::EVICT_MODE_FULL
) {
15374 stats
.stats
.sum
.num_evict_mode_full
= 1;
15376 if (agent_state
->evict_mode
== TierAgentState::EVICT_MODE_SOME
) {
15377 stats
.stats
.sum
.num_evict_mode_some
= 0;
15378 } else if (agent_state
->evict_mode
== TierAgentState::EVICT_MODE_FULL
) {
15379 stats
.stats
.sum
.num_evict_mode_full
= 0;
15383 agent_state
->evict_mode
= evict_mode
;
15385 uint64_t old_effort
= agent_state
->evict_effort
;
15386 if (evict_effort
!= agent_state
->evict_effort
) {
15387 dout(5) << __func__
<< " evict_effort "
15388 << ((float)agent_state
->evict_effort
/ 1000000.0)
15390 << ((float)evict_effort
/ 1000000.0)
15392 agent_state
->evict_effort
= evict_effort
;
15395 // NOTE: we are using evict_effort as a proxy for *all* agent effort
15396 // (including flush). This is probably fine (they should be
15397 // correlated) but it is not precisely correct.
15398 if (agent_state
->is_idle()) {
15399 if (!restart
&& !old_idle
) {
15400 osd
->agent_disable_pg(this, old_effort
);
15403 if (restart
|| old_idle
) {
15404 osd
->agent_enable_pg(this, agent_state
->evict_effort
);
15405 } else if (old_effort
!= agent_state
->evict_effort
) {
15406 osd
->agent_adjust_pg(this, old_effort
, agent_state
->evict_effort
);
15412 void PrimaryLogPG::agent_estimate_temp(const hobject_t
& oid
, int *temp
)
15414 ceph_assert(hit_set
);
15417 if (hit_set
->contains(oid
))
15420 int last_n
= pool
.info
.hit_set_search_last_n
;
15421 for (map
<time_t,HitSetRef
>::reverse_iterator p
=
15422 agent_state
->hit_set_map
.rbegin(); last_n
> 0 &&
15423 p
!= agent_state
->hit_set_map
.rend(); ++p
, ++i
) {
15424 if (p
->second
->contains(oid
)) {
15425 *temp
+= pool
.info
.get_grade(i
);
15431 // Dup op detection
15433 bool PrimaryLogPG::already_complete(eversion_t v
)
15435 dout(20) << __func__
<< ": " << v
<< dendl
;
15436 for (xlist
<RepGather
*>::iterator i
= repop_queue
.begin();
15439 dout(20) << __func__
<< ": " << **i
<< dendl
;
15440 // skip copy from temp object ops
15441 if ((*i
)->v
== eversion_t()) {
15442 dout(20) << __func__
<< ": " << **i
15443 << " version is empty" << dendl
;
15447 dout(20) << __func__
<< ": " << **i
15448 << " (*i)->v past v" << dendl
;
15451 if (!(*i
)->all_committed
) {
15452 dout(20) << __func__
<< ": " << **i
15453 << " not committed, returning false"
15458 dout(20) << __func__
<< ": returning true" << dendl
;
15463 // ==========================================================================================
15466 void PrimaryLogPG::do_replica_scrub_map(OpRequestRef op
)
15468 dout(15) << __func__
<< " is scrub active? " << is_scrub_active() << dendl
;
15469 op
->mark_started();
15471 if (!is_scrub_active()) {
15472 dout(10) << __func__
<< " scrub isn't active" << dendl
;
15475 m_scrubber
->map_from_replica(op
);
15478 bool PrimaryLogPG::_range_available_for_scrub(const hobject_t
& begin
,
15479 const hobject_t
& end
)
15481 pair
<hobject_t
, ObjectContextRef
> next
;
15482 next
.second
= object_contexts
.lookup(begin
);
15483 next
.first
= begin
;
15485 while (more
&& next
.first
< end
) {
15486 if (next
.second
&& next
.second
->is_blocked()) {
15487 next
.second
->requeue_scrub_on_unblock
= true;
15488 dout(10) << __func__
<< ": scrub delayed, "
15489 << next
.first
<< " is blocked"
15493 more
= object_contexts
.get_next(next
.first
, &next
);
15499 int PrimaryLogPG::rep_repair_primary_object(const hobject_t
& soid
, OpContext
*ctx
)
15501 OpRequestRef op
= ctx
->op
;
15502 // Only supports replicated pools
15503 ceph_assert(!pool
.info
.is_erasure());
15504 ceph_assert(is_primary());
15506 dout(10) << __func__
<< " " << soid
15507 << " peers osd.{" << get_acting_recovery_backfill() << "}" << dendl
;
15510 block_for_clean(soid
, op
);
15514 ceph_assert(!recovery_state
.get_pg_log().get_missing().is_missing(soid
));
15515 auto& oi
= ctx
->new_obs
.oi
;
15516 eversion_t v
= oi
.version
;
15518 if (primary_error(soid
, v
)) {
15519 dout(0) << __func__
<< " No other replicas available for " << soid
<< dendl
;
15520 // XXX: If we knew that there is no down osd which could include this
15521 // object, it would be nice if we could return EIO here.
15522 // If a "never fail" flag was available, that could be used
15523 // for rbd to NOT return EIO until object marked lost.
15525 // Drop through to save this op in case an osd comes up with the object.
15528 // Restart the op after object becomes readable again
15529 waiting_for_unreadable_object
[soid
].push_back(op
);
15530 op
->mark_delayed("waiting for missing object");
15532 ceph_assert(is_clean());
15533 state_set(PG_STATE_REPAIR
);
15534 state_clear(PG_STATE_CLEAN
);
15535 queue_peering_event(
15537 std::make_shared
<PGPeeringEvent
>(
15538 get_osdmap_epoch(),
15539 get_osdmap_epoch(),
15540 PeeringState::DoRecovery())));
15545 /*---SnapTrimmer Logging---*/
15547 #define dout_prefix pg->gen_prefix(*_dout)
15549 void PrimaryLogPG::SnapTrimmer::log_enter(const char *state_name
)
15551 ldout(pg
->cct
, 20) << "enter " << state_name
<< dendl
;
15554 void PrimaryLogPG::SnapTrimmer::log_exit(const char *state_name
, utime_t enter_time
)
15556 ldout(pg
->cct
, 20) << "exit " << state_name
<< dendl
;
15559 bool PrimaryLogPG::SnapTrimmer::permit_trim() {
15562 !pg
->is_scrub_queued_or_active() &&
15563 !pg
->snap_trimq
.empty();
15566 /*---SnapTrimmer states---*/
15568 #define dout_prefix (context< SnapTrimmer >().pg->gen_prefix(*_dout) \
15569 << "SnapTrimmer state<" << get_state_name() << ">: ")
15572 PrimaryLogPG::NotTrimming::NotTrimming(my_context ctx
)
15574 NamedState(nullptr, "NotTrimming")
15576 context
< SnapTrimmer
>().log_enter(state_name
);
15579 void PrimaryLogPG::NotTrimming::exit()
15581 context
< SnapTrimmer
>().log_exit(state_name
, enter_time
);
15584 boost::statechart::result
PrimaryLogPG::NotTrimming::react(const KickTrim
&)
15586 PrimaryLogPG
*pg
= context
< SnapTrimmer
>().pg
;
15587 ldout(pg
->cct
, 10) << "NotTrimming react KickTrim" << dendl
;
15589 if (!(pg
->is_primary() && pg
->is_active())) {
15590 ldout(pg
->cct
, 10) << "NotTrimming not primary or active" << dendl
;
15591 return discard_event();
15593 if (!pg
->is_clean() ||
15594 pg
->snap_trimq
.empty()) {
15595 ldout(pg
->cct
, 10) << "NotTrimming not clean or nothing to trim" << dendl
;
15596 return discard_event();
15598 if (pg
->is_scrub_queued_or_active()) {
15599 ldout(pg
->cct
, 10) << " scrubbing, will requeue snap_trimmer after" << dendl
;
15600 return transit
< WaitScrub
>();
15602 return transit
< Trimming
>();
15606 boost::statechart::result
PrimaryLogPG::WaitReservation::react(const SnapTrimReserved
&)
15608 PrimaryLogPG
*pg
= context
< SnapTrimmer
>().pg
;
15609 ldout(pg
->cct
, 10) << "WaitReservation react SnapTrimReserved" << dendl
;
15612 if (!context
< SnapTrimmer
>().can_trim()) {
15613 post_event(KickTrim());
15614 return transit
< NotTrimming
>();
15617 context
<Trimming
>().snap_to_trim
= pg
->snap_trimq
.range_start();
15618 ldout(pg
->cct
, 10) << "NotTrimming: trimming "
15619 << pg
->snap_trimq
.range_start()
15621 return transit
< AwaitAsyncWork
>();
15624 /* AwaitAsyncWork */
15625 PrimaryLogPG::AwaitAsyncWork::AwaitAsyncWork(my_context ctx
)
15627 NamedState(nullptr, "Trimming/AwaitAsyncWork")
15629 auto *pg
= context
< SnapTrimmer
>().pg
;
15630 context
< SnapTrimmer
>().log_enter(state_name
);
15631 context
< SnapTrimmer
>().pg
->osd
->queue_for_snap_trim(pg
);
15632 pg
->state_set(PG_STATE_SNAPTRIM
);
15633 pg
->state_clear(PG_STATE_SNAPTRIM_ERROR
);
15634 pg
->publish_stats_to_osd();
15637 boost::statechart::result
PrimaryLogPG::AwaitAsyncWork::react(const DoSnapWork
&)
15639 PrimaryLogPGRef pg
= context
< SnapTrimmer
>().pg
;
15640 snapid_t snap_to_trim
= context
<Trimming
>().snap_to_trim
;
15641 auto &in_flight
= context
<Trimming
>().in_flight
;
15642 ceph_assert(in_flight
.empty());
15644 ceph_assert(pg
->is_primary() && pg
->is_active());
15645 if (!context
< SnapTrimmer
>().can_trim()) {
15646 ldout(pg
->cct
, 10) << "something changed, reverting to NotTrimming" << dendl
;
15647 post_event(KickTrim());
15648 return transit
< NotTrimming
>();
15651 ldout(pg
->cct
, 10) << "AwaitAsyncWork: trimming snap " << snap_to_trim
<< dendl
;
15653 vector
<hobject_t
> to_trim
;
15654 unsigned max
= pg
->cct
->_conf
->osd_pg_max_concurrent_snap_trims
;
15655 // we need to look for at least 1 snaptrim, otherwise we'll misinterpret
15656 // the ENOENT below and erase snap_to_trim.
15657 ceph_assert(max
> 0);
15658 to_trim
.reserve(max
);
15659 int r
= pg
->snap_mapper
.get_next_objects_to_trim(
15663 if (r
!= 0 && r
!= -ENOENT
) {
15664 lderr(pg
->cct
) << "get_next_objects_to_trim returned "
15665 << cpp_strerror(r
) << dendl
;
15666 ceph_abort_msg("get_next_objects_to_trim returned an invalid code");
15667 } else if (r
== -ENOENT
) {
15669 ldout(pg
->cct
, 10) << "got ENOENT" << dendl
;
15671 pg
->snap_trimq
.erase(snap_to_trim
);
15673 if (pg
->snap_trimq_repeat
.count(snap_to_trim
)) {
15674 ldout(pg
->cct
, 10) << " removing from snap_trimq_repeat" << dendl
;
15675 pg
->snap_trimq_repeat
.erase(snap_to_trim
);
15677 ldout(pg
->cct
, 10) << "adding snap " << snap_to_trim
15678 << " to purged_snaps"
15680 ObjectStore::Transaction t
;
15681 pg
->recovery_state
.adjust_purged_snaps(
15682 [snap_to_trim
](auto &purged_snaps
) {
15683 purged_snaps
.insert(snap_to_trim
);
15685 pg
->write_if_dirty(t
);
15687 ldout(pg
->cct
, 10) << "purged_snaps now "
15688 << pg
->info
.purged_snaps
<< ", snap_trimq now "
15689 << pg
->snap_trimq
<< dendl
;
15691 int tr
= pg
->osd
->store
->queue_transaction(pg
->ch
, std::move(t
), NULL
);
15692 ceph_assert(tr
== 0);
15694 pg
->recovery_state
.share_pg_info();
15696 post_event(KickTrim());
15697 pg
->set_snaptrim_duration();
15698 return transit
< NotTrimming
>();
15700 ceph_assert(!to_trim
.empty());
15702 for (auto &&object
: to_trim
) {
15704 ldout(pg
->cct
, 10) << "AwaitAsyncWork react trimming " << object
<< dendl
;
15706 int error
= pg
->trim_object(in_flight
.empty(), object
, snap_to_trim
, &ctx
);
15708 if (error
== -ENOLCK
) {
15709 ldout(pg
->cct
, 10) << "could not get write lock on obj "
15710 << object
<< dendl
;
15712 pg
->state_set(PG_STATE_SNAPTRIM_ERROR
);
15713 ldout(pg
->cct
, 10) << "Snaptrim error=" << error
<< dendl
;
15715 if (!in_flight
.empty()) {
15716 ldout(pg
->cct
, 10) << "letting the ones we already started finish" << dendl
;
15717 return transit
< WaitRepops
>();
15719 if (error
== -ENOLCK
) {
15720 ldout(pg
->cct
, 10) << "waiting for it to clear"
15722 return transit
< WaitRWLock
>();
15724 return transit
< NotTrimming
>();
15728 in_flight
.insert(object
);
15729 ctx
->register_on_success(
15730 [pg
, object
, &in_flight
]() {
15731 ceph_assert(in_flight
.find(object
) != in_flight
.end());
15732 in_flight
.erase(object
);
15733 if (in_flight
.empty()) {
15734 if (pg
->state_test(PG_STATE_SNAPTRIM_ERROR
)) {
15735 pg
->snap_trimmer_machine
.process_event(Reset());
15737 pg
->snap_trimmer_machine
.process_event(RepopsComplete());
15742 pg
->simple_opc_submit(std::move(ctx
));
15745 return transit
< WaitRepops
>();
15748 void PrimaryLogPG::setattr_maybe_cache(
15749 ObjectContextRef obc
,
15754 t
->setattr(obc
->obs
.oi
.soid
, key
, val
);
15757 void PrimaryLogPG::setattrs_maybe_cache(
15758 ObjectContextRef obc
,
15760 map
<string
, bufferlist
, less
<>> &attrs
)
15762 t
->setattrs(obc
->obs
.oi
.soid
, attrs
);
15765 void PrimaryLogPG::rmattr_maybe_cache(
15766 ObjectContextRef obc
,
15770 t
->rmattr(obc
->obs
.oi
.soid
, key
);
15773 int PrimaryLogPG::getattr_maybe_cache(
15774 ObjectContextRef obc
,
15778 if (pool
.info
.is_erasure()) {
15779 map
<string
, bufferlist
>::iterator i
= obc
->attr_cache
.find(key
);
15780 if (i
!= obc
->attr_cache
.end()) {
15785 if (obc
->obs
.exists
) {
15792 return pgbackend
->objects_get_attr(obc
->obs
.oi
.soid
, key
, val
);
15795 int PrimaryLogPG::getattrs_maybe_cache(
15796 ObjectContextRef obc
,
15797 map
<string
, bufferlist
, less
<>> *out
)
15801 if (pool
.info
.is_erasure()) {
15802 *out
= obc
->attr_cache
;
15804 r
= pgbackend
->objects_get_attrs(obc
->obs
.oi
.soid
, out
);
15806 map
<string
, bufferlist
, less
<>> tmp
;
15807 for (auto& [key
, val
]: *out
) {
15808 if (key
.size() > 1 && key
[0] == '_') {
15809 tmp
[key
.substr(1, key
.size())] = std::move(val
);
15816 bool PrimaryLogPG::check_failsafe_full() {
15817 return osd
->check_failsafe_full(get_dpp());
15820 bool PrimaryLogPG::maybe_preempt_replica_scrub(const hobject_t
& oid
)
15822 return m_scrubber
->write_blocked_by_scrub(oid
);
15825 void intrusive_ptr_add_ref(PrimaryLogPG
*pg
) { pg
->get("intptr"); }
15826 void intrusive_ptr_release(PrimaryLogPG
*pg
) { pg
->put("intptr"); }
15828 #ifdef PG_DEBUG_REFS
15829 uint64_t get_with_id(PrimaryLogPG
*pg
) { return pg
->get_with_id(); }
15830 void put_with_id(PrimaryLogPG
*pg
, uint64_t id
) { return pg
->put_with_id(id
); }
15833 void intrusive_ptr_add_ref(PrimaryLogPG::RepGather
*repop
) { repop
->get(); }
15834 void intrusive_ptr_release(PrimaryLogPG::RepGather
*repop
) { repop
->put(); }