1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 * Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com>
9 * Author: Loic Dachary <loic@dachary.org>
11 * This is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public
13 * License version 2.1, as published by the Free Software
14 * Foundation. See file COPYING.
18 #include "boost/tuple/tuple.hpp"
19 #include "boost/intrusive_ptr.hpp"
21 #include "pg_scrubber.h"
22 #include "PrimaryLogPG.h"
24 #include "PrimaryLogScrub.h"
25 #include "OpRequest.h"
26 #include "ScrubStore.h"
28 #include "objclass/objclass.h"
29 #include "osd/ClassHandler.h"
31 #include "cls/cas/cls_cas_ops.h"
32 #include "common/ceph_crypto.h"
33 #include "common/errno.h"
34 #include "common/scrub_types.h"
35 #include "common/perf_counters.h"
37 #include "messages/MOSDOp.h"
38 #include "messages/MOSDBackoff.h"
39 #include "messages/MOSDPGTrim.h"
40 #include "messages/MOSDPGScan.h"
41 #include "messages/MOSDRepScrub.h"
42 #include "messages/MOSDPGBackfill.h"
43 #include "messages/MOSDPGBackfillRemove.h"
44 #include "messages/MOSDPGUpdateLogMissing.h"
45 #include "messages/MOSDPGUpdateLogMissingReply.h"
46 #include "messages/MCommandReply.h"
47 #include "messages/MOSDScrubReserve.h"
48 #include "common/EventTrace.h"
50 #include "common/config.h"
51 #include "include/compat.h"
52 #include "mon/MonClient.h"
53 #include "osdc/Objecter.h"
54 #include "json_spirit/json_spirit_value.h"
55 #include "json_spirit/json_spirit_reader.h"
56 #include "include/ceph_assert.h" // json_spirit clobbers it
57 #include "include/rados/rados_types.hpp"
60 #include "tracing/osd.h"
62 #define tracepoint(...)
65 #define dout_context cct
66 #define dout_subsys ceph_subsys_osd
67 #define DOUT_PREFIX_ARGS this, osd->whoami, get_osdmap()
69 #define dout_prefix _prefix(_dout, this)
76 #include "common/tracer.h"
79 #include <common/CDC.h>
81 MEMPOOL_DEFINE_OBJECT_FACTORY(PrimaryLogPG
, replicatedpg
, osd
);
88 using std::ostringstream
;
91 using std::string_view
;
92 using std::stringstream
;
93 using std::unique_ptr
;
96 using ceph::bufferlist
;
97 using ceph::bufferptr
;
98 using ceph::Formatter
;
100 using ceph::decode_noclear
;
102 using ceph::encode_destructively
;
104 using namespace ceph::osd::scheduler
;
105 using TOPNSPC::common::cmd_getval
;
107 template <typename T
>
108 static ostream
& _prefix(std::ostream
*_dout
, T
*pg
) {
109 return pg
->gen_prefix(*_dout
);
113 * The CopyCallback class defines an interface for completions to the
114 * copy_start code. Users of the copy infrastructure must implement
115 * one and give an instance of the class to start_copy.
117 * The implementer is responsible for making sure that the CopyCallback
118 * can associate itself with the correct copy operation.
120 class PrimaryLogPG::CopyCallback
: public GenContext
<CopyCallbackResults
> {
124 * results.get<0>() is the return code: 0 for success; -ECANCELED if
125 * the operation was cancelled by the local OSD; -errno for other issues.
126 * results.get<1>() is a pointer to a CopyResults object, which you are
127 * responsible for deleting.
129 void finish(CopyCallbackResults results_
) override
= 0;
132 /// Provide the final size of the copied object to the CopyCallback
133 ~CopyCallback() override
{}
136 template <typename T
>
137 class PrimaryLogPG::BlessedGenContext
: public GenContext
<T
> {
139 unique_ptr
<GenContext
<T
>> c
;
142 BlessedGenContext(PrimaryLogPG
*pg
, GenContext
<T
> *c
, epoch_t e
)
143 : pg(pg
), c(c
), e(e
) {}
144 void finish(T t
) override
{
145 std::scoped_lock locker
{*pg
};
146 if (pg
->pg_has_reset_since(e
))
149 c
.release()->complete(t
);
151 bool sync_finish(T t
) {
152 // we assume here all blessed/wrapped Contexts can complete synchronously.
153 c
.release()->complete(t
);
158 GenContext
<ThreadPool::TPHandle
&> *PrimaryLogPG::bless_gencontext(
159 GenContext
<ThreadPool::TPHandle
&> *c
) {
160 return new BlessedGenContext
<ThreadPool::TPHandle
&>(
161 this, c
, get_osdmap_epoch());
164 template <typename T
>
165 class PrimaryLogPG::UnlockedBlessedGenContext
: public GenContext
<T
> {
167 unique_ptr
<GenContext
<T
>> c
;
170 UnlockedBlessedGenContext(PrimaryLogPG
*pg
, GenContext
<T
> *c
, epoch_t e
)
171 : pg(pg
), c(c
), e(e
) {}
172 void finish(T t
) override
{
173 if (pg
->pg_has_reset_since(e
))
176 c
.release()->complete(t
);
178 bool sync_finish(T t
) {
179 // we assume here all blessed/wrapped Contexts can complete synchronously.
180 c
.release()->complete(t
);
185 GenContext
<ThreadPool::TPHandle
&> *PrimaryLogPG::bless_unlocked_gencontext(
186 GenContext
<ThreadPool::TPHandle
&> *c
) {
187 return new UnlockedBlessedGenContext
<ThreadPool::TPHandle
&>(
188 this, c
, get_osdmap_epoch());
191 class PrimaryLogPG::BlessedContext
: public Context
{
193 unique_ptr
<Context
> c
;
196 BlessedContext(PrimaryLogPG
*pg
, Context
*c
, epoch_t e
)
197 : pg(pg
), c(c
), e(e
) {}
198 void finish(int r
) override
{
199 std::scoped_lock locker
{*pg
};
200 if (pg
->pg_has_reset_since(e
))
203 c
.release()->complete(r
);
205 bool sync_finish(int r
) override
{
206 // we assume here all blessed/wrapped Contexts can complete synchronously.
207 c
.release()->complete(r
);
212 Context
*PrimaryLogPG::bless_context(Context
*c
) {
213 return new BlessedContext(this, c
, get_osdmap_epoch());
216 class PrimaryLogPG::C_PG_ObjectContext
: public Context
{
220 C_PG_ObjectContext(PrimaryLogPG
*p
, ObjectContext
*o
) :
222 void finish(int r
) override
{
223 pg
->object_context_destructor_callback(obc
);
227 struct OnReadComplete
: public Context
{
229 PrimaryLogPG::OpContext
*opcontext
;
232 PrimaryLogPG::OpContext
*ctx
) : pg(pg
), opcontext(ctx
) {}
233 void finish(int r
) override
{
234 opcontext
->finish_read(pg
);
236 ~OnReadComplete() override
{}
239 class PrimaryLogPG::C_OSD_AppliedRecoveredObject
: public Context
{
241 ObjectContextRef obc
;
243 C_OSD_AppliedRecoveredObject(PrimaryLogPG
*p
, ObjectContextRef o
) :
245 bool sync_finish(int r
) override
{
246 pg
->_applied_recovered_object(obc
);
249 void finish(int r
) override
{
250 std::scoped_lock locker
{*pg
};
251 pg
->_applied_recovered_object(obc
);
255 class PrimaryLogPG::C_OSD_CommittedPushedObject
: public Context
{
258 eversion_t last_complete
;
260 C_OSD_CommittedPushedObject(
261 PrimaryLogPG
*p
, epoch_t epoch
, eversion_t lc
) :
262 pg(p
), epoch(epoch
), last_complete(lc
) {
264 void finish(int r
) override
{
265 pg
->_committed_pushed_object(epoch
, last_complete
);
269 class PrimaryLogPG::C_OSD_AppliedRecoveredObjectReplica
: public Context
{
272 explicit C_OSD_AppliedRecoveredObjectReplica(PrimaryLogPG
*p
) :
274 bool sync_finish(int r
) override
{
275 pg
->_applied_recovered_object_replica();
278 void finish(int r
) override
{
279 std::scoped_lock locker
{*pg
};
280 pg
->_applied_recovered_object_replica();
285 void PrimaryLogPG::OpContext::start_async_reads(PrimaryLogPG
*pg
)
288 list
<pair
<boost::tuple
<uint64_t, uint64_t, unsigned>,
289 pair
<bufferlist
*, Context
*> > > in
;
290 in
.swap(pending_async_reads
);
291 pg
->pgbackend
->objects_read_async(
294 new OnReadComplete(pg
, this), pg
->get_pool().fast_read
);
296 void PrimaryLogPG::OpContext::finish_read(PrimaryLogPG
*pg
)
298 ceph_assert(inflightreads
> 0);
300 if (async_reads_complete()) {
301 ceph_assert(pg
->in_progress_async_reads
.size());
302 ceph_assert(pg
->in_progress_async_reads
.front().second
== this);
303 pg
->in_progress_async_reads
.pop_front();
305 // Restart the op context now that all reads have been
306 // completed. Read failures will be handled by the op finisher
307 pg
->execute_ctx(this);
311 class CopyFromCallback
: public PrimaryLogPG::CopyCallback
{
313 PrimaryLogPG::CopyResults
*results
= nullptr;
314 PrimaryLogPG::OpContext
*ctx
;
316 uint32_t truncate_seq
;
317 uint64_t truncate_size
;
318 bool have_truncate
= false;
320 CopyFromCallback(PrimaryLogPG::OpContext
*ctx
, OSDOp
&osd_op
)
321 : ctx(ctx
), osd_op(osd_op
) {
323 ~CopyFromCallback() override
{}
325 void finish(PrimaryLogPG::CopyCallbackResults results_
) override
{
326 results
= results_
.get
<1>();
327 int r
= results_
.get
<0>();
329 // Only use truncate_{seq,size} from the original object if the client
330 // did not sent us these parameters
331 if (!have_truncate
) {
332 truncate_seq
= results
->truncate_seq
;
333 truncate_size
= results
->truncate_size
;
336 // for finish_copyfrom
337 ctx
->user_at_version
= results
->user_version
;
340 ctx
->pg
->execute_ctx(ctx
);
342 if (r
!= -ECANCELED
) { // on cancel just toss it out; client resends
344 ctx
->pg
->osd
->reply_op_error(ctx
->op
, r
);
345 } else if (results
->should_requeue
) {
347 ctx
->pg
->requeue_op(ctx
->op
);
349 ctx
->pg
->close_op_ctx(ctx
);
353 bool is_temp_obj_used() {
354 return results
->started_temp_obj
;
356 uint64_t get_data_size() {
357 return results
->object_size
;
359 void set_truncate(uint32_t seq
, uint64_t size
) {
361 truncate_size
= size
;
362 have_truncate
= true;
366 struct CopyFromFinisher
: public PrimaryLogPG::OpFinisher
{
367 CopyFromCallback
*copy_from_callback
;
369 explicit CopyFromFinisher(CopyFromCallback
*copy_from_callback
)
370 : copy_from_callback(copy_from_callback
) {
373 int execute() override
{
374 // instance will be destructed after this method completes
375 copy_from_callback
->ctx
->pg
->finish_copyfrom(copy_from_callback
);
380 // ======================
381 // PGBackend::Listener
383 void PrimaryLogPG::on_local_recover(
384 const hobject_t
&hoid
,
385 const ObjectRecoveryInfo
&_recovery_info
,
386 ObjectContextRef obc
,
388 ObjectStore::Transaction
*t
391 dout(10) << __func__
<< ": " << hoid
<< dendl
;
393 ObjectRecoveryInfo
recovery_info(_recovery_info
);
394 clear_object_snap_mapping(t
, hoid
);
395 if (!is_delete
&& recovery_info
.soid
.is_snap()) {
396 OSDriver::OSTransaction
_t(osdriver
.get_transaction(t
));
398 dout(20) << " snapset " << recovery_info
.ss
<< dendl
;
399 auto p
= recovery_info
.ss
.clone_snaps
.find(hoid
.snap
);
400 if (p
!= recovery_info
.ss
.clone_snaps
.end()) {
401 snaps
.insert(p
->second
.begin(), p
->second
.end());
402 dout(20) << " snaps " << snaps
<< dendl
;
408 derr
<< __func__
<< " " << hoid
<< " had no clone_snaps" << dendl
;
411 if (!is_delete
&& recovery_state
.get_pg_log().get_missing().is_missing(recovery_info
.soid
) &&
412 recovery_state
.get_pg_log().get_missing().get_items().find(recovery_info
.soid
)->second
.need
> recovery_info
.version
) {
413 ceph_assert(is_primary());
414 const pg_log_entry_t
*latest
= recovery_state
.get_pg_log().get_log().objects
.find(recovery_info
.soid
)->second
;
415 if (latest
->op
== pg_log_entry_t::LOST_REVERT
&&
416 latest
->reverting_to
== recovery_info
.version
) {
417 dout(10) << " got old revert version " << recovery_info
.version
418 << " for " << *latest
<< dendl
;
419 recovery_info
.version
= latest
->version
;
420 // update the attr to the revert event version
421 recovery_info
.oi
.prior_version
= recovery_info
.oi
.version
;
422 recovery_info
.oi
.version
= latest
->version
;
424 encode(recovery_info
.oi
, bl
,
425 get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD
, nullptr));
426 ceph_assert(!pool
.info
.is_erasure());
427 t
->setattr(coll
, ghobject_t(recovery_info
.soid
), OI_ATTR
, bl
);
429 obc
->attr_cache
[OI_ATTR
] = bl
;
433 // keep track of active pushes for scrub
436 recovery_state
.recover_got(
438 recovery_info
.version
,
444 obc
->obs
.exists
= true;
446 bool got
= obc
->get_recovery_read();
449 ceph_assert(recovering
.count(obc
->obs
.oi
.soid
));
450 recovering
[obc
->obs
.oi
.soid
] = obc
;
451 obc
->obs
.oi
= recovery_info
.oi
; // may have been updated above
454 t
->register_on_applied(new C_OSD_AppliedRecoveredObject(this, obc
));
456 publish_stats_to_osd();
457 release_backoffs(hoid
);
458 if (!is_unreadable_object(hoid
)) {
459 auto unreadable_object_entry
= waiting_for_unreadable_object
.find(hoid
);
460 if (unreadable_object_entry
!= waiting_for_unreadable_object
.end()) {
461 dout(20) << " kicking unreadable waiters on " << hoid
<< dendl
;
462 requeue_ops(unreadable_object_entry
->second
);
463 waiting_for_unreadable_object
.erase(unreadable_object_entry
);
467 t
->register_on_applied(
468 new C_OSD_AppliedRecoveredObjectReplica(this));
472 t
->register_on_commit(
473 new C_OSD_CommittedPushedObject(
476 info
.last_complete
));
479 void PrimaryLogPG::on_global_recover(
480 const hobject_t
&soid
,
481 const object_stat_sum_t
&stat_diff
,
484 recovery_state
.object_recovered(soid
, stat_diff
);
485 publish_stats_to_osd();
486 dout(10) << "pushed " << soid
<< " to all replicas" << dendl
;
487 auto i
= recovering
.find(soid
);
488 ceph_assert(i
!= recovering
.end());
490 if (i
->second
&& i
->second
->rwstate
.recovery_read_marker
) {
491 // recover missing won't have had an obc, but it gets filled in
492 // during on_local_recover
493 ceph_assert(i
->second
);
494 list
<OpRequestRef
> requeue_list
;
495 i
->second
->drop_recovery_read(&requeue_list
);
496 requeue_ops(requeue_list
);
499 backfills_in_flight
.erase(soid
);
502 finish_recovery_op(soid
);
503 release_backoffs(soid
);
504 auto degraded_object_entry
= waiting_for_degraded_object
.find(soid
);
505 if (degraded_object_entry
!= waiting_for_degraded_object
.end()) {
506 dout(20) << " kicking degraded waiters on " << soid
<< dendl
;
507 requeue_ops(degraded_object_entry
->second
);
508 waiting_for_degraded_object
.erase(degraded_object_entry
);
510 auto unreadable_object_entry
= waiting_for_unreadable_object
.find(soid
);
511 if (unreadable_object_entry
!= waiting_for_unreadable_object
.end()) {
512 dout(20) << " kicking unreadable waiters on " << soid
<< dendl
;
513 requeue_ops(unreadable_object_entry
->second
);
514 waiting_for_unreadable_object
.erase(unreadable_object_entry
);
516 finish_degraded_object(soid
);
519 void PrimaryLogPG::schedule_recovery_work(
520 GenContext
<ThreadPool::TPHandle
&> *c
)
522 osd
->queue_recovery_context(this, c
);
525 void PrimaryLogPG::replica_clear_repop_obc(
526 const vector
<pg_log_entry_t
> &logv
,
527 ObjectStore::Transaction
&t
)
529 for (auto &&e
: logv
) {
530 /* Have to blast all clones, they share a snapset */
531 object_contexts
.clear_range(
532 e
.soid
.get_object_boundary(), e
.soid
.get_head());
534 snapset_contexts
.find(e
.soid
.get_head()) ==
535 snapset_contexts
.end());
539 bool PrimaryLogPG::should_send_op(
541 const hobject_t
&hoid
) {
542 if (peer
== get_primary())
544 ceph_assert(recovery_state
.has_peer_info(peer
));
546 hoid
.pool
!= (int64_t)info
.pgid
.pool() ||
547 hoid
<= last_backfill_started
||
548 hoid
<= recovery_state
.get_peer_info(peer
).last_backfill
;
550 ceph_assert(is_backfill_target(peer
));
551 dout(10) << __func__
<< " issue_repop shipping empty opt to osd." << peer
552 << ", object " << hoid
553 << " beyond std::max(last_backfill_started "
554 << ", peer_info[peer].last_backfill "
555 << recovery_state
.get_peer_info(peer
).last_backfill
559 if (is_async_recovery_target(peer
) &&
560 recovery_state
.get_peer_missing(peer
).is_missing(hoid
)) {
562 dout(10) << __func__
<< " issue_repop shipping empty opt to osd." << peer
563 << ", object " << hoid
564 << " which is pending recovery in async_recovery_targets" << dendl
;
570 ConnectionRef
PrimaryLogPG::get_con_osd_cluster(
571 int peer
, epoch_t from_epoch
)
573 return osd
->get_con_osd_cluster(peer
, from_epoch
);
576 PerfCounters
*PrimaryLogPG::get_logger()
582 // ====================
585 bool PrimaryLogPG::is_missing_object(const hobject_t
& soid
) const
587 return recovery_state
.get_pg_log().get_missing().get_items().count(soid
);
590 void PrimaryLogPG::maybe_kick_recovery(
591 const hobject_t
&soid
)
594 bool work_started
= false;
595 if (!recovery_state
.get_missing_loc().needs_recovery(soid
, &v
))
598 map
<hobject_t
, ObjectContextRef
>::const_iterator p
= recovering
.find(soid
);
599 if (p
!= recovering
.end()) {
600 dout(7) << "object " << soid
<< " v " << v
<< ", already recovering." << dendl
;
601 } else if (recovery_state
.get_missing_loc().is_unfound(soid
)) {
602 dout(7) << "object " << soid
<< " v " << v
<< ", is unfound." << dendl
;
604 dout(7) << "object " << soid
<< " v " << v
<< ", recovering." << dendl
;
605 PGBackend::RecoveryHandle
*h
= pgbackend
->open_recovery_op();
606 if (is_missing_object(soid
)) {
607 recover_missing(soid
, v
, CEPH_MSG_PRIO_HIGH
, h
);
608 } else if (recovery_state
.get_missing_loc().is_deleted(soid
)) {
609 prep_object_replica_deletes(soid
, v
, h
, &work_started
);
611 prep_object_replica_pushes(soid
, v
, h
, &work_started
);
613 pgbackend
->run_recovery_op(h
, CEPH_MSG_PRIO_HIGH
);
617 void PrimaryLogPG::wait_for_unreadable_object(
618 const hobject_t
& soid
, OpRequestRef op
)
620 ceph_assert(is_unreadable_object(soid
));
621 maybe_kick_recovery(soid
);
622 waiting_for_unreadable_object
[soid
].push_back(op
);
623 op
->mark_delayed("waiting for missing object");
626 bool PrimaryLogPG::is_degraded_or_backfilling_object(const hobject_t
& soid
)
628 /* The conditions below may clear (on_local_recover, before we queue
629 * the transaction) before we actually requeue the degraded waiters
630 * in on_global_recover after the transaction completes.
632 if (waiting_for_degraded_object
.count(soid
))
634 if (recovery_state
.get_pg_log().get_missing().get_items().count(soid
))
636 ceph_assert(!get_acting_recovery_backfill().empty());
637 for (set
<pg_shard_t
>::iterator i
= get_acting_recovery_backfill().begin();
638 i
!= get_acting_recovery_backfill().end();
640 if (*i
== get_primary()) continue;
641 pg_shard_t peer
= *i
;
642 auto peer_missing_entry
= recovery_state
.get_peer_missing().find(peer
);
643 // If an object is missing on an async_recovery_target, return false.
644 // This will not block the op and the object is async recovered later.
645 if (peer_missing_entry
!= recovery_state
.get_peer_missing().end() &&
646 peer_missing_entry
->second
.get_items().count(soid
)) {
647 if (is_async_recovery_target(peer
))
652 // Object is degraded if after last_backfill AND
653 // we are backfilling it
654 if (is_backfill_target(peer
) &&
655 recovery_state
.get_peer_info(peer
).last_backfill
<= soid
&&
656 last_backfill_started
>= soid
&&
657 backfills_in_flight
.count(soid
))
663 bool PrimaryLogPG::is_degraded_on_async_recovery_target(const hobject_t
& soid
)
665 for (auto &i
: get_async_recovery_targets()) {
666 auto peer_missing_entry
= recovery_state
.get_peer_missing().find(i
);
667 if (peer_missing_entry
!= recovery_state
.get_peer_missing().end() &&
668 peer_missing_entry
->second
.get_items().count(soid
)) {
669 dout(30) << __func__
<< " " << soid
<< dendl
;
676 void PrimaryLogPG::wait_for_degraded_object(const hobject_t
& soid
, OpRequestRef op
)
678 ceph_assert(is_degraded_or_backfilling_object(soid
) || is_degraded_on_async_recovery_target(soid
));
680 maybe_kick_recovery(soid
);
681 waiting_for_degraded_object
[soid
].push_back(op
);
682 op
->mark_delayed("waiting for degraded object");
685 void PrimaryLogPG::block_write_on_full_cache(
686 const hobject_t
& _oid
, OpRequestRef op
)
688 const hobject_t oid
= _oid
.get_head();
689 dout(20) << __func__
<< ": blocking object " << oid
690 << " on full cache" << dendl
;
691 objects_blocked_on_cache_full
.insert(oid
);
692 waiting_for_cache_not_full
.push_back(op
);
693 op
->mark_delayed("waiting for cache not full");
696 void PrimaryLogPG::block_for_clean(
697 const hobject_t
& oid
, OpRequestRef op
)
699 dout(20) << __func__
<< ": blocking object " << oid
700 << " on primary repair" << dendl
;
701 waiting_for_clean_to_primary_repair
.push_back(op
);
702 op
->mark_delayed("waiting for clean to repair");
705 void PrimaryLogPG::block_write_on_snap_rollback(
706 const hobject_t
& oid
, ObjectContextRef obc
, OpRequestRef op
)
708 dout(20) << __func__
<< ": blocking object " << oid
.get_head()
709 << " on snap promotion " << obc
->obs
.oi
.soid
<< dendl
;
710 // otherwise, we'd have blocked in do_op
711 ceph_assert(oid
.is_head());
712 ceph_assert(objects_blocked_on_snap_promotion
.count(oid
) == 0);
713 objects_blocked_on_snap_promotion
[oid
] = obc
;
714 wait_for_blocked_object(obc
->obs
.oi
.soid
, op
);
717 void PrimaryLogPG::block_write_on_degraded_snap(
718 const hobject_t
& snap
, OpRequestRef op
)
720 dout(20) << __func__
<< ": blocking object " << snap
.get_head()
721 << " on degraded snap " << snap
<< dendl
;
722 // otherwise, we'd have blocked in do_op
723 ceph_assert(objects_blocked_on_degraded_snap
.count(snap
.get_head()) == 0);
724 objects_blocked_on_degraded_snap
[snap
.get_head()] = snap
.snap
;
725 wait_for_degraded_object(snap
, op
);
728 bool PrimaryLogPG::maybe_await_blocked_head(
729 const hobject_t
&hoid
,
732 ObjectContextRef obc
;
733 obc
= object_contexts
.lookup(hoid
.get_head());
735 if (obc
->is_blocked()) {
736 wait_for_blocked_object(obc
->obs
.oi
.soid
, op
);
745 void PrimaryLogPG::wait_for_blocked_object(const hobject_t
& soid
, OpRequestRef op
)
747 dout(10) << __func__
<< " " << soid
<< " " << op
<< dendl
;
748 waiting_for_blocked_object
[soid
].push_back(op
);
749 op
->mark_delayed("waiting for blocked object");
752 void PrimaryLogPG::maybe_force_recovery()
754 // no force if not in degraded/recovery/backfill states
755 if (!is_degraded() &&
756 !state_test(PG_STATE_RECOVERING
|
757 PG_STATE_RECOVERY_WAIT
|
758 PG_STATE_BACKFILLING
|
759 PG_STATE_BACKFILL_WAIT
|
760 PG_STATE_BACKFILL_TOOFULL
))
763 if (recovery_state
.get_pg_log().get_log().approx_size() <
764 cct
->_conf
->osd_max_pg_log_entries
*
765 cct
->_conf
->osd_force_recovery_pg_log_entries_factor
)
768 // find the oldest missing object
769 version_t min_version
= recovery_state
.get_pg_log().get_log().head
.version
;
771 if (!recovery_state
.get_pg_log().get_missing().get_rmissing().empty()) {
772 min_version
= recovery_state
.get_pg_log().get_missing().get_rmissing().begin()->first
;
773 soid
= recovery_state
.get_pg_log().get_missing().get_rmissing().begin()->second
;
775 ceph_assert(!get_acting_recovery_backfill().empty());
776 for (set
<pg_shard_t
>::iterator it
= get_acting_recovery_backfill().begin();
777 it
!= get_acting_recovery_backfill().end();
779 if (*it
== get_primary()) continue;
780 pg_shard_t peer
= *it
;
781 auto it_missing
= recovery_state
.get_peer_missing().find(peer
);
782 if (it_missing
!= recovery_state
.get_peer_missing().end() &&
783 !it_missing
->second
.get_rmissing().empty()) {
784 const auto& min_obj
= recovery_state
.get_peer_missing(peer
).get_rmissing().begin();
785 dout(20) << __func__
<< " peer " << peer
<< " min_version " << min_obj
->first
786 << " oid " << min_obj
->second
<< dendl
;
787 if (min_version
> min_obj
->first
) {
788 min_version
= min_obj
->first
;
789 soid
= min_obj
->second
;
795 if (soid
!= hobject_t())
796 maybe_kick_recovery(soid
);
799 bool PrimaryLogPG::check_laggy(OpRequestRef
& op
)
801 if (!HAVE_FEATURE(recovery_state
.get_min_upacting_features(),
803 dout(20) << __func__
<< " not all upacting has SERVER_OCTOPUS" << dendl
;
806 if (state_test(PG_STATE_WAIT
)) {
807 dout(10) << __func__
<< " PG is WAIT state" << dendl
;
808 } else if (!state_test(PG_STATE_LAGGY
)) {
809 auto mnow
= osd
->get_mnow();
810 auto ru
= recovery_state
.get_readable_until();
817 << " > readable_until " << ru
<< dendl
;
820 osd
->reply_op_error(op
, -EAGAIN
);
825 state_set(PG_STATE_LAGGY
);
826 publish_stats_to_osd();
828 dout(10) << __func__
<< " not readable" << dendl
;
829 waiting_for_readable
.push_back(op
);
830 op
->mark_delayed("waiting for readable");
834 bool PrimaryLogPG::check_laggy_requeue(OpRequestRef
& op
)
836 if (!HAVE_FEATURE(recovery_state
.get_min_upacting_features(),
840 if (!state_test(PG_STATE_WAIT
) && !state_test(PG_STATE_LAGGY
)) {
841 return true; // not laggy
843 dout(10) << __func__
<< " not readable" << dendl
;
844 waiting_for_readable
.push_front(op
);
845 op
->mark_delayed("waiting for readable");
849 void PrimaryLogPG::recheck_readable()
851 if (!is_wait() && !is_laggy()) {
852 dout(20) << __func__
<< " wasn't wait or laggy" << dendl
;
855 auto mnow
= osd
->get_mnow();
858 auto prior_readable_until_ub
= recovery_state
.get_prior_readable_until_ub();
859 if (mnow
< prior_readable_until_ub
) {
860 dout(10) << __func__
<< " still wait (mnow " << mnow
861 << " < prior_readable_until_ub " << prior_readable_until_ub
864 dout(10) << __func__
<< " no longer wait (mnow " << mnow
865 << " >= prior_readable_until_ub " << prior_readable_until_ub
867 state_clear(PG_STATE_WAIT
);
868 recovery_state
.clear_prior_readable_until_ub();
873 auto ru
= recovery_state
.get_readable_until();
874 if (ru
== ceph::signedspan::zero()) {
875 dout(10) << __func__
<< " still laggy (mnow " << mnow
876 << ", readable_until zero)" << dendl
;
877 } else if (mnow
>= ru
) {
878 dout(10) << __func__
<< " still laggy (mnow " << mnow
879 << " >= readable_until " << ru
<< ")" << dendl
;
881 dout(10) << __func__
<< " no longer laggy (mnow " << mnow
882 << " < readable_until " << ru
<< ")" << dendl
;
883 state_clear(PG_STATE_LAGGY
);
888 publish_stats_to_osd();
890 if (!is_laggy() && !is_wait()) {
891 requeue_ops(waiting_for_readable
);
895 bool PrimaryLogPG::pgls_filter(const PGLSFilter
& filter
, const hobject_t
& sobj
)
899 // If filter has expressed an interest in an xattr, load it.
900 if (!filter
.get_xattr().empty()) {
901 int ret
= pgbackend
->objects_get_attr(
905 dout(0) << "getattr (sobj=" << sobj
<< ", attr=" << filter
.get_xattr() << ") returned " << ret
<< dendl
;
907 if (ret
!= -ENODATA
|| filter
.reject_empty_xattr()) {
913 return filter
.filter(sobj
, bl
);
916 std::pair
<int, std::unique_ptr
<const PGLSFilter
>>
917 PrimaryLogPG::get_pgls_filter(bufferlist::const_iterator
& iter
)
920 // storing non-const PGLSFilter for the sake of ::init()
921 std::unique_ptr
<PGLSFilter
> filter
;
926 catch (ceph::buffer::error
& e
) {
927 return { -EINVAL
, nullptr };
930 if (type
.compare("plain") == 0) {
931 filter
= std::make_unique
<PGLSPlainFilter
>();
933 std::size_t dot
= type
.find(".");
934 if (dot
== std::string::npos
|| dot
== 0 || dot
== type
.size() - 1) {
935 return { -EINVAL
, nullptr };
938 const std::string class_name
= type
.substr(0, dot
);
939 const std::string filter_name
= type
.substr(dot
+ 1);
940 ClassHandler::ClassData
*cls
= NULL
;
941 int r
= ClassHandler::get_instance().open_class(class_name
, &cls
);
943 derr
<< "Error opening class '" << class_name
<< "': "
944 << cpp_strerror(r
) << dendl
;
945 if (r
!= -EPERM
) // propagate permission error
947 return { r
, nullptr };
952 ClassHandler::ClassFilter
*class_filter
= cls
->get_filter(filter_name
);
953 if (class_filter
== NULL
) {
954 derr
<< "Error finding filter '" << filter_name
<< "' in class "
955 << class_name
<< dendl
;
956 return { -EINVAL
, nullptr };
958 filter
.reset(class_filter
->fn());
960 // Object classes are obliged to return us something, but let's
961 // give an error rather than asserting out.
962 derr
<< "Buggy class " << class_name
<< " failed to construct "
963 "filter " << filter_name
<< dendl
;
964 return { -EINVAL
, nullptr };
969 int r
= filter
->init(iter
);
971 derr
<< "Error initializing filter " << type
<< ": "
972 << cpp_strerror(r
) << dendl
;
973 return { -EINVAL
, nullptr };
975 // Successfully constructed and initialized, return it.
976 return std::make_pair(0, std::move(filter
));
981 // ==========================================================
983 void PrimaryLogPG::do_command(
984 const string_view
& orig_prefix
,
985 const cmdmap_t
& cmdmap
,
986 const bufferlist
& idata
,
987 std::function
<void(int,const std::string
&,bufferlist
&)> on_finish
)
990 cmd_getval(cmdmap
, "format", format
);
991 std::unique_ptr
<Formatter
> f(Formatter::create(
992 format
, "json-pretty", "json-pretty"));
994 stringstream ss
; // stderr error message stream
995 bufferlist outbl
; // if empty at end, we'll dump formatter as output
998 // - ceph pg <pgid> foo -> prefix=pg, cmd=foo
999 // - ceph tell <pgid> foo -> prefix=foo
1000 string
prefix(orig_prefix
);
1002 cmd_getval(cmdmap
, "cmd", command
);
1003 if (command
.size()) {
1007 if (prefix
== "query") {
1008 f
->open_object_section("pg");
1009 f
->dump_stream("snap_trimq") << snap_trimq
;
1010 f
->dump_unsigned("snap_trimq_len", snap_trimq
.size());
1011 recovery_state
.dump_peering_state(f
.get());
1013 f
->open_array_section("recovery_state");
1014 handle_query_state(f
.get());
1017 if (is_primary() && is_active() && m_scrubber
) {
1018 m_scrubber
->dump(f
.get());
1021 f
->open_object_section("agent_state");
1023 agent_state
->dump(f
.get());
1029 else if (prefix
== "mark_unfound_lost") {
1031 cmd_getval(cmdmap
, "mulcmd", mulcmd
);
1033 if (mulcmd
== "revert") {
1034 if (pool
.info
.is_erasure()) {
1035 ss
<< "mode must be 'delete' for ec pool";
1039 mode
= pg_log_entry_t::LOST_REVERT
;
1040 } else if (mulcmd
== "delete") {
1041 mode
= pg_log_entry_t::LOST_DELETE
;
1043 ss
<< "mode must be 'revert' or 'delete'; mark not yet implemented";
1047 ceph_assert(mode
== pg_log_entry_t::LOST_REVERT
||
1048 mode
== pg_log_entry_t::LOST_DELETE
);
1050 if (!is_primary()) {
1051 ss
<< "not primary";
1056 uint64_t unfound
= recovery_state
.get_missing_loc().num_unfound();
1058 ss
<< "pg has no unfound objects";
1059 goto out
; // make command idempotent
1062 if (!recovery_state
.all_unfound_are_queried_or_lost(get_osdmap())) {
1063 ss
<< "pg has " << unfound
1064 << " unfound objects but we haven't probed all sources, not marking lost";
1069 mark_all_unfound_lost(mode
, on_finish
);
1073 else if (prefix
== "list_unfound") {
1076 bool show_offset
= false;
1077 if (cmd_getval(cmdmap
, "offset", offset_json
)) {
1078 json_spirit::Value v
;
1080 if (!json_spirit::read(offset_json
, v
))
1081 throw std::runtime_error("bad json");
1083 } catch (std::runtime_error
& e
) {
1084 ss
<< "error parsing offset: " << e
.what();
1090 f
->open_object_section("missing");
1092 f
->open_object_section("offset");
1093 offset
.dump(f
.get());
1096 auto &needs_recovery_map
= recovery_state
.get_missing_loc()
1097 .get_needs_recovery();
1098 f
->dump_int("num_missing", needs_recovery_map
.size());
1099 f
->dump_int("num_unfound", get_num_unfound());
1100 map
<hobject_t
, pg_missing_item
>::const_iterator p
=
1101 needs_recovery_map
.upper_bound(offset
);
1103 f
->open_array_section("objects");
1105 for (; p
!= needs_recovery_map
.end() &&
1106 num
< cct
->_conf
->osd_command_max_records
;
1108 if (recovery_state
.get_missing_loc().is_unfound(p
->first
)) {
1109 f
->open_object_section("object");
1111 f
->open_object_section("oid");
1112 p
->first
.dump(f
.get());
1115 p
->second
.dump(f
.get()); // have, need keys
1117 f
->open_array_section("locations");
1118 for (auto &&r
: recovery_state
.get_missing_loc().get_locations(
1120 f
->dump_stream("shard") << r
;
1130 // Get possible locations of missing objects from pg information
1131 PeeringState::QueryUnfound
q(f
.get());
1132 recovery_state
.handle_event(q
, 0);
1133 f
->dump_bool("more", p
!= needs_recovery_map
.end());
1137 else if (prefix
== "scrub" ||
1138 prefix
== "deep_scrub") {
1139 bool deep
= (prefix
== "deep_scrub");
1141 cmd_getval(cmdmap
, "time", time
, (int64_t)0);
1144 const pg_pool_t
*p
= &pool
.info
;
1145 double pool_scrub_max_interval
= 0;
1146 double scrub_max_interval
;
1148 p
->opts
.get(pool_opts_t::DEEP_SCRUB_INTERVAL
, &pool_scrub_max_interval
);
1149 scrub_max_interval
= pool_scrub_max_interval
> 0 ?
1150 pool_scrub_max_interval
: g_conf()->osd_deep_scrub_interval
;
1152 p
->opts
.get(pool_opts_t::SCRUB_MAX_INTERVAL
, &pool_scrub_max_interval
);
1153 scrub_max_interval
= pool_scrub_max_interval
> 0 ?
1154 pool_scrub_max_interval
: g_conf()->osd_scrub_max_interval
;
1156 // Instead of marking must_scrub force a schedule scrub
1157 utime_t stamp
= ceph_clock_now();
1159 stamp
-= scrub_max_interval
;
1161 stamp
-= (float)time
;
1162 stamp
-= 100.0; // push back last scrub more for good measure
1164 set_last_deep_scrub_stamp(stamp
);
1166 set_last_scrub_stamp(stamp
);
1168 f
->open_object_section("result");
1169 f
->dump_bool("deep", deep
);
1170 f
->dump_stream("stamp") << stamp
;
1173 ss
<< "Not primary";
1176 outbl
.append(ss
.str());
1181 ss
<< "prefix '" << prefix
<< "' not implemented";
1185 if (ret
>= 0 && outbl
.length() == 0) {
1188 on_finish(ret
, ss
.str(), outbl
);
1192 // ==========================================================
1194 void PrimaryLogPG::do_pg_op(OpRequestRef op
)
1196 const MOSDOp
*m
= static_cast<const MOSDOp
*>(op
->get_req());
1197 ceph_assert(m
->get_type() == CEPH_MSG_OSD_OP
);
1198 dout(10) << "do_pg_op " << *m
<< dendl
;
1203 string cname
, mname
;
1205 snapid_t snapid
= m
->get_snapid();
1207 vector
<OSDOp
> ops
= m
->ops
;
1209 for (vector
<OSDOp
>::iterator p
= ops
.begin(); p
!= ops
.end(); ++p
) {
1210 std::unique_ptr
<const PGLSFilter
> filter
;
1212 auto bp
= p
->indata
.cbegin();
1214 case CEPH_OSD_OP_PGNLS_FILTER
:
1219 catch (const ceph::buffer::error
& e
) {
1220 dout(0) << "unable to decode PGLS_FILTER description in " << *m
<< dendl
;
1224 std::tie(result
, filter
) = get_pgls_filter(bp
);
1228 ceph_assert(filter
);
1232 case CEPH_OSD_OP_PGNLS
:
1233 if (snapid
!= CEPH_NOSNAP
) {
1237 if (get_osdmap()->raw_pg_to_pg(m
->get_pg()) != info
.pgid
.pgid
) {
1238 dout(10) << " pgnls pg=" << m
->get_pg()
1239 << " " << get_osdmap()->raw_pg_to_pg(m
->get_pg())
1240 << " != " << info
.pgid
<< dendl
;
1243 unsigned list_size
= std::min
<uint64_t>(cct
->_conf
->osd_max_pgls
,
1246 dout(10) << " pgnls pg=" << m
->get_pg() << " count " << list_size
1248 // read into a buffer
1249 vector
<hobject_t
> sentries
;
1250 pg_nls_response_t response
;
1252 decode(response
.handle
, bp
);
1254 catch (const ceph::buffer::error
& e
) {
1255 dout(0) << "unable to decode PGNLS handle in " << *m
<< dendl
;
1261 hobject_t lower_bound
= response
.handle
;
1262 hobject_t pg_start
= info
.pgid
.pgid
.get_hobj_start();
1263 hobject_t pg_end
= info
.pgid
.pgid
.get_hobj_end(pool
.info
.get_pg_num());
1264 dout(10) << " pgnls lower_bound " << lower_bound
1265 << " pg_end " << pg_end
<< dendl
;
1266 if (((!lower_bound
.is_max() && lower_bound
>= pg_end
) ||
1267 (lower_bound
!= hobject_t() && lower_bound
< pg_start
))) {
1268 // this should only happen with a buggy client.
1269 dout(10) << "outside of PG bounds " << pg_start
<< " .. "
1275 hobject_t current
= lower_bound
;
1276 int r
= pgbackend
->objects_list_partial(
1287 map
<hobject_t
, pg_missing_item
>::const_iterator missing_iter
=
1288 recovery_state
.get_pg_log().get_missing().get_items().lower_bound(current
);
1289 vector
<hobject_t
>::iterator ls_iter
= sentries
.begin();
1290 hobject_t _max
= hobject_t::get_max();
1292 const hobject_t
&mcand
=
1293 missing_iter
== recovery_state
.get_pg_log().get_missing().get_items().end() ?
1295 missing_iter
->first
;
1296 const hobject_t
&lcand
=
1297 ls_iter
== sentries
.end() ?
1301 hobject_t candidate
;
1302 if (mcand
== lcand
) {
1304 if (!mcand
.is_max()) {
1308 } else if (mcand
< lcand
) {
1310 ceph_assert(!mcand
.is_max());
1314 ceph_assert(!lcand
.is_max());
1318 dout(10) << " pgnls candidate 0x" << std::hex
<< candidate
.get_hash()
1319 << " vs lower bound 0x" << lower_bound
.get_hash()
1320 << std::dec
<< dendl
;
1322 if (candidate
>= next
) {
1326 if (response
.entries
.size() == list_size
) {
1331 if (candidate
.snap
!= CEPH_NOSNAP
)
1334 // skip internal namespace
1335 if (candidate
.get_namespace() == cct
->_conf
->osd_hit_set_namespace
)
1338 if (recovery_state
.get_missing_loc().is_deleted(candidate
))
1341 // skip wrong namespace
1342 if (m
->get_hobj().nspace
!= librados::all_nspaces
&&
1343 candidate
.get_namespace() != m
->get_hobj().nspace
)
1346 if (filter
&& !pgls_filter(*filter
, candidate
))
1349 dout(20) << "pgnls item 0x" << std::hex
1350 << candidate
.get_hash()
1351 << ", rev 0x" << hobject_t::_reverse_bits(candidate
.get_hash())
1353 << candidate
.oid
.name
<< dendl
;
1355 librados::ListObjectImpl item
;
1356 item
.nspace
= candidate
.get_namespace();
1357 item
.oid
= candidate
.oid
.name
;
1358 item
.locator
= candidate
.get_key();
1359 response
.entries
.push_back(item
);
1362 if (next
.is_max() &&
1363 missing_iter
== recovery_state
.get_pg_log().get_missing().get_items().end() &&
1364 ls_iter
== sentries
.end()) {
1367 // Set response.handle to the start of the next PG according
1368 // to the object sort order.
1369 response
.handle
= info
.pgid
.pgid
.get_hobj_end(pool
.info
.get_pg_num());
1371 response
.handle
= next
;
1373 dout(10) << "pgnls handle=" << response
.handle
<< dendl
;
1374 encode(response
, osd_op
.outdata
);
1375 dout(10) << " pgnls result=" << result
<< " outdata.length()="
1376 << osd_op
.outdata
.length() << dendl
;
1380 case CEPH_OSD_OP_PGLS_FILTER
:
1385 catch (const ceph::buffer::error
& e
) {
1386 dout(0) << "unable to decode PGLS_FILTER description in " << *m
<< dendl
;
1390 std::tie(result
, filter
) = get_pgls_filter(bp
);
1394 ceph_assert(filter
);
1398 case CEPH_OSD_OP_PGLS
:
1399 if (snapid
!= CEPH_NOSNAP
) {
1403 if (get_osdmap()->raw_pg_to_pg(m
->get_pg()) != info
.pgid
.pgid
) {
1404 dout(10) << " pgls pg=" << m
->get_pg()
1405 << " " << get_osdmap()->raw_pg_to_pg(m
->get_pg())
1406 << " != " << info
.pgid
<< dendl
;
1409 unsigned list_size
= std::min
<uint64_t>(cct
->_conf
->osd_max_pgls
,
1412 dout(10) << " pgls pg=" << m
->get_pg() << " count " << list_size
<< dendl
;
1413 // read into a buffer
1414 vector
<hobject_t
> sentries
;
1415 pg_ls_response_t response
;
1417 decode(response
.handle
, bp
);
1419 catch (const ceph::buffer::error
& e
) {
1420 dout(0) << "unable to decode PGLS handle in " << *m
<< dendl
;
1426 hobject_t current
= response
.handle
;
1427 int r
= pgbackend
->objects_list_partial(
1438 ceph_assert(snapid
== CEPH_NOSNAP
|| recovery_state
.get_pg_log().get_missing().get_items().empty());
1440 map
<hobject_t
, pg_missing_item
>::const_iterator missing_iter
=
1441 recovery_state
.get_pg_log().get_missing().get_items().lower_bound(current
);
1442 vector
<hobject_t
>::iterator ls_iter
= sentries
.begin();
1443 hobject_t _max
= hobject_t::get_max();
1445 const hobject_t
&mcand
=
1446 missing_iter
== recovery_state
.get_pg_log().get_missing().get_items().end() ?
1448 missing_iter
->first
;
1449 const hobject_t
&lcand
=
1450 ls_iter
== sentries
.end() ?
1454 hobject_t candidate
;
1455 if (mcand
== lcand
) {
1457 if (!mcand
.is_max()) {
1461 } else if (mcand
< lcand
) {
1463 ceph_assert(!mcand
.is_max());
1467 ceph_assert(!lcand
.is_max());
1471 if (candidate
>= next
) {
1475 if (response
.entries
.size() == list_size
) {
1480 if (candidate
.snap
!= CEPH_NOSNAP
)
1483 // skip wrong namespace
1484 if (candidate
.get_namespace() != m
->get_hobj().nspace
)
1487 if (recovery_state
.get_missing_loc().is_deleted(candidate
))
1490 if (filter
&& !pgls_filter(*filter
, candidate
))
1493 response
.entries
.push_back(make_pair(candidate
.oid
,
1494 candidate
.get_key()));
1496 if (next
.is_max() &&
1497 missing_iter
== recovery_state
.get_pg_log().get_missing().get_items().end() &&
1498 ls_iter
== sentries
.end()) {
1501 response
.handle
= next
;
1502 encode(response
, osd_op
.outdata
);
1503 dout(10) << " pgls result=" << result
<< " outdata.length()="
1504 << osd_op
.outdata
.length() << dendl
;
1508 case CEPH_OSD_OP_PG_HITSET_LS
:
1510 list
< pair
<utime_t
,utime_t
> > ls
;
1511 for (list
<pg_hit_set_info_t
>::const_iterator p
= info
.hit_set
.history
.begin();
1512 p
!= info
.hit_set
.history
.end();
1514 ls
.push_back(make_pair(p
->begin
, p
->end
));
1516 ls
.push_back(make_pair(hit_set_start_stamp
, utime_t()));
1517 encode(ls
, osd_op
.outdata
);
1521 case CEPH_OSD_OP_PG_HITSET_GET
:
1523 utime_t
stamp(osd_op
.op
.hit_set_get
.stamp
);
1524 if (hit_set_start_stamp
&& stamp
>= hit_set_start_stamp
) {
1525 // read the current in-memory HitSet, not the version we've
1531 encode(*hit_set
, osd_op
.outdata
);
1532 result
= osd_op
.outdata
.length();
1534 // read an archived HitSet.
1536 for (list
<pg_hit_set_info_t
>::const_iterator p
= info
.hit_set
.history
.begin();
1537 p
!= info
.hit_set
.history
.end();
1539 if (stamp
>= p
->begin
&& stamp
<= p
->end
) {
1540 oid
= get_hit_set_archive_object(p
->begin
, p
->end
, p
->using_gmt
);
1544 if (oid
== hobject_t()) {
1548 if (!pool
.info
.is_replicated()) {
1549 // FIXME: EC not supported yet
1550 result
= -EOPNOTSUPP
;
1553 if (is_unreadable_object(oid
)) {
1554 wait_for_unreadable_object(oid
, op
);
1557 result
= osd
->store
->read(ch
, ghobject_t(oid
), 0, 0, osd_op
.outdata
);
1562 case CEPH_OSD_OP_SCRUBLS
:
1563 result
= do_scrub_ls(m
, &osd_op
);
1576 MOSDOpReply
*reply
= new MOSDOpReply(m
, 0, get_osdmap_epoch(),
1577 CEPH_OSD_FLAG_ACK
| CEPH_OSD_FLAG_ONDISK
,
1579 reply
->claim_op_out_data(ops
);
1580 reply
->set_result(result
);
1581 reply
->set_reply_versions(info
.last_update
, info
.last_user_version
);
1582 osd
->send_message_osd_client(reply
, m
->get_connection());
1585 int PrimaryLogPG::do_scrub_ls(const MOSDOp
*m
, OSDOp
*osd_op
)
1587 if (m
->get_pg() != info
.pgid
.pgid
) {
1588 dout(10) << " scrubls pg=" << m
->get_pg() << " != " << info
.pgid
<< dendl
;
1589 return -EINVAL
; // hmm?
1591 auto bp
= osd_op
->indata
.cbegin();
1595 } catch (ceph::buffer::error
&) {
1596 dout(10) << " corrupted scrub_ls_arg_t" << dendl
;
1601 scrub_ls_result_t result
= {.interval
= info
.history
.same_interval_since
};
1603 if (arg
.interval
!= 0 && arg
.interval
!= info
.history
.same_interval_since
) {
1606 bool store_queried
= m_scrubber
&& m_scrubber
->get_store_errors(arg
, result
);
1607 if (store_queried
) {
1608 encode(result
, osd_op
->outdata
);
1610 // the scrubber's store is not initialized
1621 * @param manager [in] manager with locks to release
1623 void PrimaryLogPG::release_object_locks(
1624 ObcLockManager
&lock_manager
) {
1625 std::list
<std::pair
<ObjectContextRef
, std::list
<OpRequestRef
> > > to_req
;
1626 bool requeue_recovery
= false;
1627 bool requeue_snaptrim
= false;
1628 lock_manager
.put_locks(
1632 if (requeue_recovery
)
1634 if (requeue_snaptrim
)
1635 snap_trimmer_machine
.process_event(TrimWriteUnblocked());
1637 if (!to_req
.empty()) {
1638 // requeue at front of scrub blocking queue if we are blocked by scrub
1639 for (auto &&p
: to_req
) {
1640 if (m_scrubber
->write_blocked_by_scrub(p
.first
->obs
.oi
.soid
.get_head())) {
1641 for (auto& op
: p
.second
) {
1642 op
->mark_delayed("waiting for scrub");
1645 waiting_for_scrub
.splice(
1646 waiting_for_scrub
.begin(),
1650 } else if (is_laggy()) {
1651 for (auto& op
: p
.second
) {
1652 op
->mark_delayed("waiting for readable");
1654 waiting_for_readable
.splice(
1655 waiting_for_readable
.begin(),
1660 requeue_ops(p
.second
);
1666 PrimaryLogPG::PrimaryLogPG(OSDService
*o
, OSDMapRef curmap
,
1667 const PGPool
&_pool
,
1668 const map
<string
,string
>& ec_profile
, spg_t p
) :
1669 PG(o
, curmap
, _pool
, p
),
1671 PGBackend::build_pg_backend(
1672 _pool
.info
, ec_profile
, this, coll_t(p
), ch
, o
->store
, cct
)),
1673 object_contexts(o
->cct
, o
->cct
->_conf
->osd_pg_object_context_cache_count
),
1674 new_backfill(false),
1676 snap_trimmer_machine(this)
1678 recovery_state
.set_backend_predicates(
1679 pgbackend
->get_is_readable_predicate(),
1680 pgbackend
->get_is_recoverable_predicate());
1681 snap_trimmer_machine
.initiate();
1683 m_scrubber
= make_unique
<PrimaryLogScrub
>(this);
1686 void PrimaryLogPG::get_src_oloc(const object_t
& oid
, const object_locator_t
& oloc
, object_locator_t
& src_oloc
)
1689 if (oloc
.key
.empty())
1690 src_oloc
.key
= oid
.name
;
1693 void PrimaryLogPG::handle_backoff(OpRequestRef
& op
)
1695 auto m
= op
->get_req
<MOSDBackoff
>();
1696 auto session
= ceph::ref_cast
<Session
>(m
->get_connection()->get_priv());
1699 hobject_t begin
= info
.pgid
.pgid
.get_hobj_start();
1700 hobject_t end
= info
.pgid
.pgid
.get_hobj_end(pool
.info
.get_pg_num());
1701 if (begin
< m
->begin
) {
1707 dout(10) << __func__
<< " backoff ack id " << m
->id
1708 << " [" << begin
<< "," << end
<< ")" << dendl
;
1709 session
->ack_backoff(cct
, m
->pgid
, m
->id
, begin
, end
);
1712 void PrimaryLogPG::do_request(
1714 ThreadPool::TPHandle
&handle
)
1716 if (op
->osd_trace
) {
1717 op
->pg_trace
.init("pg op", &trace_endpoint
, &op
->osd_trace
);
1718 op
->pg_trace
.event("do request");
1721 if (op
->osd_parent_span
) {
1722 auto do_req_span
= jaeger_tracing::child_span(__func__
, op
->osd_parent_span
);
1725 // make sure we have a new enough map
1726 auto p
= waiting_for_map
.find(op
->get_source());
1727 if (p
!= waiting_for_map
.end()) {
1728 // preserve ordering
1729 dout(20) << __func__
<< " waiting_for_map "
1730 << p
->first
<< " not empty, queueing" << dendl
;
1731 p
->second
.push_back(op
);
1732 op
->mark_delayed("waiting_for_map not empty");
1735 if (!have_same_or_newer_map(op
->min_epoch
)) {
1736 dout(20) << __func__
<< " min " << op
->min_epoch
1737 << ", queue on waiting_for_map " << op
->get_source() << dendl
;
1738 waiting_for_map
[op
->get_source()].push_back(op
);
1739 op
->mark_delayed("op must wait for map");
1740 osd
->request_osdmap_update(op
->min_epoch
);
1744 if (can_discard_request(op
)) {
1749 const Message
*m
= op
->get_req();
1750 int msg_type
= m
->get_type();
1751 if (m
->get_connection()->has_feature(CEPH_FEATURE_RADOS_BACKOFF
)) {
1752 auto session
= ceph::ref_cast
<Session
>(m
->get_connection()->get_priv());
1755 if (msg_type
== CEPH_MSG_OSD_OP
) {
1756 if (session
->check_backoff(cct
, info
.pgid
,
1757 info
.pgid
.pgid
.get_hobj_start(), m
)) {
1764 (!is_active() && is_peered());
1765 if (g_conf()->osd_backoff_on_peering
&& !backoff
) {
1771 add_pg_backoff(session
);
1775 // pg backoff acks at pg-level
1776 if (msg_type
== CEPH_MSG_OSD_BACKOFF
) {
1777 const MOSDBackoff
*ba
= static_cast<const MOSDBackoff
*>(m
);
1778 if (ba
->begin
!= ba
->end
) {
1786 // Delay unless PGBackend says it's ok
1787 if (pgbackend
->can_handle_while_inactive(op
)) {
1788 bool handled
= pgbackend
->handle_message(op
);
1789 ceph_assert(handled
);
1792 waiting_for_peered
.push_back(op
);
1793 op
->mark_delayed("waiting for peered");
1798 if (recovery_state
.needs_flush()) {
1799 dout(20) << "waiting for flush on " << op
<< dendl
;
1800 waiting_for_flush
.push_back(op
);
1801 op
->mark_delayed("waiting for flush");
1805 ceph_assert(is_peered() && !recovery_state
.needs_flush());
1806 if (pgbackend
->handle_message(op
))
1810 case CEPH_MSG_OSD_OP
:
1811 case CEPH_MSG_OSD_BACKOFF
:
1813 dout(20) << " peered, not active, waiting for active on " << op
<< dendl
;
1814 waiting_for_active
.push_back(op
);
1815 op
->mark_delayed("waiting for active");
1819 case CEPH_MSG_OSD_OP
:
1820 // verify client features
1821 if ((pool
.info
.has_tiers() || pool
.info
.is_tier()) &&
1822 !op
->has_feature(CEPH_FEATURE_OSD_CACHEPOOL
)) {
1823 osd
->reply_op_error(op
, -EOPNOTSUPP
);
1828 case CEPH_MSG_OSD_BACKOFF
:
1829 // object-level backoff acks handled in osdop context
1835 case MSG_OSD_PG_SCAN
:
1836 do_scan(op
, handle
);
1839 case MSG_OSD_PG_BACKFILL
:
1843 case MSG_OSD_PG_BACKFILL_REMOVE
:
1844 do_backfill_remove(op
);
1847 case MSG_OSD_SCRUB_RESERVE
:
1850 osd
->reply_op_error(op
, -EAGAIN
);
1853 auto m
= op
->get_req
<MOSDScrubReserve
>();
1855 case MOSDScrubReserve::REQUEST
:
1856 m_scrubber
->handle_scrub_reserve_request(op
);
1858 case MOSDScrubReserve::GRANT
:
1859 m_scrubber
->handle_scrub_reserve_grant(op
, m
->from
);
1861 case MOSDScrubReserve::REJECT
:
1862 m_scrubber
->handle_scrub_reserve_reject(op
, m
->from
);
1864 case MOSDScrubReserve::RELEASE
:
1865 m_scrubber
->handle_scrub_reserve_release(op
);
1871 case MSG_OSD_REP_SCRUB
:
1872 replica_scrub(op
, handle
);
1875 case MSG_OSD_REP_SCRUBMAP
:
1876 do_replica_scrub_map(op
);
1879 case MSG_OSD_PG_UPDATE_LOG_MISSING
:
1880 do_update_log_missing(op
);
1883 case MSG_OSD_PG_UPDATE_LOG_MISSING_REPLY
:
1884 do_update_log_missing_reply(op
);
1888 ceph_abort_msg("bad message type in do_request");
1892 /** do_op - do an op
1893 * pg lock will be held (if multithreaded)
1894 * osd_lock NOT held.
1896 void PrimaryLogPG::do_op(OpRequestRef
& op
)
1899 // NOTE: take a non-const pointer here; we must be careful not to
1900 // change anything that will break other reads on m (operator<<).
1901 MOSDOp
*m
= static_cast<MOSDOp
*>(op
->get_nonconst_req());
1902 ceph_assert(m
->get_type() == CEPH_MSG_OSD_OP
);
1903 if (m
->finish_decode()) {
1904 op
->reset_desc(); // for TrackedOp
1908 dout(20) << __func__
<< ": op " << *m
<< dendl
;
1910 const hobject_t head
= m
->get_hobj().get_head();
1912 if (!info
.pgid
.pgid
.contains(
1913 info
.pgid
.pgid
.get_split_bits(pool
.info
.get_pg_num()), head
)) {
1914 derr
<< __func__
<< " " << info
.pgid
.pgid
<< " does not contain "
1915 << head
<< " pg_num " << pool
.info
.get_pg_num() << " hash "
1916 << std::hex
<< head
.get_hash() << std::dec
<< dendl
;
1917 osd
->clog
->warn() << info
.pgid
.pgid
<< " does not contain " << head
1919 ceph_assert(!cct
->_conf
->osd_debug_misdirected_ops
);
1924 m
->get_connection()->has_feature(CEPH_FEATURE_RADOS_BACKOFF
);
1925 ceph::ref_t
<Session
> session
;
1927 session
= static_cast<Session
*>(m
->get_connection()->get_priv().get());
1928 if (!session
.get()) {
1929 dout(10) << __func__
<< " no session" << dendl
;
1933 if (session
->check_backoff(cct
, info
.pgid
, head
, m
)) {
1938 if (m
->has_flag(CEPH_OSD_FLAG_PARALLELEXEC
)) {
1940 dout(20) << __func__
<< ": PARALLELEXEC not implemented " << *m
<< dendl
;
1941 osd
->reply_op_error(op
, -EINVAL
);
1946 int r
= op
->maybe_init_op_info(*get_osdmap());
1948 osd
->reply_op_error(op
, r
);
1953 if ((m
->get_flags() & (CEPH_OSD_FLAG_BALANCE_READS
|
1954 CEPH_OSD_FLAG_LOCALIZE_READS
)) &&
1956 !(op
->may_write() || op
->may_cache())) {
1957 // balanced reads; any replica will do
1958 if (!(is_primary() || is_nonprimary())) {
1959 osd
->handle_misdirected_op(this, op
);
1963 // normal case; must be primary
1964 if (!is_primary()) {
1965 osd
->handle_misdirected_op(this, op
);
1970 if (!check_laggy(op
)) {
1974 if (!op_has_sufficient_caps(op
)) {
1975 osd
->reply_op_error(op
, -EPERM
);
1979 if (op
->includes_pg_op()) {
1980 return do_pg_op(op
);
1983 // object name too long?
1984 if (m
->get_oid().name
.size() > cct
->_conf
->osd_max_object_name_len
) {
1985 dout(4) << "do_op name is longer than "
1986 << cct
->_conf
->osd_max_object_name_len
1987 << " bytes" << dendl
;
1988 osd
->reply_op_error(op
, -ENAMETOOLONG
);
1991 if (m
->get_hobj().get_key().size() > cct
->_conf
->osd_max_object_name_len
) {
1992 dout(4) << "do_op locator is longer than "
1993 << cct
->_conf
->osd_max_object_name_len
1994 << " bytes" << dendl
;
1995 osd
->reply_op_error(op
, -ENAMETOOLONG
);
1998 if (m
->get_hobj().nspace
.size() > cct
->_conf
->osd_max_object_namespace_len
) {
1999 dout(4) << "do_op namespace is longer than "
2000 << cct
->_conf
->osd_max_object_namespace_len
2001 << " bytes" << dendl
;
2002 osd
->reply_op_error(op
, -ENAMETOOLONG
);
2005 if (m
->get_hobj().oid
.name
.empty()) {
2006 dout(4) << "do_op empty oid name is not allowed" << dendl
;
2007 osd
->reply_op_error(op
, -EINVAL
);
2011 if (int r
= osd
->store
->validate_hobject_key(head
)) {
2012 dout(4) << "do_op object " << head
<< " invalid for backing store: "
2014 osd
->reply_op_error(op
, r
);
2019 if (get_osdmap()->is_blocklisted(m
->get_source_addr())) {
2020 dout(10) << "do_op " << m
->get_source_addr() << " is blocklisted" << dendl
;
2021 osd
->reply_op_error(op
, -EBLOCKLISTED
);
2025 // order this op as a write?
2026 bool write_ordered
= op
->rwordered();
2028 // discard due to cluster full transition? (we discard any op that
2029 // originates before the cluster or pool is marked full; the client
2030 // will resend after the full flag is removed or if they expect the
2031 // op to succeed despite being full). The except is FULL_FORCE and
2032 // FULL_TRY ops, which there is no reason to discard because they
2033 // bypass all full checks anyway. If this op isn't write or
2034 // read-ordered, we skip.
2035 // FIXME: we exclude mds writes for now.
2036 if (write_ordered
&& !(m
->get_source().is_mds() ||
2037 m
->has_flag(CEPH_OSD_FLAG_FULL_TRY
) ||
2038 m
->has_flag(CEPH_OSD_FLAG_FULL_FORCE
)) &&
2039 info
.history
.last_epoch_marked_full
> m
->get_map_epoch()) {
2040 dout(10) << __func__
<< " discarding op sent before full " << m
<< " "
2044 // mds should have stopped writing before this point.
2045 // We can't allow OSD to become non-startable even if mds
2046 // could be writing as part of file removals.
2047 if (write_ordered
&& osd
->check_failsafe_full(get_dpp()) &&
2048 !m
->has_flag(CEPH_OSD_FLAG_FULL_TRY
)) {
2049 dout(10) << __func__
<< " fail-safe full check failed, dropping request." << dendl
;
2052 int64_t poolid
= get_pgid().pool();
2053 if (op
->may_write()) {
2055 const pg_pool_t
*pi
= get_osdmap()->get_pg_pool(poolid
);
2061 if (m
->get_snapid() != CEPH_NOSNAP
) {
2062 dout(20) << __func__
<< ": write to clone not valid " << *m
<< dendl
;
2063 osd
->reply_op_error(op
, -EINVAL
);
2068 if (cct
->_conf
->osd_max_write_size
&&
2069 m
->get_data_len() > cct
->_conf
->osd_max_write_size
<< 20) {
2070 // journal can't hold commit!
2071 derr
<< "do_op msg data len " << m
->get_data_len()
2072 << " > osd_max_write_size " << (cct
->_conf
->osd_max_write_size
<< 20)
2073 << " on " << *m
<< dendl
;
2074 osd
->reply_op_error(op
, -OSD_WRITETOOBIG
);
2079 dout(10) << "do_op " << *m
2080 << (op
->may_write() ? " may_write" : "")
2081 << (op
->may_read() ? " may_read" : "")
2082 << (op
->may_cache() ? " may_cache" : "")
2083 << " -> " << (write_ordered
? "write-ordered" : "read-ordered")
2084 << " flags " << ceph_osd_flag_string(m
->get_flags())
2088 if (op
->osd_parent_span
) {
2089 auto do_op_span
= jaeger_tracing::child_span(__func__
, op
->osd_parent_span
);
2093 if (is_unreadable_object(head
)) {
2094 if (!is_primary()) {
2095 osd
->reply_op_error(op
, -EAGAIN
);
2099 (g_conf()->osd_backoff_on_degraded
||
2100 (g_conf()->osd_backoff_on_unfound
&&
2101 recovery_state
.get_missing_loc().is_unfound(head
)))) {
2102 add_backoff(session
, head
, head
);
2103 maybe_kick_recovery(head
);
2105 wait_for_unreadable_object(head
, op
);
2110 if (write_ordered
) {
2112 if (is_degraded_or_backfilling_object(head
)) {
2113 if (can_backoff
&& g_conf()->osd_backoff_on_degraded
) {
2114 add_backoff(session
, head
, head
);
2115 maybe_kick_recovery(head
);
2117 wait_for_degraded_object(head
, op
);
2122 if (m_scrubber
->is_scrub_active() && m_scrubber
->write_blocked_by_scrub(head
)) {
2123 dout(20) << __func__
<< ": waiting for scrub" << dendl
;
2124 waiting_for_scrub
.push_back(op
);
2125 op
->mark_delayed("waiting for scrub");
2128 if (!check_laggy_requeue(op
)) {
2133 if (auto blocked_iter
= objects_blocked_on_degraded_snap
.find(head
);
2134 blocked_iter
!= std::end(objects_blocked_on_degraded_snap
)) {
2135 hobject_t
to_wait_on(head
);
2136 to_wait_on
.snap
= blocked_iter
->second
;
2137 wait_for_degraded_object(to_wait_on
, op
);
2140 if (auto blocked_snap_promote_iter
= objects_blocked_on_snap_promotion
.find(head
);
2141 blocked_snap_promote_iter
!= std::end(objects_blocked_on_snap_promotion
)) {
2142 wait_for_blocked_object(blocked_snap_promote_iter
->second
->obs
.oi
.soid
, op
);
2145 if (objects_blocked_on_cache_full
.count(head
)) {
2146 block_write_on_full_cache(head
, op
);
2152 if (op
->may_write() || op
->may_cache()) {
2153 // warning: we will get back *a* request for this reqid, but not
2154 // necessarily the most recent. this happens with flush and
2155 // promote ops, but we can't possible have both in our log where
2156 // the original request is still not stable on disk, so for our
2157 // purposes here it doesn't matter which one we get.
2159 version_t user_version
;
2160 int return_code
= 0;
2161 vector
<pg_log_op_return_item_t
> op_returns
;
2162 bool got
= check_in_progress_op(
2163 m
->get_reqid(), &version
, &user_version
, &return_code
, &op_returns
);
2165 dout(3) << __func__
<< " dup " << m
->get_reqid()
2166 << " version " << version
<< dendl
;
2167 if (already_complete(version
)) {
2168 osd
->reply_op_error(op
, return_code
, version
, user_version
, op_returns
);
2170 dout(10) << " waiting for " << version
<< " to commit" << dendl
;
2171 // always queue ondisk waiters, so that we can requeue if needed
2172 waiting_for_ondisk
[version
].emplace_back(op
, user_version
, return_code
,
2174 op
->mark_delayed("waiting for ondisk");
2180 ObjectContextRef obc
;
2181 bool can_create
= op
->may_write();
2182 hobject_t missing_oid
;
2184 // kludge around the fact that LIST_SNAPS sets CEPH_SNAPDIR for LIST_SNAPS
2185 const hobject_t
& oid
=
2186 m
->get_snapid() == CEPH_SNAPDIR
? head
: m
->get_hobj();
2188 // make sure LIST_SNAPS is on CEPH_SNAPDIR and nothing else
2189 for (vector
<OSDOp
>::iterator p
= m
->ops
.begin(); p
!= m
->ops
.end(); ++p
) {
2192 if (osd_op
.op
.op
== CEPH_OSD_OP_LIST_SNAPS
) {
2193 if (m
->get_snapid() != CEPH_SNAPDIR
) {
2194 dout(10) << "LIST_SNAPS with incorrect context" << dendl
;
2195 osd
->reply_op_error(op
, -EINVAL
);
2199 if (m
->get_snapid() == CEPH_SNAPDIR
) {
2200 dout(10) << "non-LIST_SNAPS on snapdir" << dendl
;
2201 osd
->reply_op_error(op
, -EINVAL
);
2207 // io blocked on obc?
2208 if (!m
->has_flag(CEPH_OSD_FLAG_FLUSH
) &&
2209 maybe_await_blocked_head(oid
, op
)) {
2213 if (!is_primary()) {
2214 if (!recovery_state
.can_serve_replica_read(oid
)) {
2215 dout(20) << __func__
2216 << ": unstable write on replica, bouncing to primary "
2218 osd
->reply_op_error(op
, -EAGAIN
);
2221 dout(20) << __func__
<< ": serving replica read on oid " << oid
2225 int r
= find_object_context(
2226 oid
, &obc
, can_create
,
2227 m
->has_flag(CEPH_OSD_FLAG_MAP_SNAP_CLONE
),
2230 // LIST_SNAPS needs the ssc too
2232 m
->get_snapid() == CEPH_SNAPDIR
&&
2234 obc
->ssc
= get_snapset_context(oid
, true);
2238 // If we're not the primary of this OSD, we just return -EAGAIN. Otherwise,
2239 // we have to wait for the object.
2241 // missing the specific snap we need; requeue and wait.
2242 ceph_assert(!op
->may_write()); // only happens on a read/cache
2243 wait_for_unreadable_object(missing_oid
, op
);
2246 } else if (r
== 0) {
2247 if (is_unreadable_object(obc
->obs
.oi
.soid
)) {
2248 dout(10) << __func__
<< ": clone " << obc
->obs
.oi
.soid
2249 << " is unreadable, waiting" << dendl
;
2250 wait_for_unreadable_object(obc
->obs
.oi
.soid
, op
);
2254 // degraded object? (the check above was for head; this could be a clone)
2255 if (write_ordered
&&
2256 obc
->obs
.oi
.soid
.snap
!= CEPH_NOSNAP
&&
2257 is_degraded_or_backfilling_object(obc
->obs
.oi
.soid
)) {
2258 dout(10) << __func__
<< ": clone " << obc
->obs
.oi
.soid
2259 << " is degraded, waiting" << dendl
;
2260 wait_for_degraded_object(obc
->obs
.oi
.soid
, op
);
2265 bool in_hit_set
= false;
2268 if (obc
->obs
.oi
.soid
!= hobject_t() && hit_set
->contains(obc
->obs
.oi
.soid
))
2271 if (missing_oid
!= hobject_t() && hit_set
->contains(missing_oid
))
2274 if (!op
->hitset_inserted
) {
2275 hit_set
->insert(oid
);
2276 op
->hitset_inserted
= true;
2277 if (hit_set
->is_full() ||
2278 hit_set_start_stamp
+ pool
.info
.hit_set_period
<= m
->get_recv_stamp()) {
2285 if (agent_choose_mode(false, op
))
2289 if (obc
.get() && obc
->obs
.exists
&& obc
->obs
.oi
.has_manifest()) {
2290 if (recover_adjacent_clones(obc
, op
)) {
2293 if (maybe_handle_manifest(op
,
2299 if (maybe_handle_cache(op
,
2308 if (r
&& (r
!= -ENOENT
|| !obc
)) {
2309 // copy the reqids for copy get on ENOENT
2311 (m
->ops
[0].op
.op
== CEPH_OSD_OP_COPY_GET
)) {
2312 fill_in_copy_get_noent(op
, oid
, m
->ops
[0]);
2315 dout(20) << __func__
<< ": find_object_context got error " << r
<< dendl
;
2316 if (op
->may_write() &&
2317 get_osdmap()->require_osd_release
>= ceph_release_t::kraken
) {
2318 record_write_error(op
, oid
, nullptr, r
);
2320 osd
->reply_op_error(op
, r
);
2325 // make sure locator is consistent
2326 object_locator_t
oloc(obc
->obs
.oi
.soid
);
2327 if (m
->get_object_locator() != oloc
) {
2328 dout(10) << " provided locator " << m
->get_object_locator()
2329 << " != object's " << obc
->obs
.oi
.soid
<< dendl
;
2330 osd
->clog
->warn() << "bad locator " << m
->get_object_locator()
2331 << " on object " << oloc
2335 // io blocked on obc?
2336 if (obc
->is_blocked() &&
2337 !m
->has_flag(CEPH_OSD_FLAG_FLUSH
)) {
2338 wait_for_blocked_object(obc
->obs
.oi
.soid
, op
);
2342 dout(25) << __func__
<< " oi " << obc
->obs
.oi
<< dendl
;
2344 OpContext
*ctx
= new OpContext(op
, m
->get_reqid(), &m
->ops
, obc
, this);
2346 if (m
->has_flag(CEPH_OSD_FLAG_SKIPRWLOCKS
)) {
2347 dout(20) << __func__
<< ": skipping rw locks" << dendl
;
2348 } else if (m
->get_flags() & CEPH_OSD_FLAG_FLUSH
) {
2349 dout(20) << __func__
<< ": part of flush, will ignore write lock" << dendl
;
2351 // verify there is in fact a flush in progress
2352 // FIXME: we could make this a stronger test.
2353 map
<hobject_t
,FlushOpRef
>::iterator p
= flush_ops
.find(obc
->obs
.oi
.soid
);
2354 if (p
== flush_ops
.end()) {
2355 dout(10) << __func__
<< " no flush in progress, aborting" << dendl
;
2356 reply_ctx(ctx
, -EINVAL
);
2359 } else if (!get_rw_locks(write_ordered
, ctx
)) {
2360 dout(20) << __func__
<< " waiting for rw locks " << dendl
;
2361 op
->mark_delayed("waiting for rw locks");
2365 dout(20) << __func__
<< " obc " << *obc
<< dendl
;
2368 dout(20) << __func__
<< " returned an error: " << r
<< dendl
;
2369 if (op
->may_write() &&
2370 get_osdmap()->require_osd_release
>= ceph_release_t::kraken
) {
2371 record_write_error(op
, oid
, nullptr, r
,
2372 ctx
->op
->allows_returnvec() ? ctx
: nullptr);
2374 osd
->reply_op_error(op
, r
);
2380 if (m
->has_flag(CEPH_OSD_FLAG_IGNORE_CACHE
)) {
2381 ctx
->ignore_cache
= true;
2384 if ((op
->may_read()) && (obc
->obs
.oi
.is_lost())) {
2385 // This object is lost. Reading from it returns an error.
2386 dout(20) << __func__
<< ": object " << obc
->obs
.oi
.soid
2387 << " is lost" << dendl
;
2388 reply_ctx(ctx
, -ENFILE
);
2391 if (!op
->may_write() &&
2393 (!obc
->obs
.exists
||
2394 ((m
->get_snapid() != CEPH_SNAPDIR
) &&
2395 obc
->obs
.oi
.is_whiteout()))) {
2396 // copy the reqids for copy get on ENOENT
2397 if (m
->ops
[0].op
.op
== CEPH_OSD_OP_COPY_GET
) {
2398 fill_in_copy_get_noent(op
, oid
, m
->ops
[0]);
2402 reply_ctx(ctx
, -ENOENT
);
2409 utime_t prepare_latency
= ceph_clock_now();
2410 prepare_latency
-= op
->get_dequeued_time();
2411 osd
->logger
->tinc(l_osd_op_prepare_lat
, prepare_latency
);
2412 if (op
->may_read() && op
->may_write()) {
2413 osd
->logger
->tinc(l_osd_op_rw_prepare_lat
, prepare_latency
);
2414 } else if (op
->may_read()) {
2415 osd
->logger
->tinc(l_osd_op_r_prepare_lat
, prepare_latency
);
2416 } else if (op
->may_write() || op
->may_cache()) {
2417 osd
->logger
->tinc(l_osd_op_w_prepare_lat
, prepare_latency
);
2420 // force recovery of the oldest missing object if too many logs
2421 maybe_force_recovery();
2424 PrimaryLogPG::cache_result_t
PrimaryLogPG::maybe_handle_manifest_detail(
2427 ObjectContextRef obc
)
2430 if (op
->get_req
<MOSDOp
>()->get_flags() & CEPH_OSD_FLAG_IGNORE_REDIRECT
) {
2431 dout(20) << __func__
<< ": ignoring redirect due to flag" << dendl
;
2432 return cache_result_t::NOOP
;
2435 // if it is write-ordered and blocked, stop now
2436 if (obc
->is_blocked() && write_ordered
) {
2437 // we're already doing something with this object
2438 dout(20) << __func__
<< " blocked on " << obc
->obs
.oi
.soid
<< dendl
;
2439 return cache_result_t::NOOP
;
2442 vector
<OSDOp
> ops
= op
->get_req
<MOSDOp
>()->ops
;
2443 for (vector
<OSDOp
>::iterator p
= ops
.begin(); p
!= ops
.end(); ++p
) {
2445 ceph_osd_op
& op
= osd_op
.op
;
2446 if (op
.op
== CEPH_OSD_OP_SET_REDIRECT
||
2447 op
.op
== CEPH_OSD_OP_SET_CHUNK
||
2448 op
.op
== CEPH_OSD_OP_UNSET_MANIFEST
||
2449 op
.op
== CEPH_OSD_OP_TIER_PROMOTE
||
2450 op
.op
== CEPH_OSD_OP_TIER_FLUSH
||
2451 op
.op
== CEPH_OSD_OP_TIER_EVICT
) {
2452 return cache_result_t::NOOP
;
2456 switch (obc
->obs
.oi
.manifest
.type
) {
2457 case object_manifest_t::TYPE_REDIRECT
:
2458 if (op
->may_write() || write_ordered
) {
2459 do_proxy_write(op
, obc
);
2462 if (obc
->obs
.oi
.size
!= 0) {
2463 return cache_result_t::NOOP
;
2465 do_proxy_read(op
, obc
);
2467 return cache_result_t::HANDLED_PROXY
;
2468 case object_manifest_t::TYPE_CHUNKED
:
2470 if (can_proxy_chunked_read(op
, obc
)) {
2471 map
<hobject_t
,FlushOpRef
>::iterator p
= flush_ops
.find(obc
->obs
.oi
.soid
);
2472 if (p
!= flush_ops
.end()) {
2473 do_proxy_chunked_op(op
, obc
->obs
.oi
.soid
, obc
, true);
2474 return cache_result_t::HANDLED_PROXY
;
2476 do_proxy_chunked_op(op
, obc
->obs
.oi
.soid
, obc
, write_ordered
);
2477 return cache_result_t::HANDLED_PROXY
;
2480 MOSDOp
*m
= static_cast<MOSDOp
*>(op
->get_nonconst_req());
2481 ceph_assert(m
->get_type() == CEPH_MSG_OSD_OP
);
2482 hobject_t head
= m
->get_hobj();
2484 if (is_degraded_or_backfilling_object(head
)) {
2485 dout(20) << __func__
<< ": " << head
<< " is degraded, waiting" << dendl
;
2486 wait_for_degraded_object(head
, op
);
2487 return cache_result_t::BLOCKED_RECOVERY
;
2490 if (m_scrubber
->write_blocked_by_scrub(head
)) {
2491 dout(20) << __func__
<< ": waiting for scrub" << dendl
;
2492 waiting_for_scrub
.push_back(op
);
2493 op
->mark_delayed("waiting for scrub");
2494 return cache_result_t::BLOCKED_RECOVERY
;
2496 if (!check_laggy_requeue(op
)) {
2497 return cache_result_t::BLOCKED_RECOVERY
;
2500 for (auto& p
: obc
->obs
.oi
.manifest
.chunk_map
) {
2501 if (p
.second
.is_missing()) {
2502 auto m
= op
->get_req
<MOSDOp
>();
2503 const object_locator_t oloc
= m
->get_object_locator();
2504 promote_object(obc
, obc
->obs
.oi
.soid
, oloc
, op
, NULL
);
2505 return cache_result_t::BLOCKED_PROMOTE
;
2508 return cache_result_t::NOOP
;
2511 ceph_abort_msg("unrecognized manifest type");
2514 return cache_result_t::NOOP
;
2517 void PrimaryLogPG::record_write_error(OpRequestRef op
, const hobject_t
&soid
,
2518 MOSDOpReply
*orig_reply
, int r
,
2519 OpContext
*ctx_for_op_returns
)
2521 dout(20) << __func__
<< " r=" << r
<< dendl
;
2522 ceph_assert(op
->may_write());
2523 const osd_reqid_t
&reqid
= op
->get_req
<MOSDOp
>()->get_reqid();
2524 mempool::osd_pglog::list
<pg_log_entry_t
> entries
;
2525 entries
.push_back(pg_log_entry_t(pg_log_entry_t::ERROR
, soid
,
2526 get_next_version(), eversion_t(), 0,
2527 reqid
, utime_t(), r
));
2528 if (ctx_for_op_returns
) {
2529 entries
.back().set_op_returns(*ctx_for_op_returns
->ops
);
2530 dout(20) << __func__
<< " op_returns=" << entries
.back().op_returns
<< dendl
;
2536 boost::intrusive_ptr
<MOSDOpReply
> orig_reply
;
2541 MOSDOpReply
*orig_reply
,
2544 orig_reply(orig_reply
, false /* take over ref */), r(r
)
2547 ldpp_dout(pg
, 20) << "finished " << __func__
<< " r=" << r
<< dendl
;
2548 auto m
= op
->get_req
<MOSDOp
>();
2549 MOSDOpReply
*reply
= orig_reply
.detach();
2550 ldpp_dout(pg
, 10) << " sending commit on " << *m
<< " " << reply
<< dendl
;
2551 pg
->osd
->send_message_osd_client(reply
, m
->get_connection());
2555 ObcLockManager lock_manager
;
2558 std::move(lock_manager
),
2559 std::optional
<std::function
<void(void)> >(
2560 OnComplete(this, op
, orig_reply
, r
)),
2565 PrimaryLogPG::cache_result_t
PrimaryLogPG::maybe_handle_cache_detail(
2568 ObjectContextRef obc
,
2569 int r
, hobject_t missing_oid
,
2572 ObjectContextRef
*promote_obc
)
2574 // return quickly if caching is not enabled
2575 if (pool
.info
.cache_mode
== pg_pool_t::CACHEMODE_NONE
)
2576 return cache_result_t::NOOP
;
2580 op
->get_req()->get_type() == CEPH_MSG_OSD_OP
&&
2581 (op
->get_req
<MOSDOp
>()->get_flags() &
2582 CEPH_OSD_FLAG_IGNORE_CACHE
)) {
2583 dout(20) << __func__
<< ": ignoring cache due to flag" << dendl
;
2584 return cache_result_t::NOOP
;
2587 must_promote
= must_promote
|| op
->need_promote();
2590 dout(25) << __func__
<< " " << obc
->obs
.oi
<< " "
2591 << (obc
->obs
.exists
? "exists" : "DNE")
2592 << " missing_oid " << missing_oid
2593 << " must_promote " << (int)must_promote
2594 << " in_hit_set " << (int)in_hit_set
2597 dout(25) << __func__
<< " (no obc)"
2598 << " missing_oid " << missing_oid
2599 << " must_promote " << (int)must_promote
2600 << " in_hit_set " << (int)in_hit_set
2603 // if it is write-ordered and blocked, stop now
2604 if (obc
.get() && obc
->is_blocked() && write_ordered
) {
2605 // we're already doing something with this object
2606 dout(20) << __func__
<< " blocked on " << obc
->obs
.oi
.soid
<< dendl
;
2607 return cache_result_t::NOOP
;
2610 if (r
== -ENOENT
&& missing_oid
== hobject_t()) {
2611 // we know this object is logically absent (e.g., an undefined clone)
2612 return cache_result_t::NOOP
;
2615 if (obc
.get() && obc
->obs
.exists
) {
2616 osd
->logger
->inc(l_osd_op_cache_hit
);
2617 return cache_result_t::NOOP
;
2619 if (!is_primary()) {
2620 dout(20) << __func__
<< " cache miss; ask the primary" << dendl
;
2621 osd
->reply_op_error(op
, -EAGAIN
);
2622 return cache_result_t::REPLIED_WITH_EAGAIN
;
2625 if (missing_oid
== hobject_t() && obc
.get()) {
2626 missing_oid
= obc
->obs
.oi
.soid
;
2629 auto m
= op
->get_req
<MOSDOp
>();
2630 const object_locator_t oloc
= m
->get_object_locator();
2632 if (op
->need_skip_handle_cache()) {
2633 return cache_result_t::NOOP
;
2636 OpRequestRef promote_op
;
2638 switch (pool
.info
.cache_mode
) {
2639 case pg_pool_t::CACHEMODE_WRITEBACK
:
2641 agent_state
->evict_mode
== TierAgentState::EVICT_MODE_FULL
) {
2642 if (!op
->may_write() && !op
->may_cache() &&
2643 !write_ordered
&& !must_promote
) {
2644 dout(20) << __func__
<< " cache pool full, proxying read" << dendl
;
2646 return cache_result_t::HANDLED_PROXY
;
2648 dout(20) << __func__
<< " cache pool full, waiting" << dendl
;
2649 block_write_on_full_cache(missing_oid
, op
);
2650 return cache_result_t::BLOCKED_FULL
;
2653 if (must_promote
|| (!hit_set
&& !op
->need_skip_promote())) {
2654 promote_object(obc
, missing_oid
, oloc
, op
, promote_obc
);
2655 return cache_result_t::BLOCKED_PROMOTE
;
2658 if (op
->may_write() || op
->may_cache()) {
2662 if (!op
->need_skip_promote() &&
2663 maybe_promote(obc
, missing_oid
, oloc
, in_hit_set
,
2664 pool
.info
.min_write_recency_for_promote
,
2667 return cache_result_t::BLOCKED_PROMOTE
;
2669 return cache_result_t::HANDLED_PROXY
;
2673 // Avoid duplicate promotion
2674 if (obc
.get() && obc
->is_blocked()) {
2677 return cache_result_t::BLOCKED_PROMOTE
;
2681 if (!op
->need_skip_promote()) {
2682 (void)maybe_promote(obc
, missing_oid
, oloc
, in_hit_set
,
2683 pool
.info
.min_read_recency_for_promote
,
2684 promote_op
, promote_obc
);
2687 return cache_result_t::HANDLED_PROXY
;
2689 ceph_abort_msg("unreachable");
2690 return cache_result_t::NOOP
;
2692 case pg_pool_t::CACHEMODE_READONLY
:
2693 // TODO: clean this case up
2694 if (!obc
.get() && r
== -ENOENT
) {
2695 // we don't have the object and op's a read
2696 promote_object(obc
, missing_oid
, oloc
, op
, promote_obc
);
2697 return cache_result_t::BLOCKED_PROMOTE
;
2699 if (!r
) { // it must be a write
2700 do_cache_redirect(op
);
2701 return cache_result_t::HANDLED_REDIRECT
;
2703 // crap, there was a failure of some kind
2704 return cache_result_t::NOOP
;
2706 case pg_pool_t::CACHEMODE_FORWARD
:
2707 // this mode is deprecated; proxy instead
2708 case pg_pool_t::CACHEMODE_PROXY
:
2709 if (!must_promote
) {
2710 if (op
->may_write() || op
->may_cache() || write_ordered
) {
2712 return cache_result_t::HANDLED_PROXY
;
2715 return cache_result_t::HANDLED_PROXY
;
2718 // ugh, we're forced to promote.
2720 agent_state
->evict_mode
== TierAgentState::EVICT_MODE_FULL
) {
2721 dout(20) << __func__
<< " cache pool full, waiting" << dendl
;
2722 block_write_on_full_cache(missing_oid
, op
);
2723 return cache_result_t::BLOCKED_FULL
;
2725 promote_object(obc
, missing_oid
, oloc
, op
, promote_obc
);
2726 return cache_result_t::BLOCKED_PROMOTE
;
2728 case pg_pool_t::CACHEMODE_READFORWARD
:
2729 // this mode is deprecated; proxy instead
2730 case pg_pool_t::CACHEMODE_READPROXY
:
2731 // Do writeback to the cache tier for writes
2732 if (op
->may_write() || write_ordered
|| must_promote
) {
2734 agent_state
->evict_mode
== TierAgentState::EVICT_MODE_FULL
) {
2735 dout(20) << __func__
<< " cache pool full, waiting" << dendl
;
2736 block_write_on_full_cache(missing_oid
, op
);
2737 return cache_result_t::BLOCKED_FULL
;
2739 promote_object(obc
, missing_oid
, oloc
, op
, promote_obc
);
2740 return cache_result_t::BLOCKED_PROMOTE
;
2743 // If it is a read, we can read, we need to proxy it
2745 return cache_result_t::HANDLED_PROXY
;
2748 ceph_abort_msg("unrecognized cache_mode");
2750 return cache_result_t::NOOP
;
2753 bool PrimaryLogPG::maybe_promote(ObjectContextRef obc
,
2754 const hobject_t
& missing_oid
,
2755 const object_locator_t
& oloc
,
2758 OpRequestRef promote_op
,
2759 ObjectContextRef
*promote_obc
)
2761 dout(20) << __func__
<< " missing_oid " << missing_oid
2762 << " in_hit_set " << in_hit_set
<< dendl
;
2768 // Check if in the current hit set
2778 unsigned count
= (int)in_hit_set
;
2780 // Check if in other hit sets
2781 const hobject_t
& oid
= obc
.get() ? obc
->obs
.oi
.soid
: missing_oid
;
2782 for (map
<time_t,HitSetRef
>::reverse_iterator itor
=
2783 agent_state
->hit_set_map
.rbegin();
2784 itor
!= agent_state
->hit_set_map
.rend();
2786 if (!itor
->second
->contains(oid
)) {
2790 if (count
>= recency
) {
2795 if (count
>= recency
) {
2798 return false; // not promoting
2803 if (osd
->promote_throttle()) {
2804 dout(10) << __func__
<< " promote throttled" << dendl
;
2807 promote_object(obc
, missing_oid
, oloc
, promote_op
, promote_obc
);
2811 void PrimaryLogPG::do_cache_redirect(OpRequestRef op
)
2813 auto m
= op
->get_req
<MOSDOp
>();
2814 int flags
= m
->get_flags() & (CEPH_OSD_FLAG_ACK
|CEPH_OSD_FLAG_ONDISK
);
2815 MOSDOpReply
*reply
= new MOSDOpReply(m
, -ENOENT
, get_osdmap_epoch(),
2817 request_redirect_t
redir(m
->get_object_locator(), pool
.info
.tier_of
);
2818 reply
->set_redirect(redir
);
2819 dout(10) << "sending redirect to pool " << pool
.info
.tier_of
<< " for op "
2821 m
->get_connection()->send_message(reply
);
2825 struct C_ProxyRead
: public Context
{
2828 epoch_t last_peering_reset
;
2830 PrimaryLogPG::ProxyReadOpRef prdop
;
2832 C_ProxyRead(PrimaryLogPG
*p
, hobject_t o
, epoch_t lpr
,
2833 const PrimaryLogPG::ProxyReadOpRef
& prd
)
2834 : pg(p
), oid(o
), last_peering_reset(lpr
),
2835 tid(0), prdop(prd
), start(ceph_clock_now())
2837 void finish(int r
) override
{
2838 if (prdop
->canceled
)
2840 std::scoped_lock locker
{*pg
};
2841 if (prdop
->canceled
) {
2844 if (last_peering_reset
== pg
->get_last_peering_reset()) {
2845 pg
->finish_proxy_read(oid
, tid
, r
);
2846 pg
->osd
->logger
->tinc(l_osd_tier_r_lat
, ceph_clock_now() - start
);
2851 struct C_ProxyChunkRead
: public Context
{
2854 epoch_t last_peering_reset
;
2856 PrimaryLogPG::ProxyReadOpRef prdop
;
2858 ObjectOperation
*obj_op
;
2860 uint64_t req_offset
= 0;
2861 ObjectContextRef obc
;
2862 uint64_t req_total_len
= 0;
2863 C_ProxyChunkRead(PrimaryLogPG
*p
, hobject_t o
, epoch_t lpr
,
2864 const PrimaryLogPG::ProxyReadOpRef
& prd
)
2865 : pg(p
), oid(o
), last_peering_reset(lpr
),
2866 tid(0), prdop(prd
), start(ceph_clock_now()), obj_op(NULL
)
2868 void finish(int r
) override
{
2869 if (prdop
->canceled
)
2871 std::scoped_lock locker
{*pg
};
2872 if (prdop
->canceled
) {
2875 if (last_peering_reset
== pg
->get_last_peering_reset()) {
2877 if (!prdop
->ops
[op_index
].outdata
.length()) {
2878 ceph_assert(req_total_len
);
2880 bufferptr
bptr(req_total_len
);
2881 list
.push_back(std::move(bptr
));
2882 prdop
->ops
[op_index
].outdata
.append(list
);
2884 ceph_assert(obj_op
);
2885 uint64_t copy_offset
;
2886 if (req_offset
>= prdop
->ops
[op_index
].op
.extent
.offset
) {
2887 copy_offset
= req_offset
- prdop
->ops
[op_index
].op
.extent
.offset
;
2891 prdop
->ops
[op_index
].outdata
.begin(copy_offset
).copy_in(
2892 obj_op
->ops
[0].outdata
.length(),
2893 obj_op
->ops
[0].outdata
.c_str());
2896 pg
->finish_proxy_read(oid
, tid
, r
);
2897 pg
->osd
->logger
->tinc(l_osd_tier_r_lat
, ceph_clock_now() - start
);
2905 void PrimaryLogPG::do_proxy_read(OpRequestRef op
, ObjectContextRef obc
)
2907 // NOTE: non-const here because the ProxyReadOp needs mutable refs to
2908 // stash the result in the request's OSDOp vector
2909 MOSDOp
*m
= static_cast<MOSDOp
*>(op
->get_nonconst_req());
2910 object_locator_t oloc
;
2912 /* extensible tier */
2913 if (obc
&& obc
->obs
.exists
&& obc
->obs
.oi
.has_manifest()) {
2914 switch (obc
->obs
.oi
.manifest
.type
) {
2915 case object_manifest_t::TYPE_REDIRECT
:
2916 oloc
= object_locator_t(obc
->obs
.oi
.manifest
.redirect_target
);
2917 soid
= obc
->obs
.oi
.manifest
.redirect_target
;
2920 ceph_abort_msg("unrecognized manifest type");
2924 soid
= m
->get_hobj();
2925 oloc
= object_locator_t(m
->get_object_locator());
2926 oloc
.pool
= pool
.info
.tier_of
;
2928 unsigned flags
= CEPH_OSD_FLAG_IGNORE_CACHE
| CEPH_OSD_FLAG_IGNORE_OVERLAY
;
2930 // pass through some original flags that make sense.
2931 // - leave out redirection and balancing flags since we are
2932 // already proxying through the primary
2933 // - leave off read/write/exec flags that are derived from the op
2934 flags
|= m
->get_flags() & (CEPH_OSD_FLAG_RWORDERED
|
2935 CEPH_OSD_FLAG_ORDERSNAP
|
2936 CEPH_OSD_FLAG_ENFORCE_SNAPC
|
2937 CEPH_OSD_FLAG_MAP_SNAP_CLONE
);
2939 dout(10) << __func__
<< " Start proxy read for " << *m
<< dendl
;
2941 ProxyReadOpRef
prdop(std::make_shared
<ProxyReadOp
>(op
, soid
, m
->ops
));
2943 ObjectOperation obj_op
;
2944 obj_op
.dup(prdop
->ops
);
2946 if (pool
.info
.cache_mode
== pg_pool_t::CACHEMODE_WRITEBACK
&&
2947 (agent_state
&& agent_state
->evict_mode
!= TierAgentState::EVICT_MODE_FULL
)) {
2948 for (unsigned i
= 0; i
< obj_op
.ops
.size(); i
++) {
2949 ceph_osd_op op
= obj_op
.ops
[i
].op
;
2951 case CEPH_OSD_OP_READ
:
2952 case CEPH_OSD_OP_SYNC_READ
:
2953 case CEPH_OSD_OP_SPARSE_READ
:
2954 case CEPH_OSD_OP_CHECKSUM
:
2955 case CEPH_OSD_OP_CMPEXT
:
2956 op
.flags
= (op
.flags
| CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL
) &
2957 ~(CEPH_OSD_OP_FLAG_FADVISE_DONTNEED
| CEPH_OSD_OP_FLAG_FADVISE_NOCACHE
);
2962 C_ProxyRead
*fin
= new C_ProxyRead(this, soid
, get_last_peering_reset(),
2964 ceph_tid_t tid
= osd
->objecter
->read(
2965 soid
.oid
, oloc
, obj_op
,
2966 m
->get_snapid(), NULL
,
2967 flags
, new C_OnFinisher(fin
, osd
->get_objecter_finisher(get_pg_shard())),
2968 &prdop
->user_version
,
2969 &prdop
->data_offset
,
2972 prdop
->objecter_tid
= tid
;
2973 proxyread_ops
[tid
] = prdop
;
2974 in_progress_proxy_ops
[soid
].push_back(op
);
2977 void PrimaryLogPG::finish_proxy_read(hobject_t oid
, ceph_tid_t tid
, int r
)
2979 dout(10) << __func__
<< " " << oid
<< " tid " << tid
2980 << " " << cpp_strerror(r
) << dendl
;
2982 map
<ceph_tid_t
, ProxyReadOpRef
>::iterator p
= proxyread_ops
.find(tid
);
2983 if (p
== proxyread_ops
.end()) {
2984 dout(10) << __func__
<< " no proxyread_op found" << dendl
;
2987 ProxyReadOpRef prdop
= p
->second
;
2988 if (tid
!= prdop
->objecter_tid
) {
2989 dout(10) << __func__
<< " tid " << tid
<< " != prdop " << prdop
2990 << " tid " << prdop
->objecter_tid
<< dendl
;
2993 if (oid
!= prdop
->soid
) {
2994 dout(10) << __func__
<< " oid " << oid
<< " != prdop " << prdop
2995 << " soid " << prdop
->soid
<< dendl
;
2998 proxyread_ops
.erase(tid
);
3000 map
<hobject_t
, list
<OpRequestRef
>>::iterator q
= in_progress_proxy_ops
.find(oid
);
3001 if (q
== in_progress_proxy_ops
.end()) {
3002 dout(10) << __func__
<< " no in_progress_proxy_ops found" << dendl
;
3005 ceph_assert(q
->second
.size());
3006 list
<OpRequestRef
>::iterator it
= std::find(q
->second
.begin(),
3009 ceph_assert(it
!= q
->second
.end());
3010 OpRequestRef op
= *it
;
3011 q
->second
.erase(it
);
3012 if (q
->second
.size() == 0) {
3013 in_progress_proxy_ops
.erase(oid
);
3014 } else if (std::find(q
->second
.begin(),
3016 prdop
->op
) != q
->second
.end()) {
3017 /* multiple read case */
3018 dout(20) << __func__
<< " " << oid
<< " is not completed " << dendl
;
3022 osd
->logger
->inc(l_osd_tier_proxy_read
);
3024 auto m
= op
->get_req
<MOSDOp
>();
3025 OpContext
*ctx
= new OpContext(op
, m
->get_reqid(), &prdop
->ops
, this);
3026 ctx
->reply
= new MOSDOpReply(m
, 0, get_osdmap_epoch(), 0, false);
3027 ctx
->user_at_version
= prdop
->user_version
;
3028 ctx
->data_off
= prdop
->data_offset
;
3029 ctx
->ignore_log_op_stats
= true;
3030 complete_read_ctx(r
, ctx
);
3033 void PrimaryLogPG::kick_proxy_ops_blocked(hobject_t
& soid
)
3035 map
<hobject_t
, list
<OpRequestRef
>>::iterator p
= in_progress_proxy_ops
.find(soid
);
3036 if (p
== in_progress_proxy_ops
.end())
3039 list
<OpRequestRef
>& ls
= p
->second
;
3040 dout(10) << __func__
<< " " << soid
<< " requeuing " << ls
.size() << " requests" << dendl
;
3042 in_progress_proxy_ops
.erase(p
);
3045 void PrimaryLogPG::cancel_proxy_read(ProxyReadOpRef prdop
,
3046 vector
<ceph_tid_t
> *tids
)
3048 dout(10) << __func__
<< " " << prdop
->soid
<< dendl
;
3049 prdop
->canceled
= true;
3051 // cancel objecter op, if we can
3052 if (prdop
->objecter_tid
) {
3053 tids
->push_back(prdop
->objecter_tid
);
3054 for (uint32_t i
= 0; i
< prdop
->ops
.size(); i
++) {
3055 prdop
->ops
[i
].outdata
.clear();
3057 proxyread_ops
.erase(prdop
->objecter_tid
);
3058 prdop
->objecter_tid
= 0;
3062 void PrimaryLogPG::cancel_proxy_ops(bool requeue
, vector
<ceph_tid_t
> *tids
)
3064 dout(10) << __func__
<< dendl
;
3066 // cancel proxy reads
3067 map
<ceph_tid_t
, ProxyReadOpRef
>::iterator p
= proxyread_ops
.begin();
3068 while (p
!= proxyread_ops
.end()) {
3069 cancel_proxy_read((p
++)->second
, tids
);
3072 // cancel proxy writes
3073 map
<ceph_tid_t
, ProxyWriteOpRef
>::iterator q
= proxywrite_ops
.begin();
3074 while (q
!= proxywrite_ops
.end()) {
3075 cancel_proxy_write((q
++)->second
, tids
);
3079 map
<hobject_t
, list
<OpRequestRef
>>::iterator p
=
3080 in_progress_proxy_ops
.begin();
3081 while (p
!= in_progress_proxy_ops
.end()) {
3082 list
<OpRequestRef
>& ls
= p
->second
;
3083 dout(10) << __func__
<< " " << p
->first
<< " requeuing " << ls
.size()
3084 << " requests" << dendl
;
3086 in_progress_proxy_ops
.erase(p
++);
3089 in_progress_proxy_ops
.clear();
3093 struct C_ProxyWrite_Commit
: public Context
{
3096 epoch_t last_peering_reset
;
3098 PrimaryLogPG::ProxyWriteOpRef pwop
;
3099 C_ProxyWrite_Commit(PrimaryLogPG
*p
, hobject_t o
, epoch_t lpr
,
3100 const PrimaryLogPG::ProxyWriteOpRef
& pw
)
3101 : pg(p
), oid(o
), last_peering_reset(lpr
),
3104 void finish(int r
) override
{
3107 std::scoped_lock locker
{*pg
};
3108 if (pwop
->canceled
) {
3111 if (last_peering_reset
== pg
->get_last_peering_reset()) {
3112 pg
->finish_proxy_write(oid
, tid
, r
);
3117 void PrimaryLogPG::do_proxy_write(OpRequestRef op
, ObjectContextRef obc
)
3119 // NOTE: non-const because ProxyWriteOp takes a mutable ref
3120 MOSDOp
*m
= static_cast<MOSDOp
*>(op
->get_nonconst_req());
3121 object_locator_t oloc
;
3122 SnapContext
snapc(m
->get_snap_seq(), m
->get_snaps());
3124 /* extensible tier */
3125 if (obc
&& obc
->obs
.exists
&& obc
->obs
.oi
.has_manifest()) {
3126 switch (obc
->obs
.oi
.manifest
.type
) {
3127 case object_manifest_t::TYPE_REDIRECT
:
3128 oloc
= object_locator_t(obc
->obs
.oi
.manifest
.redirect_target
);
3129 soid
= obc
->obs
.oi
.manifest
.redirect_target
;
3132 ceph_abort_msg("unrecognized manifest type");
3136 soid
= m
->get_hobj();
3137 oloc
= object_locator_t(m
->get_object_locator());
3138 oloc
.pool
= pool
.info
.tier_of
;
3141 unsigned flags
= CEPH_OSD_FLAG_IGNORE_CACHE
| CEPH_OSD_FLAG_IGNORE_OVERLAY
;
3142 if (!(op
->may_write() || op
->may_cache())) {
3143 flags
|= CEPH_OSD_FLAG_RWORDERED
;
3145 if (op
->allows_returnvec()) {
3146 flags
|= CEPH_OSD_FLAG_RETURNVEC
;
3149 dout(10) << __func__
<< " Start proxy write for " << *m
<< dendl
;
3151 ProxyWriteOpRef
pwop(std::make_shared
<ProxyWriteOp
>(op
, soid
, m
->ops
, m
->get_reqid()));
3152 pwop
->ctx
= new OpContext(op
, m
->get_reqid(), &pwop
->ops
, this);
3153 pwop
->mtime
= m
->get_mtime();
3155 ObjectOperation obj_op
;
3156 obj_op
.dup(pwop
->ops
);
3158 C_ProxyWrite_Commit
*fin
= new C_ProxyWrite_Commit(
3159 this, soid
, get_last_peering_reset(), pwop
);
3160 ceph_tid_t tid
= osd
->objecter
->mutate(
3161 soid
.oid
, oloc
, obj_op
, snapc
,
3162 ceph::real_clock::from_ceph_timespec(pwop
->mtime
),
3163 flags
, new C_OnFinisher(fin
, osd
->get_objecter_finisher(get_pg_shard())),
3164 &pwop
->user_version
, pwop
->reqid
);
3166 pwop
->objecter_tid
= tid
;
3167 proxywrite_ops
[tid
] = pwop
;
3168 in_progress_proxy_ops
[soid
].push_back(op
);
3171 void PrimaryLogPG::do_proxy_chunked_op(OpRequestRef op
, const hobject_t
& missing_oid
,
3172 ObjectContextRef obc
, bool write_ordered
)
3174 MOSDOp
*m
= static_cast<MOSDOp
*>(op
->get_nonconst_req());
3175 OSDOp
*osd_op
= NULL
;
3176 for (unsigned int i
= 0; i
< m
->ops
.size(); i
++) {
3177 osd_op
= &m
->ops
[i
];
3178 uint64_t cursor
= osd_op
->op
.extent
.offset
;
3179 uint64_t op_length
= osd_op
->op
.extent
.offset
+ osd_op
->op
.extent
.length
;
3180 uint64_t chunk_length
= 0, chunk_index
= 0, req_len
= 0;
3181 object_manifest_t
*manifest
= &obc
->obs
.oi
.manifest
;
3182 map
<uint64_t, map
<uint64_t, uint64_t>> chunk_read
;
3184 while (cursor
< op_length
) {
3187 /* find the right chunk position for cursor */
3188 for (auto &p
: manifest
->chunk_map
) {
3189 if (p
.first
<= cursor
&& p
.first
+ p
.second
.length
> cursor
) {
3190 chunk_length
= p
.second
.length
;
3191 chunk_index
= p
.first
;
3196 if (!chunk_index
&& !chunk_length
) {
3197 if (cursor
== osd_op
->op
.extent
.offset
) {
3198 OpContext
*ctx
= new OpContext(op
, m
->get_reqid(), &m
->ops
, this);
3199 ctx
->reply
= new MOSDOpReply(m
, 0, get_osdmap_epoch(), 0, false);
3200 ctx
->data_off
= osd_op
->op
.extent
.offset
;
3201 ctx
->ignore_log_op_stats
= true;
3202 complete_read_ctx(0, ctx
);
3206 uint64_t next_length
= chunk_length
;
3207 /* the size to read -> | op length | */
3209 if (cursor
+ next_length
> op_length
) {
3210 next_length
= op_length
- cursor
;
3212 /* the size to read -> | op length | */
3214 if (cursor
+ next_length
> chunk_index
+ chunk_length
) {
3215 next_length
= chunk_index
+ chunk_length
- cursor
;
3218 chunk_read
[cursor
] = {{chunk_index
, next_length
}};
3219 cursor
+= next_length
;
3222 req_len
= cursor
- osd_op
->op
.extent
.offset
;
3223 for (auto &p
: chunk_read
) {
3224 auto chunks
= p
.second
.begin();
3225 dout(20) << __func__
<< " chunk_index: " << chunks
->first
3226 << " next_length: " << chunks
->second
<< " cursor: "
3227 << p
.first
<< dendl
;
3228 do_proxy_chunked_read(op
, obc
, i
, chunks
->first
, p
.first
, chunks
->second
, req_len
, write_ordered
);
3233 struct RefCountCallback
: public Context
{
3235 PrimaryLogPG::OpContext
*ctx
;
3237 bool requeue
= false;
3239 RefCountCallback(PrimaryLogPG::OpContext
*ctx
, OSDOp
&osd_op
)
3240 : ctx(ctx
), osd_op(osd_op
) {}
3241 void finish(int r
) override
{
3242 // NB: caller must already have pg->lock held
3243 ctx
->obc
->stop_block();
3244 ctx
->pg
->kick_object_context_blocked(ctx
->obc
);
3247 ctx
->pg
->execute_ctx(ctx
);
3249 // on cancel simply toss op out,
3250 // or requeue as requested
3251 if (r
!= -ECANCELED
) {
3253 ctx
->pg
->osd
->reply_op_error(ctx
->op
, r
);
3254 } else if (requeue
) {
3256 ctx
->pg
->requeue_op(ctx
->op
);
3258 ctx
->pg
->close_op_ctx(ctx
);
3261 void set_requeue(bool rq
) {
3266 struct SetManifestFinisher
: public PrimaryLogPG::OpFinisher
{
3269 explicit SetManifestFinisher(OSDOp
& osd_op
) : osd_op(osd_op
) {
3272 int execute() override
{
3277 struct C_SetManifestRefCountDone
: public Context
{
3279 PrimaryLogPG::ManifestOpRef mop
;
3281 C_SetManifestRefCountDone(PrimaryLogPG
*p
,
3282 PrimaryLogPG::ManifestOpRef mop
, hobject_t soid
) :
3283 pg(p
), mop(mop
), soid(soid
) {}
3284 void finish(int r
) override
{
3285 if (r
== -ECANCELED
)
3287 std::scoped_lock locker
{*pg
};
3288 auto it
= pg
->manifest_ops
.find(soid
);
3289 if (it
== pg
->manifest_ops
.end()) {
3290 // raced with cancel_manifest_ops
3293 if (it
->second
->cb
) {
3294 it
->second
->cb
->complete(r
);
3296 pg
->manifest_ops
.erase(it
);
3301 struct C_SetDedupChunks
: public Context
{
3304 epoch_t last_peering_reset
;
3308 C_SetDedupChunks(PrimaryLogPG
*p
, hobject_t o
, epoch_t lpr
, uint64_t offset
)
3309 : pg(p
), oid(o
), last_peering_reset(lpr
),
3310 tid(0), offset(offset
)
3312 void finish(int r
) override
{
3313 if (r
== -ECANCELED
)
3315 std::scoped_lock locker
{*pg
};
3316 if (last_peering_reset
!= pg
->get_last_peering_reset()) {
3319 pg
->finish_set_dedup(oid
, r
, tid
, offset
);
3323 void PrimaryLogPG::cancel_manifest_ops(bool requeue
, vector
<ceph_tid_t
> *tids
)
3325 dout(10) << __func__
<< dendl
;
3326 auto p
= manifest_ops
.begin();
3327 while (p
!= manifest_ops
.end()) {
3328 auto mop
= p
->second
;
3329 // cancel objecter op, if we can
3330 if (mop
->objecter_tid
) {
3331 tids
->push_back(mop
->objecter_tid
);
3332 mop
->objecter_tid
= 0;
3335 mop
->cb
->set_requeue(requeue
);
3336 mop
->cb
->complete(-ECANCELED
);
3338 manifest_ops
.erase(p
++);
3342 int PrimaryLogPG::get_manifest_ref_count(ObjectContextRef obc
, std::string
& fp_oid
)
3346 for (auto &p
: obc
->obs
.oi
.manifest
.chunk_map
) {
3347 if (p
.second
.oid
.oid
.name
== fp_oid
) {
3352 SnapSet
& ss
= obc
->ssc
->snapset
;
3353 const OSDMapRef
& osdmap
= get_osdmap();
3354 for (vector
<snapid_t
>::const_reverse_iterator p
= ss
.clones
.rbegin();
3355 p
!= ss
.clones
.rend();
3357 object_ref_delta_t refs
;
3358 ObjectContextRef obc_l
= nullptr;
3359 ObjectContextRef obc_g
= nullptr;
3360 hobject_t clone_oid
= obc
->obs
.oi
.soid
;
3361 clone_oid
.snap
= *p
;
3362 if (osdmap
->in_removed_snaps_queue(info
.pgid
.pgid
.pool(), *p
)) {
3365 ObjectContextRef clone_obc
= get_object_context(clone_oid
, false);
3369 get_adjacent_clones(clone_obc
, obc_l
, obc_g
);
3370 clone_obc
->obs
.oi
.manifest
.calc_refs_to_inc_on_set(
3371 obc_g
? &(obc_g
->obs
.oi
.manifest
) : nullptr ,
3374 for (auto p
= refs
.begin(); p
!= refs
.end(); ++p
) {
3375 if (p
->first
.oid
.name
== fp_oid
&& p
->second
> 0) {
3384 bool PrimaryLogPG::recover_adjacent_clones(ObjectContextRef obc
, OpRequestRef op
)
3386 if (!obc
->obs
.oi
.manifest
.is_chunked() || !obc
->ssc
|| !obc
->ssc
->snapset
.clones
.size()) {
3390 const SnapSet
& snapset
= obc
->ssc
->snapset
;
3391 auto s
= std::find(snapset
.clones
.begin(), snapset
.clones
.end(), obc
->obs
.oi
.soid
.snap
);
3392 auto is_unreadable_snap
= [this, obc
, &snapset
, op
](auto iter
) -> bool {
3393 hobject_t cid
= obc
->obs
.oi
.soid
;
3394 cid
.snap
= (iter
== snapset
.clones
.end()) ? snapid_t(CEPH_NOSNAP
) : *iter
;
3395 if (is_unreadable_object(cid
)) {
3396 dout(10) << __func__
<< ": clone " << cid
3397 << " is unreadable, waiting" << dendl
;
3398 wait_for_unreadable_object(cid
, op
);
3403 if (s
!= snapset
.clones
.begin()) {
3404 if (is_unreadable_snap(s
- 1)) {
3408 if (s
!= snapset
.clones
.end()) {
3409 if (is_unreadable_snap(s
+ 1)) {
3416 ObjectContextRef
PrimaryLogPG::get_prev_clone_obc(ObjectContextRef obc
)
3418 auto s
= std::find(obc
->ssc
->snapset
.clones
.begin(), obc
->ssc
->snapset
.clones
.end(),
3419 obc
->obs
.oi
.soid
.snap
);
3420 if (s
!= obc
->ssc
->snapset
.clones
.begin()) {
3421 auto s_iter
= s
- 1;
3422 hobject_t cid
= obc
->obs
.oi
.soid
;
3423 object_ref_delta_t refs
;
3425 ObjectContextRef cobc
= get_object_context(cid
, false, NULL
);
3432 void PrimaryLogPG::dec_refcount(const hobject_t
& soid
, const object_ref_delta_t
& refs
)
3434 for (auto p
= refs
.begin(); p
!= refs
.end(); ++p
) {
3435 int dec_ref_count
= p
->second
;
3436 ceph_assert(dec_ref_count
< 0);
3437 while (dec_ref_count
< 0) {
3438 dout(10) << __func__
<< ": decrement reference on offset oid: " << p
->first
<< dendl
;
3439 refcount_manifest(soid
, p
->first
,
3440 refcount_t::DECREMENT_REF
, NULL
, std::nullopt
);
3447 void PrimaryLogPG::get_adjacent_clones(ObjectContextRef src_obc
,
3448 ObjectContextRef
& _l
, ObjectContextRef
& _g
)
3450 const SnapSet
& snapset
= src_obc
->ssc
->snapset
;
3451 const object_info_t
& oi
= src_obc
->obs
.oi
;
3453 auto get_context
= [this, &oi
, &snapset
](auto iter
)
3454 -> ObjectContextRef
{
3455 hobject_t cid
= oi
.soid
;
3456 cid
.snap
= (iter
== snapset
.clones
.end()) ? snapid_t(CEPH_NOSNAP
) : *iter
;
3457 ObjectContextRef obc
= get_object_context(cid
, false, NULL
);
3462 // check adjacent clones
3463 auto s
= std::find(snapset
.clones
.begin(), snapset
.clones
.end(), oi
.soid
.snap
);
3465 // We *must* find the clone iff it's not head,
3466 // let s == snapset.clones.end() mean head
3467 ceph_assert((s
== snapset
.clones
.end()) == oi
.soid
.is_head());
3469 if (s
!= snapset
.clones
.begin()) {
3470 _l
= get_context(s
- 1);
3473 if (s
!= snapset
.clones
.end()) {
3474 _g
= get_context(s
+ 1);
3478 bool PrimaryLogPG::inc_refcount_by_set(OpContext
* ctx
, object_manifest_t
& set_chunk
,
3481 object_ref_delta_t refs
;
3482 ObjectContextRef obc_l
, obc_g
;
3483 get_adjacent_clones(ctx
->obc
, obc_l
, obc_g
);
3484 set_chunk
.calc_refs_to_inc_on_set(
3485 obc_l
? &(obc_l
->obs
.oi
.manifest
) : nullptr,
3486 obc_g
? &(obc_g
->obs
.oi
.manifest
) : nullptr,
3488 if (!refs
.is_empty()) {
3489 /* This is called by set-chunk, so we only consider a single chunk for the time being */
3490 ceph_assert(refs
.size() == 1);
3491 auto p
= refs
.begin();
3492 int inc_ref_count
= p
->second
;
3493 if (inc_ref_count
> 0) {
3495 * In set-chunk case, the first thing we should do is to increment
3496 * the reference the targe object has prior to update object_manifest in object_info_t.
3497 * So, call directly refcount_manifest.
3499 ManifestOpRef mop
= std::make_shared
<ManifestOp
>(new RefCountCallback(ctx
, osd_op
));
3500 C_SetManifestRefCountDone
* fin
= new C_SetManifestRefCountDone(this, mop
, ctx
->obs
->oi
.soid
);
3501 ceph_tid_t tid
= refcount_manifest(ctx
->obs
->oi
.soid
, p
->first
,
3502 refcount_t::INCREMENT_REF
, fin
, std::nullopt
);
3503 mop
->objecter_tid
= tid
;
3504 manifest_ops
[ctx
->obs
->oi
.soid
] = mop
;
3505 ctx
->obc
->start_block();
3507 } else if (inc_ref_count
< 0) {
3508 hobject_t src
= ctx
->obs
->oi
.soid
;
3509 hobject_t tgt
= p
->first
;
3510 ctx
->register_on_commit(
3512 refcount_manifest(src
, tgt
, refcount_t::DECREMENT_REF
, NULL
, std::nullopt
);
3521 void PrimaryLogPG::dec_refcount_by_dirty(OpContext
* ctx
)
3523 object_ref_delta_t refs
;
3524 ObjectContextRef cobc
= nullptr;
3525 ObjectContextRef obc
= ctx
->obc
;
3526 for (auto &p
: ctx
->obs
->oi
.manifest
.chunk_map
) {
3527 if (!ctx
->clean_regions
.is_clean_region(p
.first
, p
.second
.length
)) {
3528 ctx
->new_obs
.oi
.manifest
.chunk_map
.erase(p
.first
);
3529 if (ctx
->new_obs
.oi
.manifest
.chunk_map
.empty()) {
3530 ctx
->new_obs
.oi
.manifest
.type
= object_manifest_t::TYPE_NONE
;
3531 ctx
->new_obs
.oi
.clear_flag(object_info_t::FLAG_MANIFEST
);
3532 ctx
->delta_stats
.num_objects_manifest
--;
3536 // Look over previous snapshot, then figure out whether updated chunk needs to be deleted
3537 cobc
= get_prev_clone_obc(obc
);
3538 obc
->obs
.oi
.manifest
.calc_refs_to_drop_on_modify(
3539 cobc
? &cobc
->obs
.oi
.manifest
: nullptr,
3542 if (!refs
.is_empty()) {
3543 hobject_t soid
= obc
->obs
.oi
.soid
;
3544 ctx
->register_on_commit(
3545 [soid
, this, refs
](){
3546 dec_refcount(soid
, refs
);
3551 void PrimaryLogPG::dec_all_refcount_manifest(const object_info_t
& oi
, OpContext
* ctx
)
3553 ceph_assert(oi
.has_manifest());
3554 ceph_assert(ctx
->obc
->ssc
);
3556 if (oi
.manifest
.is_chunked()) {
3557 object_ref_delta_t refs
;
3558 ObjectContextRef obc_l
, obc_g
;
3559 get_adjacent_clones(ctx
->obc
, obc_l
, obc_g
);
3560 oi
.manifest
.calc_refs_to_drop_on_removal(
3561 obc_l
? &(obc_l
->obs
.oi
.manifest
) : nullptr,
3562 obc_g
? &(obc_g
->obs
.oi
.manifest
) : nullptr,
3565 if (!refs
.is_empty()) {
3566 hobject_t soid
= ctx
->obc
->obs
.oi
.soid
;
3567 ctx
->register_on_commit(
3568 [soid
, this, refs
](){
3569 dec_refcount(soid
, refs
);
3572 } else if (oi
.manifest
.is_redirect() &&
3573 oi
.test_flag(object_info_t::FLAG_REDIRECT_HAS_REFERENCE
)) {
3574 ctx
->register_on_commit(
3576 refcount_manifest(oi
.soid
, oi
.manifest
.redirect_target
,
3577 refcount_t::DECREMENT_REF
, NULL
, std::nullopt
);
3582 ceph_tid_t
PrimaryLogPG::refcount_manifest(hobject_t src_soid
, hobject_t tgt_soid
, refcount_t type
,
3583 Context
*cb
, std::optional
<bufferlist
> chunk
)
3585 unsigned flags
= CEPH_OSD_FLAG_IGNORE_CACHE
| CEPH_OSD_FLAG_IGNORE_OVERLAY
|
3586 CEPH_OSD_FLAG_RWORDERED
;
3588 dout(10) << __func__
<< " Start refcount from " << src_soid
3589 << " to " << tgt_soid
<< dendl
;
3591 ObjectOperation obj_op
;
3593 if (type
== refcount_t::INCREMENT_REF
) {
3594 cls_cas_chunk_get_ref_op call
;
3595 call
.source
= src_soid
.get_head();
3597 obj_op
.call("cas", "chunk_get_ref", in
);
3598 } else if (type
== refcount_t::DECREMENT_REF
) {
3599 cls_cas_chunk_put_ref_op call
;
3600 call
.source
= src_soid
.get_head();
3602 obj_op
.call("cas", "chunk_put_ref", in
);
3603 } else if (type
== refcount_t::CREATE_OR_GET_REF
) {
3604 cls_cas_chunk_create_or_get_ref_op get_call
;
3605 get_call
.source
= src_soid
.get_head();
3607 get_call
.data
= move(*chunk
);
3608 ::encode(get_call
, in
);
3609 obj_op
.call("cas", "chunk_create_or_get_ref", in
);
3611 ceph_assert(0 == "unrecognized type");
3614 Context
*c
= nullptr;
3616 c
= new C_OnFinisher(cb
, osd
->get_objecter_finisher(get_pg_shard()));
3619 object_locator_t
oloc(tgt_soid
);
3620 ObjectContextRef src_obc
= get_object_context(src_soid
, false, NULL
);
3621 ceph_assert(src_obc
);
3622 auto tid
= osd
->objecter
->mutate(
3623 tgt_soid
.oid
, oloc
, obj_op
, SnapContext(),
3624 ceph::real_clock::from_ceph_timespec(src_obc
->obs
.oi
.mtime
),
3629 void PrimaryLogPG::do_proxy_chunked_read(OpRequestRef op
, ObjectContextRef obc
, int op_index
,
3630 uint64_t chunk_index
, uint64_t req_offset
, uint64_t req_length
,
3631 uint64_t req_total_len
, bool write_ordered
)
3633 MOSDOp
*m
= static_cast<MOSDOp
*>(op
->get_nonconst_req());
3634 object_manifest_t
*manifest
= &obc
->obs
.oi
.manifest
;
3635 if (!manifest
->chunk_map
.count(chunk_index
)) {
3638 uint64_t chunk_length
= manifest
->chunk_map
[chunk_index
].length
;
3639 hobject_t soid
= manifest
->chunk_map
[chunk_index
].oid
;
3640 hobject_t ori_soid
= m
->get_hobj();
3641 object_locator_t
oloc(soid
);
3642 unsigned flags
= CEPH_OSD_FLAG_IGNORE_CACHE
| CEPH_OSD_FLAG_IGNORE_OVERLAY
;
3643 if (write_ordered
) {
3644 flags
|= CEPH_OSD_FLAG_RWORDERED
;
3647 if (!chunk_length
|| soid
== hobject_t()) {
3651 /* same as do_proxy_read() */
3652 flags
|= m
->get_flags() & (CEPH_OSD_FLAG_RWORDERED
|
3653 CEPH_OSD_FLAG_ORDERSNAP
|
3654 CEPH_OSD_FLAG_ENFORCE_SNAPC
|
3655 CEPH_OSD_FLAG_MAP_SNAP_CLONE
);
3657 dout(10) << __func__
<< " Start do chunk proxy read for " << *m
3658 << " index: " << op_index
<< " oid: " << soid
.oid
.name
<< " req_offset: " << req_offset
3659 << " req_length: " << req_length
<< dendl
;
3661 ProxyReadOpRef
prdop(std::make_shared
<ProxyReadOp
>(op
, ori_soid
, m
->ops
));
3663 ObjectOperation
*pobj_op
= new ObjectOperation
;
3664 OSDOp
&osd_op
= pobj_op
->add_op(m
->ops
[op_index
].op
.op
);
3666 if (chunk_index
<= req_offset
) {
3667 osd_op
.op
.extent
.offset
= manifest
->chunk_map
[chunk_index
].offset
+ req_offset
- chunk_index
;
3669 ceph_abort_msg("chunk_index > req_offset");
3671 osd_op
.op
.extent
.length
= req_length
;
3673 ObjectOperation obj_op
;
3674 obj_op
.dup(pobj_op
->ops
);
3676 C_ProxyChunkRead
*fin
= new C_ProxyChunkRead(this, ori_soid
, get_last_peering_reset(),
3678 fin
->obj_op
= pobj_op
;
3679 fin
->op_index
= op_index
;
3680 fin
->req_offset
= req_offset
;
3682 fin
->req_total_len
= req_total_len
;
3684 ceph_tid_t tid
= osd
->objecter
->read(
3685 soid
.oid
, oloc
, obj_op
,
3686 m
->get_snapid(), NULL
,
3687 flags
, new C_OnFinisher(fin
, osd
->get_objecter_finisher(get_pg_shard())),
3688 &prdop
->user_version
,
3689 &prdop
->data_offset
,
3692 prdop
->objecter_tid
= tid
;
3693 proxyread_ops
[tid
] = prdop
;
3694 in_progress_proxy_ops
[ori_soid
].push_back(op
);
3697 bool PrimaryLogPG::can_proxy_chunked_read(OpRequestRef op
, ObjectContextRef obc
)
3699 MOSDOp
*m
= static_cast<MOSDOp
*>(op
->get_nonconst_req());
3700 OSDOp
*osd_op
= NULL
;
3702 for (unsigned int i
= 0; i
< m
->ops
.size(); i
++) {
3703 osd_op
= &m
->ops
[i
];
3704 ceph_osd_op op
= osd_op
->op
;
3706 case CEPH_OSD_OP_READ
:
3707 case CEPH_OSD_OP_SYNC_READ
: {
3708 uint64_t cursor
= osd_op
->op
.extent
.offset
;
3709 uint64_t remain
= osd_op
->op
.extent
.length
;
3711 /* requested chunks exist in chunk_map ? */
3712 for (auto &p
: obc
->obs
.oi
.manifest
.chunk_map
) {
3713 if (p
.first
<= cursor
&& p
.first
+ p
.second
.length
> cursor
) {
3714 if (!p
.second
.is_missing()) {
3717 if (p
.second
.length
>= remain
) {
3721 remain
= remain
- p
.second
.length
;
3723 cursor
+= p
.second
.length
;
3728 dout(20) << __func__
<< " requested chunks don't exist in chunk_map " << dendl
;
3740 void PrimaryLogPG::finish_proxy_write(hobject_t oid
, ceph_tid_t tid
, int r
)
3742 dout(10) << __func__
<< " " << oid
<< " tid " << tid
3743 << " " << cpp_strerror(r
) << dendl
;
3745 map
<ceph_tid_t
, ProxyWriteOpRef
>::iterator p
= proxywrite_ops
.find(tid
);
3746 if (p
== proxywrite_ops
.end()) {
3747 dout(10) << __func__
<< " no proxywrite_op found" << dendl
;
3750 ProxyWriteOpRef pwop
= p
->second
;
3751 ceph_assert(tid
== pwop
->objecter_tid
);
3752 ceph_assert(oid
== pwop
->soid
);
3754 proxywrite_ops
.erase(tid
);
3756 map
<hobject_t
, list
<OpRequestRef
> >::iterator q
= in_progress_proxy_ops
.find(oid
);
3757 if (q
== in_progress_proxy_ops
.end()) {
3758 dout(10) << __func__
<< " no in_progress_proxy_ops found" << dendl
;
3763 list
<OpRequestRef
>& in_progress_op
= q
->second
;
3764 ceph_assert(in_progress_op
.size());
3765 list
<OpRequestRef
>::iterator it
= std::find(in_progress_op
.begin(),
3766 in_progress_op
.end(),
3768 ceph_assert(it
!= in_progress_op
.end());
3769 in_progress_op
.erase(it
);
3770 if (in_progress_op
.size() == 0) {
3771 in_progress_proxy_ops
.erase(oid
);
3772 } else if (std::find(in_progress_op
.begin(),
3773 in_progress_op
.end(),
3774 pwop
->op
) != in_progress_op
.end()) {
3778 dout(20) << __func__
<< " " << oid
<< " tid " << tid
3779 << " in_progress_op size: "
3780 << in_progress_op
.size() << dendl
;
3784 osd
->logger
->inc(l_osd_tier_proxy_write
);
3786 auto m
= pwop
->op
->get_req
<MOSDOp
>();
3787 ceph_assert(m
!= NULL
);
3789 if (!pwop
->sent_reply
) {
3791 assert(pwop
->ctx
->reply
== nullptr);
3792 MOSDOpReply
*reply
= new MOSDOpReply(m
, r
, get_osdmap_epoch(), 0,
3793 true /* we claim it below */);
3794 reply
->set_reply_versions(eversion_t(), pwop
->user_version
);
3795 reply
->add_flags(CEPH_OSD_FLAG_ACK
| CEPH_OSD_FLAG_ONDISK
);
3796 reply
->claim_op_out_data(pwop
->ops
);
3797 dout(10) << " sending commit on " << pwop
<< " " << reply
<< dendl
;
3798 osd
->send_message_osd_client(reply
, m
->get_connection());
3799 pwop
->sent_reply
= true;
3800 pwop
->ctx
->op
->mark_commit_sent();
3807 void PrimaryLogPG::cancel_proxy_write(ProxyWriteOpRef pwop
,
3808 vector
<ceph_tid_t
> *tids
)
3810 dout(10) << __func__
<< " " << pwop
->soid
<< dendl
;
3811 pwop
->canceled
= true;
3813 // cancel objecter op, if we can
3814 if (pwop
->objecter_tid
) {
3815 tids
->push_back(pwop
->objecter_tid
);
3818 proxywrite_ops
.erase(pwop
->objecter_tid
);
3819 pwop
->objecter_tid
= 0;
3823 class PromoteCallback
: public PrimaryLogPG::CopyCallback
{
3824 ObjectContextRef obc
;
3828 PromoteCallback(ObjectContextRef obc_
, PrimaryLogPG
*pg_
)
3831 start(ceph_clock_now()) {}
3833 void finish(PrimaryLogPG::CopyCallbackResults results
) override
{
3834 PrimaryLogPG::CopyResults
*results_data
= results
.get
<1>();
3835 int r
= results
.get
<0>();
3836 pg
->finish_promote(r
, results_data
, obc
);
3837 pg
->osd
->logger
->tinc(l_osd_tier_promote_lat
, ceph_clock_now() - start
);
3841 class PromoteManifestCallback
: public PrimaryLogPG::CopyCallback
{
3842 ObjectContextRef obc
;
3845 PrimaryLogPG::OpContext
*ctx
;
3846 PrimaryLogPG::CopyCallbackResults promote_results
;
3848 PromoteManifestCallback(ObjectContextRef obc_
, PrimaryLogPG
*pg_
, PrimaryLogPG::OpContext
*ctx
= NULL
)
3851 start(ceph_clock_now()), ctx(ctx
) {}
3853 void finish(PrimaryLogPG::CopyCallbackResults results
) override
{
3854 PrimaryLogPG::CopyResults
*results_data
= results
.get
<1>();
3855 int r
= results
.get
<0>();
3857 promote_results
= results
;
3858 pg
->execute_ctx(ctx
);
3860 pg
->finish_promote_manifest(r
, results_data
, obc
);
3862 pg
->osd
->logger
->tinc(l_osd_tier_promote_lat
, ceph_clock_now() - start
);
3864 friend struct PromoteFinisher
;
3867 struct PromoteFinisher
: public PrimaryLogPG::OpFinisher
{
3868 PromoteManifestCallback
*promote_callback
;
3870 explicit PromoteFinisher(PromoteManifestCallback
*promote_callback
)
3871 : promote_callback(promote_callback
) {
3874 int execute() override
{
3875 if (promote_callback
->ctx
->obc
->obs
.oi
.manifest
.is_redirect()) {
3876 promote_callback
->ctx
->pg
->finish_promote(promote_callback
->promote_results
.get
<0>(),
3877 promote_callback
->promote_results
.get
<1>(),
3878 promote_callback
->obc
);
3879 } else if (promote_callback
->ctx
->obc
->obs
.oi
.manifest
.is_chunked()) {
3880 promote_callback
->ctx
->pg
->finish_promote_manifest(promote_callback
->promote_results
.get
<0>(),
3881 promote_callback
->promote_results
.get
<1>(),
3882 promote_callback
->obc
);
3884 ceph_abort_msg("unrecognized manifest type");
3890 void PrimaryLogPG::promote_object(ObjectContextRef obc
,
3891 const hobject_t
& missing_oid
,
3892 const object_locator_t
& oloc
,
3894 ObjectContextRef
*promote_obc
)
3896 hobject_t hoid
= obc
? obc
->obs
.oi
.soid
: missing_oid
;
3897 ceph_assert(hoid
!= hobject_t());
3898 if (m_scrubber
->write_blocked_by_scrub(hoid
)) {
3899 dout(10) << __func__
<< " " << hoid
3900 << " blocked by scrub" << dendl
;
3902 waiting_for_scrub
.push_back(op
);
3903 op
->mark_delayed("waiting for scrub");
3904 dout(10) << __func__
<< " " << hoid
3905 << " placing op in waiting_for_scrub" << dendl
;
3907 dout(10) << __func__
<< " " << hoid
3908 << " no op, dropping on the floor" << dendl
;
3912 if (op
&& !check_laggy_requeue(op
)) {
3915 if (!obc
) { // we need to create an ObjectContext
3916 ceph_assert(missing_oid
!= hobject_t());
3917 obc
= get_object_context(missing_oid
, true);
3923 * Before promote complete, if there are proxy-reads for the object,
3924 * for this case we don't use DONTNEED.
3926 unsigned src_fadvise_flags
= LIBRADOS_OP_FLAG_FADVISE_SEQUENTIAL
;
3927 map
<hobject_t
, list
<OpRequestRef
>>::iterator q
= in_progress_proxy_ops
.find(obc
->obs
.oi
.soid
);
3928 if (q
== in_progress_proxy_ops
.end()) {
3929 src_fadvise_flags
|= LIBRADOS_OP_FLAG_FADVISE_DONTNEED
;
3933 object_locator_t my_oloc
;
3935 if (!obc
->obs
.oi
.has_manifest()) {
3937 my_oloc
.pool
= pool
.info
.tier_of
;
3938 src_hoid
= obc
->obs
.oi
.soid
;
3939 cb
= new PromoteCallback(obc
, this);
3941 if (obc
->obs
.oi
.manifest
.is_chunked()) {
3942 src_hoid
= obc
->obs
.oi
.soid
;
3943 cb
= new PromoteManifestCallback(obc
, this);
3944 } else if (obc
->obs
.oi
.manifest
.is_redirect()) {
3945 object_locator_t
src_oloc(obc
->obs
.oi
.manifest
.redirect_target
);
3947 src_hoid
= obc
->obs
.oi
.manifest
.redirect_target
;
3948 cb
= new PromoteCallback(obc
, this);
3950 ceph_abort_msg("unrecognized manifest type");
3954 unsigned flags
= CEPH_OSD_COPY_FROM_FLAG_IGNORE_OVERLAY
|
3955 CEPH_OSD_COPY_FROM_FLAG_IGNORE_CACHE
|
3956 CEPH_OSD_COPY_FROM_FLAG_MAP_SNAP_CLONE
|
3957 CEPH_OSD_COPY_FROM_FLAG_RWORDERED
;
3958 start_copy(cb
, obc
, src_hoid
, my_oloc
, 0, flags
,
3959 obc
->obs
.oi
.soid
.snap
== CEPH_NOSNAP
,
3960 src_fadvise_flags
, 0);
3962 ceph_assert(obc
->is_blocked());
3965 wait_for_blocked_object(obc
->obs
.oi
.soid
, op
);
3967 recovery_state
.update_stats(
3968 [](auto &history
, auto &stats
) {
3969 stats
.stats
.sum
.num_promote
++;
3974 void PrimaryLogPG::execute_ctx(OpContext
*ctx
)
3977 dout(10) << __func__
<< " " << ctx
<< dendl
;
3978 ctx
->reset_obs(ctx
->obc
);
3979 ctx
->update_log_only
= false; // reset in case finish_copyfrom() is re-running execute_ctx
3980 OpRequestRef op
= ctx
->op
;
3981 auto m
= op
->get_req
<MOSDOp
>();
3982 ObjectContextRef obc
= ctx
->obc
;
3983 const hobject_t
& soid
= obc
->obs
.oi
.soid
;
3985 // this method must be idempotent since we may call it several times
3986 // before we finally apply the resulting transaction.
3987 ctx
->op_t
.reset(new PGTransaction
);
3989 if (op
->may_write() || op
->may_cache()) {
3991 if (!(m
->has_flag(CEPH_OSD_FLAG_ENFORCE_SNAPC
)) &&
3992 pool
.info
.is_pool_snaps_mode()) {
3994 ctx
->snapc
= pool
.snapc
;
3996 // client specified snapc
3997 ctx
->snapc
.seq
= m
->get_snap_seq();
3998 ctx
->snapc
.snaps
= m
->get_snaps();
3999 filter_snapc(ctx
->snapc
.snaps
);
4001 if ((m
->has_flag(CEPH_OSD_FLAG_ORDERSNAP
)) &&
4002 ctx
->snapc
.seq
< obc
->ssc
->snapset
.seq
) {
4003 dout(10) << " ORDERSNAP flag set and snapc seq " << ctx
->snapc
.seq
4004 << " < snapset seq " << obc
->ssc
->snapset
.seq
4005 << " on " << obc
->obs
.oi
.soid
<< dendl
;
4006 reply_ctx(ctx
, -EOLDSNAPC
);
4011 ctx
->at_version
= get_next_version();
4012 ctx
->mtime
= m
->get_mtime();
4014 dout(10) << __func__
<< " " << soid
<< " " << *ctx
->ops
4015 << " ov " << obc
->obs
.oi
.version
<< " av " << ctx
->at_version
4016 << " snapc " << ctx
->snapc
4017 << " snapset " << obc
->ssc
->snapset
4020 dout(10) << __func__
<< " " << soid
<< " " << *ctx
->ops
4021 << " ov " << obc
->obs
.oi
.version
4025 if (!ctx
->user_at_version
)
4026 ctx
->user_at_version
= obc
->obs
.oi
.user_version
;
4027 dout(30) << __func__
<< " user_at_version " << ctx
->user_at_version
<< dendl
;
4031 osd_reqid_t reqid
= ctx
->op
->get_reqid();
4033 tracepoint(osd
, prepare_tx_enter
, reqid
.name
._type
,
4034 reqid
.name
._num
, reqid
.tid
, reqid
.inc
);
4037 if (ctx
->op
->osd_parent_span
) {
4038 auto execute_span
= jaeger_tracing::child_span(__func__
, ctx
->op
->osd_parent_span
);
4042 int result
= prepare_transaction(ctx
);
4046 osd_reqid_t reqid
= ctx
->op
->get_reqid();
4048 tracepoint(osd
, prepare_tx_exit
, reqid
.name
._type
,
4049 reqid
.name
._num
, reqid
.tid
, reqid
.inc
);
4052 bool pending_async_reads
= !ctx
->pending_async_reads
.empty();
4053 if (result
== -EINPROGRESS
|| pending_async_reads
) {
4055 if (pending_async_reads
) {
4056 ceph_assert(pool
.info
.is_erasure());
4057 in_progress_async_reads
.push_back(make_pair(op
, ctx
));
4058 ctx
->start_async_reads(this);
4063 if (result
== -EAGAIN
) {
4064 // clean up after the ctx
4069 bool ignore_out_data
= false;
4070 if (!ctx
->op_t
->empty() &&
4073 // successful update
4074 if (ctx
->op
->allows_returnvec()) {
4075 // enforce reasonable bound on the return buffer sizes
4076 for (auto& i
: *ctx
->ops
) {
4077 if (i
.outdata
.length() > cct
->_conf
->osd_max_write_op_reply_len
) {
4078 dout(10) << __func__
<< " op " << i
<< " outdata overflow" << dendl
;
4079 result
= -EOVERFLOW
; // overall result is overflow
4080 i
.rval
= -EOVERFLOW
;
4085 // legacy behavior -- zero result and return data etc.
4086 ignore_out_data
= true;
4091 // prepare the reply
4092 ctx
->reply
= new MOSDOpReply(m
, result
, get_osdmap_epoch(), 0,
4094 dout(20) << __func__
<< " alloc reply " << ctx
->reply
4095 << " result " << result
<< dendl
;
4098 if ((ctx
->op_t
->empty() || result
< 0) && !ctx
->update_log_only
) {
4099 // finish side-effects
4101 do_osd_op_effects(ctx
, m
->get_connection());
4103 complete_read_ctx(result
, ctx
);
4107 ctx
->reply
->set_reply_versions(ctx
->at_version
, ctx
->user_at_version
);
4109 ceph_assert(op
->may_write() || op
->may_cache());
4112 recovery_state
.update_trim_to();
4114 // verify that we are doing this in order?
4115 if (cct
->_conf
->osd_debug_op_order
&& m
->get_source().is_client() &&
4116 !pool
.info
.is_tier() && !pool
.info
.has_tiers()) {
4117 map
<client_t
,ceph_tid_t
>& cm
= debug_op_order
[obc
->obs
.oi
.soid
];
4118 ceph_tid_t t
= m
->get_tid();
4119 client_t n
= m
->get_source().num();
4120 map
<client_t
,ceph_tid_t
>::iterator p
= cm
.find(n
);
4121 if (p
== cm
.end()) {
4122 dout(20) << " op order client." << n
<< " tid " << t
<< " (first)" << dendl
;
4125 dout(20) << " op order client." << n
<< " tid " << t
<< " last was " << p
->second
<< dendl
;
4126 if (p
->second
> t
) {
4127 derr
<< "bad op order, already applied " << p
->second
<< " > this " << t
<< dendl
;
4128 ceph_abort_msg("out of order op");
4134 if (ctx
->update_log_only
) {
4136 do_osd_op_effects(ctx
, m
->get_connection());
4138 dout(20) << __func__
<< " update_log_only -- result=" << result
<< dendl
;
4139 // save just what we need from ctx
4140 MOSDOpReply
*reply
= ctx
->reply
;
4141 ctx
->reply
= nullptr;
4142 reply
->get_header().data_off
= (ctx
->data_off
? *ctx
->data_off
: 0);
4144 if (result
== -ENOENT
) {
4145 reply
->set_enoent_reply_versions(info
.last_update
,
4146 info
.last_user_version
);
4148 reply
->add_flags(CEPH_OSD_FLAG_ACK
| CEPH_OSD_FLAG_ONDISK
);
4149 // append to pg log for dup detection - don't save buffers for now
4150 record_write_error(op
, soid
, reply
, result
,
4151 ctx
->op
->allows_returnvec() ? ctx
: nullptr);
4156 // no need to capture PG ref, repop cancel will handle that
4157 // Can capture the ctx by pointer, it's owned by the repop
4158 ctx
->register_on_commit(
4161 log_op_stats(*ctx
->op
, ctx
->bytes_written
, ctx
->bytes_read
);
4163 if (m
&& !ctx
->sent_reply
) {
4164 MOSDOpReply
*reply
= ctx
->reply
;
4165 ctx
->reply
= nullptr;
4166 reply
->add_flags(CEPH_OSD_FLAG_ACK
| CEPH_OSD_FLAG_ONDISK
);
4167 dout(10) << " sending reply on " << *m
<< " " << reply
<< dendl
;
4168 osd
->send_message_osd_client(reply
, m
->get_connection());
4169 ctx
->sent_reply
= true;
4170 ctx
->op
->mark_commit_sent();
4173 ctx
->register_on_success(
4177 ctx
->op
? ctx
->op
->get_req()->get_connection() :
4180 ctx
->register_on_finish(
4185 // issue replica writes
4186 ceph_tid_t rep_tid
= osd
->get_tid();
4188 RepGather
*repop
= new_repop(ctx
, obc
, rep_tid
);
4190 issue_repop(repop
, ctx
);
4195 void PrimaryLogPG::close_op_ctx(OpContext
*ctx
) {
4196 release_object_locks(ctx
->lock_manager
);
4200 for (auto p
= ctx
->on_finish
.begin(); p
!= ctx
->on_finish
.end();
4201 ctx
->on_finish
.erase(p
++)) {
4207 void PrimaryLogPG::reply_ctx(OpContext
*ctx
, int r
)
4210 osd
->reply_op_error(ctx
->op
, r
);
4214 void PrimaryLogPG::log_op_stats(const OpRequest
& op
,
4216 const uint64_t outb
)
4218 auto m
= op
.get_req
<MOSDOp
>();
4219 const utime_t now
= ceph_clock_now();
4221 const utime_t latency
= now
- m
->get_recv_stamp();
4222 const utime_t process_latency
= now
- op
.get_dequeued_time();
4224 osd
->logger
->inc(l_osd_op
);
4226 osd
->logger
->inc(l_osd_op_outb
, outb
);
4227 osd
->logger
->inc(l_osd_op_inb
, inb
);
4228 osd
->logger
->tinc(l_osd_op_lat
, latency
);
4229 osd
->logger
->tinc(l_osd_op_process_lat
, process_latency
);
4231 if (op
.may_read() && op
.may_write()) {
4232 osd
->logger
->inc(l_osd_op_rw
);
4233 osd
->logger
->inc(l_osd_op_rw_inb
, inb
);
4234 osd
->logger
->inc(l_osd_op_rw_outb
, outb
);
4235 osd
->logger
->tinc(l_osd_op_rw_lat
, latency
);
4236 osd
->logger
->hinc(l_osd_op_rw_lat_inb_hist
, latency
.to_nsec(), inb
);
4237 osd
->logger
->hinc(l_osd_op_rw_lat_outb_hist
, latency
.to_nsec(), outb
);
4238 osd
->logger
->tinc(l_osd_op_rw_process_lat
, process_latency
);
4239 } else if (op
.may_read()) {
4240 osd
->logger
->inc(l_osd_op_r
);
4241 osd
->logger
->inc(l_osd_op_r_outb
, outb
);
4242 osd
->logger
->tinc(l_osd_op_r_lat
, latency
);
4243 osd
->logger
->hinc(l_osd_op_r_lat_outb_hist
, latency
.to_nsec(), outb
);
4244 osd
->logger
->tinc(l_osd_op_r_process_lat
, process_latency
);
4245 } else if (op
.may_write() || op
.may_cache()) {
4246 osd
->logger
->inc(l_osd_op_w
);
4247 osd
->logger
->inc(l_osd_op_w_inb
, inb
);
4248 osd
->logger
->tinc(l_osd_op_w_lat
, latency
);
4249 osd
->logger
->hinc(l_osd_op_w_lat_inb_hist
, latency
.to_nsec(), inb
);
4250 osd
->logger
->tinc(l_osd_op_w_process_lat
, process_latency
);
4255 dout(15) << "log_op_stats " << *m
4258 << " lat " << latency
<< dendl
;
4260 if (m_dynamic_perf_stats
.is_enabled()) {
4261 m_dynamic_perf_stats
.add(osd
, info
, op
, inb
, outb
, latency
);
4265 void PrimaryLogPG::set_dynamic_perf_stats_queries(
4266 const std::list
<OSDPerfMetricQuery
> &queries
)
4268 m_dynamic_perf_stats
.set_queries(queries
);
4271 void PrimaryLogPG::get_dynamic_perf_stats(DynamicPerfStats
*stats
)
4273 std::swap(m_dynamic_perf_stats
, *stats
);
4276 void PrimaryLogPG::do_scan(
4278 ThreadPool::TPHandle
&handle
)
4280 auto m
= op
->get_req
<MOSDPGScan
>();
4281 ceph_assert(m
->get_type() == MSG_OSD_PG_SCAN
);
4282 dout(10) << "do_scan " << *m
<< dendl
;
4287 case MOSDPGScan::OP_SCAN_GET_DIGEST
:
4289 auto dpp
= get_dpp();
4290 if (osd
->check_backfill_full(dpp
)) {
4291 dout(1) << __func__
<< ": Canceling backfill: Full." << dendl
;
4292 queue_peering_event(
4294 std::make_shared
<PGPeeringEvent
>(
4297 PeeringState::BackfillTooFull())));
4301 BackfillInterval bi
;
4302 bi
.begin
= m
->begin
;
4303 // No need to flush, there won't be any in progress writes occuring
4306 cct
->_conf
->osd_backfill_scan_min
,
4307 cct
->_conf
->osd_backfill_scan_max
,
4310 MOSDPGScan
*reply
= new MOSDPGScan(
4311 MOSDPGScan::OP_SCAN_DIGEST
,
4313 get_osdmap_epoch(), m
->query_epoch
,
4314 spg_t(info
.pgid
.pgid
, get_primary().shard
), bi
.begin
, bi
.end
);
4315 encode(bi
.objects
, reply
->get_data());
4316 osd
->send_message_osd_cluster(reply
, m
->get_connection());
4320 case MOSDPGScan::OP_SCAN_DIGEST
:
4322 pg_shard_t from
= m
->from
;
4324 // Check that from is in backfill_targets vector
4325 ceph_assert(is_backfill_target(from
));
4327 BackfillInterval
& bi
= peer_backfill_info
[from
];
4328 bi
.begin
= m
->begin
;
4330 auto p
= m
->get_data().cbegin();
4332 // take care to preserve ordering!
4334 decode_noclear(bi
.objects
, p
);
4335 dout(10) << __func__
<< " bi.begin=" << bi
.begin
<< " bi.end=" << bi
.end
4336 << " bi.objects.size()=" << bi
.objects
.size() << dendl
;
4338 if (waiting_on_backfill
.erase(from
)) {
4339 if (waiting_on_backfill
.empty()) {
4341 peer_backfill_info
.size() ==
4342 get_backfill_targets().size());
4343 finish_recovery_op(hobject_t::get_max());
4346 // we canceled backfill for a while due to a too full, and this
4347 // is an extra response from a non-too-full peer
4348 dout(20) << __func__
<< " canceled backfill (too full?)" << dendl
;
4355 void PrimaryLogPG::do_backfill(OpRequestRef op
)
4357 auto m
= op
->get_req
<MOSDPGBackfill
>();
4358 ceph_assert(m
->get_type() == MSG_OSD_PG_BACKFILL
);
4359 dout(10) << "do_backfill " << *m
<< dendl
;
4364 case MOSDPGBackfill::OP_BACKFILL_FINISH
:
4366 ceph_assert(cct
->_conf
->osd_kill_backfill_at
!= 1);
4368 MOSDPGBackfill
*reply
= new MOSDPGBackfill(
4369 MOSDPGBackfill::OP_BACKFILL_FINISH_ACK
,
4372 spg_t(info
.pgid
.pgid
, get_primary().shard
));
4373 reply
->set_priority(get_recovery_op_priority());
4374 osd
->send_message_osd_cluster(reply
, m
->get_connection());
4375 queue_peering_event(
4377 std::make_shared
<PGPeeringEvent
>(
4384 case MOSDPGBackfill::OP_BACKFILL_PROGRESS
:
4386 ceph_assert(cct
->_conf
->osd_kill_backfill_at
!= 2);
4388 ObjectStore::Transaction t
;
4389 recovery_state
.update_backfill_progress(
4392 m
->op
== MOSDPGBackfill::OP_BACKFILL_PROGRESS
,
4395 int tr
= osd
->store
->queue_transaction(ch
, std::move(t
), NULL
);
4396 ceph_assert(tr
== 0);
4400 case MOSDPGBackfill::OP_BACKFILL_FINISH_ACK
:
4402 ceph_assert(is_primary());
4403 ceph_assert(cct
->_conf
->osd_kill_backfill_at
!= 3);
4404 finish_recovery_op(hobject_t::get_max());
4410 void PrimaryLogPG::do_backfill_remove(OpRequestRef op
)
4412 const MOSDPGBackfillRemove
*m
= static_cast<const MOSDPGBackfillRemove
*>(
4414 ceph_assert(m
->get_type() == MSG_OSD_PG_BACKFILL_REMOVE
);
4415 dout(7) << __func__
<< " " << m
->ls
<< dendl
;
4419 ObjectStore::Transaction t
;
4420 for (auto& p
: m
->ls
) {
4421 if (is_remote_backfilling()) {
4423 int r
= osd
->store
->stat(ch
, ghobject_t(p
.first
, ghobject_t::NO_GEN
,
4424 pg_whoami
.shard
) , &st
);
4426 sub_local_num_bytes(st
.st_size
);
4428 if (pool
.info
.is_erasure()) {
4430 int r
= osd
->store
->getattr(
4432 ghobject_t(p
.first
, ghobject_t::NO_GEN
, pg_whoami
.shard
),
4436 object_info_t
oi(bv
);
4437 usersize
= oi
.size
* pgbackend
->get_ec_data_chunk_count();
4439 dout(0) << __func__
<< " " << ghobject_t(p
.first
, ghobject_t::NO_GEN
, pg_whoami
.shard
)
4440 << " can't get object info" << dendl
;
4444 usersize
= st
.st_size
;
4446 sub_num_bytes(usersize
);
4447 dout(10) << __func__
<< " " << ghobject_t(p
.first
, ghobject_t::NO_GEN
, pg_whoami
.shard
)
4448 << " sub actual data by " << st
.st_size
4449 << " sub num_bytes by " << usersize
4453 remove_snap_mapped_object(t
, p
.first
);
4455 int r
= osd
->store
->queue_transaction(ch
, std::move(t
), NULL
);
4456 ceph_assert(r
== 0);
4459 int PrimaryLogPG::trim_object(
4460 bool first
, const hobject_t
&coid
, snapid_t snap_to_trim
,
4461 PrimaryLogPG::OpContextUPtr
*ctxp
)
4467 ObjectContextRef obc
= get_object_context(coid
, false, NULL
);
4468 if (!obc
|| !obc
->ssc
|| !obc
->ssc
->exists
) {
4469 osd
->clog
->error() << __func__
<< ": Can not trim " << coid
4470 << " repair needed " << (obc
? "(no obc->ssc or !exists)" : "(no obc)");
4474 hobject_t head_oid
= coid
.get_head();
4475 ObjectContextRef head_obc
= get_object_context(head_oid
, false);
4477 osd
->clog
->error() << __func__
<< ": Can not trim " << coid
4478 << " repair needed, no snapset obc for " << head_oid
;
4482 SnapSet
& snapset
= obc
->ssc
->snapset
;
4484 object_info_t
&coi
= obc
->obs
.oi
;
4485 auto citer
= snapset
.clone_snaps
.find(coid
.snap
);
4486 if (citer
== snapset
.clone_snaps
.end()) {
4487 osd
->clog
->error() << "No clone_snaps in snapset " << snapset
4488 << " for object " << coid
<< "\n";
4491 set
<snapid_t
> old_snaps(citer
->second
.begin(), citer
->second
.end());
4492 if (old_snaps
.empty()) {
4493 osd
->clog
->error() << "No object info snaps for object " << coid
;
4497 dout(10) << coid
<< " old_snaps " << old_snaps
4498 << " old snapset " << snapset
<< dendl
;
4499 if (snapset
.seq
== 0) {
4500 osd
->clog
->error() << "No snapset.seq for object " << coid
;
4504 set
<snapid_t
> new_snaps
;
4505 const OSDMapRef
& osdmap
= get_osdmap();
4506 for (set
<snapid_t
>::iterator i
= old_snaps
.begin();
4507 i
!= old_snaps
.end();
4509 if (!osdmap
->in_removed_snaps_queue(info
.pgid
.pgid
.pool(), *i
) &&
4510 *i
!= snap_to_trim
) {
4511 new_snaps
.insert(*i
);
4515 vector
<snapid_t
>::iterator p
= snapset
.clones
.end();
4517 if (new_snaps
.empty()) {
4518 p
= std::find(snapset
.clones
.begin(), snapset
.clones
.end(), coid
.snap
);
4519 if (p
== snapset
.clones
.end()) {
4520 osd
->clog
->error() << "Snap " << coid
.snap
<< " not in clones";
4525 OpContextUPtr ctx
= simple_opc_create(obc
);
4526 ctx
->head_obc
= head_obc
;
4528 if (!ctx
->lock_manager
.get_snaptrimmer_write(
4532 close_op_ctx(ctx
.release());
4533 dout(10) << __func__
<< ": Unable to get a wlock on " << coid
<< dendl
;
4537 if (!ctx
->lock_manager
.get_snaptrimmer_write(
4541 close_op_ctx(ctx
.release());
4542 dout(10) << __func__
<< ": Unable to get a wlock on " << head_oid
<< dendl
;
4546 ctx
->at_version
= get_next_version();
4548 PGTransaction
*t
= ctx
->op_t
.get();
4550 if (new_snaps
.empty()) {
4552 dout(10) << coid
<< " snaps " << old_snaps
<< " -> "
4553 << new_snaps
<< " ... deleting" << dendl
;
4556 ceph_assert(p
!= snapset
.clones
.end());
4558 snapid_t last
= coid
.snap
;
4559 ctx
->delta_stats
.num_bytes
-= snapset
.get_clone_bytes(last
);
4561 if (p
!= snapset
.clones
.begin()) {
4562 // not the oldest... merge overlap into next older clone
4563 vector
<snapid_t
>::iterator n
= p
- 1;
4564 hobject_t prev_coid
= coid
;
4565 prev_coid
.snap
= *n
;
4566 bool adjust_prev_bytes
= is_present_clone(prev_coid
);
4568 if (adjust_prev_bytes
)
4569 ctx
->delta_stats
.num_bytes
-= snapset
.get_clone_bytes(*n
);
4571 snapset
.clone_overlap
[*n
].intersection_of(
4572 snapset
.clone_overlap
[*p
]);
4574 if (adjust_prev_bytes
)
4575 ctx
->delta_stats
.num_bytes
+= snapset
.get_clone_bytes(*n
);
4577 ctx
->delta_stats
.num_objects
--;
4579 ctx
->delta_stats
.num_objects_dirty
--;
4581 ctx
->delta_stats
.num_objects_omap
--;
4582 if (coi
.is_whiteout()) {
4583 dout(20) << __func__
<< " trimming whiteout on " << coid
<< dendl
;
4584 ctx
->delta_stats
.num_whiteouts
--;
4586 ctx
->delta_stats
.num_object_clones
--;
4587 if (coi
.is_cache_pinned())
4588 ctx
->delta_stats
.num_objects_pinned
--;
4589 if (coi
.has_manifest()) {
4590 dec_all_refcount_manifest(coi
, ctx
.get());
4591 ctx
->delta_stats
.num_objects_manifest
--;
4593 obc
->obs
.exists
= false;
4595 snapset
.clones
.erase(p
);
4596 snapset
.clone_overlap
.erase(last
);
4597 snapset
.clone_size
.erase(last
);
4598 snapset
.clone_snaps
.erase(last
);
4602 pg_log_entry_t::DELETE
,
4605 ctx
->obs
->oi
.version
,
4617 coi
= object_info_t(coid
);
4619 ctx
->at_version
.version
++;
4621 // save adjusted snaps for this object
4622 dout(10) << coid
<< " snaps " << old_snaps
<< " -> " << new_snaps
<< dendl
;
4623 snapset
.clone_snaps
[coid
.snap
] =
4624 vector
<snapid_t
>(new_snaps
.rbegin(), new_snaps
.rend());
4625 // we still do a 'modify' event on this object just to trigger a
4626 // snapmapper.update ... :(
4628 coi
.prior_version
= coi
.version
;
4629 coi
.version
= ctx
->at_version
;
4631 encode(coi
, bl
, get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD
, nullptr));
4632 t
->setattr(coid
, OI_ATTR
, bl
);
4636 pg_log_entry_t::MODIFY
,
4645 ctx
->at_version
.version
++;
4653 // save head snapset
4654 dout(10) << coid
<< " new snapset " << snapset
<< " on "
4655 << head_obc
->obs
.oi
<< dendl
;
4656 if (snapset
.clones
.empty() &&
4657 (head_obc
->obs
.oi
.is_whiteout() &&
4658 !(head_obc
->obs
.oi
.is_dirty() && pool
.info
.is_tier()) &&
4659 !head_obc
->obs
.oi
.is_cache_pinned())) {
4660 // NOTE: this arguably constitutes minor interference with the
4661 // tiering agent if this is a cache tier since a snap trim event
4662 // is effectively evicting a whiteout we might otherwise want to
4664 dout(10) << coid
<< " removing " << head_oid
<< dendl
;
4667 pg_log_entry_t::DELETE
,
4670 head_obc
->obs
.oi
.version
,
4676 dout(10) << "removing snap head" << dendl
;
4677 object_info_t
& oi
= head_obc
->obs
.oi
;
4678 ctx
->delta_stats
.num_objects
--;
4679 if (oi
.is_dirty()) {
4680 ctx
->delta_stats
.num_objects_dirty
--;
4683 ctx
->delta_stats
.num_objects_omap
--;
4684 if (oi
.is_whiteout()) {
4685 dout(20) << __func__
<< " trimming whiteout on " << oi
.soid
<< dendl
;
4686 ctx
->delta_stats
.num_whiteouts
--;
4688 if (oi
.is_cache_pinned()) {
4689 ctx
->delta_stats
.num_objects_pinned
--;
4691 if (oi
.has_manifest()) {
4692 ctx
->delta_stats
.num_objects_manifest
--;
4693 dec_all_refcount_manifest(oi
, ctx
.get());
4695 head_obc
->obs
.exists
= false;
4696 head_obc
->obs
.oi
= object_info_t(head_oid
);
4697 t
->remove(head_oid
);
4699 if (get_osdmap()->require_osd_release
< ceph_release_t::octopus
) {
4700 // filter SnapSet::snaps for the benefit of pre-octopus
4701 // peers. This is perhaps overly conservative in that I'm not
4702 // certain they need this, but let's be conservative here.
4703 dout(10) << coid
<< " filtering snapset on " << head_oid
<< dendl
;
4704 snapset
.filter(pool
.info
);
4706 snapset
.snaps
.clear();
4708 dout(10) << coid
<< " writing updated snapset on " << head_oid
4709 << ", snapset is " << snapset
<< dendl
;
4712 pg_log_entry_t::MODIFY
,
4715 head_obc
->obs
.oi
.version
,
4722 head_obc
->obs
.oi
.prior_version
= head_obc
->obs
.oi
.version
;
4723 head_obc
->obs
.oi
.version
= ctx
->at_version
;
4725 map
<string
, bufferlist
> attrs
;
4727 encode(snapset
, bl
);
4728 attrs
[SS_ATTR
] = std::move(bl
);
4731 encode(head_obc
->obs
.oi
, bl
,
4732 get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD
, nullptr));
4733 attrs
[OI_ATTR
] = std::move(bl
);
4734 t
->setattrs(head_oid
, attrs
);
4737 *ctxp
= std::move(ctx
);
4741 void PrimaryLogPG::kick_snap_trim()
4743 ceph_assert(is_active());
4744 ceph_assert(is_primary());
4746 !state_test(PG_STATE_PREMERGE
) &&
4747 !snap_trimq
.empty()) {
4748 if (get_osdmap()->test_flag(CEPH_OSDMAP_NOSNAPTRIM
)) {
4749 dout(10) << __func__
<< ": nosnaptrim set, not kicking" << dendl
;
4751 dout(10) << __func__
<< ": clean and snaps to trim, kicking" << dendl
;
4752 snap_trimmer_machine
.process_event(KickTrim());
4757 void PrimaryLogPG::snap_trimmer_scrub_complete()
4759 if (is_primary() && is_active() && is_clean()) {
4760 ceph_assert(!snap_trimq
.empty());
4761 snap_trimmer_machine
.process_event(ScrubComplete());
4765 void PrimaryLogPG::snap_trimmer(epoch_t queued
)
4767 if (recovery_state
.is_deleting() || pg_has_reset_since(queued
)) {
4771 ceph_assert(is_primary());
4773 dout(10) << "snap_trimmer posting" << dendl
;
4774 snap_trimmer_machine
.process_event(DoSnapWork());
4775 dout(10) << "snap_trimmer complete" << dendl
;
4779 int PrimaryLogPG::do_xattr_cmp_u64(int op
, __u64 v1
, bufferlist
& xattr
)
4783 string
v2s(xattr
.c_str(), xattr
.length());
4785 v2
= strtoull(v2s
.c_str(), NULL
, 10);
4789 dout(20) << "do_xattr_cmp_u64 '" << v1
<< "' vs '" << v2
<< "' op " << op
<< dendl
;
4792 case CEPH_OSD_CMPXATTR_OP_EQ
:
4794 case CEPH_OSD_CMPXATTR_OP_NE
:
4796 case CEPH_OSD_CMPXATTR_OP_GT
:
4798 case CEPH_OSD_CMPXATTR_OP_GTE
:
4800 case CEPH_OSD_CMPXATTR_OP_LT
:
4802 case CEPH_OSD_CMPXATTR_OP_LTE
:
4809 int PrimaryLogPG::do_xattr_cmp_str(int op
, string
& v1s
, bufferlist
& xattr
)
4811 string
v2s(xattr
.c_str(), xattr
.length());
4813 dout(20) << "do_xattr_cmp_str '" << v1s
<< "' vs '" << v2s
<< "' op " << op
<< dendl
;
4816 case CEPH_OSD_CMPXATTR_OP_EQ
:
4817 return (v1s
.compare(v2s
) == 0);
4818 case CEPH_OSD_CMPXATTR_OP_NE
:
4819 return (v1s
.compare(v2s
) != 0);
4820 case CEPH_OSD_CMPXATTR_OP_GT
:
4821 return (v1s
.compare(v2s
) > 0);
4822 case CEPH_OSD_CMPXATTR_OP_GTE
:
4823 return (v1s
.compare(v2s
) >= 0);
4824 case CEPH_OSD_CMPXATTR_OP_LT
:
4825 return (v1s
.compare(v2s
) < 0);
4826 case CEPH_OSD_CMPXATTR_OP_LTE
:
4827 return (v1s
.compare(v2s
) <= 0);
4833 int PrimaryLogPG::do_writesame(OpContext
*ctx
, OSDOp
& osd_op
)
4835 ceph_osd_op
& op
= osd_op
.op
;
4836 vector
<OSDOp
> write_ops(1);
4837 OSDOp
& write_op
= write_ops
[0];
4838 uint64_t write_length
= op
.writesame
.length
;
4844 if (!op
.writesame
.data_length
|| write_length
% op
.writesame
.data_length
)
4847 if (op
.writesame
.data_length
!= osd_op
.indata
.length()) {
4848 derr
<< "invalid length ws data length " << op
.writesame
.data_length
<< " actual len " << osd_op
.indata
.length() << dendl
;
4852 while (write_length
) {
4853 write_op
.indata
.append(osd_op
.indata
);
4854 write_length
-= op
.writesame
.data_length
;
4857 write_op
.op
.op
= CEPH_OSD_OP_WRITE
;
4858 write_op
.op
.extent
.offset
= op
.writesame
.offset
;
4859 write_op
.op
.extent
.length
= op
.writesame
.length
;
4860 result
= do_osd_ops(ctx
, write_ops
);
4862 derr
<< "do_writesame do_osd_ops failed " << result
<< dendl
;
4867 // ========================================================================
4868 // low level osd ops
4870 int PrimaryLogPG::do_tmap2omap(OpContext
*ctx
, unsigned flags
)
4872 dout(20) << " convert tmap to omap for " << ctx
->new_obs
.oi
.soid
<< dendl
;
4873 bufferlist header
, vals
;
4874 int r
= _get_tmap(ctx
, &header
, &vals
);
4876 if (r
== -ENODATA
&& (flags
& CEPH_OSD_TMAP2OMAP_NULLOK
))
4881 vector
<OSDOp
> ops(3);
4883 ops
[0].op
.op
= CEPH_OSD_OP_TRUNCATE
;
4884 ops
[0].op
.extent
.offset
= 0;
4885 ops
[0].op
.extent
.length
= 0;
4887 ops
[1].op
.op
= CEPH_OSD_OP_OMAPSETHEADER
;
4888 ops
[1].indata
= std::move(header
);
4890 ops
[2].op
.op
= CEPH_OSD_OP_OMAPSETVALS
;
4891 ops
[2].indata
= std::move(vals
);
4893 return do_osd_ops(ctx
, ops
);
4896 int PrimaryLogPG::do_tmapup_slow(OpContext
*ctx
, bufferlist::const_iterator
& bp
,
4897 OSDOp
& osd_op
, bufferlist
& bl
)
4901 map
<string
, bufferlist
> m
;
4903 auto p
= bl
.cbegin();
4906 ceph_assert(p
.end());
4916 case CEPH_OSD_TMAP_SET
: // insert key
4924 case CEPH_OSD_TMAP_RM
: // remove key
4926 if (!m
.count(key
)) {
4931 case CEPH_OSD_TMAP_RMSLOPPY
: // remove key
4935 case CEPH_OSD_TMAP_HDR
: // update header
4947 encode(header
, obl
);
4951 vector
<OSDOp
> nops(1);
4952 OSDOp
& newop
= nops
[0];
4953 newop
.op
.op
= CEPH_OSD_OP_WRITEFULL
;
4954 newop
.op
.extent
.offset
= 0;
4955 newop
.op
.extent
.length
= obl
.length();
4957 do_osd_ops(ctx
, nops
);
4961 int PrimaryLogPG::do_tmapup(OpContext
*ctx
, bufferlist::const_iterator
& bp
, OSDOp
& osd_op
)
4963 bufferlist::const_iterator orig_bp
= bp
;
4966 dout(10) << "tmapup is a no-op" << dendl
;
4968 // read the whole object
4969 vector
<OSDOp
> nops(1);
4970 OSDOp
& newop
= nops
[0];
4971 newop
.op
.op
= CEPH_OSD_OP_READ
;
4972 newop
.op
.extent
.offset
= 0;
4973 newop
.op
.extent
.length
= 0;
4974 result
= do_osd_ops(ctx
, nops
);
4976 dout(10) << "tmapup read " << newop
.outdata
.length() << dendl
;
4978 dout(30) << " starting is \n";
4979 newop
.outdata
.hexdump(*_dout
);
4982 auto ip
= newop
.outdata
.cbegin();
4985 dout(30) << "the update command is: \n";
4986 osd_op
.indata
.hexdump(*_dout
);
4992 if (newop
.outdata
.length()) {
4996 dout(10) << "tmapup header " << header
.length() << dendl
;
4998 if (!bp
.end() && *bp
== CEPH_OSD_TMAP_HDR
) {
5001 dout(10) << "tmapup new header " << header
.length() << dendl
;
5004 encode(header
, obl
);
5006 dout(20) << "tmapup initial nkeys " << nkeys
<< dendl
;
5009 bufferlist newkeydata
;
5010 string nextkey
, last_in_key
;
5012 bool have_next
= false;
5015 decode(nextkey
, ip
);
5016 decode(nextval
, ip
);
5018 while (!bp
.end() && !result
) {
5025 catch (ceph::buffer::error
& e
) {
5028 if (key
< last_in_key
) {
5029 dout(5) << "tmapup warning: key '" << key
<< "' < previous key '" << last_in_key
5030 << "', falling back to an inefficient (unsorted) update" << dendl
;
5032 return do_tmapup_slow(ctx
, bp
, osd_op
, newop
.outdata
);
5036 dout(10) << "tmapup op " << (int)op
<< " key " << key
<< dendl
;
5038 // skip existing intervening keys
5039 bool key_exists
= false;
5040 while (have_next
&& !key_exists
) {
5041 dout(20) << " (have_next=" << have_next
<< " nextkey=" << nextkey
<< ")" << dendl
;
5044 if (nextkey
< key
) {
5046 encode(nextkey
, newkeydata
);
5047 encode(nextval
, newkeydata
);
5048 dout(20) << " keep " << nextkey
<< " " << nextval
.length() << dendl
;
5050 // don't copy; discard old value. and stop.
5051 dout(20) << " drop " << nextkey
<< " " << nextval
.length() << dendl
;
5056 decode(nextkey
, ip
);
5057 decode(nextval
, ip
);
5063 if (op
== CEPH_OSD_TMAP_SET
) {
5068 catch (ceph::buffer::error
& e
) {
5071 encode(key
, newkeydata
);
5072 encode(val
, newkeydata
);
5073 dout(20) << " set " << key
<< " " << val
.length() << dendl
;
5075 } else if (op
== CEPH_OSD_TMAP_CREATE
) {
5083 catch (ceph::buffer::error
& e
) {
5086 encode(key
, newkeydata
);
5087 encode(val
, newkeydata
);
5088 dout(20) << " create " << key
<< " " << val
.length() << dendl
;
5090 } else if (op
== CEPH_OSD_TMAP_RM
) {
5095 } else if (op
== CEPH_OSD_TMAP_RMSLOPPY
) {
5098 dout(10) << " invalid tmap op " << (int)op
<< dendl
;
5105 encode(nextkey
, newkeydata
);
5106 encode(nextval
, newkeydata
);
5107 dout(20) << " keep " << nextkey
<< " " << nextval
.length() << dendl
;
5111 rest
.substr_of(newop
.outdata
, ip
.get_off(), newop
.outdata
.length() - ip
.get_off());
5112 dout(20) << " keep trailing " << rest
.length()
5113 << " at " << newkeydata
.length() << dendl
;
5114 newkeydata
.claim_append(rest
);
5117 // encode final key count + key data
5118 dout(20) << "tmapup final nkeys " << nkeys
<< dendl
;
5120 obl
.claim_append(newkeydata
);
5123 dout(30) << " final is \n";
5124 obl
.hexdump(*_dout
);
5128 auto tp
= obl
.cbegin();
5131 map
<string
,bufferlist
> d
;
5133 ceph_assert(tp
.end());
5134 dout(0) << " **** debug sanity check, looks ok ****" << dendl
;
5139 dout(20) << "tmapput write " << obl
.length() << dendl
;
5140 newop
.op
.op
= CEPH_OSD_OP_WRITEFULL
;
5141 newop
.op
.extent
.offset
= 0;
5142 newop
.op
.extent
.length
= obl
.length();
5144 do_osd_ops(ctx
, nops
);
5150 static int check_offset_and_length(uint64_t offset
, uint64_t length
,
5151 uint64_t max
, DoutPrefixProvider
*dpp
)
5153 if (offset
>= max
||
5155 offset
+ length
> max
) {
5156 ldpp_dout(dpp
, 10) << __func__
<< " "
5157 << "osd_max_object_size: " << max
5158 << "; Hard limit of object size is 4GB." << dendl
;
5165 struct FillInVerifyExtent
: public Context
{
5168 bufferlist
*outdatap
;
5169 std::optional
<uint32_t> maybe_crc
;
5174 FillInVerifyExtent(ceph_le64
*r
, int32_t *rv
, bufferlist
*blp
,
5175 std::optional
<uint32_t> mc
, uint64_t size
,
5176 OSDService
*osd
, hobject_t soid
, uint32_t flags
) :
5177 r(r
), rval(rv
), outdatap(blp
), maybe_crc(mc
),
5178 size(size
), osd(osd
), soid(soid
), flags(flags
) {}
5179 void finish(int len
) override
{
5187 // whole object? can we verify the checksum?
5188 if (maybe_crc
&& *r
== size
) {
5189 uint32_t crc
= outdatap
->crc32c(-1);
5190 if (maybe_crc
!= crc
) {
5191 osd
->clog
->error() << std::hex
<< " full-object read crc 0x" << crc
5192 << " != expected 0x" << *maybe_crc
5193 << std::dec
<< " on " << soid
;
5194 if (!(flags
& CEPH_OSD_OP_FLAG_FAILOK
)) {
5203 struct ToSparseReadResult
: public Context
{
5205 bufferlist
* data_bl
;
5206 uint64_t data_offset
;
5208 ToSparseReadResult(int* result
, bufferlist
* bl
, uint64_t offset
,
5210 : result(result
), data_bl(bl
), data_offset(offset
),len(len
) {}
5211 void finish(int r
) override
{
5219 map
<uint64_t, uint64_t> extents
= {{data_offset
, r
}};
5220 encode(extents
, outdata
);
5221 encode_destructively(*data_bl
, outdata
);
5222 data_bl
->swap(outdata
);
5226 template<typename V
>
5227 static string
list_keys(const map
<string
, V
>& m
) {
5229 for (typename map
<string
, V
>::const_iterator itr
= m
.begin(); itr
!= m
.end(); ++itr
) {
5233 s
.append(itr
->first
);
5238 template<typename T
>
5239 static string
list_entries(const T
& m
) {
5241 for (typename
T::const_iterator itr
= m
.begin(); itr
!= m
.end(); ++itr
) {
5250 void PrimaryLogPG::maybe_create_new_object(
5252 bool ignore_transaction
)
5254 ObjectState
& obs
= ctx
->new_obs
;
5256 ctx
->delta_stats
.num_objects
++;
5258 ceph_assert(!obs
.oi
.is_whiteout());
5259 obs
.oi
.new_object();
5260 if (!ignore_transaction
)
5261 ctx
->op_t
->create(obs
.oi
.soid
);
5262 } else if (obs
.oi
.is_whiteout()) {
5263 dout(10) << __func__
<< " clearing whiteout on " << obs
.oi
.soid
<< dendl
;
5264 ctx
->new_obs
.oi
.clear_flag(object_info_t::FLAG_WHITEOUT
);
5265 --ctx
->delta_stats
.num_whiteouts
;
5269 struct ReadFinisher
: public PrimaryLogPG::OpFinisher
{
5272 explicit ReadFinisher(OSDOp
& osd_op
) : osd_op(osd_op
) {
5275 int execute() override
{
5280 struct C_ChecksumRead
: public Context
{
5281 PrimaryLogPG
*primary_log_pg
;
5283 Checksummer::CSumType csum_type
;
5284 bufferlist init_value_bl
;
5285 ceph_le64 read_length
;
5287 Context
*fill_extent_ctx
;
5289 C_ChecksumRead(PrimaryLogPG
*primary_log_pg
, OSDOp
&osd_op
,
5290 Checksummer::CSumType csum_type
, bufferlist
&&init_value_bl
,
5291 std::optional
<uint32_t> maybe_crc
, uint64_t size
,
5292 OSDService
*osd
, hobject_t soid
, uint32_t flags
)
5293 : primary_log_pg(primary_log_pg
), osd_op(osd_op
),
5294 csum_type(csum_type
), init_value_bl(std::move(init_value_bl
)),
5295 fill_extent_ctx(new FillInVerifyExtent(&read_length
, &osd_op
.rval
,
5296 &read_bl
, maybe_crc
, size
,
5297 osd
, soid
, flags
)) {
5299 ~C_ChecksumRead() override
{
5300 delete fill_extent_ctx
;
5303 void finish(int r
) override
{
5304 fill_extent_ctx
->complete(r
);
5305 fill_extent_ctx
= nullptr;
5307 if (osd_op
.rval
>= 0) {
5308 bufferlist::const_iterator init_value_bl_it
= init_value_bl
.begin();
5309 osd_op
.rval
= primary_log_pg
->finish_checksum(osd_op
, csum_type
,
5310 &init_value_bl_it
, read_bl
);
5315 int PrimaryLogPG::do_checksum(OpContext
*ctx
, OSDOp
& osd_op
,
5316 bufferlist::const_iterator
*bl_it
)
5318 dout(20) << __func__
<< dendl
;
5320 auto& op
= osd_op
.op
;
5321 if (op
.checksum
.chunk_size
> 0) {
5322 if (op
.checksum
.length
== 0) {
5323 dout(10) << __func__
<< ": length required when chunk size provided"
5327 if (op
.checksum
.length
% op
.checksum
.chunk_size
!= 0) {
5328 dout(10) << __func__
<< ": length not aligned to chunk size" << dendl
;
5333 auto& oi
= ctx
->new_obs
.oi
;
5334 if (op
.checksum
.offset
== 0 && op
.checksum
.length
== 0) {
5335 // zeroed offset+length implies checksum whole object
5336 op
.checksum
.length
= oi
.size
;
5337 } else if (op
.checksum
.offset
>= oi
.size
) {
5338 // read size was trimmed to zero, do nothing
5339 // see PrimaryLogPG::do_read
5341 } else if (op
.extent
.offset
+ op
.extent
.length
> oi
.size
) {
5342 op
.extent
.length
= oi
.size
- op
.extent
.offset
;
5343 if (op
.checksum
.chunk_size
> 0 &&
5344 op
.checksum
.length
% op
.checksum
.chunk_size
!= 0) {
5345 dout(10) << __func__
<< ": length (trimmed to 0x"
5346 << std::hex
<< op
.checksum
.length
5347 << ") not aligned to chunk size 0x"
5348 << op
.checksum
.chunk_size
<< std::dec
5354 Checksummer::CSumType csum_type
;
5355 switch (op
.checksum
.type
) {
5356 case CEPH_OSD_CHECKSUM_OP_TYPE_XXHASH32
:
5357 csum_type
= Checksummer::CSUM_XXHASH32
;
5359 case CEPH_OSD_CHECKSUM_OP_TYPE_XXHASH64
:
5360 csum_type
= Checksummer::CSUM_XXHASH64
;
5362 case CEPH_OSD_CHECKSUM_OP_TYPE_CRC32C
:
5363 csum_type
= Checksummer::CSUM_CRC32C
;
5366 dout(10) << __func__
<< ": unknown crc type ("
5367 << static_cast<uint32_t>(op
.checksum
.type
) << ")" << dendl
;
5371 size_t csum_init_value_size
= Checksummer::get_csum_init_value_size(csum_type
);
5372 if (bl_it
->get_remaining() < csum_init_value_size
) {
5373 dout(10) << __func__
<< ": init value not provided" << dendl
;
5377 bufferlist init_value_bl
;
5378 init_value_bl
.substr_of(bl_it
->get_bl(), bl_it
->get_off(),
5379 csum_init_value_size
);
5380 *bl_it
+= csum_init_value_size
;
5382 if (pool
.info
.is_erasure() && op
.checksum
.length
> 0) {
5383 // If there is a data digest and it is possible we are reading
5384 // entire object, pass the digest.
5385 std::optional
<uint32_t> maybe_crc
;
5386 if (oi
.is_data_digest() && op
.checksum
.offset
== 0 &&
5387 op
.checksum
.length
>= oi
.size
) {
5388 maybe_crc
= oi
.data_digest
;
5392 auto& soid
= oi
.soid
;
5393 auto checksum_ctx
= new C_ChecksumRead(this, osd_op
, csum_type
,
5394 std::move(init_value_bl
), maybe_crc
,
5395 oi
.size
, osd
, soid
, op
.flags
);
5397 ctx
->pending_async_reads
.push_back({
5398 {op
.checksum
.offset
, op
.checksum
.length
, op
.flags
},
5399 {&checksum_ctx
->read_bl
, checksum_ctx
}});
5401 dout(10) << __func__
<< ": async_read noted for " << soid
<< dendl
;
5402 ctx
->op_finishers
[ctx
->current_osd_subop_num
].reset(
5403 new ReadFinisher(osd_op
));
5404 return -EINPROGRESS
;
5408 std::vector
<OSDOp
> read_ops(1);
5409 auto& read_op
= read_ops
[0];
5410 if (op
.checksum
.length
> 0) {
5411 read_op
.op
.op
= CEPH_OSD_OP_READ
;
5412 read_op
.op
.flags
= op
.flags
;
5413 read_op
.op
.extent
.offset
= op
.checksum
.offset
;
5414 read_op
.op
.extent
.length
= op
.checksum
.length
;
5415 read_op
.op
.extent
.truncate_size
= 0;
5416 read_op
.op
.extent
.truncate_seq
= 0;
5418 int r
= do_osd_ops(ctx
, read_ops
);
5420 derr
<< __func__
<< ": do_osd_ops failed: " << cpp_strerror(r
) << dendl
;
5425 bufferlist::const_iterator init_value_bl_it
= init_value_bl
.begin();
5426 return finish_checksum(osd_op
, csum_type
, &init_value_bl_it
,
5430 int PrimaryLogPG::finish_checksum(OSDOp
& osd_op
,
5431 Checksummer::CSumType csum_type
,
5432 bufferlist::const_iterator
*init_value_bl_it
,
5433 const bufferlist
&read_bl
) {
5434 dout(20) << __func__
<< dendl
;
5436 auto& op
= osd_op
.op
;
5438 if (op
.checksum
.length
> 0 && read_bl
.length() != op
.checksum
.length
) {
5439 derr
<< __func__
<< ": bytes read " << read_bl
.length() << " != "
5440 << op
.checksum
.length
<< dendl
;
5444 size_t csum_chunk_size
= (op
.checksum
.chunk_size
!= 0 ?
5445 op
.checksum
.chunk_size
: read_bl
.length());
5446 uint32_t csum_count
= (csum_chunk_size
> 0 ?
5447 read_bl
.length() / csum_chunk_size
: 0);
5450 bufferptr csum_data
;
5451 if (csum_count
> 0) {
5452 size_t csum_value_size
= Checksummer::get_csum_value_size(csum_type
);
5453 csum_data
= ceph::buffer::create(csum_value_size
* csum_count
);
5455 csum
.append(csum_data
);
5457 switch (csum_type
) {
5458 case Checksummer::CSUM_XXHASH32
:
5460 Checksummer::xxhash32::init_value_t init_value
;
5461 decode(init_value
, *init_value_bl_it
);
5462 Checksummer::calculate
<Checksummer::xxhash32
>(
5463 init_value
, csum_chunk_size
, 0, read_bl
.length(), read_bl
,
5467 case Checksummer::CSUM_XXHASH64
:
5469 Checksummer::xxhash64::init_value_t init_value
;
5470 decode(init_value
, *init_value_bl_it
);
5471 Checksummer::calculate
<Checksummer::xxhash64
>(
5472 init_value
, csum_chunk_size
, 0, read_bl
.length(), read_bl
,
5476 case Checksummer::CSUM_CRC32C
:
5478 Checksummer::crc32c::init_value_t init_value
;
5479 decode(init_value
, *init_value_bl_it
);
5480 Checksummer::calculate
<Checksummer::crc32c
>(
5481 init_value
, csum_chunk_size
, 0, read_bl
.length(), read_bl
,
5490 encode(csum_count
, osd_op
.outdata
);
5491 osd_op
.outdata
.claim_append(csum
);
5495 struct C_ExtentCmpRead
: public Context
{
5496 PrimaryLogPG
*primary_log_pg
;
5498 ceph_le64 read_length
{};
5500 Context
*fill_extent_ctx
;
5502 C_ExtentCmpRead(PrimaryLogPG
*primary_log_pg
, OSDOp
&osd_op
,
5503 std::optional
<uint32_t> maybe_crc
, uint64_t size
,
5504 OSDService
*osd
, hobject_t soid
, uint32_t flags
)
5505 : primary_log_pg(primary_log_pg
), osd_op(osd_op
),
5506 fill_extent_ctx(new FillInVerifyExtent(&read_length
, &osd_op
.rval
,
5507 &read_bl
, maybe_crc
, size
,
5508 osd
, soid
, flags
)) {
5510 ~C_ExtentCmpRead() override
{
5511 delete fill_extent_ctx
;
5514 void finish(int r
) override
{
5518 delete fill_extent_ctx
;
5520 fill_extent_ctx
->complete(r
);
5522 fill_extent_ctx
= nullptr;
5524 if (osd_op
.rval
>= 0) {
5525 osd_op
.rval
= primary_log_pg
->finish_extent_cmp(osd_op
, read_bl
);
5530 int PrimaryLogPG::do_extent_cmp(OpContext
*ctx
, OSDOp
& osd_op
)
5532 dout(20) << __func__
<< dendl
;
5533 ceph_osd_op
& op
= osd_op
.op
;
5535 auto& oi
= ctx
->new_obs
.oi
;
5536 uint64_t size
= oi
.size
;
5537 if ((oi
.truncate_seq
< op
.extent
.truncate_seq
) &&
5538 (op
.extent
.offset
+ op
.extent
.length
> op
.extent
.truncate_size
)) {
5539 size
= op
.extent
.truncate_size
;
5542 if (op
.extent
.offset
>= size
) {
5543 op
.extent
.length
= 0;
5544 } else if (op
.extent
.offset
+ op
.extent
.length
> size
) {
5545 op
.extent
.length
= size
- op
.extent
.offset
;
5548 if (op
.extent
.length
== 0) {
5549 dout(20) << __func__
<< " zero length extent" << dendl
;
5550 return finish_extent_cmp(osd_op
, bufferlist
{});
5551 } else if (!ctx
->obs
->exists
|| ctx
->obs
->oi
.is_whiteout()) {
5552 dout(20) << __func__
<< " object DNE" << dendl
;
5553 return finish_extent_cmp(osd_op
, {});
5554 } else if (pool
.info
.is_erasure()) {
5555 // If there is a data digest and it is possible we are reading
5556 // entire object, pass the digest.
5557 std::optional
<uint32_t> maybe_crc
;
5558 if (oi
.is_data_digest() && op
.checksum
.offset
== 0 &&
5559 op
.checksum
.length
>= oi
.size
) {
5560 maybe_crc
= oi
.data_digest
;
5564 auto& soid
= oi
.soid
;
5565 auto extent_cmp_ctx
= new C_ExtentCmpRead(this, osd_op
, maybe_crc
, oi
.size
,
5566 osd
, soid
, op
.flags
);
5567 ctx
->pending_async_reads
.push_back({
5568 {op
.extent
.offset
, op
.extent
.length
, op
.flags
},
5569 {&extent_cmp_ctx
->read_bl
, extent_cmp_ctx
}});
5571 dout(10) << __func__
<< ": async_read noted for " << soid
<< dendl
;
5573 ctx
->op_finishers
[ctx
->current_osd_subop_num
].reset(
5574 new ReadFinisher(osd_op
));
5575 return -EINPROGRESS
;
5579 vector
<OSDOp
> read_ops(1);
5580 OSDOp
& read_op
= read_ops
[0];
5582 read_op
.op
.op
= CEPH_OSD_OP_SYNC_READ
;
5583 read_op
.op
.extent
.offset
= op
.extent
.offset
;
5584 read_op
.op
.extent
.length
= op
.extent
.length
;
5585 read_op
.op
.extent
.truncate_seq
= op
.extent
.truncate_seq
;
5586 read_op
.op
.extent
.truncate_size
= op
.extent
.truncate_size
;
5588 int result
= do_osd_ops(ctx
, read_ops
);
5590 derr
<< __func__
<< " failed " << result
<< dendl
;
5593 return finish_extent_cmp(osd_op
, read_op
.outdata
);
5596 int PrimaryLogPG::finish_extent_cmp(OSDOp
& osd_op
, const bufferlist
&read_bl
)
5598 for (uint64_t idx
= 0; idx
< osd_op
.indata
.length(); ++idx
) {
5599 char read_byte
= (idx
< read_bl
.length() ? read_bl
[idx
] : 0);
5600 if (osd_op
.indata
[idx
] != read_byte
) {
5601 return (-MAX_ERRNO
- idx
);
5608 int PrimaryLogPG::do_read(OpContext
*ctx
, OSDOp
& osd_op
) {
5609 dout(20) << __func__
<< dendl
;
5610 auto& op
= osd_op
.op
;
5611 auto& oi
= ctx
->new_obs
.oi
;
5612 auto& soid
= oi
.soid
;
5613 __u32 seq
= oi
.truncate_seq
;
5614 uint64_t size
= oi
.size
;
5615 bool trimmed_read
= false;
5617 dout(30) << __func__
<< " oi.size: " << oi
.size
<< dendl
;
5618 dout(30) << __func__
<< " oi.truncate_seq: " << oi
.truncate_seq
<< dendl
;
5619 dout(30) << __func__
<< " op.extent.truncate_seq: " << op
.extent
.truncate_seq
<< dendl
;
5620 dout(30) << __func__
<< " op.extent.truncate_size: " << op
.extent
.truncate_size
<< dendl
;
5622 // are we beyond truncate_size?
5623 if ( (seq
< op
.extent
.truncate_seq
) &&
5624 (op
.extent
.offset
+ op
.extent
.length
> op
.extent
.truncate_size
) &&
5625 (size
> op
.extent
.truncate_size
) )
5626 size
= op
.extent
.truncate_size
;
5628 if (op
.extent
.length
== 0) //length is zero mean read the whole object
5629 op
.extent
.length
= size
;
5631 if (op
.extent
.offset
>= size
) {
5632 op
.extent
.length
= 0;
5633 trimmed_read
= true;
5634 } else if (op
.extent
.offset
+ op
.extent
.length
> size
) {
5635 op
.extent
.length
= size
- op
.extent
.offset
;
5636 trimmed_read
= true;
5639 dout(30) << __func__
<< "op.extent.length is now " << op
.extent
.length
<< dendl
;
5641 // read into a buffer
5643 if (trimmed_read
&& op
.extent
.length
== 0) {
5644 // read size was trimmed to zero and it is expected to do nothing
5645 // a read operation of 0 bytes does *not* do nothing, this is why
5646 // the trimmed_read boolean is needed
5647 } else if (pool
.info
.is_erasure()) {
5648 // The initialisation below is required to silence a false positive
5649 // -Wmaybe-uninitialized warning
5650 std::optional
<uint32_t> maybe_crc
;
5651 // If there is a data digest and it is possible we are reading
5652 // entire object, pass the digest. FillInVerifyExtent will
5653 // will check the oi.size again.
5654 if (oi
.is_data_digest() && op
.extent
.offset
== 0 &&
5655 op
.extent
.length
>= oi
.size
)
5656 maybe_crc
= oi
.data_digest
;
5657 ctx
->pending_async_reads
.push_back(
5659 boost::make_tuple(op
.extent
.offset
, op
.extent
.length
, op
.flags
),
5660 make_pair(&osd_op
.outdata
,
5661 new FillInVerifyExtent(&op
.extent
.length
, &osd_op
.rval
,
5662 &osd_op
.outdata
, maybe_crc
, oi
.size
,
5663 osd
, soid
, op
.flags
))));
5664 dout(10) << " async_read noted for " << soid
<< dendl
;
5666 ctx
->op_finishers
[ctx
->current_osd_subop_num
].reset(
5667 new ReadFinisher(osd_op
));
5669 int r
= pgbackend
->objects_read_sync(
5670 soid
, op
.extent
.offset
, op
.extent
.length
, op
.flags
, &osd_op
.outdata
);
5671 // whole object? can we verify the checksum?
5672 if (r
>= 0 && op
.extent
.offset
== 0 &&
5673 (uint64_t)r
== oi
.size
&& oi
.is_data_digest()) {
5674 uint32_t crc
= osd_op
.outdata
.crc32c(-1);
5675 if (oi
.data_digest
!= crc
) {
5676 osd
->clog
->error() << info
.pgid
<< std::hex
5677 << " full-object read crc 0x" << crc
5678 << " != expected 0x" << oi
.data_digest
5679 << std::dec
<< " on " << soid
;
5680 r
= -EIO
; // try repair later
5684 r
= rep_repair_primary_object(soid
, ctx
);
5687 op
.extent
.length
= r
;
5688 else if (r
== -EAGAIN
) {
5692 op
.extent
.length
= 0;
5694 dout(10) << " read got " << r
<< " / " << op
.extent
.length
5695 << " bytes from obj " << soid
<< dendl
;
5698 ctx
->delta_stats
.num_rd_kb
+= shift_round_up(op
.extent
.length
, 10);
5699 ctx
->delta_stats
.num_rd
++;
5704 int PrimaryLogPG::do_sparse_read(OpContext
*ctx
, OSDOp
& osd_op
) {
5705 dout(20) << __func__
<< dendl
;
5706 auto& op
= osd_op
.op
;
5707 auto& oi
= ctx
->new_obs
.oi
;
5708 auto& soid
= oi
.soid
;
5710 if (op
.extent
.truncate_seq
) {
5711 dout(0) << "sparse_read does not support truncation sequence " << dendl
;
5716 if (pool
.info
.is_erasure()) {
5717 // translate sparse read to a normal one if not supported
5718 uint64_t offset
= op
.extent
.offset
;
5719 uint64_t length
= op
.extent
.length
;
5720 if (offset
> oi
.size
) {
5722 } else if (offset
+ length
> oi
.size
) {
5723 length
= oi
.size
- offset
;
5727 ctx
->pending_async_reads
.push_back(
5729 boost::make_tuple(offset
, length
, op
.flags
),
5732 new ToSparseReadResult(&osd_op
.rval
, &osd_op
.outdata
, offset
,
5733 &op
.extent
.length
))));
5734 dout(10) << " async_read (was sparse_read) noted for " << soid
<< dendl
;
5736 ctx
->op_finishers
[ctx
->current_osd_subop_num
].reset(
5737 new ReadFinisher(osd_op
));
5739 dout(10) << " sparse read ended up empty for " << soid
<< dendl
;
5740 map
<uint64_t, uint64_t> extents
;
5741 encode(extents
, osd_op
.outdata
);
5744 // read into a buffer
5745 map
<uint64_t, uint64_t> m
;
5746 int r
= osd
->store
->fiemap(ch
, ghobject_t(soid
, ghobject_t::NO_GEN
,
5748 op
.extent
.offset
, op
.extent
.length
, m
);
5754 r
= pgbackend
->objects_readv_sync(soid
, std::move(m
), op
.flags
, &data_bl
);
5756 r
= rep_repair_primary_object(soid
, ctx
);
5762 // Why SPARSE_READ need checksum? In fact, librbd always use sparse-read.
5763 // Maybe at first, there is no much whole objects. With continued use, more
5764 // and more whole object exist. So from this point, for spare-read add
5765 // checksum make sense.
5766 if ((uint64_t)r
== oi
.size
&& oi
.is_data_digest()) {
5767 uint32_t crc
= data_bl
.crc32c(-1);
5768 if (oi
.data_digest
!= crc
) {
5769 osd
->clog
->error() << info
.pgid
<< std::hex
5770 << " full-object read crc 0x" << crc
5771 << " != expected 0x" << oi
.data_digest
5772 << std::dec
<< " on " << soid
;
5773 r
= rep_repair_primary_object(soid
, ctx
);
5780 op
.extent
.length
= r
;
5782 encode(m
, osd_op
.outdata
); // re-encode since it might be modified
5783 ::encode_destructively(data_bl
, osd_op
.outdata
);
5785 dout(10) << " sparse_read got " << r
<< " bytes from object "
5789 ctx
->delta_stats
.num_rd_kb
+= shift_round_up(op
.extent
.length
, 10);
5790 ctx
->delta_stats
.num_rd
++;
5794 int PrimaryLogPG::do_osd_ops(OpContext
*ctx
, vector
<OSDOp
>& ops
)
5797 SnapSetContext
*ssc
= ctx
->obc
->ssc
;
5798 ObjectState
& obs
= ctx
->new_obs
;
5799 object_info_t
& oi
= obs
.oi
;
5800 const hobject_t
& soid
= oi
.soid
;
5801 const bool skip_data_digest
= osd
->store
->has_builtin_csum() &&
5802 osd
->osd_skip_data_digest
;
5804 PGTransaction
* t
= ctx
->op_t
.get();
5806 dout(10) << "do_osd_op " << soid
<< " " << ops
<< dendl
;
5808 if (ctx
->op
->osd_parent_span
) {
5809 auto do_osd_op_span
= jaeger_tracing::child_span(__func__
, ctx
->op
->osd_parent_span
);
5813 ctx
->current_osd_subop_num
= 0;
5814 for (auto p
= ops
.begin(); p
!= ops
.end(); ++p
, ctx
->current_osd_subop_num
++, ctx
->processed_subop_count
++) {
5816 ceph_osd_op
& op
= osd_op
.op
;
5818 OpFinisher
* op_finisher
= nullptr;
5820 auto op_finisher_it
= ctx
->op_finishers
.find(ctx
->current_osd_subop_num
);
5821 if (op_finisher_it
!= ctx
->op_finishers
.end()) {
5822 op_finisher
= op_finisher_it
->second
.get();
5826 // TODO: check endianness (ceph_le32 vs uint32_t, etc.)
5827 // The fields in ceph_osd_op are little-endian (according to the definition in rados.h),
5828 // but the code in this function seems to treat them as native-endian. What should the
5830 tracepoint(osd
, do_osd_op_pre
, soid
.oid
.name
.c_str(), soid
.snap
.val
, op
.op
, ceph_osd_op_name(op
.op
), op
.flags
);
5832 dout(10) << "do_osd_op " << osd_op
<< dendl
;
5834 auto bp
= osd_op
.indata
.cbegin();
5836 // user-visible modifcation?
5838 // non user-visible modifications
5839 case CEPH_OSD_OP_WATCH
:
5840 case CEPH_OSD_OP_CACHE_EVICT
:
5841 case CEPH_OSD_OP_CACHE_FLUSH
:
5842 case CEPH_OSD_OP_CACHE_TRY_FLUSH
:
5843 case CEPH_OSD_OP_UNDIRTY
:
5844 case CEPH_OSD_OP_COPY_FROM
: // we handle user_version update explicitly
5845 case CEPH_OSD_OP_COPY_FROM2
:
5846 case CEPH_OSD_OP_CACHE_PIN
:
5847 case CEPH_OSD_OP_CACHE_UNPIN
:
5848 case CEPH_OSD_OP_SET_REDIRECT
:
5849 case CEPH_OSD_OP_SET_CHUNK
:
5850 case CEPH_OSD_OP_TIER_PROMOTE
:
5851 case CEPH_OSD_OP_TIER_FLUSH
:
5852 case CEPH_OSD_OP_TIER_EVICT
:
5855 if (op
.op
& CEPH_OSD_OP_MODE_WR
)
5856 ctx
->user_modify
= true;
5859 // munge -1 truncate to 0 truncate
5860 if (ceph_osd_op_uses_extent(op
.op
) &&
5861 op
.extent
.truncate_seq
== 1 &&
5862 op
.extent
.truncate_size
== (-1ULL)) {
5863 op
.extent
.truncate_size
= 0;
5864 op
.extent
.truncate_seq
= 0;
5867 // munge ZERO -> TRUNCATE? (don't munge to DELETE or we risk hosing attributes)
5868 if (op
.op
== CEPH_OSD_OP_ZERO
&&
5870 op
.extent
.offset
< static_cast<Option::size_t>(osd
->osd_max_object_size
) &&
5871 op
.extent
.length
>= 1 &&
5872 op
.extent
.length
<= static_cast<Option::size_t>(osd
->osd_max_object_size
) &&
5873 op
.extent
.offset
+ op
.extent
.length
>= oi
.size
) {
5874 if (op
.extent
.offset
>= oi
.size
) {
5878 dout(10) << " munging ZERO " << op
.extent
.offset
<< "~" << op
.extent
.length
5879 << " -> TRUNCATE " << op
.extent
.offset
<< " (old size is " << oi
.size
<< ")" << dendl
;
5880 op
.op
= CEPH_OSD_OP_TRUNCATE
;
5887 case CEPH_OSD_OP_CMPEXT
:
5889 tracepoint(osd
, do_osd_op_pre_extent_cmp
, soid
.oid
.name
.c_str(),
5890 soid
.snap
.val
, oi
.size
, oi
.truncate_seq
, op
.extent
.offset
,
5891 op
.extent
.length
, op
.extent
.truncate_size
,
5892 op
.extent
.truncate_seq
);
5894 if (op_finisher
== nullptr) {
5895 result
= do_extent_cmp(ctx
, osd_op
);
5897 result
= op_finisher
->execute();
5901 case CEPH_OSD_OP_SYNC_READ
:
5902 if (pool
.info
.is_erasure()) {
5903 result
= -EOPNOTSUPP
;
5907 case CEPH_OSD_OP_READ
:
5909 tracepoint(osd
, do_osd_op_pre_read
, soid
.oid
.name
.c_str(),
5910 soid
.snap
.val
, oi
.size
, oi
.truncate_seq
, op
.extent
.offset
,
5911 op
.extent
.length
, op
.extent
.truncate_size
,
5912 op
.extent
.truncate_seq
);
5913 if (op_finisher
== nullptr) {
5914 if (!ctx
->data_off
) {
5915 ctx
->data_off
= op
.extent
.offset
;
5917 result
= do_read(ctx
, osd_op
);
5919 result
= op_finisher
->execute();
5923 case CEPH_OSD_OP_CHECKSUM
:
5926 tracepoint(osd
, do_osd_op_pre_checksum
, soid
.oid
.name
.c_str(),
5927 soid
.snap
.val
, oi
.size
, oi
.truncate_seq
, op
.checksum
.type
,
5928 op
.checksum
.offset
, op
.checksum
.length
,
5929 op
.checksum
.chunk_size
);
5931 if (op_finisher
== nullptr) {
5932 result
= do_checksum(ctx
, osd_op
, &bp
);
5934 result
= op_finisher
->execute();
5940 case CEPH_OSD_OP_MAPEXT
:
5941 tracepoint(osd
, do_osd_op_pre_mapext
, soid
.oid
.name
.c_str(), soid
.snap
.val
, op
.extent
.offset
, op
.extent
.length
);
5942 if (pool
.info
.is_erasure()) {
5943 result
= -EOPNOTSUPP
;
5948 // read into a buffer
5950 int r
= osd
->store
->fiemap(ch
, ghobject_t(soid
, ghobject_t::NO_GEN
,
5952 op
.extent
.offset
, op
.extent
.length
, bl
);
5953 osd_op
.outdata
= std::move(bl
);
5957 ctx
->delta_stats
.num_rd_kb
+= shift_round_up(bl
.length(), 10);
5958 ctx
->delta_stats
.num_rd
++;
5959 dout(10) << " map_extents done on object " << soid
<< dendl
;
5964 case CEPH_OSD_OP_SPARSE_READ
:
5965 tracepoint(osd
, do_osd_op_pre_sparse_read
, soid
.oid
.name
.c_str(),
5966 soid
.snap
.val
, oi
.size
, oi
.truncate_seq
, op
.extent
.offset
,
5967 op
.extent
.length
, op
.extent
.truncate_size
,
5968 op
.extent
.truncate_seq
);
5969 if (op_finisher
== nullptr) {
5970 result
= do_sparse_read(ctx
, osd_op
);
5972 result
= op_finisher
->execute();
5976 case CEPH_OSD_OP_CALL
:
5978 string cname
, mname
;
5981 bp
.copy(op
.cls
.class_len
, cname
);
5982 bp
.copy(op
.cls
.method_len
, mname
);
5983 bp
.copy(op
.cls
.indata_len
, indata
);
5984 } catch (ceph::buffer::error
& e
) {
5985 dout(10) << "call unable to decode class + method + indata" << dendl
;
5986 dout(30) << "in dump: ";
5987 osd_op
.indata
.hexdump(*_dout
);
5990 tracepoint(osd
, do_osd_op_pre_call
, soid
.oid
.name
.c_str(), soid
.snap
.val
, "???", "???");
5993 tracepoint(osd
, do_osd_op_pre_call
, soid
.oid
.name
.c_str(), soid
.snap
.val
, cname
.c_str(), mname
.c_str());
5995 ClassHandler::ClassData
*cls
;
5996 result
= ClassHandler::get_instance().open_class(cname
, &cls
);
5997 ceph_assert(result
== 0); // init_op_flags() already verified this works.
5999 ClassHandler::ClassMethod
*method
= cls
->get_method(mname
);
6001 dout(10) << "call method " << cname
<< "." << mname
<< " does not exist" << dendl
;
6002 result
= -EOPNOTSUPP
;
6006 int flags
= method
->get_flags();
6007 if (flags
& CLS_METHOD_WR
)
6008 ctx
->user_modify
= true;
6011 dout(10) << "call method " << cname
<< "." << mname
<< dendl
;
6012 int prev_rd
= ctx
->num_read
;
6013 int prev_wr
= ctx
->num_write
;
6014 result
= method
->exec((cls_method_context_t
)&ctx
, indata
, outdata
);
6016 if (ctx
->num_read
> prev_rd
&& !(flags
& CLS_METHOD_RD
)) {
6017 derr
<< "method " << cname
<< "." << mname
<< " tried to read object but is not marked RD" << dendl
;
6021 if (ctx
->num_write
> prev_wr
&& !(flags
& CLS_METHOD_WR
)) {
6022 derr
<< "method " << cname
<< "." << mname
<< " tried to update object but is not marked WR" << dendl
;
6027 dout(10) << "method called response length=" << outdata
.length() << dendl
;
6028 op
.extent
.length
= outdata
.length();
6029 osd_op
.outdata
.claim_append(outdata
);
6030 dout(30) << "out dump: ";
6031 osd_op
.outdata
.hexdump(*_dout
);
6036 case CEPH_OSD_OP_STAT
:
6037 // note: stat does not require RD
6039 tracepoint(osd
, do_osd_op_pre_stat
, soid
.oid
.name
.c_str(), soid
.snap
.val
);
6041 if (obs
.exists
&& !oi
.is_whiteout()) {
6042 encode(oi
.size
, osd_op
.outdata
);
6043 encode(oi
.mtime
, osd_op
.outdata
);
6044 dout(10) << "stat oi has " << oi
.size
<< " " << oi
.mtime
<< dendl
;
6047 dout(10) << "stat oi object does not exist" << dendl
;
6050 ctx
->delta_stats
.num_rd
++;
6054 case CEPH_OSD_OP_ISDIRTY
:
6057 tracepoint(osd
, do_osd_op_pre_isdirty
, soid
.oid
.name
.c_str(), soid
.snap
.val
);
6058 bool is_dirty
= obs
.oi
.is_dirty();
6059 encode(is_dirty
, osd_op
.outdata
);
6060 ctx
->delta_stats
.num_rd
++;
6065 case CEPH_OSD_OP_UNDIRTY
:
6069 tracepoint(osd
, do_osd_op_pre_undirty
, soid
.oid
.name
.c_str(), soid
.snap
.val
);
6070 if (oi
.is_dirty()) {
6071 ctx
->undirty
= true; // see make_writeable()
6073 ctx
->delta_stats
.num_wr
++;
6078 case CEPH_OSD_OP_CACHE_TRY_FLUSH
:
6082 tracepoint(osd
, do_osd_op_pre_try_flush
, soid
.oid
.name
.c_str(), soid
.snap
.val
);
6083 if (ctx
->lock_type
!= RWState::RWNONE
) {
6084 dout(10) << "cache-try-flush without SKIPRWLOCKS flag set" << dendl
;
6088 if (pool
.info
.cache_mode
== pg_pool_t::CACHEMODE_NONE
|| obs
.oi
.has_manifest()) {
6096 if (oi
.is_cache_pinned()) {
6097 dout(10) << "cache-try-flush on a pinned object, consider unpin this object first" << dendl
;
6101 if (oi
.is_dirty()) {
6102 result
= start_flush(ctx
->op
, ctx
->obc
, false, NULL
, std::nullopt
);
6103 if (result
== -EINPROGRESS
)
6111 case CEPH_OSD_OP_CACHE_FLUSH
:
6115 tracepoint(osd
, do_osd_op_pre_cache_flush
, soid
.oid
.name
.c_str(), soid
.snap
.val
);
6116 if (ctx
->lock_type
== RWState::RWNONE
) {
6117 dout(10) << "cache-flush with SKIPRWLOCKS flag set" << dendl
;
6121 if (pool
.info
.cache_mode
== pg_pool_t::CACHEMODE_NONE
|| obs
.oi
.has_manifest()) {
6129 if (oi
.is_cache_pinned()) {
6130 dout(10) << "cache-flush on a pinned object, consider unpin this object first" << dendl
;
6135 if (oi
.is_dirty()) {
6136 result
= start_flush(ctx
->op
, ctx
->obc
, true, &missing
, std::nullopt
);
6137 if (result
== -EINPROGRESS
)
6142 // Check special return value which has set missing_return
6143 if (result
== -ENOENT
) {
6144 dout(10) << __func__
<< " CEPH_OSD_OP_CACHE_FLUSH got ENOENT" << dendl
;
6145 ceph_assert(!missing
.is_min());
6146 wait_for_unreadable_object(missing
, ctx
->op
);
6147 // Error code which is used elsewhere when wait_for_unreadable_object() is used
6153 case CEPH_OSD_OP_CACHE_EVICT
:
6157 tracepoint(osd
, do_osd_op_pre_cache_evict
, soid
.oid
.name
.c_str(), soid
.snap
.val
);
6158 if (pool
.info
.cache_mode
== pg_pool_t::CACHEMODE_NONE
|| obs
.oi
.has_manifest()) {
6166 if (oi
.is_cache_pinned()) {
6167 dout(10) << "cache-evict on a pinned object, consider unpin this object first" << dendl
;
6171 if (oi
.is_dirty()) {
6175 if (!oi
.watchers
.empty()) {
6179 if (soid
.snap
== CEPH_NOSNAP
) {
6180 result
= _verify_no_head_clones(soid
, ssc
->snapset
);
6184 result
= _delete_oid(ctx
, true, false);
6186 // mark that this is a cache eviction to avoid triggering normal
6187 // make_writeable() clone creation in finish_ctx()
6188 ctx
->cache_operation
= true;
6190 osd
->logger
->inc(l_osd_tier_evict
);
6194 case CEPH_OSD_OP_GETXATTR
:
6198 bp
.copy(op
.xattr
.name_len
, aname
);
6199 tracepoint(osd
, do_osd_op_pre_getxattr
, soid
.oid
.name
.c_str(), soid
.snap
.val
, aname
.c_str());
6200 string name
= "_" + aname
;
6201 int r
= getattr_maybe_cache(
6206 op
.xattr
.value_len
= osd_op
.outdata
.length();
6208 ctx
->delta_stats
.num_rd_kb
+= shift_round_up(osd_op
.outdata
.length(), 10);
6212 ctx
->delta_stats
.num_rd
++;
6216 case CEPH_OSD_OP_GETXATTRS
:
6219 tracepoint(osd
, do_osd_op_pre_getxattrs
, soid
.oid
.name
.c_str(), soid
.snap
.val
);
6220 map
<string
, bufferlist
> out
;
6221 result
= getattrs_maybe_cache(
6227 ctx
->delta_stats
.num_rd_kb
+= shift_round_up(bl
.length(), 10);
6228 ctx
->delta_stats
.num_rd
++;
6229 osd_op
.outdata
.claim_append(bl
);
6233 case CEPH_OSD_OP_CMPXATTR
:
6237 bp
.copy(op
.xattr
.name_len
, aname
);
6238 tracepoint(osd
, do_osd_op_pre_cmpxattr
, soid
.oid
.name
.c_str(), soid
.snap
.val
, aname
.c_str());
6239 string name
= "_" + aname
;
6240 name
[op
.xattr
.name_len
+ 1] = 0;
6243 result
= getattr_maybe_cache(
6247 if (result
< 0 && result
!= -EEXIST
&& result
!= -ENODATA
)
6250 ctx
->delta_stats
.num_rd
++;
6251 ctx
->delta_stats
.num_rd_kb
+= shift_round_up(xattr
.length(), 10);
6253 switch (op
.xattr
.cmp_mode
) {
6254 case CEPH_OSD_CMPXATTR_MODE_STRING
:
6257 bp
.copy(op
.xattr
.value_len
, val
);
6258 val
[op
.xattr
.value_len
] = 0;
6259 dout(10) << "CEPH_OSD_OP_CMPXATTR name=" << name
<< " val=" << val
6260 << " op=" << (int)op
.xattr
.cmp_op
<< " mode=" << (int)op
.xattr
.cmp_mode
<< dendl
;
6261 result
= do_xattr_cmp_str(op
.xattr
.cmp_op
, val
, xattr
);
6265 case CEPH_OSD_CMPXATTR_MODE_U64
:
6271 catch (ceph::buffer::error
& e
) {
6275 dout(10) << "CEPH_OSD_OP_CMPXATTR name=" << name
<< " val=" << u64val
6276 << " op=" << (int)op
.xattr
.cmp_op
<< " mode=" << (int)op
.xattr
.cmp_mode
<< dendl
;
6277 result
= do_xattr_cmp_u64(op
.xattr
.cmp_op
, u64val
, xattr
);
6282 dout(10) << "bad cmp mode " << (int)op
.xattr
.cmp_mode
<< dendl
;
6287 dout(10) << "comparison returned false" << dendl
;
6288 result
= -ECANCELED
;
6292 dout(10) << "comparison returned " << result
<< " " << cpp_strerror(-result
) << dendl
;
6296 dout(10) << "comparison returned true" << dendl
;
6300 case CEPH_OSD_OP_ASSERT_VER
:
6303 uint64_t ver
= op
.assert_ver
.ver
;
6304 tracepoint(osd
, do_osd_op_pre_assert_ver
, soid
.oid
.name
.c_str(), soid
.snap
.val
, ver
);
6307 else if (ver
< oi
.user_version
)
6309 else if (ver
> oi
.user_version
)
6310 result
= -EOVERFLOW
;
6314 case CEPH_OSD_OP_LIST_WATCHERS
:
6317 tracepoint(osd
, do_osd_op_pre_list_watchers
, soid
.oid
.name
.c_str(), soid
.snap
.val
);
6318 obj_list_watch_response_t resp
;
6320 map
<pair
<uint64_t, entity_name_t
>, watch_info_t
>::const_iterator oi_iter
;
6321 for (oi_iter
= oi
.watchers
.begin(); oi_iter
!= oi
.watchers
.end();
6323 dout(20) << "key cookie=" << oi_iter
->first
.first
6324 << " entity=" << oi_iter
->first
.second
<< " "
6325 << oi_iter
->second
<< dendl
;
6326 ceph_assert(oi_iter
->first
.first
== oi_iter
->second
.cookie
);
6327 ceph_assert(oi_iter
->first
.second
.is_client());
6329 watch_item_t
wi(oi_iter
->first
.second
, oi_iter
->second
.cookie
,
6330 oi_iter
->second
.timeout_seconds
, oi_iter
->second
.addr
);
6331 resp
.entries
.push_back(wi
);
6334 resp
.encode(osd_op
.outdata
, ctx
->get_features());
6337 ctx
->delta_stats
.num_rd
++;
6341 case CEPH_OSD_OP_LIST_SNAPS
:
6344 tracepoint(osd
, do_osd_op_pre_list_snaps
, soid
.oid
.name
.c_str(), soid
.snap
.val
);
6345 obj_list_snap_response_t resp
;
6348 ssc
= ctx
->obc
->ssc
= get_snapset_context(soid
, false);
6351 dout(20) << " snapset " << ssc
->snapset
<< dendl
;
6353 int clonecount
= ssc
->snapset
.clones
.size();
6354 clonecount
++; // for head
6355 resp
.clones
.reserve(clonecount
);
6356 for (auto clone_iter
= ssc
->snapset
.clones
.begin();
6357 clone_iter
!= ssc
->snapset
.clones
.end(); ++clone_iter
) {
6359 ci
.cloneid
= *clone_iter
;
6361 hobject_t clone_oid
= soid
;
6362 clone_oid
.snap
= *clone_iter
;
6364 auto p
= ssc
->snapset
.clone_snaps
.find(*clone_iter
);
6365 if (p
== ssc
->snapset
.clone_snaps
.end()) {
6366 osd
->clog
->error() << "osd." << osd
->whoami
6367 << ": inconsistent clone_snaps found for oid "
6368 << soid
<< " clone " << *clone_iter
6369 << " snapset " << ssc
->snapset
;
6373 for (auto q
= p
->second
.rbegin(); q
!= p
->second
.rend(); ++q
) {
6374 ci
.snaps
.push_back(*q
);
6377 dout(20) << " clone " << *clone_iter
<< " snaps " << ci
.snaps
<< dendl
;
6379 map
<snapid_t
, interval_set
<uint64_t> >::const_iterator coi
;
6380 coi
= ssc
->snapset
.clone_overlap
.find(ci
.cloneid
);
6381 if (coi
== ssc
->snapset
.clone_overlap
.end()) {
6382 osd
->clog
->error() << "osd." << osd
->whoami
6383 << ": inconsistent clone_overlap found for oid "
6384 << soid
<< " clone " << *clone_iter
;
6388 const interval_set
<uint64_t> &o
= coi
->second
;
6389 ci
.overlap
.reserve(o
.num_intervals());
6390 for (interval_set
<uint64_t>::const_iterator r
= o
.begin();
6391 r
!= o
.end(); ++r
) {
6392 ci
.overlap
.push_back(pair
<uint64_t,uint64_t>(r
.get_start(),
6396 map
<snapid_t
, uint64_t>::const_iterator si
;
6397 si
= ssc
->snapset
.clone_size
.find(ci
.cloneid
);
6398 if (si
== ssc
->snapset
.clone_size
.end()) {
6399 osd
->clog
->error() << "osd." << osd
->whoami
6400 << ": inconsistent clone_size found for oid "
6401 << soid
<< " clone " << *clone_iter
;
6405 ci
.size
= si
->second
;
6407 resp
.clones
.push_back(ci
);
6412 if (!ctx
->obc
->obs
.oi
.is_whiteout()) {
6413 ceph_assert(obs
.exists
);
6415 ci
.cloneid
= CEPH_NOSNAP
;
6417 //Size for HEAD is oi.size
6420 resp
.clones
.push_back(ci
);
6422 resp
.seq
= ssc
->snapset
.seq
;
6424 resp
.encode(osd_op
.outdata
);
6427 ctx
->delta_stats
.num_rd
++;
6431 case CEPH_OSD_OP_NOTIFY
:
6438 uint32_t ver
; // obsolete
6440 decode(timeout
, bp
);
6442 } catch (const ceph::buffer::error
&e
) {
6445 tracepoint(osd
, do_osd_op_pre_notify
, soid
.oid
.name
.c_str(), soid
.snap
.val
, timeout
);
6447 timeout
= cct
->_conf
->osd_default_notify_timeout
;
6450 n
.timeout
= timeout
;
6451 n
.notify_id
= osd
->get_next_id(get_osdmap_epoch());
6452 n
.cookie
= op
.notify
.cookie
;
6454 ctx
->notifies
.push_back(n
);
6456 // return our unique notify id to the client
6457 encode(n
.notify_id
, osd_op
.outdata
);
6461 case CEPH_OSD_OP_NOTIFY_ACK
:
6465 uint64_t notify_id
= 0;
6466 uint64_t watch_cookie
= 0;
6467 decode(notify_id
, bp
);
6468 decode(watch_cookie
, bp
);
6469 bufferlist reply_bl
;
6471 decode(reply_bl
, bp
);
6473 tracepoint(osd
, do_osd_op_pre_notify_ack
, soid
.oid
.name
.c_str(), soid
.snap
.val
, notify_id
, watch_cookie
, "Y");
6474 OpContext::NotifyAck
ack(notify_id
, watch_cookie
, reply_bl
);
6475 ctx
->notify_acks
.push_back(ack
);
6476 } catch (const ceph::buffer::error
&e
) {
6477 tracepoint(osd
, do_osd_op_pre_notify_ack
, soid
.oid
.name
.c_str(), soid
.snap
.val
, op
.watch
.cookie
, 0, "N");
6478 OpContext::NotifyAck
ack(
6479 // op.watch.cookie is actually the notify_id for historical reasons
6482 ctx
->notify_acks
.push_back(ack
);
6487 case CEPH_OSD_OP_SETALLOCHINT
:
6491 tracepoint(osd
, do_osd_op_pre_setallochint
, soid
.oid
.name
.c_str(), soid
.snap
.val
, op
.alloc_hint
.expected_object_size
, op
.alloc_hint
.expected_write_size
);
6492 maybe_create_new_object(ctx
);
6493 oi
.expected_object_size
= op
.alloc_hint
.expected_object_size
;
6494 oi
.expected_write_size
= op
.alloc_hint
.expected_write_size
;
6495 oi
.alloc_hint_flags
= op
.alloc_hint
.flags
;
6496 t
->set_alloc_hint(soid
, op
.alloc_hint
.expected_object_size
,
6497 op
.alloc_hint
.expected_write_size
,
6498 op
.alloc_hint
.flags
);
6505 // -- object data --
6507 case CEPH_OSD_OP_WRITE
:
6511 __u32 seq
= oi
.truncate_seq
;
6512 tracepoint(osd
, do_osd_op_pre_write
, soid
.oid
.name
.c_str(), soid
.snap
.val
, oi
.size
, seq
, op
.extent
.offset
, op
.extent
.length
, op
.extent
.truncate_size
, op
.extent
.truncate_seq
);
6513 if (op
.extent
.length
!= osd_op
.indata
.length()) {
6518 if (pool
.info
.has_flag(pg_pool_t::FLAG_WRITE_FADVISE_DONTNEED
))
6519 op
.flags
= op
.flags
| CEPH_OSD_OP_FLAG_FADVISE_DONTNEED
;
6521 if (pool
.info
.requires_aligned_append() &&
6522 (op
.extent
.offset
% pool
.info
.required_alignment() != 0)) {
6523 result
= -EOPNOTSUPP
;
6528 if (pool
.info
.requires_aligned_append() && op
.extent
.offset
) {
6529 result
= -EOPNOTSUPP
;
6532 } else if (op
.extent
.offset
!= oi
.size
&&
6533 pool
.info
.requires_aligned_append()) {
6534 result
= -EOPNOTSUPP
;
6538 if (seq
&& (seq
> op
.extent
.truncate_seq
) &&
6539 (op
.extent
.offset
+ op
.extent
.length
> oi
.size
)) {
6540 // old write, arrived after trimtrunc
6541 op
.extent
.length
= (op
.extent
.offset
> oi
.size
? 0 : oi
.size
- op
.extent
.offset
);
6542 dout(10) << " old truncate_seq " << op
.extent
.truncate_seq
<< " < current " << seq
6543 << ", adjusting write length to " << op
.extent
.length
<< dendl
;
6545 t
.substr_of(osd_op
.indata
, 0, op
.extent
.length
);
6546 osd_op
.indata
.swap(t
);
6548 if (op
.extent
.truncate_seq
> seq
) {
6549 // write arrives before trimtrunc
6550 if (obs
.exists
&& !oi
.is_whiteout()) {
6551 dout(10) << " truncate_seq " << op
.extent
.truncate_seq
<< " > current " << seq
6552 << ", truncating to " << op
.extent
.truncate_size
<< dendl
;
6553 t
->truncate(soid
, op
.extent
.truncate_size
);
6554 oi
.truncate_seq
= op
.extent
.truncate_seq
;
6555 oi
.truncate_size
= op
.extent
.truncate_size
;
6556 if (oi
.size
> op
.extent
.truncate_size
) {
6557 interval_set
<uint64_t> trim
;
6558 trim
.insert(op
.extent
.truncate_size
,
6559 oi
.size
- op
.extent
.truncate_size
);
6560 ctx
->modified_ranges
.union_of(trim
);
6561 ctx
->clean_regions
.mark_data_region_dirty(op
.extent
.truncate_size
, oi
.size
- op
.extent
.truncate_size
);
6562 oi
.clear_data_digest();
6564 if (op
.extent
.truncate_size
!= oi
.size
) {
6565 truncate_update_size_and_usage(ctx
->delta_stats
,
6567 op
.extent
.truncate_size
);
6570 dout(10) << " truncate_seq " << op
.extent
.truncate_seq
<< " > current " << seq
6571 << ", but object is new" << dendl
;
6572 oi
.truncate_seq
= op
.extent
.truncate_seq
;
6573 oi
.truncate_size
= op
.extent
.truncate_size
;
6576 result
= check_offset_and_length(
6577 op
.extent
.offset
, op
.extent
.length
,
6578 static_cast<Option::size_t>(osd
->osd_max_object_size
), get_dpp());
6582 maybe_create_new_object(ctx
);
6584 if (op
.extent
.length
== 0) {
6585 if (op
.extent
.offset
> oi
.size
) {
6587 soid
, op
.extent
.offset
);
6588 truncate_update_size_and_usage(ctx
->delta_stats
, oi
,
6595 soid
, op
.extent
.offset
, op
.extent
.length
, osd_op
.indata
, op
.flags
);
6598 if (op
.extent
.offset
== 0 && op
.extent
.length
>= oi
.size
6599 && !skip_data_digest
) {
6600 obs
.oi
.set_data_digest(osd_op
.indata
.crc32c(-1));
6601 } else if (op
.extent
.offset
== oi
.size
&& obs
.oi
.is_data_digest()) {
6602 if (skip_data_digest
) {
6603 obs
.oi
.clear_data_digest();
6605 obs
.oi
.set_data_digest(osd_op
.indata
.crc32c(obs
.oi
.data_digest
));
6608 obs
.oi
.clear_data_digest();
6610 write_update_size_and_usage(ctx
->delta_stats
, oi
, ctx
->modified_ranges
,
6611 op
.extent
.offset
, op
.extent
.length
);
6612 ctx
->clean_regions
.mark_data_region_dirty(op
.extent
.offset
, op
.extent
.length
);
6613 dout(10) << "clean_regions modified" << ctx
->clean_regions
<< dendl
;
6617 case CEPH_OSD_OP_WRITEFULL
:
6620 { // write full object
6621 tracepoint(osd
, do_osd_op_pre_writefull
, soid
.oid
.name
.c_str(), soid
.snap
.val
, oi
.size
, 0, op
.extent
.length
);
6623 if (op
.extent
.length
!= osd_op
.indata
.length()) {
6627 result
= check_offset_and_length(
6628 0, op
.extent
.length
,
6629 static_cast<Option::size_t>(osd
->osd_max_object_size
), get_dpp());
6633 if (pool
.info
.has_flag(pg_pool_t::FLAG_WRITE_FADVISE_DONTNEED
))
6634 op
.flags
= op
.flags
| CEPH_OSD_OP_FLAG_FADVISE_DONTNEED
;
6636 maybe_create_new_object(ctx
);
6637 if (pool
.info
.is_erasure()) {
6638 t
->truncate(soid
, 0);
6639 } else if (obs
.exists
&& op
.extent
.length
< oi
.size
) {
6640 t
->truncate(soid
, op
.extent
.length
);
6642 if (op
.extent
.length
) {
6643 t
->write(soid
, 0, op
.extent
.length
, osd_op
.indata
, op
.flags
);
6645 if (!skip_data_digest
) {
6646 obs
.oi
.set_data_digest(osd_op
.indata
.crc32c(-1));
6648 obs
.oi
.clear_data_digest();
6650 ctx
->clean_regions
.mark_data_region_dirty(0,
6651 std::max((uint64_t)op
.extent
.length
, oi
.size
));
6652 write_update_size_and_usage(ctx
->delta_stats
, oi
, ctx
->modified_ranges
,
6653 0, op
.extent
.length
, true);
6657 case CEPH_OSD_OP_WRITESAME
:
6659 tracepoint(osd
, do_osd_op_pre_writesame
, soid
.oid
.name
.c_str(), soid
.snap
.val
, oi
.size
, op
.writesame
.offset
, op
.writesame
.length
, op
.writesame
.data_length
);
6660 result
= do_writesame(ctx
, osd_op
);
6663 case CEPH_OSD_OP_ROLLBACK
:
6665 tracepoint(osd
, do_osd_op_pre_rollback
, soid
.oid
.name
.c_str(), soid
.snap
.val
);
6666 result
= _rollback_to(ctx
, op
);
6669 case CEPH_OSD_OP_ZERO
:
6670 tracepoint(osd
, do_osd_op_pre_zero
, soid
.oid
.name
.c_str(), soid
.snap
.val
, op
.extent
.offset
, op
.extent
.length
);
6671 if (pool
.info
.requires_aligned_append()) {
6672 result
= -EOPNOTSUPP
;
6677 result
= check_offset_and_length(
6678 op
.extent
.offset
, op
.extent
.length
,
6679 static_cast<Option::size_t>(osd
->osd_max_object_size
), get_dpp());
6683 ceph_assert(op
.extent
.length
);
6684 if (obs
.exists
&& !oi
.is_whiteout()) {
6685 t
->zero(soid
, op
.extent
.offset
, op
.extent
.length
);
6686 interval_set
<uint64_t> ch
;
6687 ch
.insert(op
.extent
.offset
, op
.extent
.length
);
6688 ctx
->modified_ranges
.union_of(ch
);
6689 ctx
->clean_regions
.mark_data_region_dirty(op
.extent
.offset
, op
.extent
.length
);
6690 ctx
->delta_stats
.num_wr
++;
6691 oi
.clear_data_digest();
6697 case CEPH_OSD_OP_CREATE
:
6701 tracepoint(osd
, do_osd_op_pre_create
, soid
.oid
.name
.c_str(), soid
.snap
.val
);
6702 if (obs
.exists
&& !oi
.is_whiteout() &&
6703 (op
.flags
& CEPH_OSD_OP_FLAG_EXCL
)) {
6704 result
= -EEXIST
; /* this is an exclusive create */
6706 if (osd_op
.indata
.length()) {
6707 auto p
= osd_op
.indata
.cbegin();
6710 decode(category
, p
);
6712 catch (ceph::buffer::error
& e
) {
6716 // category is no longer implemented.
6718 maybe_create_new_object(ctx
);
6724 case CEPH_OSD_OP_TRIMTRUNC
:
6725 op
.extent
.offset
= op
.extent
.truncate_size
;
6728 case CEPH_OSD_OP_TRUNCATE
:
6729 tracepoint(osd
, do_osd_op_pre_truncate
, soid
.oid
.name
.c_str(), soid
.snap
.val
, oi
.size
, oi
.truncate_seq
, op
.extent
.offset
, op
.extent
.length
, op
.extent
.truncate_size
, op
.extent
.truncate_seq
);
6730 if (pool
.info
.requires_aligned_append()) {
6731 result
= -EOPNOTSUPP
;
6738 if (!obs
.exists
|| oi
.is_whiteout()) {
6739 dout(10) << " object dne, truncate is a no-op" << dendl
;
6743 result
= check_offset_and_length(
6744 op
.extent
.offset
, op
.extent
.length
,
6745 static_cast<Option::size_t>(osd
->osd_max_object_size
), get_dpp());
6749 if (op
.extent
.truncate_seq
) {
6750 ceph_assert(op
.extent
.offset
== op
.extent
.truncate_size
);
6751 if (op
.extent
.truncate_seq
<= oi
.truncate_seq
) {
6752 dout(10) << " truncate seq " << op
.extent
.truncate_seq
<< " <= current " << oi
.truncate_seq
6753 << ", no-op" << dendl
;
6756 dout(10) << " truncate seq " << op
.extent
.truncate_seq
<< " > current " << oi
.truncate_seq
6757 << ", truncating" << dendl
;
6758 oi
.truncate_seq
= op
.extent
.truncate_seq
;
6759 oi
.truncate_size
= op
.extent
.truncate_size
;
6762 maybe_create_new_object(ctx
);
6763 t
->truncate(soid
, op
.extent
.offset
);
6764 if (oi
.size
> op
.extent
.offset
) {
6765 interval_set
<uint64_t> trim
;
6766 trim
.insert(op
.extent
.offset
, oi
.size
-op
.extent
.offset
);
6767 ctx
->modified_ranges
.union_of(trim
);
6768 ctx
->clean_regions
.mark_data_region_dirty(op
.extent
.offset
, oi
.size
- op
.extent
.offset
);
6769 } else if (oi
.size
< op
.extent
.offset
) {
6770 ctx
->clean_regions
.mark_data_region_dirty(oi
.size
, op
.extent
.offset
- oi
.size
);
6772 if (op
.extent
.offset
!= oi
.size
) {
6773 truncate_update_size_and_usage(ctx
->delta_stats
,
6777 ctx
->delta_stats
.num_wr
++;
6778 // do no set exists, or we will break above DELETE -> TRUNCATE munging.
6780 oi
.clear_data_digest();
6784 case CEPH_OSD_OP_DELETE
:
6787 tracepoint(osd
, do_osd_op_pre_delete
, soid
.oid
.name
.c_str(), soid
.snap
.val
);
6789 result
= _delete_oid(ctx
, false, ctx
->ignore_cache
);
6793 case CEPH_OSD_OP_WATCH
:
6797 tracepoint(osd
, do_osd_op_pre_watch
, soid
.oid
.name
.c_str(), soid
.snap
.val
,
6798 op
.watch
.cookie
, op
.watch
.op
);
6804 uint64_t cookie
= op
.watch
.cookie
;
6805 entity_name_t entity
= ctx
->reqid
.name
;
6806 ObjectContextRef obc
= ctx
->obc
;
6808 dout(10) << "watch " << ceph_osd_watch_op_name(op
.watch
.op
)
6809 << ": ctx->obc=" << (void *)obc
.get() << " cookie=" << cookie
6810 << " oi.version=" << oi
.version
.version
<< " ctx->at_version=" << ctx
->at_version
<< dendl
;
6811 dout(10) << "watch: oi.user_version=" << oi
.user_version
<< dendl
;
6812 dout(10) << "watch: peer_addr="
6813 << ctx
->op
->get_req()->get_connection()->get_peer_addr() << dendl
;
6815 uint32_t timeout
= cct
->_conf
->osd_client_watch_timeout
;
6816 if (op
.watch
.timeout
!= 0) {
6817 timeout
= op
.watch
.timeout
;
6820 watch_info_t
w(cookie
, timeout
,
6821 ctx
->op
->get_req()->get_connection()->get_peer_addr());
6822 if (op
.watch
.op
== CEPH_OSD_WATCH_OP_WATCH
||
6823 op
.watch
.op
== CEPH_OSD_WATCH_OP_LEGACY_WATCH
) {
6824 if (oi
.watchers
.count(make_pair(cookie
, entity
))) {
6825 dout(10) << " found existing watch " << w
<< " by " << entity
<< dendl
;
6827 dout(10) << " registered new watch " << w
<< " by " << entity
<< dendl
;
6828 oi
.watchers
[make_pair(cookie
, entity
)] = w
;
6829 t
->nop(soid
); // make sure update the object_info on disk!
6831 bool will_ping
= (op
.watch
.op
== CEPH_OSD_WATCH_OP_WATCH
);
6832 ctx
->watch_connects
.push_back(make_pair(w
, will_ping
));
6833 } else if (op
.watch
.op
== CEPH_OSD_WATCH_OP_RECONNECT
) {
6834 if (!oi
.watchers
.count(make_pair(cookie
, entity
))) {
6838 dout(10) << " found existing watch " << w
<< " by " << entity
<< dendl
;
6839 ctx
->watch_connects
.push_back(make_pair(w
, true));
6840 } else if (op
.watch
.op
== CEPH_OSD_WATCH_OP_PING
) {
6841 /* Note: WATCH with PING doesn't cause may_write() to return true,
6842 * so if there is nothing else in the transaction, this is going
6843 * to run do_osd_op_effects, but not write out a log entry */
6844 if (!oi
.watchers
.count(make_pair(cookie
, entity
))) {
6848 map
<pair
<uint64_t,entity_name_t
>,WatchRef
>::iterator p
=
6849 obc
->watchers
.find(make_pair(cookie
, entity
));
6850 if (p
== obc
->watchers
.end() ||
6851 !p
->second
->is_connected()) {
6852 // client needs to reconnect
6853 result
= -ETIMEDOUT
;
6856 dout(10) << " found existing watch " << w
<< " by " << entity
<< dendl
;
6857 p
->second
->got_ping(ceph_clock_now());
6859 } else if (op
.watch
.op
== CEPH_OSD_WATCH_OP_UNWATCH
) {
6860 map
<pair
<uint64_t, entity_name_t
>, watch_info_t
>::iterator oi_iter
=
6861 oi
.watchers
.find(make_pair(cookie
, entity
));
6862 if (oi_iter
!= oi
.watchers
.end()) {
6863 dout(10) << " removed watch " << oi_iter
->second
<< " by "
6865 oi
.watchers
.erase(oi_iter
);
6866 t
->nop(soid
); // update oi on disk
6867 ctx
->watch_disconnects
.push_back(
6868 watch_disconnect_t(cookie
, entity
, false));
6870 dout(10) << " can't remove: no watch by " << entity
<< dendl
;
6876 case CEPH_OSD_OP_CACHE_PIN
:
6877 tracepoint(osd
, do_osd_op_pre_cache_pin
, soid
.oid
.name
.c_str(), soid
.snap
.val
);
6878 if ((!pool
.info
.is_tier() ||
6879 pool
.info
.cache_mode
== pg_pool_t::CACHEMODE_NONE
)) {
6881 dout(10) << " pin object is only allowed on the cache tier " << dendl
;
6887 if (!obs
.exists
|| oi
.is_whiteout()) {
6892 if (!oi
.is_cache_pinned()) {
6893 oi
.set_flag(object_info_t::FLAG_CACHE_PIN
);
6895 ctx
->delta_stats
.num_objects_pinned
++;
6896 ctx
->delta_stats
.num_wr
++;
6901 case CEPH_OSD_OP_CACHE_UNPIN
:
6902 tracepoint(osd
, do_osd_op_pre_cache_unpin
, soid
.oid
.name
.c_str(), soid
.snap
.val
);
6903 if ((!pool
.info
.is_tier() ||
6904 pool
.info
.cache_mode
== pg_pool_t::CACHEMODE_NONE
)) {
6906 dout(10) << " pin object is only allowed on the cache tier " << dendl
;
6912 if (!obs
.exists
|| oi
.is_whiteout()) {
6917 if (oi
.is_cache_pinned()) {
6918 oi
.clear_flag(object_info_t::FLAG_CACHE_PIN
);
6920 ctx
->delta_stats
.num_objects_pinned
--;
6921 ctx
->delta_stats
.num_wr
++;
6926 case CEPH_OSD_OP_SET_REDIRECT
:
6930 if (pool
.info
.is_tier()) {
6938 if (get_osdmap()->require_osd_release
< ceph_release_t::luminous
) {
6939 result
= -EOPNOTSUPP
;
6943 object_t target_name
;
6944 object_locator_t target_oloc
;
6945 snapid_t target_snapid
= (uint64_t)op
.copy_from
.snapid
;
6946 version_t target_version
= op
.copy_from
.src_version
;
6948 decode(target_name
, bp
);
6949 decode(target_oloc
, bp
);
6951 catch (ceph::buffer::error
& e
) {
6956 get_osdmap()->object_locator_to_pg(target_name
, target_oloc
, raw_pg
);
6957 hobject_t
target(target_name
, target_oloc
.key
, target_snapid
,
6958 raw_pg
.ps(), raw_pg
.pool(),
6959 target_oloc
.nspace
);
6960 if (target
== soid
) {
6961 dout(20) << " set-redirect self is invalid" << dendl
;
6966 bool need_reference
= (osd_op
.op
.flags
& CEPH_OSD_OP_FLAG_WITH_REFERENCE
);
6967 bool has_reference
= (oi
.flags
& object_info_t::FLAG_REDIRECT_HAS_REFERENCE
);
6968 if (has_reference
) {
6970 dout(5) << " the object is already a manifest " << dendl
;
6973 if (op_finisher
== nullptr && need_reference
) {
6975 ctx
->op_finishers
[ctx
->current_osd_subop_num
].reset(
6976 new SetManifestFinisher(osd_op
));
6977 ManifestOpRef mop
= std::make_shared
<ManifestOp
>(new RefCountCallback(ctx
, osd_op
));
6978 C_SetManifestRefCountDone
* fin
= new C_SetManifestRefCountDone(this, mop
, soid
);
6979 ceph_tid_t tid
= refcount_manifest(soid
, target
,
6980 refcount_t::INCREMENT_REF
, fin
, std::nullopt
);
6981 mop
->objecter_tid
= tid
;
6982 manifest_ops
[soid
] = mop
;
6983 ctx
->obc
->start_block();
6984 result
= -EINPROGRESS
;
6988 result
= op_finisher
->execute();
6989 ceph_assert(result
== 0);
6992 if (!oi
.has_manifest() && !oi
.manifest
.is_redirect())
6993 ctx
->delta_stats
.num_objects_manifest
++;
6995 oi
.set_flag(object_info_t::FLAG_MANIFEST
);
6996 oi
.manifest
.redirect_target
= target
;
6997 oi
.manifest
.type
= object_manifest_t::TYPE_REDIRECT
;
6998 t
->truncate(soid
, 0);
6999 ctx
->clean_regions
.mark_data_region_dirty(0, oi
.size
);
7000 if (oi
.is_omap() && pool
.info
.supports_omap()) {
7001 t
->omap_clear(soid
);
7002 obs
.oi
.clear_omap_digest();
7003 obs
.oi
.clear_flag(object_info_t::FLAG_OMAP
);
7004 ctx
->clean_regions
.mark_omap_dirty();
7006 write_update_size_and_usage(ctx
->delta_stats
, oi
, ctx
->modified_ranges
,
7008 ctx
->delta_stats
.num_bytes
-= oi
.size
;
7011 oi
.user_version
= target_version
;
7012 ctx
->user_at_version
= target_version
;
7014 map
<string
,bufferlist
> rmattrs
;
7015 result
= getattrs_maybe_cache(ctx
->obc
, &rmattrs
);
7017 dout(10) << __func__
<< " error: " << cpp_strerror(result
) << dendl
;
7020 map
<string
, bufferlist
>::iterator iter
;
7021 for (iter
= rmattrs
.begin(); iter
!= rmattrs
.end(); ++iter
) {
7022 const string
& name
= iter
->first
;
7023 t
->rmattr(soid
, name
);
7025 if (!has_reference
&& need_reference
) {
7026 oi
.set_flag(object_info_t::FLAG_REDIRECT_HAS_REFERENCE
);
7028 dout(10) << "set-redirect oid:" << oi
.soid
<< " user_version: " << oi
.user_version
<< dendl
;
7030 ctx
->op_finishers
.erase(ctx
->current_osd_subop_num
);
7037 case CEPH_OSD_OP_SET_CHUNK
:
7041 if (pool
.info
.is_tier()) {
7049 if (get_osdmap()->require_osd_release
< ceph_release_t::luminous
) {
7050 result
= -EOPNOTSUPP
;
7053 if (oi
.manifest
.is_redirect()) {
7058 object_locator_t tgt_oloc
;
7059 uint64_t src_offset
, src_length
, tgt_offset
;
7062 decode(src_offset
, bp
);
7063 decode(src_length
, bp
);
7064 decode(tgt_oloc
, bp
);
7065 decode(tgt_name
, bp
);
7066 decode(tgt_offset
, bp
);
7068 catch (ceph::buffer::error
& e
) {
7077 if (src_offset
+ src_length
> oi
.size
) {
7081 if (!(osd_op
.op
.flags
& CEPH_OSD_OP_FLAG_WITH_REFERENCE
)) {
7082 result
= -EOPNOTSUPP
;
7085 if (pool
.info
.is_erasure()) {
7086 result
= -EOPNOTSUPP
;
7090 for (auto &p
: oi
.manifest
.chunk_map
) {
7091 interval_set
<uint64_t> chunk
;
7092 chunk
.insert(p
.first
, p
.second
.length
);
7093 if (chunk
.intersects(src_offset
, src_length
)) {
7094 dout(20) << __func__
<< " overlapped !! offset: " << src_offset
<< " length: " << src_length
7095 << " chunk_info: " << p
<< dendl
;
7096 result
= -EOPNOTSUPP
;
7102 chunk_info_t chunk_info
;
7103 get_osdmap()->object_locator_to_pg(tgt_name
, tgt_oloc
, raw_pg
);
7104 hobject_t
target(tgt_name
, tgt_oloc
.key
, snapid_t(),
7105 raw_pg
.ps(), raw_pg
.pool(),
7107 bool has_reference
= (oi
.manifest
.chunk_map
.find(src_offset
) != oi
.manifest
.chunk_map
.end()) &&
7108 (oi
.manifest
.chunk_map
[src_offset
].test_flag(chunk_info_t::FLAG_HAS_REFERENCE
));
7109 if (has_reference
) {
7111 dout(5) << " the object is already a manifest " << dendl
;
7114 chunk_info
.oid
= target
;
7115 chunk_info
.offset
= tgt_offset
;
7116 chunk_info
.length
= src_length
;
7117 if (op_finisher
== nullptr) {
7119 ctx
->op_finishers
[ctx
->current_osd_subop_num
].reset(
7120 new SetManifestFinisher(osd_op
));
7121 object_manifest_t set_chunk
;
7122 bool need_inc_ref
= false;
7123 set_chunk
.chunk_map
[src_offset
] = chunk_info
;
7124 need_inc_ref
= inc_refcount_by_set(ctx
, set_chunk
, osd_op
);
7126 result
= -EINPROGRESS
;
7131 result
= op_finisher
->execute();
7132 ceph_assert(result
== 0);
7135 oi
.manifest
.chunk_map
[src_offset
] = chunk_info
;
7136 if (!oi
.has_manifest() && !oi
.manifest
.is_chunked())
7137 ctx
->delta_stats
.num_objects_manifest
++;
7138 oi
.set_flag(object_info_t::FLAG_MANIFEST
);
7139 oi
.manifest
.type
= object_manifest_t::TYPE_CHUNKED
;
7140 if (!has_reference
) {
7141 oi
.manifest
.chunk_map
[src_offset
].set_flag(chunk_info_t::FLAG_HAS_REFERENCE
);
7144 ctx
->cache_operation
= true;
7146 dout(10) << "set-chunked oid:" << oi
.soid
<< " user_version: " << oi
.user_version
7147 << " chunk_info: " << chunk_info
<< dendl
;
7149 ctx
->op_finishers
.erase(ctx
->current_osd_subop_num
);
7155 case CEPH_OSD_OP_TIER_PROMOTE
:
7159 if (pool
.info
.is_tier()) {
7167 if (get_osdmap()->require_osd_release
< ceph_release_t::luminous
) {
7168 result
= -EOPNOTSUPP
;
7171 if (!obs
.oi
.has_manifest()) {
7176 if (op_finisher
== nullptr) {
7177 PromoteManifestCallback
*cb
;
7178 object_locator_t my_oloc
;
7181 if (obs
.oi
.manifest
.is_chunked()) {
7182 src_hoid
= obs
.oi
.soid
;
7183 } else if (obs
.oi
.manifest
.is_redirect()) {
7184 object_locator_t
src_oloc(obs
.oi
.manifest
.redirect_target
);
7186 src_hoid
= obs
.oi
.manifest
.redirect_target
;
7188 ceph_abort_msg("unrecognized manifest type");
7190 cb
= new PromoteManifestCallback(ctx
->obc
, this, ctx
);
7191 ctx
->op_finishers
[ctx
->current_osd_subop_num
].reset(
7192 new PromoteFinisher(cb
));
7193 unsigned flags
= CEPH_OSD_COPY_FROM_FLAG_IGNORE_OVERLAY
|
7194 CEPH_OSD_COPY_FROM_FLAG_IGNORE_CACHE
|
7195 CEPH_OSD_COPY_FROM_FLAG_MAP_SNAP_CLONE
|
7196 CEPH_OSD_COPY_FROM_FLAG_RWORDERED
;
7197 unsigned src_fadvise_flags
= LIBRADOS_OP_FLAG_FADVISE_SEQUENTIAL
;
7198 start_copy(cb
, ctx
->obc
, src_hoid
, my_oloc
, 0, flags
,
7199 obs
.oi
.soid
.snap
== CEPH_NOSNAP
,
7200 src_fadvise_flags
, 0);
7202 dout(10) << "tier-promote oid:" << oi
.soid
<< " manifest: " << obs
.oi
.manifest
<< dendl
;
7203 result
= -EINPROGRESS
;
7205 result
= op_finisher
->execute();
7206 ceph_assert(result
== 0);
7207 ctx
->op_finishers
.erase(ctx
->current_osd_subop_num
);
7213 case CEPH_OSD_OP_TIER_FLUSH
:
7217 if (pool
.info
.is_tier()) {
7225 if (get_osdmap()->require_osd_release
< ceph_release_t::octopus
) {
7226 result
= -EOPNOTSUPP
;
7229 if (!obs
.oi
.has_manifest()) {
7234 if (oi
.is_dirty()) {
7235 result
= start_flush(ctx
->op
, ctx
->obc
, true, NULL
, std::nullopt
);
7236 if (result
== -EINPROGRESS
)
7245 case CEPH_OSD_OP_TIER_EVICT
:
7249 if (pool
.info
.is_tier()) {
7257 if (get_osdmap()->require_osd_release
< ceph_release_t::octopus
) {
7258 result
= -EOPNOTSUPP
;
7261 if (!obs
.oi
.has_manifest()) {
7266 // The chunks already has a reference, so it is just enough to invoke truncate if necessary
7267 uint64_t chunk_length
= 0;
7268 for (auto p
: obs
.oi
.manifest
.chunk_map
) {
7269 chunk_length
+= p
.second
.length
;
7271 if (chunk_length
== obs
.oi
.size
) {
7272 for (auto &p
: obs
.oi
.manifest
.chunk_map
) {
7273 p
.second
.set_flag(chunk_info_t::FLAG_MISSING
);
7276 t
->zero(soid
, 0, oi
.size
);
7277 oi
.clear_data_digest();
7278 ctx
->delta_stats
.num_wr
++;
7279 ctx
->cache_operation
= true;
7281 osd
->logger
->inc(l_osd_tier_evict
);
7286 case CEPH_OSD_OP_UNSET_MANIFEST
:
7290 if (pool
.info
.is_tier()) {
7298 if (!oi
.has_manifest()) {
7299 result
= -EOPNOTSUPP
;
7302 if (get_osdmap()->require_osd_release
< ceph_release_t::luminous
) {
7303 result
= -EOPNOTSUPP
;
7307 dec_all_refcount_manifest(oi
, ctx
);
7309 oi
.clear_flag(object_info_t::FLAG_MANIFEST
);
7310 oi
.manifest
= object_manifest_t();
7311 ctx
->delta_stats
.num_objects_manifest
--;
7312 ctx
->delta_stats
.num_wr
++;
7318 // -- object attrs --
7320 case CEPH_OSD_OP_SETXATTR
:
7324 if (cct
->_conf
->osd_max_attr_size
> 0 &&
7325 op
.xattr
.value_len
> cct
->_conf
->osd_max_attr_size
) {
7326 tracepoint(osd
, do_osd_op_pre_setxattr
, soid
.oid
.name
.c_str(), soid
.snap
.val
, "???");
7330 unsigned max_name_len
=
7331 std::min
<uint64_t>(osd
->store
->get_max_attr_name_length(),
7332 cct
->_conf
->osd_max_attr_name_len
);
7333 if (op
.xattr
.name_len
> max_name_len
) {
7334 result
= -ENAMETOOLONG
;
7337 maybe_create_new_object(ctx
);
7339 bp
.copy(op
.xattr
.name_len
, aname
);
7340 tracepoint(osd
, do_osd_op_pre_setxattr
, soid
.oid
.name
.c_str(), soid
.snap
.val
, aname
.c_str());
7341 string name
= "_" + aname
;
7343 bp
.copy(op
.xattr
.value_len
, bl
);
7344 t
->setattr(soid
, name
, bl
);
7345 ctx
->delta_stats
.num_wr
++;
7349 case CEPH_OSD_OP_RMXATTR
:
7354 bp
.copy(op
.xattr
.name_len
, aname
);
7355 tracepoint(osd
, do_osd_op_pre_rmxattr
, soid
.oid
.name
.c_str(), soid
.snap
.val
, aname
.c_str());
7356 if (!obs
.exists
|| oi
.is_whiteout()) {
7360 string name
= "_" + aname
;
7361 t
->rmattr(soid
, name
);
7362 ctx
->delta_stats
.num_wr
++;
7367 // -- fancy writers --
7368 case CEPH_OSD_OP_APPEND
:
7370 tracepoint(osd
, do_osd_op_pre_append
, soid
.oid
.name
.c_str(), soid
.snap
.val
, oi
.size
, oi
.truncate_seq
, op
.extent
.offset
, op
.extent
.length
, op
.extent
.truncate_size
, op
.extent
.truncate_seq
);
7371 // just do it inline; this works because we are happy to execute
7372 // fancy op on replicas as well.
7373 vector
<OSDOp
> nops(1);
7374 OSDOp
& newop
= nops
[0];
7375 newop
.op
.op
= CEPH_OSD_OP_WRITE
;
7376 newop
.op
.extent
.offset
= oi
.size
;
7377 newop
.op
.extent
.length
= op
.extent
.length
;
7378 newop
.op
.extent
.truncate_seq
= oi
.truncate_seq
;
7379 newop
.indata
= osd_op
.indata
;
7380 result
= do_osd_ops(ctx
, nops
);
7381 osd_op
.outdata
= std::move(newop
.outdata
);
7385 case CEPH_OSD_OP_STARTSYNC
:
7390 // -- trivial map --
7391 case CEPH_OSD_OP_TMAPGET
:
7392 tracepoint(osd
, do_osd_op_pre_tmapget
, soid
.oid
.name
.c_str(), soid
.snap
.val
);
7393 if (pool
.info
.is_erasure()) {
7394 result
= -EOPNOTSUPP
;
7398 vector
<OSDOp
> nops(1);
7399 OSDOp
& newop
= nops
[0];
7400 newop
.op
.op
= CEPH_OSD_OP_SYNC_READ
;
7401 newop
.op
.extent
.offset
= 0;
7402 newop
.op
.extent
.length
= 0;
7403 result
= do_osd_ops(ctx
, nops
);
7404 osd_op
.outdata
= std::move(newop
.outdata
);
7408 case CEPH_OSD_OP_TMAPPUT
:
7409 tracepoint(osd
, do_osd_op_pre_tmapput
, soid
.oid
.name
.c_str(), soid
.snap
.val
);
7410 if (pool
.info
.is_erasure()) {
7411 result
= -EOPNOTSUPP
;
7415 //_dout_lock.Lock();
7416 //osd_op.data.hexdump(*_dout);
7417 //_dout_lock.Unlock();
7419 // verify sort order
7420 bool unsorted
= false;
7430 dout(10) << "tmapput key " << key
<< dendl
;
7433 if (key
< last_key
) {
7434 dout(10) << "TMAPPUT is unordered; resorting" << dendl
;
7443 vector
<OSDOp
> nops(1);
7444 OSDOp
& newop
= nops
[0];
7445 newop
.op
.op
= CEPH_OSD_OP_WRITEFULL
;
7446 newop
.op
.extent
.offset
= 0;
7447 newop
.op
.extent
.length
= osd_op
.indata
.length();
7448 newop
.indata
= osd_op
.indata
;
7451 bp
= osd_op
.indata
.begin();
7453 map
<string
, bufferlist
> m
;
7456 ceph_assert(bp
.end());
7458 encode(header
, newbl
);
7460 newop
.indata
= newbl
;
7462 result
= do_osd_ops(ctx
, nops
);
7463 ceph_assert(result
== 0);
7467 case CEPH_OSD_OP_TMAPUP
:
7468 tracepoint(osd
, do_osd_op_pre_tmapup
, soid
.oid
.name
.c_str(), soid
.snap
.val
);
7469 if (pool
.info
.is_erasure()) {
7470 result
= -EOPNOTSUPP
;
7474 result
= do_tmapup(ctx
, bp
, osd_op
);
7477 case CEPH_OSD_OP_TMAP2OMAP
:
7479 tracepoint(osd
, do_osd_op_pre_tmap2omap
, soid
.oid
.name
.c_str(), soid
.snap
.val
);
7480 result
= do_tmap2omap(ctx
, op
.tmap2omap
.flags
);
7484 case CEPH_OSD_OP_OMAPGETKEYS
:
7488 uint64_t max_return
;
7490 decode(start_after
, bp
);
7491 decode(max_return
, bp
);
7493 catch (ceph::buffer::error
& e
) {
7495 tracepoint(osd
, do_osd_op_pre_omapgetkeys
, soid
.oid
.name
.c_str(), soid
.snap
.val
, "???", 0);
7498 if (max_return
> cct
->_conf
->osd_max_omap_entries_per_request
) {
7499 max_return
= cct
->_conf
->osd_max_omap_entries_per_request
;
7501 tracepoint(osd
, do_osd_op_pre_omapgetkeys
, soid
.oid
.name
.c_str(), soid
.snap
.val
, start_after
.c_str(), max_return
);
7505 bool truncated
= false;
7507 ObjectMap::ObjectMapIterator iter
= osd
->store
->get_omap_iterator(
7508 ch
, ghobject_t(soid
)
7511 iter
->upper_bound(start_after
);
7512 for (num
= 0; iter
->valid(); ++num
, iter
->next()) {
7513 if (num
>= max_return
||
7514 bl
.length() >= cct
->_conf
->osd_max_omap_bytes_per_request
) {
7518 encode(iter
->key(), bl
);
7520 } // else return empty out_set
7521 encode(num
, osd_op
.outdata
);
7522 osd_op
.outdata
.claim_append(bl
);
7523 encode(truncated
, osd_op
.outdata
);
7524 ctx
->delta_stats
.num_rd_kb
+= shift_round_up(osd_op
.outdata
.length(), 10);
7525 ctx
->delta_stats
.num_rd
++;
7529 case CEPH_OSD_OP_OMAPGETVALS
:
7533 uint64_t max_return
;
7534 string filter_prefix
;
7536 decode(start_after
, bp
);
7537 decode(max_return
, bp
);
7538 decode(filter_prefix
, bp
);
7540 catch (ceph::buffer::error
& e
) {
7542 tracepoint(osd
, do_osd_op_pre_omapgetvals
, soid
.oid
.name
.c_str(), soid
.snap
.val
, "???", 0, "???");
7545 if (max_return
> cct
->_conf
->osd_max_omap_entries_per_request
) {
7546 max_return
= cct
->_conf
->osd_max_omap_entries_per_request
;
7548 tracepoint(osd
, do_osd_op_pre_omapgetvals
, soid
.oid
.name
.c_str(), soid
.snap
.val
, start_after
.c_str(), max_return
, filter_prefix
.c_str());
7551 bool truncated
= false;
7554 ObjectMap::ObjectMapIterator iter
= osd
->store
->get_omap_iterator(
7555 ch
, ghobject_t(soid
)
7561 iter
->upper_bound(start_after
);
7562 if (filter_prefix
> start_after
) iter
->lower_bound(filter_prefix
);
7565 iter
->key().substr(0, filter_prefix
.size()) == filter_prefix
;
7566 ++num
, iter
->next()) {
7567 dout(20) << "Found key " << iter
->key() << dendl
;
7568 if (num
>= max_return
||
7569 bl
.length() >= cct
->_conf
->osd_max_omap_bytes_per_request
) {
7573 encode(iter
->key(), bl
);
7574 encode(iter
->value(), bl
);
7576 } // else return empty out_set
7577 encode(num
, osd_op
.outdata
);
7578 osd_op
.outdata
.claim_append(bl
);
7579 encode(truncated
, osd_op
.outdata
);
7580 ctx
->delta_stats
.num_rd_kb
+= shift_round_up(osd_op
.outdata
.length(), 10);
7581 ctx
->delta_stats
.num_rd
++;
7585 case CEPH_OSD_OP_OMAPGETHEADER
:
7586 tracepoint(osd
, do_osd_op_pre_omapgetheader
, soid
.oid
.name
.c_str(), soid
.snap
.val
);
7587 if (!oi
.is_omap()) {
7588 // return empty header
7593 osd
->store
->omap_get_header(ch
, ghobject_t(soid
), &osd_op
.outdata
);
7594 ctx
->delta_stats
.num_rd_kb
+= shift_round_up(osd_op
.outdata
.length(), 10);
7595 ctx
->delta_stats
.num_rd
++;
7599 case CEPH_OSD_OP_OMAPGETVALSBYKEYS
:
7602 set
<string
> keys_to_get
;
7604 decode(keys_to_get
, bp
);
7606 catch (ceph::buffer::error
& e
) {
7608 tracepoint(osd
, do_osd_op_pre_omapgetvalsbykeys
, soid
.oid
.name
.c_str(), soid
.snap
.val
, "???");
7611 tracepoint(osd
, do_osd_op_pre_omapgetvalsbykeys
, soid
.oid
.name
.c_str(), soid
.snap
.val
, list_entries(keys_to_get
).c_str());
7612 map
<string
, bufferlist
> out
;
7614 osd
->store
->omap_get_values(ch
, ghobject_t(soid
), keys_to_get
, &out
);
7615 } // else return empty omap entries
7616 encode(out
, osd_op
.outdata
);
7617 ctx
->delta_stats
.num_rd_kb
+= shift_round_up(osd_op
.outdata
.length(), 10);
7618 ctx
->delta_stats
.num_rd
++;
7622 case CEPH_OSD_OP_OMAP_CMP
:
7625 if (!obs
.exists
|| oi
.is_whiteout()) {
7627 tracepoint(osd
, do_osd_op_pre_omap_cmp
, soid
.oid
.name
.c_str(), soid
.snap
.val
, "???");
7630 map
<string
, pair
<bufferlist
, int> > assertions
;
7632 decode(assertions
, bp
);
7634 catch (ceph::buffer::error
& e
) {
7636 tracepoint(osd
, do_osd_op_pre_omap_cmp
, soid
.oid
.name
.c_str(), soid
.snap
.val
, "???");
7639 tracepoint(osd
, do_osd_op_pre_omap_cmp
, soid
.oid
.name
.c_str(), soid
.snap
.val
, list_keys(assertions
).c_str());
7641 map
<string
, bufferlist
> out
;
7645 for (map
<string
, pair
<bufferlist
, int> >::iterator i
= assertions
.begin();
7646 i
!= assertions
.end();
7648 to_get
.insert(i
->first
);
7649 int r
= osd
->store
->omap_get_values(ch
, ghobject_t(soid
),
7655 } // else leave out empty
7657 //Should set num_rd_kb based on encode length of map
7658 ctx
->delta_stats
.num_rd
++;
7662 for (map
<string
, pair
<bufferlist
, int> >::iterator i
= assertions
.begin();
7663 i
!= assertions
.end();
7665 auto out_entry
= out
.find(i
->first
);
7666 bufferlist
&bl
= (out_entry
!= out
.end()) ?
7667 out_entry
->second
: empty
;
7668 switch (i
->second
.second
) {
7669 case CEPH_OSD_CMPXATTR_OP_EQ
:
7670 if (!(bl
== i
->second
.first
)) {
7674 case CEPH_OSD_CMPXATTR_OP_LT
:
7675 if (!(bl
< i
->second
.first
)) {
7679 case CEPH_OSD_CMPXATTR_OP_GT
:
7680 if (!(bl
> i
->second
.first
)) {
7698 case CEPH_OSD_OP_OMAPSETVALS
:
7699 if (!pool
.info
.supports_omap()) {
7700 result
= -EOPNOTSUPP
;
7701 tracepoint(osd
, do_osd_op_pre_omapsetvals
, soid
.oid
.name
.c_str(), soid
.snap
.val
);
7707 maybe_create_new_object(ctx
);
7708 bufferlist to_set_bl
;
7710 decode_str_str_map_to_bl(bp
, &to_set_bl
);
7712 catch (ceph::buffer::error
& e
) {
7714 tracepoint(osd
, do_osd_op_pre_omapsetvals
, soid
.oid
.name
.c_str(), soid
.snap
.val
);
7717 tracepoint(osd
, do_osd_op_pre_omapsetvals
, soid
.oid
.name
.c_str(), soid
.snap
.val
);
7718 if (cct
->_conf
->subsys
.should_gather
<dout_subsys
, 20>()) {
7719 dout(20) << "setting vals: " << dendl
;
7720 map
<string
,bufferlist
> to_set
;
7721 bufferlist::const_iterator pt
= to_set_bl
.begin();
7723 for (map
<string
, bufferlist
>::iterator i
= to_set
.begin();
7726 dout(20) << "\t" << i
->first
<< dendl
;
7729 t
->omap_setkeys(soid
, to_set_bl
);
7730 ctx
->clean_regions
.mark_omap_dirty();
7731 ctx
->delta_stats
.num_wr
++;
7732 ctx
->delta_stats
.num_wr_kb
+= shift_round_up(to_set_bl
.length(), 10);
7734 obs
.oi
.set_flag(object_info_t::FLAG_OMAP
);
7735 obs
.oi
.clear_omap_digest();
7738 case CEPH_OSD_OP_OMAPSETHEADER
:
7739 tracepoint(osd
, do_osd_op_pre_omapsetheader
, soid
.oid
.name
.c_str(), soid
.snap
.val
);
7740 if (!pool
.info
.supports_omap()) {
7741 result
= -EOPNOTSUPP
;
7747 maybe_create_new_object(ctx
);
7748 t
->omap_setheader(soid
, osd_op
.indata
);
7749 ctx
->clean_regions
.mark_omap_dirty();
7750 ctx
->delta_stats
.num_wr
++;
7752 obs
.oi
.set_flag(object_info_t::FLAG_OMAP
);
7753 obs
.oi
.clear_omap_digest();
7756 case CEPH_OSD_OP_OMAPCLEAR
:
7757 tracepoint(osd
, do_osd_op_pre_omapclear
, soid
.oid
.name
.c_str(), soid
.snap
.val
);
7758 if (!pool
.info
.supports_omap()) {
7759 result
= -EOPNOTSUPP
;
7765 if (!obs
.exists
|| oi
.is_whiteout()) {
7770 t
->omap_clear(soid
);
7771 ctx
->clean_regions
.mark_omap_dirty();
7772 ctx
->delta_stats
.num_wr
++;
7773 obs
.oi
.clear_omap_digest();
7774 obs
.oi
.clear_flag(object_info_t::FLAG_OMAP
);
7779 case CEPH_OSD_OP_OMAPRMKEYS
:
7780 if (!pool
.info
.supports_omap()) {
7781 result
= -EOPNOTSUPP
;
7782 tracepoint(osd
, do_osd_op_pre_omaprmkeys
, soid
.oid
.name
.c_str(), soid
.snap
.val
);
7788 if (!obs
.exists
|| oi
.is_whiteout()) {
7790 tracepoint(osd
, do_osd_op_pre_omaprmkeys
, soid
.oid
.name
.c_str(), soid
.snap
.val
);
7793 bufferlist to_rm_bl
;
7795 decode_str_set_to_bl(bp
, &to_rm_bl
);
7797 catch (ceph::buffer::error
& e
) {
7799 tracepoint(osd
, do_osd_op_pre_omaprmkeys
, soid
.oid
.name
.c_str(), soid
.snap
.val
);
7802 tracepoint(osd
, do_osd_op_pre_omaprmkeys
, soid
.oid
.name
.c_str(), soid
.snap
.val
);
7803 t
->omap_rmkeys(soid
, to_rm_bl
);
7804 ctx
->clean_regions
.mark_omap_dirty();
7805 ctx
->delta_stats
.num_wr
++;
7807 obs
.oi
.clear_omap_digest();
7810 case CEPH_OSD_OP_OMAPRMKEYRANGE
:
7811 tracepoint(osd
, do_osd_op_pre_omaprmkeyrange
, soid
.oid
.name
.c_str(), soid
.snap
.val
);
7812 if (!pool
.info
.supports_omap()) {
7813 result
= -EOPNOTSUPP
;
7819 if (!obs
.exists
|| oi
.is_whiteout()) {
7823 std::string key_begin
, key_end
;
7825 decode(key_begin
, bp
);
7826 decode(key_end
, bp
);
7827 } catch (ceph::buffer::error
& e
) {
7831 t
->omap_rmkeyrange(soid
, key_begin
, key_end
);
7832 ctx
->delta_stats
.num_wr
++;
7834 obs
.oi
.clear_omap_digest();
7837 case CEPH_OSD_OP_COPY_GET
:
7839 tracepoint(osd
, do_osd_op_pre_copy_get
, soid
.oid
.name
.c_str(),
7841 if (op_finisher
== nullptr) {
7842 result
= do_copy_get(ctx
, bp
, osd_op
, ctx
->obc
);
7844 result
= op_finisher
->execute();
7848 case CEPH_OSD_OP_COPY_FROM
:
7849 case CEPH_OSD_OP_COPY_FROM2
:
7854 object_locator_t src_oloc
;
7855 uint32_t truncate_seq
= 0;
7856 uint64_t truncate_size
= 0;
7857 bool have_truncate
= false;
7858 snapid_t src_snapid
= (uint64_t)op
.copy_from
.snapid
;
7859 version_t src_version
= op
.copy_from
.src_version
;
7861 if ((op
.op
== CEPH_OSD_OP_COPY_FROM2
) &&
7862 (op
.copy_from
.flags
& ~CEPH_OSD_COPY_FROM_FLAGS
)) {
7863 dout(20) << "invalid copy-from2 flags 0x"
7864 << std::hex
<< (int)op
.copy_from
.flags
<< std::dec
<< dendl
;
7869 decode(src_name
, bp
);
7870 decode(src_oloc
, bp
);
7871 // check if client sent us truncate_seq and truncate_size
7872 if ((op
.op
== CEPH_OSD_OP_COPY_FROM2
) &&
7873 (op
.copy_from
.flags
& CEPH_OSD_COPY_FROM_FLAG_TRUNCATE_SEQ
)) {
7874 decode(truncate_seq
, bp
);
7875 decode(truncate_size
, bp
);
7876 have_truncate
= true;
7879 catch (ceph::buffer::error
& e
) {
7882 do_osd_op_pre_copy_from
,
7883 soid
.oid
.name
.c_str(),
7895 do_osd_op_pre_copy_from
,
7896 soid
.oid
.name
.c_str(),
7898 src_name
.name
.c_str(),
7900 src_oloc
.key
.c_str(),
7901 src_oloc
.nspace
.c_str(),
7905 if (op_finisher
== nullptr) {
7908 get_osdmap()->object_locator_to_pg(src_name
, src_oloc
, raw_pg
);
7909 hobject_t
src(src_name
, src_oloc
.key
, src_snapid
,
7910 raw_pg
.ps(), raw_pg
.pool(),
7913 dout(20) << " copy from self is invalid" << dendl
;
7917 CopyFromCallback
*cb
= new CopyFromCallback(ctx
, osd_op
);
7919 cb
->set_truncate(truncate_seq
, truncate_size
);
7920 ctx
->op_finishers
[ctx
->current_osd_subop_num
].reset(
7921 new CopyFromFinisher(cb
));
7922 start_copy(cb
, ctx
->obc
, src
, src_oloc
, src_version
,
7925 op
.copy_from
.src_fadvise_flags
,
7927 result
= -EINPROGRESS
;
7930 result
= op_finisher
->execute();
7931 ceph_assert(result
== 0);
7933 // COPY_FROM cannot be executed multiple times -- it must restart
7934 ctx
->op_finishers
.erase(ctx
->current_osd_subop_num
);
7940 tracepoint(osd
, do_osd_op_pre_unknown
, soid
.oid
.name
.c_str(), soid
.snap
.val
, op
.op
, ceph_osd_op_name(op
.op
));
7941 dout(1) << "unrecognized osd op " << op
.op
7942 << " " << ceph_osd_op_name(op
.op
)
7944 result
= -EOPNOTSUPP
;
7948 osd_op
.rval
= result
;
7949 tracepoint(osd
, do_osd_op_post
, soid
.oid
.name
.c_str(), soid
.snap
.val
, op
.op
, ceph_osd_op_name(op
.op
), op
.flags
, result
);
7950 if (result
< 0 && (op
.flags
& CEPH_OSD_OP_FLAG_FAILOK
) &&
7951 result
!= -EAGAIN
&& result
!= -EINPROGRESS
)
7958 dout(10) << __func__
<< " error: " << cpp_strerror(result
) << dendl
;
7963 int PrimaryLogPG::_get_tmap(OpContext
*ctx
, bufferlist
*header
, bufferlist
*vals
)
7965 if (ctx
->new_obs
.oi
.size
== 0) {
7966 dout(20) << "unable to get tmap for zero sized " << ctx
->new_obs
.oi
.soid
<< dendl
;
7969 vector
<OSDOp
> nops(1);
7970 OSDOp
&newop
= nops
[0];
7971 newop
.op
.op
= CEPH_OSD_OP_TMAPGET
;
7972 do_osd_ops(ctx
, nops
);
7974 bufferlist::const_iterator i
= newop
.outdata
.begin();
7976 (*vals
).substr_of(newop
.outdata
, i
.get_off(), i
.get_remaining());
7978 dout(20) << "unsuccessful at decoding tmap for " << ctx
->new_obs
.oi
.soid
7982 dout(20) << "successful at decoding tmap for " << ctx
->new_obs
.oi
.soid
7987 int PrimaryLogPG::_verify_no_head_clones(const hobject_t
& soid
,
7990 // verify that all clones have been evicted
7991 dout(20) << __func__
<< " verifying clones are absent "
7993 for (vector
<snapid_t
>::const_iterator p
= ss
.clones
.begin();
7994 p
!= ss
.clones
.end();
7996 hobject_t clone_oid
= soid
;
7997 clone_oid
.snap
= *p
;
7998 if (is_missing_object(clone_oid
))
8000 ObjectContextRef clone_obc
= get_object_context(clone_oid
, false);
8001 if (clone_obc
&& clone_obc
->obs
.exists
) {
8002 dout(10) << __func__
<< " cannot evict head before clone "
8003 << clone_oid
<< dendl
;
8006 if (copy_ops
.count(clone_oid
)) {
8007 dout(10) << __func__
<< " cannot evict head, pending promote on clone "
8008 << clone_oid
<< dendl
;
8015 inline int PrimaryLogPG::_delete_oid(
8017 bool no_whiteout
, // no whiteouts, no matter what.
8018 bool try_no_whiteout
) // try not to whiteout
8020 SnapSet
& snapset
= ctx
->new_snapset
;
8021 ObjectState
& obs
= ctx
->new_obs
;
8022 object_info_t
& oi
= obs
.oi
;
8023 const hobject_t
& soid
= oi
.soid
;
8024 PGTransaction
* t
= ctx
->op_t
.get();
8026 // cache: cache: set whiteout on delete?
8027 bool whiteout
= false;
8028 if (pool
.info
.cache_mode
!= pg_pool_t::CACHEMODE_NONE
8030 && !try_no_whiteout
) {
8034 // in luminous or later, we can't delete the head if there are
8035 // clones. we trust the caller passing no_whiteout has already
8036 // verified they don't exist.
8037 if (!snapset
.clones
.empty() ||
8038 (!ctx
->snapc
.snaps
.empty() && ctx
->snapc
.snaps
[0] > snapset
.seq
)) {
8040 dout(20) << __func__
<< " has or will have clones but no_whiteout=1"
8043 dout(20) << __func__
<< " has or will have clones; will whiteout"
8048 dout(20) << __func__
<< " " << soid
<< " whiteout=" << (int)whiteout
8049 << " no_whiteout=" << (int)no_whiteout
8050 << " try_no_whiteout=" << (int)try_no_whiteout
8052 if (!obs
.exists
|| (obs
.oi
.is_whiteout() && whiteout
))
8058 interval_set
<uint64_t> ch
;
8059 ch
.insert(0, oi
.size
);
8060 ctx
->modified_ranges
.union_of(ch
);
8061 ctx
->clean_regions
.mark_data_region_dirty(0, oi
.size
);
8064 ctx
->clean_regions
.mark_omap_dirty();
8065 ctx
->delta_stats
.num_wr
++;
8066 if (soid
.is_snap()) {
8067 ceph_assert(ctx
->obc
->ssc
->snapset
.clone_overlap
.count(soid
.snap
));
8068 ctx
->delta_stats
.num_bytes
-= ctx
->obc
->ssc
->snapset
.get_clone_bytes(soid
.snap
);
8070 ctx
->delta_stats
.num_bytes
-= oi
.size
;
8075 // disconnect all watchers
8076 for (map
<pair
<uint64_t, entity_name_t
>, watch_info_t
>::iterator p
=
8077 oi
.watchers
.begin();
8078 p
!= oi
.watchers
.end();
8080 dout(20) << __func__
<< " will disconnect watcher " << p
->first
<< dendl
;
8081 ctx
->watch_disconnects
.push_back(
8082 watch_disconnect_t(p
->first
.first
, p
->first
.second
, true));
8084 oi
.watchers
.clear();
8086 if (oi
.has_manifest()) {
8087 ctx
->delta_stats
.num_objects_manifest
--;
8088 dec_all_refcount_manifest(oi
, ctx
);
8092 dout(20) << __func__
<< " setting whiteout on " << soid
<< dendl
;
8093 oi
.set_flag(object_info_t::FLAG_WHITEOUT
);
8094 ctx
->delta_stats
.num_whiteouts
++;
8096 osd
->logger
->inc(l_osd_tier_whiteout
);
8101 ctx
->delta_stats
.num_objects
--;
8103 ctx
->delta_stats
.num_object_clones
--;
8104 if (oi
.is_whiteout()) {
8105 dout(20) << __func__
<< " deleting whiteout on " << soid
<< dendl
;
8106 ctx
->delta_stats
.num_whiteouts
--;
8107 oi
.clear_flag(object_info_t::FLAG_WHITEOUT
);
8109 if (oi
.is_cache_pinned()) {
8110 ctx
->delta_stats
.num_objects_pinned
--;
8116 int PrimaryLogPG::_rollback_to(OpContext
*ctx
, ceph_osd_op
& op
)
8118 SnapSet
& snapset
= ctx
->new_snapset
;
8119 ObjectState
& obs
= ctx
->new_obs
;
8120 object_info_t
& oi
= obs
.oi
;
8121 const hobject_t
& soid
= oi
.soid
;
8122 PGTransaction
* t
= ctx
->op_t
.get();
8123 snapid_t snapid
= (uint64_t)op
.snap
.snapid
;
8124 hobject_t missing_oid
;
8126 dout(10) << "_rollback_to " << soid
<< " snapid " << snapid
<< dendl
;
8128 ObjectContextRef rollback_to
;
8130 int ret
= find_object_context(
8131 hobject_t(soid
.oid
, soid
.get_key(), snapid
, soid
.get_hash(), info
.pgid
.pool(),
8132 soid
.get_namespace()),
8133 &rollback_to
, false, false, &missing_oid
);
8134 if (ret
== -EAGAIN
) {
8135 /* clone must be missing */
8136 ceph_assert(is_degraded_or_backfilling_object(missing_oid
) || is_degraded_on_async_recovery_target(missing_oid
));
8137 dout(20) << "_rollback_to attempted to roll back to a missing or backfilling clone "
8138 << missing_oid
<< " (requested snapid: ) " << snapid
<< dendl
;
8139 block_write_on_degraded_snap(missing_oid
, ctx
->op
);
8143 ObjectContextRef promote_obc
;
8144 cache_result_t tier_mode_result
;
8145 if (obs
.exists
&& obs
.oi
.has_manifest()) {
8147 maybe_handle_manifest_detail(
8153 maybe_handle_cache_detail(
8163 switch (tier_mode_result
) {
8164 case cache_result_t::NOOP
:
8166 case cache_result_t::BLOCKED_PROMOTE
:
8167 ceph_assert(promote_obc
);
8168 block_write_on_snap_rollback(soid
, promote_obc
, ctx
->op
);
8170 case cache_result_t::BLOCKED_FULL
:
8171 block_write_on_full_cache(soid
, ctx
->op
);
8173 case cache_result_t::REPLIED_WITH_EAGAIN
:
8174 ceph_abort_msg("this can't happen, no rollback on replica");
8176 ceph_abort_msg("must promote was set, other values are not valid");
8181 if (ret
== -ENOENT
|| (rollback_to
&& rollback_to
->obs
.oi
.is_whiteout())) {
8182 // there's no snapshot here, or there's no object.
8183 // if there's no snapshot, we delete the object; otherwise, do nothing.
8184 dout(20) << "_rollback_to deleting head on " << soid
.oid
8185 << " because got ENOENT|whiteout on find_object_context" << dendl
;
8186 if (ctx
->obc
->obs
.oi
.watchers
.size()) {
8187 // Cannot delete an object with watchers
8190 _delete_oid(ctx
, false, false);
8194 // ummm....huh? It *can't* return anything else at time of writing.
8195 ceph_abort_msg("unexpected error code in _rollback_to");
8196 } else { //we got our context, let's use it to do the rollback!
8197 hobject_t
& rollback_to_sobject
= rollback_to
->obs
.oi
.soid
;
8198 if (is_degraded_or_backfilling_object(rollback_to_sobject
) ||
8199 is_degraded_on_async_recovery_target(rollback_to_sobject
)) {
8200 dout(20) << "_rollback_to attempted to roll back to a degraded object "
8201 << rollback_to_sobject
<< " (requested snapid: ) " << snapid
<< dendl
;
8202 block_write_on_degraded_snap(rollback_to_sobject
, ctx
->op
);
8204 } else if (rollback_to
->obs
.oi
.soid
.snap
== CEPH_NOSNAP
) {
8205 // rolling back to the head; we just need to clone it.
8208 /* 1) Delete current head
8209 * 2) Clone correct snapshot into head
8210 * 3) Calculate clone_overlaps by following overlaps
8211 * forward from rollback snapshot */
8212 dout(10) << "_rollback_to deleting " << soid
.oid
8213 << " and rolling back to old snap" << dendl
;
8218 t
->clone(soid
, rollback_to_sobject
);
8219 t
->add_obc(rollback_to
);
8221 map
<snapid_t
, interval_set
<uint64_t> >::iterator iter
=
8222 snapset
.clone_overlap
.lower_bound(snapid
);
8223 ceph_assert(iter
!= snapset
.clone_overlap
.end());
8224 interval_set
<uint64_t> overlaps
= iter
->second
;
8226 iter
!= snapset
.clone_overlap
.end();
8228 overlaps
.intersection_of(iter
->second
);
8230 if (obs
.oi
.size
> 0) {
8231 interval_set
<uint64_t> modified
;
8232 modified
.insert(0, obs
.oi
.size
);
8233 overlaps
.intersection_of(modified
);
8234 modified
.subtract(overlaps
);
8235 ctx
->modified_ranges
.union_of(modified
);
8238 // Adjust the cached objectcontext
8239 maybe_create_new_object(ctx
, true);
8240 ctx
->delta_stats
.num_bytes
-= obs
.oi
.size
;
8241 ctx
->delta_stats
.num_bytes
+= rollback_to
->obs
.oi
.size
;
8242 ctx
->clean_regions
.mark_data_region_dirty(0, std::max(obs
.oi
.size
, rollback_to
->obs
.oi
.size
));
8243 ctx
->clean_regions
.mark_omap_dirty();
8244 obs
.oi
.size
= rollback_to
->obs
.oi
.size
;
8245 if (rollback_to
->obs
.oi
.is_data_digest())
8246 obs
.oi
.set_data_digest(rollback_to
->obs
.oi
.data_digest
);
8248 obs
.oi
.clear_data_digest();
8249 if (rollback_to
->obs
.oi
.is_omap_digest())
8250 obs
.oi
.set_omap_digest(rollback_to
->obs
.oi
.omap_digest
);
8252 obs
.oi
.clear_omap_digest();
8254 if (rollback_to
->obs
.oi
.is_omap()) {
8255 dout(10) << __func__
<< " setting omap flag on " << obs
.oi
.soid
<< dendl
;
8256 obs
.oi
.set_flag(object_info_t::FLAG_OMAP
);
8258 dout(10) << __func__
<< " clearing omap flag on " << obs
.oi
.soid
<< dendl
;
8259 obs
.oi
.clear_flag(object_info_t::FLAG_OMAP
);
8266 void PrimaryLogPG::_make_clone(
8269 ObjectContextRef obc
,
8270 const hobject_t
& head
, const hobject_t
& coid
,
8274 encode(*poi
, bv
, get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD
, nullptr));
8276 t
->clone(coid
, head
);
8277 setattr_maybe_cache(obc
, t
, OI_ATTR
, bv
);
8278 rmattr_maybe_cache(obc
, t
, SS_ATTR
);
8281 void PrimaryLogPG::make_writeable(OpContext
*ctx
)
8283 const hobject_t
& soid
= ctx
->obs
->oi
.soid
;
8284 SnapContext
& snapc
= ctx
->snapc
;
8287 ceph_assert(soid
.snap
== CEPH_NOSNAP
);
8288 dout(20) << "make_writeable " << soid
<< " snapset=" << ctx
->new_snapset
8289 << " snapc=" << snapc
<< dendl
;
8291 bool was_dirty
= ctx
->obc
->obs
.oi
.is_dirty();
8292 if (ctx
->new_obs
.exists
) {
8293 // we will mark the object dirty
8294 if (ctx
->undirty
&& was_dirty
) {
8295 dout(20) << " clearing DIRTY flag" << dendl
;
8296 ceph_assert(ctx
->new_obs
.oi
.is_dirty());
8297 ctx
->new_obs
.oi
.clear_flag(object_info_t::FLAG_DIRTY
);
8298 --ctx
->delta_stats
.num_objects_dirty
;
8299 osd
->logger
->inc(l_osd_tier_clean
);
8300 } else if (!was_dirty
&& !ctx
->undirty
) {
8301 dout(20) << " setting DIRTY flag" << dendl
;
8302 ctx
->new_obs
.oi
.set_flag(object_info_t::FLAG_DIRTY
);
8303 ++ctx
->delta_stats
.num_objects_dirty
;
8304 osd
->logger
->inc(l_osd_tier_dirty
);
8308 dout(20) << " deletion, decrementing num_dirty and clearing flag" << dendl
;
8309 ctx
->new_obs
.oi
.clear_flag(object_info_t::FLAG_DIRTY
);
8310 --ctx
->delta_stats
.num_objects_dirty
;
8314 if ((ctx
->new_obs
.exists
&&
8315 ctx
->new_obs
.oi
.is_omap()) &&
8316 (!ctx
->obc
->obs
.exists
||
8317 !ctx
->obc
->obs
.oi
.is_omap())) {
8318 ++ctx
->delta_stats
.num_objects_omap
;
8320 if ((!ctx
->new_obs
.exists
||
8321 !ctx
->new_obs
.oi
.is_omap()) &&
8322 (ctx
->obc
->obs
.exists
&&
8323 ctx
->obc
->obs
.oi
.is_omap())) {
8324 --ctx
->delta_stats
.num_objects_omap
;
8327 if (ctx
->new_snapset
.seq
> snapc
.seq
) {
8328 dout(10) << " op snapset is old" << dendl
;
8331 if ((ctx
->obs
->exists
&& !ctx
->obs
->oi
.is_whiteout()) && // head exist(ed)
8332 snapc
.snaps
.size() && // there are snaps
8333 !ctx
->cache_operation
&&
8334 snapc
.snaps
[0] > ctx
->new_snapset
.seq
) { // existing object is old
8336 hobject_t coid
= soid
;
8337 coid
.snap
= snapc
.seq
;
8341 l
< snapc
.snaps
.size() && snapc
.snaps
[l
] > ctx
->new_snapset
.seq
;
8344 vector
<snapid_t
> snaps(l
);
8345 for (unsigned i
=0; i
<l
; i
++)
8346 snaps
[i
] = snapc
.snaps
[i
];
8349 object_info_t
static_snap_oi(coid
);
8350 object_info_t
*snap_oi
;
8352 ctx
->clone_obc
= object_contexts
.lookup_or_create(static_snap_oi
.soid
);
8353 ctx
->clone_obc
->destructor_callback
=
8354 new C_PG_ObjectContext(this, ctx
->clone_obc
.get());
8355 ctx
->clone_obc
->obs
.oi
= static_snap_oi
;
8356 ctx
->clone_obc
->obs
.exists
= true;
8357 ctx
->clone_obc
->ssc
= ctx
->obc
->ssc
;
8358 ctx
->clone_obc
->ssc
->ref
++;
8359 if (pool
.info
.is_erasure())
8360 ctx
->clone_obc
->attr_cache
= ctx
->obc
->attr_cache
;
8361 snap_oi
= &ctx
->clone_obc
->obs
.oi
;
8362 if (ctx
->obc
->obs
.oi
.has_manifest()) {
8363 if ((ctx
->obc
->obs
.oi
.flags
& object_info_t::FLAG_REDIRECT_HAS_REFERENCE
) &&
8364 ctx
->obc
->obs
.oi
.manifest
.is_redirect()) {
8365 snap_oi
->set_flag(object_info_t::FLAG_MANIFEST
);
8366 snap_oi
->manifest
.type
= object_manifest_t::TYPE_REDIRECT
;
8367 snap_oi
->manifest
.redirect_target
= ctx
->obc
->obs
.oi
.manifest
.redirect_target
;
8368 } else if (ctx
->obc
->obs
.oi
.manifest
.is_chunked()) {
8369 snap_oi
->set_flag(object_info_t::FLAG_MANIFEST
);
8370 snap_oi
->manifest
.type
= object_manifest_t::TYPE_CHUNKED
;
8371 snap_oi
->manifest
.chunk_map
= ctx
->obc
->obs
.oi
.manifest
.chunk_map
;
8373 ceph_abort_msg("unrecognized manifest type");
8376 bool got
= ctx
->lock_manager
.get_write_greedy(
8381 dout(20) << " got greedy write on clone_obc " << *ctx
->clone_obc
<< dendl
;
8383 snap_oi
= &static_snap_oi
;
8385 snap_oi
->version
= ctx
->at_version
;
8386 snap_oi
->prior_version
= ctx
->obs
->oi
.version
;
8387 snap_oi
->copy_user_bits(ctx
->obs
->oi
);
8389 _make_clone(ctx
, ctx
->op_t
.get(), ctx
->clone_obc
, soid
, coid
, snap_oi
);
8391 ctx
->delta_stats
.num_objects
++;
8392 if (snap_oi
->is_dirty()) {
8393 ctx
->delta_stats
.num_objects_dirty
++;
8394 osd
->logger
->inc(l_osd_tier_dirty
);
8396 if (snap_oi
->is_omap())
8397 ctx
->delta_stats
.num_objects_omap
++;
8398 if (snap_oi
->is_cache_pinned())
8399 ctx
->delta_stats
.num_objects_pinned
++;
8400 if (snap_oi
->has_manifest())
8401 ctx
->delta_stats
.num_objects_manifest
++;
8402 ctx
->delta_stats
.num_object_clones
++;
8403 ctx
->new_snapset
.clones
.push_back(coid
.snap
);
8404 ctx
->new_snapset
.clone_size
[coid
.snap
] = ctx
->obs
->oi
.size
;
8405 ctx
->new_snapset
.clone_snaps
[coid
.snap
] = snaps
;
8407 // clone_overlap should contain an entry for each clone
8408 // (an empty interval_set if there is no overlap)
8409 ctx
->new_snapset
.clone_overlap
[coid
.snap
];
8410 if (ctx
->obs
->oi
.size
)
8411 ctx
->new_snapset
.clone_overlap
[coid
.snap
].insert(0, ctx
->obs
->oi
.size
);
8414 dout(10) << " cloning v " << ctx
->obs
->oi
.version
8415 << " to " << coid
<< " v " << ctx
->at_version
8416 << " snaps=" << snaps
8417 << " snapset=" << ctx
->new_snapset
<< dendl
;
8418 ctx
->log
.push_back(pg_log_entry_t(
8419 pg_log_entry_t::CLONE
, coid
, ctx
->at_version
,
8420 ctx
->obs
->oi
.version
,
8421 ctx
->obs
->oi
.user_version
,
8422 osd_reqid_t(), ctx
->new_obs
.oi
.mtime
, 0));
8423 encode(snaps
, ctx
->log
.back().snaps
);
8425 ctx
->at_version
.version
++;
8428 // update most recent clone_overlap and usage stats
8429 if (ctx
->new_snapset
.clones
.size() > 0) {
8430 // the clone_overlap is difference of range between head and clones.
8431 // we need to check whether the most recent clone exists, if it's
8432 // been evicted, it's not included in the stats, but the clone_overlap
8433 // is still exist in the snapset, so we should update the
8434 // clone_overlap to make it sense.
8435 hobject_t last_clone_oid
= soid
;
8436 last_clone_oid
.snap
= ctx
->new_snapset
.clone_overlap
.rbegin()->first
;
8437 interval_set
<uint64_t> &newest_overlap
=
8438 ctx
->new_snapset
.clone_overlap
.rbegin()->second
;
8439 ctx
->modified_ranges
.intersection_of(newest_overlap
);
8440 if (is_present_clone(last_clone_oid
)) {
8441 // modified_ranges is still in use by the clone
8442 ctx
->delta_stats
.num_bytes
+= ctx
->modified_ranges
.size();
8444 newest_overlap
.subtract(ctx
->modified_ranges
);
8447 if (snapc
.seq
> ctx
->new_snapset
.seq
) {
8448 // update snapset with latest snap context
8449 ctx
->new_snapset
.seq
= snapc
.seq
;
8450 if (get_osdmap()->require_osd_release
< ceph_release_t::octopus
) {
8451 ctx
->new_snapset
.snaps
= snapc
.snaps
;
8453 ctx
->new_snapset
.snaps
.clear();
8456 dout(20) << "make_writeable " << soid
8457 << " done, snapset=" << ctx
->new_snapset
<< dendl
;
8461 void PrimaryLogPG::write_update_size_and_usage(object_stat_sum_t
& delta_stats
, object_info_t
& oi
,
8462 interval_set
<uint64_t>& modified
, uint64_t offset
,
8463 uint64_t length
, bool write_full
)
8465 interval_set
<uint64_t> ch
;
8468 ch
.insert(0, oi
.size
);
8470 ch
.insert(offset
, length
);
8471 modified
.union_of(ch
);
8473 (offset
+ length
> oi
.size
&& length
)) {
8474 uint64_t new_size
= offset
+ length
;
8475 delta_stats
.num_bytes
-= oi
.size
;
8476 delta_stats
.num_bytes
+= new_size
;
8480 delta_stats
.num_wr
++;
8481 delta_stats
.num_wr_kb
+= shift_round_up(length
, 10);
8484 void PrimaryLogPG::truncate_update_size_and_usage(
8485 object_stat_sum_t
& delta_stats
,
8487 uint64_t truncate_size
)
8489 if (oi
.size
!= truncate_size
) {
8490 delta_stats
.num_bytes
-= oi
.size
;
8491 delta_stats
.num_bytes
+= truncate_size
;
8492 oi
.size
= truncate_size
;
8496 void PrimaryLogPG::complete_disconnect_watches(
8497 ObjectContextRef obc
,
8498 const list
<watch_disconnect_t
> &to_disconnect
)
8500 for (list
<watch_disconnect_t
>::const_iterator i
=
8501 to_disconnect
.begin();
8502 i
!= to_disconnect
.end();
8504 pair
<uint64_t, entity_name_t
> watcher(i
->cookie
, i
->name
);
8505 auto watchers_entry
= obc
->watchers
.find(watcher
);
8506 if (watchers_entry
!= obc
->watchers
.end()) {
8507 WatchRef watch
= watchers_entry
->second
;
8508 dout(10) << "do_osd_op_effects disconnect watcher " << watcher
<< dendl
;
8509 obc
->watchers
.erase(watcher
);
8510 watch
->remove(i
->send_disconnect
);
8512 dout(10) << "do_osd_op_effects disconnect failed to find watcher "
8513 << watcher
<< dendl
;
8518 void PrimaryLogPG::do_osd_op_effects(OpContext
*ctx
, const ConnectionRef
& conn
)
8520 entity_name_t entity
= ctx
->reqid
.name
;
8521 dout(15) << "do_osd_op_effects " << entity
<< " con " << conn
.get() << dendl
;
8523 // disconnects first
8524 complete_disconnect_watches(ctx
->obc
, ctx
->watch_disconnects
);
8528 auto session
= conn
->get_priv();
8532 for (list
<pair
<watch_info_t
,bool> >::iterator i
= ctx
->watch_connects
.begin();
8533 i
!= ctx
->watch_connects
.end();
8535 pair
<uint64_t, entity_name_t
> watcher(i
->first
.cookie
, entity
);
8536 dout(15) << "do_osd_op_effects applying watch connect on session "
8537 << session
.get() << " watcher " << watcher
<< dendl
;
8539 if (ctx
->obc
->watchers
.count(watcher
)) {
8540 dout(15) << "do_osd_op_effects found existing watch watcher " << watcher
8542 watch
= ctx
->obc
->watchers
[watcher
];
8544 dout(15) << "do_osd_op_effects new watcher " << watcher
8546 watch
= Watch::makeWatchRef(
8547 this, osd
, ctx
->obc
, i
->first
.timeout_seconds
,
8548 i
->first
.cookie
, entity
, conn
->get_peer_addr());
8549 ctx
->obc
->watchers
.insert(
8554 watch
->connect(conn
, i
->second
);
8557 for (list
<notify_info_t
>::iterator p
= ctx
->notifies
.begin();
8558 p
!= ctx
->notifies
.end();
8560 dout(10) << "do_osd_op_effects, notify " << *p
<< dendl
;
8561 ConnectionRef
conn(ctx
->op
->get_req()->get_connection());
8563 Notify::makeNotifyRef(
8565 ctx
->reqid
.name
.num(),
8570 ctx
->obc
->obs
.oi
.user_version
,
8572 for (map
<pair
<uint64_t, entity_name_t
>, WatchRef
>::iterator i
=
8573 ctx
->obc
->watchers
.begin();
8574 i
!= ctx
->obc
->watchers
.end();
8576 dout(10) << "starting notify on watch " << i
->first
<< dendl
;
8577 i
->second
->start_notify(notif
);
8582 for (list
<OpContext::NotifyAck
>::iterator p
= ctx
->notify_acks
.begin();
8583 p
!= ctx
->notify_acks
.end();
8585 if (p
->watch_cookie
)
8586 dout(10) << "notify_ack " << make_pair(*(p
->watch_cookie
), p
->notify_id
) << dendl
;
8588 dout(10) << "notify_ack " << make_pair("NULL", p
->notify_id
) << dendl
;
8589 for (map
<pair
<uint64_t, entity_name_t
>, WatchRef
>::iterator i
=
8590 ctx
->obc
->watchers
.begin();
8591 i
!= ctx
->obc
->watchers
.end();
8593 if (i
->first
.second
!= entity
) continue;
8594 if (p
->watch_cookie
&&
8595 *(p
->watch_cookie
) != i
->first
.first
) continue;
8596 dout(10) << "acking notify on watch " << i
->first
<< dendl
;
8597 i
->second
->notify_ack(p
->notify_id
, p
->reply_bl
);
8602 hobject_t
PrimaryLogPG::generate_temp_object(const hobject_t
& target
)
8605 ss
<< "temp_" << info
.pgid
<< "_" << get_role()
8606 << "_" << osd
->monc
->get_global_id() << "_" << (++temp_seq
);
8607 hobject_t hoid
= target
.make_temp_hobject(ss
.str());
8608 dout(20) << __func__
<< " " << hoid
<< dendl
;
8612 hobject_t
PrimaryLogPG::get_temp_recovery_object(
8613 const hobject_t
& target
,
8617 ss
<< "temp_recovering_" << info
.pgid
// (note this includes the shardid)
8619 << "_" << info
.history
.same_interval_since
8620 << "_" << target
.snap
;
8621 // pgid + version + interval + snapid is unique, and short
8622 hobject_t hoid
= target
.make_temp_hobject(ss
.str());
8623 dout(20) << __func__
<< " " << hoid
<< dendl
;
8627 int PrimaryLogPG::prepare_transaction(OpContext
*ctx
)
8629 ceph_assert(!ctx
->ops
->empty());
8631 // valid snap context?
8632 if (!ctx
->snapc
.is_valid()) {
8633 dout(10) << " invalid snapc " << ctx
->snapc
<< dendl
;
8637 // prepare the actual mutation
8638 int result
= do_osd_ops(ctx
, *ctx
->ops
);
8640 if (ctx
->op
->may_write() &&
8641 get_osdmap()->require_osd_release
>= ceph_release_t::kraken
) {
8642 // need to save the error code in the pg log, to detect dup ops,
8643 // but do nothing else
8644 ctx
->update_log_only
= true;
8649 // read-op? write-op noop? done?
8650 if (ctx
->op_t
->empty() && !ctx
->modify
) {
8651 if (ctx
->pending_async_reads
.empty())
8652 unstable_stats
.add(ctx
->delta_stats
);
8653 if (ctx
->op
->may_write() &&
8654 get_osdmap()->require_osd_release
>= ceph_release_t::kraken
) {
8655 ctx
->update_log_only
= true;
8661 if ((ctx
->delta_stats
.num_bytes
> 0 ||
8662 ctx
->delta_stats
.num_objects
> 0) && // FIXME: keys?
8663 pool
.info
.has_flag(pg_pool_t::FLAG_FULL
)) {
8664 auto m
= ctx
->op
->get_req
<MOSDOp
>();
8665 if (ctx
->reqid
.name
.is_mds() || // FIXME: ignore MDS for now
8666 m
->has_flag(CEPH_OSD_FLAG_FULL_FORCE
)) {
8667 dout(20) << __func__
<< " full, but proceeding due to FULL_FORCE or MDS"
8669 } else if (m
->has_flag(CEPH_OSD_FLAG_FULL_TRY
)) {
8670 // they tried, they failed.
8671 dout(20) << __func__
<< " full, replying to FULL_TRY op" << dendl
;
8672 return pool
.info
.has_flag(pg_pool_t::FLAG_FULL_QUOTA
) ? -EDQUOT
: -ENOSPC
;
8675 dout(20) << __func__
<< " full, dropping request (bad client)" << dendl
;
8680 const hobject_t
& soid
= ctx
->obs
->oi
.soid
;
8681 // clone, if necessary
8682 if (soid
.snap
== CEPH_NOSNAP
)
8683 make_writeable(ctx
);
8686 ctx
->new_obs
.exists
? pg_log_entry_t::MODIFY
:
8687 pg_log_entry_t::DELETE
,
8693 void PrimaryLogPG::finish_ctx(OpContext
*ctx
, int log_op_type
, int result
)
8695 const hobject_t
& soid
= ctx
->obs
->oi
.soid
;
8696 dout(20) << __func__
<< " " << soid
<< " " << ctx
8697 << " op " << pg_log_entry_t::get_op_name(log_op_type
)
8699 utime_t now
= ceph_clock_now();
8702 if (ctx
->op
->osd_parent_span
) {
8703 auto finish_ctx_span
= jaeger_tracing::child_span(__func__
, ctx
->op
->osd_parent_span
);
8706 // Drop the reference if deduped chunk is modified
8707 if (ctx
->new_obs
.oi
.is_dirty() &&
8708 (ctx
->obs
->oi
.has_manifest() && ctx
->obs
->oi
.manifest
.is_chunked()) &&
8709 // If a clone is creating, ignore dropping the reference for manifest object
8710 !ctx
->delta_stats
.num_object_clones
&&
8711 ctx
->new_obs
.oi
.size
!= 0 && // missing, redirect and delete
8712 !ctx
->cache_operation
&&
8713 log_op_type
!= pg_log_entry_t::PROMOTE
) {
8714 dec_refcount_by_dirty(ctx
);
8717 // finish and log the op.
8718 if (ctx
->user_modify
) {
8719 // update the user_version for any modify ops, except for the watch op
8720 ctx
->user_at_version
= std::max(info
.last_user_version
, ctx
->new_obs
.oi
.user_version
) + 1;
8721 /* In order for new clients and old clients to interoperate properly
8722 * when exchanging versions, we need to lower bound the user_version
8723 * (which our new clients pay proper attention to)
8724 * by the at_version (which is all the old clients can ever see). */
8725 if (ctx
->at_version
.version
> ctx
->user_at_version
)
8726 ctx
->user_at_version
= ctx
->at_version
.version
;
8727 ctx
->new_obs
.oi
.user_version
= ctx
->user_at_version
;
8729 ctx
->bytes_written
= ctx
->op_t
->get_bytes_written();
8731 if (ctx
->new_obs
.exists
) {
8732 ctx
->new_obs
.oi
.version
= ctx
->at_version
;
8733 ctx
->new_obs
.oi
.prior_version
= ctx
->obs
->oi
.version
;
8734 ctx
->new_obs
.oi
.last_reqid
= ctx
->reqid
;
8735 if (ctx
->mtime
!= utime_t()) {
8736 ctx
->new_obs
.oi
.mtime
= ctx
->mtime
;
8737 dout(10) << " set mtime to " << ctx
->new_obs
.oi
.mtime
<< dendl
;
8738 ctx
->new_obs
.oi
.local_mtime
= now
;
8740 dout(10) << " mtime unchanged at " << ctx
->new_obs
.oi
.mtime
<< dendl
;
8744 map
<string
, bufferlist
> attrs
;
8745 bufferlist
bv(sizeof(ctx
->new_obs
.oi
));
8746 encode(ctx
->new_obs
.oi
, bv
,
8747 get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD
, nullptr));
8748 attrs
[OI_ATTR
] = std::move(bv
);
8751 if (soid
.snap
== CEPH_NOSNAP
) {
8752 dout(10) << " final snapset " << ctx
->new_snapset
8753 << " in " << soid
<< dendl
;
8755 encode(ctx
->new_snapset
, bss
);
8756 attrs
[SS_ATTR
] = std::move(bss
);
8758 dout(10) << " no snapset (this is a clone)" << dendl
;
8760 ctx
->op_t
->setattrs(soid
, attrs
);
8763 ctx
->new_obs
.oi
= object_info_t(ctx
->obc
->obs
.oi
.soid
);
8768 pg_log_entry_t(log_op_type
, soid
, ctx
->at_version
,
8769 ctx
->obs
->oi
.version
,
8770 ctx
->user_at_version
, ctx
->reqid
,
8772 (ctx
->op
&& ctx
->op
->allows_returnvec()) ? result
: 0));
8773 if (ctx
->op
&& ctx
->op
->allows_returnvec()) {
8774 // also the per-op values
8775 ctx
->log
.back().set_op_returns(*ctx
->ops
);
8776 dout(20) << __func__
<< " op_returns " << ctx
->log
.back().op_returns
8780 ctx
->log
.back().clean_regions
= ctx
->clean_regions
;
8781 dout(20) << __func__
<< " object " << soid
<< " marks clean_regions " << ctx
->log
.back().clean_regions
<< dendl
;
8783 if (soid
.snap
< CEPH_NOSNAP
) {
8784 switch (log_op_type
) {
8785 case pg_log_entry_t::MODIFY
:
8786 case pg_log_entry_t::PROMOTE
:
8787 case pg_log_entry_t::CLEAN
:
8788 dout(20) << __func__
<< " encoding snaps from " << ctx
->new_snapset
8790 encode(ctx
->new_snapset
.clone_snaps
[soid
.snap
], ctx
->log
.back().snaps
);
8797 if (!ctx
->extra_reqids
.empty()) {
8798 dout(20) << __func__
<< " extra_reqids " << ctx
->extra_reqids
<< " "
8799 << ctx
->extra_reqid_return_codes
<< dendl
;
8800 ctx
->log
.back().extra_reqids
.swap(ctx
->extra_reqids
);
8801 ctx
->log
.back().extra_reqid_return_codes
.swap(ctx
->extra_reqid_return_codes
);
8804 // apply new object state.
8805 ctx
->obc
->obs
= ctx
->new_obs
;
8807 if (soid
.is_head() && !ctx
->obc
->obs
.exists
) {
8808 ctx
->obc
->ssc
->exists
= false;
8809 ctx
->obc
->ssc
->snapset
= SnapSet();
8811 ctx
->obc
->ssc
->exists
= true;
8812 ctx
->obc
->ssc
->snapset
= ctx
->new_snapset
;
8816 void PrimaryLogPG::apply_stats(
8817 const hobject_t
&soid
,
8818 const object_stat_sum_t
&delta_stats
) {
8820 recovery_state
.apply_op_stats(soid
, delta_stats
);
8821 for (set
<pg_shard_t
>::const_iterator i
= get_backfill_targets().begin();
8822 i
!= get_backfill_targets().end();
8825 const pg_info_t
& pinfo
= recovery_state
.get_peer_info(bt
);
8826 if (soid
> pinfo
.last_backfill
&& soid
<= last_backfill_started
) {
8827 pending_backfill_updates
[soid
].stats
.add(delta_stats
);
8831 m_scrubber
->stats_of_handled_objects(delta_stats
, soid
);
8834 void PrimaryLogPG::complete_read_ctx(int result
, OpContext
*ctx
)
8836 auto m
= ctx
->op
->get_req
<MOSDOp
>();
8837 ceph_assert(ctx
->async_reads_complete());
8839 for (auto p
= ctx
->ops
->begin();
8840 p
!= ctx
->ops
->end() && result
>= 0; ++p
) {
8841 if (p
->rval
< 0 && !(p
->op
.flags
& CEPH_OSD_OP_FLAG_FAILOK
)) {
8845 ctx
->bytes_read
+= p
->outdata
.length();
8847 ctx
->reply
->get_header().data_off
= (ctx
->data_off
? *ctx
->data_off
: 0);
8849 MOSDOpReply
*reply
= ctx
->reply
;
8850 ctx
->reply
= nullptr;
8853 if (!ctx
->ignore_log_op_stats
) {
8854 log_op_stats(*ctx
->op
, ctx
->bytes_written
, ctx
->bytes_read
);
8856 publish_stats_to_osd();
8859 // on read, return the current object version
8861 reply
->set_reply_versions(eversion_t(), ctx
->obs
->oi
.user_version
);
8863 reply
->set_reply_versions(eversion_t(), ctx
->user_at_version
);
8865 } else if (result
== -ENOENT
) {
8866 // on ENOENT, set a floor for what the next user version will be.
8867 reply
->set_enoent_reply_versions(info
.last_update
, info
.last_user_version
);
8870 reply
->set_result(result
);
8871 reply
->add_flags(CEPH_OSD_FLAG_ACK
| CEPH_OSD_FLAG_ONDISK
);
8872 osd
->send_message_osd_client(reply
, m
->get_connection());
8876 // ========================================================================
8879 struct C_Copyfrom
: public Context
{
8882 epoch_t last_peering_reset
;
8884 PrimaryLogPG::CopyOpRef cop
; // used for keeping the cop alive
8885 C_Copyfrom(PrimaryLogPG
*p
, hobject_t o
, epoch_t lpr
,
8886 const PrimaryLogPG::CopyOpRef
& c
)
8887 : pg(p
), oid(o
), last_peering_reset(lpr
),
8890 void finish(int r
) override
{
8891 if (r
== -ECANCELED
)
8893 std::scoped_lock l
{*pg
};
8894 if (last_peering_reset
== pg
->get_last_peering_reset()) {
8895 pg
->process_copy_chunk(oid
, tid
, r
);
8901 struct C_CopyFrom_AsyncReadCb
: public Context
{
8903 object_copy_data_t reply_obj
;
8906 C_CopyFrom_AsyncReadCb(OSDOp
*osd_op
, uint64_t features
) :
8907 osd_op(osd_op
), features(features
), len(0) {}
8908 void finish(int r
) override
{
8914 ceph_assert(len
> 0);
8915 ceph_assert(len
<= reply_obj
.data
.length());
8917 bl
.substr_of(reply_obj
.data
, 0, len
);
8918 reply_obj
.data
.swap(bl
);
8919 encode(reply_obj
, osd_op
->outdata
, features
);
8923 struct C_CopyChunk
: public Context
{
8926 epoch_t last_peering_reset
;
8928 PrimaryLogPG::CopyOpRef cop
; // used for keeping the cop alive
8929 uint64_t offset
= 0;
8930 C_CopyChunk(PrimaryLogPG
*p
, hobject_t o
, epoch_t lpr
,
8931 const PrimaryLogPG::CopyOpRef
& c
)
8932 : pg(p
), oid(o
), last_peering_reset(lpr
),
8935 void finish(int r
) override
{
8936 if (r
== -ECANCELED
)
8938 std::scoped_lock l
{*pg
};
8939 if (last_peering_reset
== pg
->get_last_peering_reset()) {
8940 pg
->process_copy_chunk_manifest(oid
, tid
, r
, offset
);
8946 int PrimaryLogPG::do_copy_get(OpContext
*ctx
, bufferlist::const_iterator
& bp
,
8947 OSDOp
& osd_op
, ObjectContextRef
&obc
)
8949 object_info_t
& oi
= obc
->obs
.oi
;
8950 hobject_t
& soid
= oi
.soid
;
8952 object_copy_cursor_t cursor
;
8956 decode(out_max
, bp
);
8958 catch (ceph::buffer::error
& e
) {
8963 const MOSDOp
*op
= reinterpret_cast<const MOSDOp
*>(ctx
->op
->get_req());
8964 uint64_t features
= op
->get_features();
8966 bool async_read_started
= false;
8967 object_copy_data_t _reply_obj
;
8968 C_CopyFrom_AsyncReadCb
*cb
= nullptr;
8969 if (pool
.info
.is_erasure()) {
8970 cb
= new C_CopyFrom_AsyncReadCb(&osd_op
, features
);
8972 object_copy_data_t
&reply_obj
= cb
? cb
->reply_obj
: _reply_obj
;
8974 reply_obj
.size
= oi
.size
;
8975 reply_obj
.mtime
= oi
.mtime
;
8976 ceph_assert(obc
->ssc
);
8977 if (soid
.snap
< CEPH_NOSNAP
) {
8978 auto p
= obc
->ssc
->snapset
.clone_snaps
.find(soid
.snap
);
8979 ceph_assert(p
!= obc
->ssc
->snapset
.clone_snaps
.end()); // warn?
8980 reply_obj
.snaps
= p
->second
;
8982 reply_obj
.snap_seq
= obc
->ssc
->snapset
.seq
;
8984 if (oi
.is_data_digest()) {
8985 reply_obj
.flags
|= object_copy_data_t::FLAG_DATA_DIGEST
;
8986 reply_obj
.data_digest
= oi
.data_digest
;
8988 if (oi
.is_omap_digest()) {
8989 reply_obj
.flags
|= object_copy_data_t::FLAG_OMAP_DIGEST
;
8990 reply_obj
.omap_digest
= oi
.omap_digest
;
8992 reply_obj
.truncate_seq
= oi
.truncate_seq
;
8993 reply_obj
.truncate_size
= oi
.truncate_size
;
8996 map
<string
,bufferlist
>& out_attrs
= reply_obj
.attrs
;
8997 if (!cursor
.attr_complete
) {
8998 result
= getattrs_maybe_cache(
9007 cursor
.attr_complete
= true;
9008 dout(20) << " got attrs" << dendl
;
9011 int64_t left
= out_max
- osd_op
.outdata
.length();
9014 bufferlist
& bl
= reply_obj
.data
;
9015 if (left
> 0 && !cursor
.data_complete
) {
9016 if (cursor
.data_offset
< oi
.size
) {
9017 uint64_t max_read
= std::min(oi
.size
- cursor
.data_offset
, (uint64_t)left
);
9019 async_read_started
= true;
9020 ctx
->pending_async_reads
.push_back(
9022 boost::make_tuple(cursor
.data_offset
, max_read
, osd_op
.op
.flags
),
9023 make_pair(&bl
, cb
)));
9026 ctx
->op_finishers
[ctx
->current_osd_subop_num
].reset(
9027 new ReadFinisher(osd_op
));
9028 result
= -EINPROGRESS
;
9030 dout(10) << __func__
<< ": async_read noted for " << soid
<< dendl
;
9032 result
= pgbackend
->objects_read_sync(
9033 oi
.soid
, cursor
.data_offset
, max_read
, osd_op
.op
.flags
, &bl
);
9038 cursor
.data_offset
+= max_read
;
9040 if (cursor
.data_offset
== oi
.size
) {
9041 cursor
.data_complete
= true;
9042 dout(20) << " got data" << dendl
;
9044 ceph_assert(cursor
.data_offset
<= oi
.size
);
9048 uint32_t omap_keys
= 0;
9049 if (!pool
.info
.supports_omap() || !oi
.is_omap()) {
9050 cursor
.omap_complete
= true;
9052 if (left
> 0 && !cursor
.omap_complete
) {
9053 ceph_assert(cursor
.data_complete
);
9054 if (cursor
.omap_offset
.empty()) {
9055 osd
->store
->omap_get_header(ch
, ghobject_t(oi
.soid
),
9056 &reply_obj
.omap_header
);
9058 bufferlist omap_data
;
9059 ObjectMap::ObjectMapIterator iter
=
9060 osd
->store
->get_omap_iterator(ch
, ghobject_t(oi
.soid
));
9062 iter
->upper_bound(cursor
.omap_offset
);
9063 for (; iter
->valid(); iter
->next()) {
9065 encode(iter
->key(), omap_data
);
9066 encode(iter
->value(), omap_data
);
9067 left
-= iter
->key().length() + 4 + iter
->value().length() + 4;
9072 encode(omap_keys
, reply_obj
.omap_data
);
9073 reply_obj
.omap_data
.claim_append(omap_data
);
9075 if (iter
->valid()) {
9076 cursor
.omap_offset
= iter
->key();
9078 cursor
.omap_complete
= true;
9079 dout(20) << " got omap" << dendl
;
9084 if (cursor
.is_complete()) {
9085 // include reqids only in the final step. this is a bit fragile
9087 recovery_state
.get_pg_log().get_log().get_object_reqids(ctx
->obc
->obs
.oi
.soid
, 10,
9089 &reply_obj
.reqid_return_codes
);
9090 dout(20) << " got reqids" << dendl
;
9093 dout(20) << " cursor.is_complete=" << cursor
.is_complete()
9094 << " " << out_attrs
.size() << " attrs"
9095 << " " << bl
.length() << " bytes"
9096 << " " << reply_obj
.omap_header
.length() << " omap header bytes"
9097 << " " << reply_obj
.omap_data
.length() << " omap data bytes in "
9098 << omap_keys
<< " keys"
9099 << " " << reply_obj
.reqids
.size() << " reqids"
9101 reply_obj
.cursor
= cursor
;
9102 if (!async_read_started
) {
9103 encode(reply_obj
, osd_op
.outdata
, features
);
9105 if (cb
&& !async_read_started
) {
9115 void PrimaryLogPG::fill_in_copy_get_noent(OpRequestRef
& op
, hobject_t oid
,
9118 const MOSDOp
*m
= static_cast<const MOSDOp
*>(op
->get_req());
9119 uint64_t features
= m
->get_features();
9120 object_copy_data_t reply_obj
;
9122 recovery_state
.get_pg_log().get_log().get_object_reqids(oid
, 10, &reply_obj
.reqids
,
9123 &reply_obj
.reqid_return_codes
);
9124 dout(20) << __func__
<< " got reqids " << reply_obj
.reqids
<< dendl
;
9125 encode(reply_obj
, osd_op
.outdata
, features
);
9126 osd_op
.rval
= -ENOENT
;
9127 MOSDOpReply
*reply
= new MOSDOpReply(m
, 0, get_osdmap_epoch(), 0, false);
9128 reply
->set_result(-ENOENT
);
9129 reply
->add_flags(CEPH_OSD_FLAG_ACK
| CEPH_OSD_FLAG_ONDISK
);
9130 osd
->send_message_osd_client(reply
, m
->get_connection());
9133 void PrimaryLogPG::start_copy(CopyCallback
*cb
, ObjectContextRef obc
,
9134 hobject_t src
, object_locator_t oloc
,
9135 version_t version
, unsigned flags
,
9136 bool mirror_snapset
,
9137 unsigned src_obj_fadvise_flags
,
9138 unsigned dest_obj_fadvise_flags
)
9140 const hobject_t
& dest
= obc
->obs
.oi
.soid
;
9141 dout(10) << __func__
<< " " << dest
9142 << " from " << src
<< " " << oloc
<< " v" << version
9143 << " flags " << flags
9144 << (mirror_snapset
? " mirror_snapset" : "")
9147 ceph_assert(!mirror_snapset
|| src
.snap
== CEPH_NOSNAP
);
9149 // cancel a previous in-progress copy?
9150 if (copy_ops
.count(dest
)) {
9151 // FIXME: if the src etc match, we could avoid restarting from the
9153 CopyOpRef cop
= copy_ops
[dest
];
9154 vector
<ceph_tid_t
> tids
;
9155 cancel_copy(cop
, false, &tids
);
9156 osd
->objecter
->op_cancel(tids
, -ECANCELED
);
9159 CopyOpRef
cop(std::make_shared
<CopyOp
>(cb
, obc
, src
, oloc
, version
, flags
,
9160 mirror_snapset
, src_obj_fadvise_flags
,
9161 dest_obj_fadvise_flags
));
9162 copy_ops
[dest
] = cop
;
9165 if (!obc
->obs
.oi
.has_manifest()) {
9166 _copy_some(obc
, cop
);
9168 if (obc
->obs
.oi
.manifest
.is_redirect()) {
9169 _copy_some(obc
, cop
);
9170 } else if (obc
->obs
.oi
.manifest
.is_chunked()) {
9171 auto p
= obc
->obs
.oi
.manifest
.chunk_map
.begin();
9172 _copy_some_manifest(obc
, cop
, p
->first
);
9174 ceph_abort_msg("unrecognized manifest type");
9179 void PrimaryLogPG::_copy_some(ObjectContextRef obc
, CopyOpRef cop
)
9181 dout(10) << __func__
<< " " << *obc
<< " " << cop
<< dendl
;
9184 if (cop
->flags
& CEPH_OSD_COPY_FROM_FLAG_FLUSH
)
9185 flags
|= CEPH_OSD_FLAG_FLUSH
;
9186 if (cop
->flags
& CEPH_OSD_COPY_FROM_FLAG_IGNORE_CACHE
)
9187 flags
|= CEPH_OSD_FLAG_IGNORE_CACHE
;
9188 if (cop
->flags
& CEPH_OSD_COPY_FROM_FLAG_IGNORE_OVERLAY
)
9189 flags
|= CEPH_OSD_FLAG_IGNORE_OVERLAY
;
9190 if (cop
->flags
& CEPH_OSD_COPY_FROM_FLAG_MAP_SNAP_CLONE
)
9191 flags
|= CEPH_OSD_FLAG_MAP_SNAP_CLONE
;
9192 if (cop
->flags
& CEPH_OSD_COPY_FROM_FLAG_RWORDERED
)
9193 flags
|= CEPH_OSD_FLAG_RWORDERED
;
9195 C_GatherBuilder
gather(cct
);
9197 if (cop
->cursor
.is_initial() && cop
->mirror_snapset
) {
9199 ceph_assert(cop
->src
.snap
== CEPH_NOSNAP
);
9201 op
.list_snaps(&cop
->results
.snapset
, NULL
);
9202 ceph_tid_t tid
= osd
->objecter
->read(cop
->src
.oid
, cop
->oloc
, op
,
9204 flags
, gather
.new_sub(), NULL
);
9205 cop
->objecter_tid2
= tid
;
9209 if (cop
->results
.user_version
) {
9210 op
.assert_version(cop
->results
.user_version
);
9212 // we should learn the version after the first chunk, if we didn't know
9214 ceph_assert(cop
->cursor
.is_initial());
9216 op
.copy_get(&cop
->cursor
, get_copy_chunk_size(),
9217 &cop
->results
.object_size
, &cop
->results
.mtime
,
9218 &cop
->attrs
, &cop
->data
, &cop
->omap_header
, &cop
->omap_data
,
9219 &cop
->results
.snaps
, &cop
->results
.snap_seq
,
9220 &cop
->results
.flags
,
9221 &cop
->results
.source_data_digest
,
9222 &cop
->results
.source_omap_digest
,
9223 &cop
->results
.reqids
,
9224 &cop
->results
.reqid_return_codes
,
9225 &cop
->results
.truncate_seq
,
9226 &cop
->results
.truncate_size
,
9228 op
.set_last_op_flags(cop
->src_obj_fadvise_flags
);
9230 C_Copyfrom
*fin
= new C_Copyfrom(this, obc
->obs
.oi
.soid
,
9231 get_last_peering_reset(), cop
);
9232 gather
.set_finisher(new C_OnFinisher(fin
,
9233 osd
->get_objecter_finisher(get_pg_shard())));
9235 ceph_tid_t tid
= osd
->objecter
->read(cop
->src
.oid
, cop
->oloc
, op
,
9236 cop
->src
.snap
, NULL
,
9239 // discover the object version if we don't know it yet
9240 cop
->results
.user_version
? NULL
: &cop
->results
.user_version
);
9242 cop
->objecter_tid
= tid
;
9246 void PrimaryLogPG::_copy_some_manifest(ObjectContextRef obc
, CopyOpRef cop
, uint64_t start_offset
)
9248 dout(10) << __func__
<< " " << *obc
<< " " << cop
<< dendl
;
9251 if (cop
->flags
& CEPH_OSD_COPY_FROM_FLAG_FLUSH
)
9252 flags
|= CEPH_OSD_FLAG_FLUSH
;
9253 if (cop
->flags
& CEPH_OSD_COPY_FROM_FLAG_IGNORE_CACHE
)
9254 flags
|= CEPH_OSD_FLAG_IGNORE_CACHE
;
9255 if (cop
->flags
& CEPH_OSD_COPY_FROM_FLAG_IGNORE_OVERLAY
)
9256 flags
|= CEPH_OSD_FLAG_IGNORE_OVERLAY
;
9257 if (cop
->flags
& CEPH_OSD_COPY_FROM_FLAG_MAP_SNAP_CLONE
)
9258 flags
|= CEPH_OSD_FLAG_MAP_SNAP_CLONE
;
9259 if (cop
->flags
& CEPH_OSD_COPY_FROM_FLAG_RWORDERED
)
9260 flags
|= CEPH_OSD_FLAG_RWORDERED
;
9263 uint64_t last_offset
= 0, chunks_size
= 0;
9264 object_manifest_t
*manifest
= &obc
->obs
.oi
.manifest
;
9265 map
<uint64_t, chunk_info_t
>::iterator iter
= manifest
->chunk_map
.find(start_offset
);
9266 for (;iter
!= manifest
->chunk_map
.end(); ++iter
) {
9268 chunks_size
+= iter
->second
.length
;
9269 last_offset
= iter
->first
;
9270 if (get_copy_chunk_size() < chunks_size
) {
9275 cop
->num_chunk
= num_chunks
;
9276 cop
->start_offset
= start_offset
;
9277 cop
->last_offset
= last_offset
;
9278 dout(20) << __func__
<< " oid " << obc
->obs
.oi
.soid
<< " num_chunks: " << num_chunks
9279 << " start_offset: " << start_offset
<< " chunks_size: " << chunks_size
9280 << " last_offset: " << last_offset
<< dendl
;
9282 iter
= manifest
->chunk_map
.find(start_offset
);
9283 for (;iter
!= manifest
->chunk_map
.end(); ++iter
) {
9284 uint64_t obj_offset
= iter
->first
;
9285 uint64_t length
= manifest
->chunk_map
[iter
->first
].length
;
9286 hobject_t soid
= manifest
->chunk_map
[iter
->first
].oid
;
9287 object_locator_t
oloc(soid
);
9288 CopyCallback
* cb
= NULL
;
9289 CopyOpRef
sub_cop(std::make_shared
<CopyOp
>(cb
, ObjectContextRef(), cop
->src
, oloc
,
9290 cop
->results
.user_version
, cop
->flags
, cop
->mirror_snapset
,
9291 cop
->src_obj_fadvise_flags
, cop
->dest_obj_fadvise_flags
));
9292 sub_cop
->cursor
.data_offset
= obj_offset
;
9293 cop
->chunk_cops
[obj_offset
] = sub_cop
;
9295 int s
= sub_cop
->chunk_ops
.size();
9296 sub_cop
->chunk_ops
.resize(s
+1);
9297 sub_cop
->chunk_ops
[s
].op
.op
= CEPH_OSD_OP_READ
;
9298 sub_cop
->chunk_ops
[s
].op
.extent
.offset
= manifest
->chunk_map
[iter
->first
].offset
;
9299 sub_cop
->chunk_ops
[s
].op
.extent
.length
= length
;
9302 op
.dup(sub_cop
->chunk_ops
);
9304 if (cop
->results
.user_version
) {
9305 op
.assert_version(cop
->results
.user_version
);
9307 // we should learn the version after the first chunk, if we didn't know
9309 ceph_assert(cop
->cursor
.is_initial());
9311 op
.set_last_op_flags(cop
->src_obj_fadvise_flags
);
9313 C_CopyChunk
*fin
= new C_CopyChunk(this, obc
->obs
.oi
.soid
,
9314 get_last_peering_reset(), cop
);
9315 fin
->offset
= obj_offset
;
9317 ceph_tid_t tid
= osd
->objecter
->read(
9319 sub_cop
->src
.snap
, NULL
,
9321 new C_OnFinisher(fin
, osd
->get_objecter_finisher(get_pg_shard())),
9322 // discover the object version if we don't know it yet
9323 sub_cop
->results
.user_version
? NULL
: &sub_cop
->results
.user_version
);
9325 sub_cop
->objecter_tid
= tid
;
9327 dout(20) << __func__
<< " tgt_oid: " << soid
.oid
<< " tgt_offset: "
9328 << manifest
->chunk_map
[iter
->first
].offset
9329 << " length: " << length
<< " pool id: " << oloc
.pool
9330 << " tid: " << tid
<< dendl
;
9332 if (last_offset
< iter
->first
) {
9338 void PrimaryLogPG::process_copy_chunk(hobject_t oid
, ceph_tid_t tid
, int r
)
9340 dout(10) << __func__
<< " " << oid
<< " tid " << tid
9341 << " " << cpp_strerror(r
) << dendl
;
9342 map
<hobject_t
,CopyOpRef
>::iterator p
= copy_ops
.find(oid
);
9343 if (p
== copy_ops
.end()) {
9344 dout(10) << __func__
<< " no copy_op found" << dendl
;
9347 CopyOpRef cop
= p
->second
;
9348 if (tid
!= cop
->objecter_tid
) {
9349 dout(10) << __func__
<< " tid " << tid
<< " != cop " << cop
9350 << " tid " << cop
->objecter_tid
<< dendl
;
9354 if (cop
->omap_data
.length() || cop
->omap_header
.length())
9355 cop
->results
.has_omap
= true;
9357 if (r
>= 0 && !pool
.info
.supports_omap() &&
9358 (cop
->omap_data
.length() || cop
->omap_header
.length())) {
9361 cop
->objecter_tid
= 0;
9362 cop
->objecter_tid2
= 0; // assume this ordered before us (if it happened)
9363 ObjectContextRef
& cobc
= cop
->obc
;
9368 ceph_assert(cop
->rval
>= 0);
9370 if (oid
.snap
< CEPH_NOSNAP
&& !cop
->results
.snaps
.empty()) {
9371 // verify snap hasn't been deleted
9372 vector
<snapid_t
>::iterator p
= cop
->results
.snaps
.begin();
9373 while (p
!= cop
->results
.snaps
.end()) {
9374 // make best effort to sanitize snaps/clones.
9375 if (get_osdmap()->in_removed_snaps_queue(info
.pgid
.pgid
.pool(), *p
)) {
9376 dout(10) << __func__
<< " clone snap " << *p
<< " has been deleted"
9378 for (vector
<snapid_t
>::iterator q
= p
+ 1;
9379 q
!= cop
->results
.snaps
.end();
9382 cop
->results
.snaps
.resize(cop
->results
.snaps
.size() - 1);
9387 if (cop
->results
.snaps
.empty()) {
9388 dout(10) << __func__
<< " no more snaps for " << oid
<< dendl
;
9394 ceph_assert(cop
->rval
>= 0);
9396 if (!cop
->temp_cursor
.data_complete
) {
9397 cop
->results
.data_digest
= cop
->data
.crc32c(cop
->results
.data_digest
);
9399 if (pool
.info
.supports_omap() && !cop
->temp_cursor
.omap_complete
) {
9400 if (cop
->omap_header
.length()) {
9401 cop
->results
.omap_digest
=
9402 cop
->omap_header
.crc32c(cop
->results
.omap_digest
);
9404 if (cop
->omap_data
.length()) {
9406 keys
.substr_of(cop
->omap_data
, 4, cop
->omap_data
.length() - 4);
9407 cop
->results
.omap_digest
= keys
.crc32c(cop
->results
.omap_digest
);
9411 if (!cop
->temp_cursor
.attr_complete
) {
9412 for (map
<string
,bufferlist
>::iterator p
= cop
->attrs
.begin();
9413 p
!= cop
->attrs
.end();
9415 cop
->results
.attrs
[string("_") + p
->first
] = p
->second
;
9420 if (!cop
->cursor
.is_complete()) {
9421 // write out what we have so far
9422 if (cop
->temp_cursor
.is_initial()) {
9423 ceph_assert(!cop
->results
.started_temp_obj
);
9424 cop
->results
.started_temp_obj
= true;
9425 cop
->results
.temp_oid
= generate_temp_object(oid
);
9426 dout(20) << __func__
<< " using temp " << cop
->results
.temp_oid
<< dendl
;
9428 ObjectContextRef tempobc
= get_object_context(cop
->results
.temp_oid
, true);
9429 OpContextUPtr ctx
= simple_opc_create(tempobc
);
9430 if (cop
->temp_cursor
.is_initial()) {
9431 ctx
->new_temp_oid
= cop
->results
.temp_oid
;
9433 _write_copy_chunk(cop
, ctx
->op_t
.get());
9434 simple_opc_submit(std::move(ctx
));
9435 dout(10) << __func__
<< " fetching more" << dendl
;
9436 _copy_some(cobc
, cop
);
9441 if (cop
->results
.is_data_digest() || cop
->results
.is_omap_digest()) {
9442 dout(20) << __func__
<< std::hex
9443 << " got digest: rx data 0x" << cop
->results
.data_digest
9444 << " omap 0x" << cop
->results
.omap_digest
9445 << ", source: data 0x" << cop
->results
.source_data_digest
9446 << " omap 0x" << cop
->results
.source_omap_digest
9448 << " flags " << cop
->results
.flags
9451 if (cop
->results
.is_data_digest() &&
9452 cop
->results
.data_digest
!= cop
->results
.source_data_digest
) {
9453 derr
<< __func__
<< std::hex
<< " data digest 0x" << cop
->results
.data_digest
9454 << " != source 0x" << cop
->results
.source_data_digest
<< std::dec
9456 osd
->clog
->error() << info
.pgid
<< " copy from " << cop
->src
9457 << " to " << cop
->obc
->obs
.oi
.soid
<< std::hex
9458 << " data digest 0x" << cop
->results
.data_digest
9459 << " != source 0x" << cop
->results
.source_data_digest
9464 if (cop
->results
.is_omap_digest() &&
9465 cop
->results
.omap_digest
!= cop
->results
.source_omap_digest
) {
9466 derr
<< __func__
<< std::hex
9467 << " omap digest 0x" << cop
->results
.omap_digest
9468 << " != source 0x" << cop
->results
.source_omap_digest
9469 << std::dec
<< dendl
;
9470 osd
->clog
->error() << info
.pgid
<< " copy from " << cop
->src
9471 << " to " << cop
->obc
->obs
.oi
.soid
<< std::hex
9472 << " omap digest 0x" << cop
->results
.omap_digest
9473 << " != source 0x" << cop
->results
.source_omap_digest
9478 if (cct
->_conf
->osd_debug_inject_copyfrom_error
) {
9479 derr
<< __func__
<< " injecting copyfrom failure" << dendl
;
9484 cop
->results
.fill_in_final_tx
= std::function
<void(PGTransaction
*)>(
9485 [this, &cop
/* avoid ref cycle */](PGTransaction
*t
) {
9486 ObjectState
& obs
= cop
->obc
->obs
;
9487 if (cop
->temp_cursor
.is_initial()) {
9488 dout(20) << "fill_in_final_tx: writing "
9489 << "directly to final object" << dendl
;
9490 // write directly to final object
9491 cop
->results
.temp_oid
= obs
.oi
.soid
;
9492 _write_copy_chunk(cop
, t
);
9494 // finish writing to temp object, then move into place
9495 dout(20) << "fill_in_final_tx: writing to temp object" << dendl
;
9496 if (obs
.oi
.has_manifest() && obs
.oi
.manifest
.is_redirect() && obs
.exists
) {
9497 /* In redirect manifest case, the object exists in the upper tier.
9498 * So, to avoid a conflict when rename() is called, remove existing
9501 t
->remove(obs
.oi
.soid
);
9503 _write_copy_chunk(cop
, t
);
9504 t
->rename(obs
.oi
.soid
, cop
->results
.temp_oid
);
9506 t
->setattrs(obs
.oi
.soid
, cop
->results
.attrs
);
9509 dout(20) << __func__
<< " success; committing" << dendl
;
9512 dout(20) << __func__
<< " complete r = " << cpp_strerror(r
) << dendl
;
9513 CopyCallbackResults
results(r
, &cop
->results
);
9514 cop
->cb
->complete(results
);
9516 copy_ops
.erase(cobc
->obs
.oi
.soid
);
9519 if (r
< 0 && cop
->results
.started_temp_obj
) {
9520 dout(10) << __func__
<< " deleting partial temp object "
9521 << cop
->results
.temp_oid
<< dendl
;
9522 ObjectContextRef tempobc
= get_object_context(cop
->results
.temp_oid
, true);
9523 OpContextUPtr ctx
= simple_opc_create(tempobc
);
9524 ctx
->op_t
->remove(cop
->results
.temp_oid
);
9525 ctx
->discard_temp_oid
= cop
->results
.temp_oid
;
9526 simple_opc_submit(std::move(ctx
));
9529 // cancel and requeue proxy ops on this object
9531 cancel_and_requeue_proxy_ops(cobc
->obs
.oi
.soid
);
9534 kick_object_context_blocked(cobc
);
9537 void PrimaryLogPG::process_copy_chunk_manifest(hobject_t oid
, ceph_tid_t tid
, int r
, uint64_t offset
)
9539 dout(10) << __func__
<< " " << oid
<< " tid " << tid
9540 << " " << cpp_strerror(r
) << dendl
;
9541 map
<hobject_t
,CopyOpRef
>::iterator p
= copy_ops
.find(oid
);
9542 if (p
== copy_ops
.end()) {
9543 dout(10) << __func__
<< " no copy_op found" << dendl
;
9546 CopyOpRef obj_cop
= p
->second
;
9547 CopyOpRef chunk_cop
= obj_cop
->chunk_cops
[offset
];
9549 if (tid
!= chunk_cop
->objecter_tid
) {
9550 dout(10) << __func__
<< " tid " << tid
<< " != cop " << chunk_cop
9551 << " tid " << chunk_cop
->objecter_tid
<< dendl
;
9555 if (chunk_cop
->omap_data
.length() || chunk_cop
->omap_header
.length()) {
9559 chunk_cop
->objecter_tid
= 0;
9560 chunk_cop
->objecter_tid2
= 0; // assume this ordered before us (if it happened)
9561 ObjectContextRef
& cobc
= obj_cop
->obc
;
9562 OSDOp
&chunk_data
= chunk_cop
->chunk_ops
[0];
9565 obj_cop
->failed
= true;
9569 if (obj_cop
->failed
) {
9572 if (!chunk_data
.outdata
.length()) {
9574 obj_cop
->failed
= true;
9578 obj_cop
->num_chunk
--;
9580 /* check all of the copyop are completed */
9581 if (obj_cop
->num_chunk
) {
9582 dout(20) << __func__
<< " num_chunk: " << obj_cop
->num_chunk
<< dendl
;
9587 OpContextUPtr ctx
= simple_opc_create(obj_cop
->obc
);
9588 if (!ctx
->lock_manager
.take_write_lock(
9589 obj_cop
->obc
->obs
.oi
.soid
,
9591 // recovery op can take read lock.
9592 // so need to wait for recovery completion
9594 obj_cop
->failed
= true;
9595 close_op_ctx(ctx
.release());
9598 dout(20) << __func__
<< " took lock on obc, " << obj_cop
->obc
->rwstate
<< dendl
;
9600 PGTransaction
*t
= ctx
->op_t
.get();
9601 ObjectState
& obs
= ctx
->new_obs
;
9602 for (auto p
: obj_cop
->chunk_cops
) {
9603 OSDOp
&sub_chunk
= p
.second
->chunk_ops
[0];
9604 t
->write(cobc
->obs
.oi
.soid
,
9605 p
.second
->cursor
.data_offset
,
9606 sub_chunk
.outdata
.length(),
9608 p
.second
->dest_obj_fadvise_flags
);
9609 dout(20) << __func__
<< " offset: " << p
.second
->cursor
.data_offset
9610 << " length: " << sub_chunk
.outdata
.length() << dendl
;
9611 write_update_size_and_usage(ctx
->delta_stats
, obs
.oi
, ctx
->modified_ranges
,
9612 p
.second
->cursor
.data_offset
, sub_chunk
.outdata
.length());
9613 obs
.oi
.manifest
.chunk_map
[p
.second
->cursor
.data_offset
].clear_flag(chunk_info_t::FLAG_MISSING
);
9614 ctx
->clean_regions
.mark_data_region_dirty(p
.second
->cursor
.data_offset
, sub_chunk
.outdata
.length());
9615 sub_chunk
.outdata
.clear();
9617 obs
.oi
.clear_data_digest();
9618 ctx
->at_version
= get_next_version();
9619 finish_ctx(ctx
.get(), pg_log_entry_t::PROMOTE
);
9620 simple_opc_submit(std::move(ctx
));
9622 auto p
= cobc
->obs
.oi
.manifest
.chunk_map
.rbegin();
9623 /* check remaining work */
9624 if (p
!= cobc
->obs
.oi
.manifest
.chunk_map
.rend()) {
9625 if (obj_cop
->last_offset
>= p
->first
+ p
->second
.length
) {
9626 for (auto &en
: cobc
->obs
.oi
.manifest
.chunk_map
) {
9627 if (obj_cop
->last_offset
< en
.first
) {
9628 _copy_some_manifest(cobc
, obj_cop
, en
.first
);
9637 dout(20) << __func__
<< " complete r = " << cpp_strerror(r
) << dendl
;
9638 CopyCallbackResults
results(r
, &obj_cop
->results
);
9639 obj_cop
->cb
->complete(results
);
9641 copy_ops
.erase(cobc
->obs
.oi
.soid
);
9644 // cancel and requeue proxy ops on this object
9646 cancel_and_requeue_proxy_ops(cobc
->obs
.oi
.soid
);
9649 kick_object_context_blocked(cobc
);
9652 void PrimaryLogPG::cancel_and_requeue_proxy_ops(hobject_t oid
) {
9653 vector
<ceph_tid_t
> tids
;
9654 for (map
<ceph_tid_t
, ProxyReadOpRef
>::iterator it
= proxyread_ops
.begin();
9655 it
!= proxyread_ops
.end();) {
9656 if (it
->second
->soid
== oid
) {
9657 cancel_proxy_read((it
++)->second
, &tids
);
9662 for (map
<ceph_tid_t
, ProxyWriteOpRef
>::iterator it
= proxywrite_ops
.begin();
9663 it
!= proxywrite_ops
.end();) {
9664 if (it
->second
->soid
== oid
) {
9665 cancel_proxy_write((it
++)->second
, &tids
);
9670 osd
->objecter
->op_cancel(tids
, -ECANCELED
);
9671 kick_proxy_ops_blocked(oid
);
9674 void PrimaryLogPG::_write_copy_chunk(CopyOpRef cop
, PGTransaction
*t
)
9676 dout(20) << __func__
<< " " << cop
9677 << " " << cop
->attrs
.size() << " attrs"
9678 << " " << cop
->data
.length() << " bytes"
9679 << " " << cop
->omap_header
.length() << " omap header bytes"
9680 << " " << cop
->omap_data
.length() << " omap data bytes"
9682 if (!cop
->temp_cursor
.attr_complete
) {
9683 t
->create(cop
->results
.temp_oid
);
9685 if (!cop
->temp_cursor
.data_complete
) {
9686 ceph_assert(cop
->data
.length() + cop
->temp_cursor
.data_offset
==
9687 cop
->cursor
.data_offset
);
9688 if (pool
.info
.required_alignment() &&
9689 !cop
->cursor
.data_complete
) {
9691 * Trim off the unaligned bit at the end, we'll adjust cursor.data_offset
9692 * to pick it up on the next pass.
9694 ceph_assert(cop
->temp_cursor
.data_offset
%
9695 pool
.info
.required_alignment() == 0);
9696 if (cop
->data
.length() % pool
.info
.required_alignment() != 0) {
9698 cop
->data
.length() % pool
.info
.required_alignment();
9700 bl
.substr_of(cop
->data
, 0, cop
->data
.length() - to_trim
);
9702 cop
->cursor
.data_offset
-= to_trim
;
9703 ceph_assert(cop
->data
.length() + cop
->temp_cursor
.data_offset
==
9704 cop
->cursor
.data_offset
);
9707 if (cop
->data
.length()) {
9709 cop
->results
.temp_oid
,
9710 cop
->temp_cursor
.data_offset
,
9713 cop
->dest_obj_fadvise_flags
);
9717 if (pool
.info
.supports_omap()) {
9718 if (!cop
->temp_cursor
.omap_complete
) {
9719 if (cop
->omap_header
.length()) {
9721 cop
->results
.temp_oid
,
9723 cop
->omap_header
.clear();
9725 if (cop
->omap_data
.length()) {
9726 map
<string
,bufferlist
> omap
;
9727 bufferlist::const_iterator p
= cop
->omap_data
.begin();
9729 t
->omap_setkeys(cop
->results
.temp_oid
, omap
);
9730 cop
->omap_data
.clear();
9734 ceph_assert(cop
->omap_header
.length() == 0);
9735 ceph_assert(cop
->omap_data
.length() == 0);
9737 cop
->temp_cursor
= cop
->cursor
;
9740 void PrimaryLogPG::finish_copyfrom(CopyFromCallback
*cb
)
9742 OpContext
*ctx
= cb
->ctx
;
9743 dout(20) << "finish_copyfrom on " << ctx
->obs
->oi
.soid
<< dendl
;
9745 ObjectState
& obs
= ctx
->new_obs
;
9747 dout(20) << __func__
<< ": exists, removing" << dendl
;
9748 ctx
->op_t
->remove(obs
.oi
.soid
);
9750 ctx
->delta_stats
.num_objects
++;
9753 if (cb
->is_temp_obj_used()) {
9754 ctx
->discard_temp_oid
= cb
->results
->temp_oid
;
9756 cb
->results
->fill_in_final_tx(ctx
->op_t
.get());
9758 // CopyFromCallback fills this in for us
9759 obs
.oi
.user_version
= ctx
->user_at_version
;
9761 if (cb
->results
->is_data_digest()) {
9762 obs
.oi
.set_data_digest(cb
->results
->data_digest
);
9764 obs
.oi
.clear_data_digest();
9766 if (cb
->results
->is_omap_digest()) {
9767 obs
.oi
.set_omap_digest(cb
->results
->omap_digest
);
9769 obs
.oi
.clear_omap_digest();
9772 obs
.oi
.truncate_seq
= cb
->truncate_seq
;
9773 obs
.oi
.truncate_size
= cb
->truncate_size
;
9775 obs
.oi
.mtime
= ceph::real_clock::to_timespec(cb
->results
->mtime
);
9776 ctx
->mtime
= utime_t();
9778 ctx
->extra_reqids
= cb
->results
->reqids
;
9779 ctx
->extra_reqid_return_codes
= cb
->results
->reqid_return_codes
;
9781 // cache: clear whiteout?
9782 if (obs
.oi
.is_whiteout()) {
9783 dout(10) << __func__
<< " clearing whiteout on " << obs
.oi
.soid
<< dendl
;
9784 obs
.oi
.clear_flag(object_info_t::FLAG_WHITEOUT
);
9785 --ctx
->delta_stats
.num_whiteouts
;
9788 if (cb
->results
->has_omap
) {
9789 dout(10) << __func__
<< " setting omap flag on " << obs
.oi
.soid
<< dendl
;
9790 obs
.oi
.set_flag(object_info_t::FLAG_OMAP
);
9791 ctx
->clean_regions
.mark_omap_dirty();
9793 dout(10) << __func__
<< " clearing omap flag on " << obs
.oi
.soid
<< dendl
;
9794 obs
.oi
.clear_flag(object_info_t::FLAG_OMAP
);
9797 interval_set
<uint64_t> ch
;
9798 if (obs
.oi
.size
> 0)
9799 ch
.insert(0, obs
.oi
.size
);
9800 ctx
->modified_ranges
.union_of(ch
);
9801 ctx
->clean_regions
.mark_data_region_dirty(0, std::max(obs
.oi
.size
, cb
->get_data_size()));
9803 if (cb
->get_data_size() != obs
.oi
.size
) {
9804 ctx
->delta_stats
.num_bytes
-= obs
.oi
.size
;
9805 obs
.oi
.size
= cb
->get_data_size();
9806 ctx
->delta_stats
.num_bytes
+= obs
.oi
.size
;
9808 ctx
->delta_stats
.num_wr
++;
9809 ctx
->delta_stats
.num_wr_kb
+= shift_round_up(obs
.oi
.size
, 10);
9811 osd
->logger
->inc(l_osd_copyfrom
);
9814 void PrimaryLogPG::finish_promote(int r
, CopyResults
*results
,
9815 ObjectContextRef obc
)
9817 const hobject_t
& soid
= obc
->obs
.oi
.soid
;
9818 dout(10) << __func__
<< " " << soid
<< " r=" << r
9819 << " uv" << results
->user_version
<< dendl
;
9821 if (r
== -ECANCELED
) {
9825 if (r
!= -ENOENT
&& soid
.is_snap()) {
9826 if (results
->snaps
.empty()) {
9827 // we must have read "snap" content from the head object in the
9828 // base pool. use snap_seq to construct what snaps should be
9829 // for this clone (what is was before we evicted the clean clone
9830 // from this pool, and what it will be when we flush and the
9831 // clone eventually happens in the base pool). we want to use
9832 // snaps in (results->snap_seq,soid.snap]
9833 SnapSet
& snapset
= obc
->ssc
->snapset
;
9834 for (auto p
= snapset
.clone_snaps
.rbegin();
9835 p
!= snapset
.clone_snaps
.rend();
9837 for (auto snap
: p
->second
) {
9838 if (snap
> soid
.snap
) {
9841 if (snap
<= results
->snap_seq
) {
9844 results
->snaps
.push_back(snap
);
9849 dout(20) << __func__
<< " snaps " << results
->snaps
<< dendl
;
9850 filter_snapc(results
->snaps
);
9852 dout(20) << __func__
<< " filtered snaps " << results
->snaps
<< dendl
;
9853 if (results
->snaps
.empty()) {
9854 dout(20) << __func__
9855 << " snaps are empty, clone is invalid,"
9856 << " setting r to ENOENT" << dendl
;
9861 if (r
< 0 && results
->started_temp_obj
) {
9862 dout(10) << __func__
<< " abort; will clean up partial work" << dendl
;
9863 ObjectContextRef tempobc
= get_object_context(results
->temp_oid
, false);
9864 ceph_assert(tempobc
);
9865 OpContextUPtr ctx
= simple_opc_create(tempobc
);
9866 ctx
->op_t
->remove(results
->temp_oid
);
9867 simple_opc_submit(std::move(ctx
));
9868 results
->started_temp_obj
= false;
9871 if (r
== -ENOENT
&& soid
.is_snap()) {
9872 dout(10) << __func__
9873 << ": enoent while trying to promote clone, " << soid
9874 << " must have been trimmed, removing from snapset"
9876 hobject_t
head(soid
.get_head());
9877 ObjectContextRef obc
= get_object_context(head
, false);
9880 OpContextUPtr tctx
= simple_opc_create(obc
);
9881 tctx
->at_version
= get_next_version();
9882 if (get_osdmap()->require_osd_release
< ceph_release_t::octopus
) {
9883 filter_snapc(tctx
->new_snapset
.snaps
);
9885 tctx
->new_snapset
.snaps
.clear();
9887 vector
<snapid_t
> new_clones
;
9888 map
<snapid_t
, vector
<snapid_t
>> new_clone_snaps
;
9889 for (vector
<snapid_t
>::iterator i
= tctx
->new_snapset
.clones
.begin();
9890 i
!= tctx
->new_snapset
.clones
.end();
9892 if (*i
!= soid
.snap
) {
9893 new_clones
.push_back(*i
);
9894 auto p
= tctx
->new_snapset
.clone_snaps
.find(*i
);
9895 if (p
!= tctx
->new_snapset
.clone_snaps
.end()) {
9896 new_clone_snaps
[*i
] = p
->second
;
9900 tctx
->new_snapset
.clones
.swap(new_clones
);
9901 tctx
->new_snapset
.clone_overlap
.erase(soid
.snap
);
9902 tctx
->new_snapset
.clone_size
.erase(soid
.snap
);
9903 tctx
->new_snapset
.clone_snaps
.swap(new_clone_snaps
);
9905 // take RWWRITE lock for duration of our local write. ignore starvation.
9906 if (!tctx
->lock_manager
.take_write_lock(
9909 ceph_abort_msg("problem!");
9911 dout(20) << __func__
<< " took lock on obc, " << obc
->rwstate
<< dendl
;
9913 finish_ctx(tctx
.get(), pg_log_entry_t::PROMOTE
);
9915 simple_opc_submit(std::move(tctx
));
9919 bool whiteout
= false;
9921 ceph_assert(soid
.snap
== CEPH_NOSNAP
); // snap case is above
9922 dout(10) << __func__
<< " whiteout " << soid
<< dendl
;
9926 if (r
< 0 && !whiteout
) {
9927 derr
<< __func__
<< " unexpected promote error " << cpp_strerror(r
) << dendl
;
9928 // pass error to everyone blocked on this object
9929 // FIXME: this is pretty sloppy, but at this point we got
9930 // something unexpected and don't have many other options.
9931 map
<hobject_t
,list
<OpRequestRef
>>::iterator blocked_iter
=
9932 waiting_for_blocked_object
.find(soid
);
9933 if (blocked_iter
!= waiting_for_blocked_object
.end()) {
9934 while (!blocked_iter
->second
.empty()) {
9935 osd
->reply_op_error(blocked_iter
->second
.front(), r
);
9936 blocked_iter
->second
.pop_front();
9938 waiting_for_blocked_object
.erase(blocked_iter
);
9943 osd
->promote_finish(results
->object_size
);
9945 OpContextUPtr tctx
= simple_opc_create(obc
);
9946 tctx
->at_version
= get_next_version();
9948 if (!obc
->obs
.oi
.has_manifest()) {
9949 ++tctx
->delta_stats
.num_objects
;
9951 if (soid
.snap
< CEPH_NOSNAP
)
9952 ++tctx
->delta_stats
.num_object_clones
;
9953 tctx
->new_obs
.exists
= true;
9955 tctx
->extra_reqids
= results
->reqids
;
9956 tctx
->extra_reqid_return_codes
= results
->reqid_return_codes
;
9958 if (obc
->obs
.oi
.has_manifest() && obc
->obs
.oi
.manifest
.is_redirect()) {
9959 tctx
->new_obs
.oi
.manifest
.type
= object_manifest_t::TYPE_NONE
;
9960 tctx
->new_obs
.oi
.clear_flag(object_info_t::FLAG_REDIRECT_HAS_REFERENCE
);
9961 tctx
->new_obs
.oi
.clear_flag(object_info_t::FLAG_MANIFEST
);
9962 tctx
->new_obs
.oi
.manifest
.redirect_target
= hobject_t();
9963 tctx
->delta_stats
.num_objects_manifest
--;
9964 if (obc
->obs
.oi
.test_flag(object_info_t::FLAG_REDIRECT_HAS_REFERENCE
)) {
9965 dec_all_refcount_manifest(obc
->obs
.oi
, tctx
.get());
9970 // create a whiteout
9971 tctx
->op_t
->create(soid
);
9972 tctx
->new_obs
.oi
.set_flag(object_info_t::FLAG_WHITEOUT
);
9973 ++tctx
->delta_stats
.num_whiteouts
;
9974 dout(20) << __func__
<< " creating whiteout on " << soid
<< dendl
;
9975 osd
->logger
->inc(l_osd_tier_whiteout
);
9977 if (results
->has_omap
) {
9978 dout(10) << __func__
<< " setting omap flag on " << soid
<< dendl
;
9979 tctx
->new_obs
.oi
.set_flag(object_info_t::FLAG_OMAP
);
9980 ++tctx
->delta_stats
.num_objects_omap
;
9983 results
->fill_in_final_tx(tctx
->op_t
.get());
9984 if (results
->started_temp_obj
) {
9985 tctx
->discard_temp_oid
= results
->temp_oid
;
9987 tctx
->new_obs
.oi
.size
= results
->object_size
;
9988 tctx
->new_obs
.oi
.user_version
= results
->user_version
;
9989 tctx
->new_obs
.oi
.mtime
= ceph::real_clock::to_timespec(results
->mtime
);
9990 tctx
->mtime
= utime_t();
9991 if (results
->is_data_digest()) {
9992 tctx
->new_obs
.oi
.set_data_digest(results
->data_digest
);
9994 tctx
->new_obs
.oi
.clear_data_digest();
9996 if (results
->object_size
)
9997 tctx
->clean_regions
.mark_data_region_dirty(0, results
->object_size
);
9998 if (results
->is_omap_digest()) {
9999 tctx
->new_obs
.oi
.set_omap_digest(results
->omap_digest
);
10001 tctx
->new_obs
.oi
.clear_omap_digest();
10003 if (results
->has_omap
)
10004 tctx
->clean_regions
.mark_omap_dirty();
10005 tctx
->new_obs
.oi
.truncate_seq
= results
->truncate_seq
;
10006 tctx
->new_obs
.oi
.truncate_size
= results
->truncate_size
;
10008 if (soid
.snap
!= CEPH_NOSNAP
) {
10009 ceph_assert(obc
->ssc
->snapset
.clone_snaps
.count(soid
.snap
));
10010 ceph_assert(obc
->ssc
->snapset
.clone_size
.count(soid
.snap
));
10011 ceph_assert(obc
->ssc
->snapset
.clone_size
[soid
.snap
] ==
10012 results
->object_size
);
10013 ceph_assert(obc
->ssc
->snapset
.clone_overlap
.count(soid
.snap
));
10015 tctx
->delta_stats
.num_bytes
+= obc
->ssc
->snapset
.get_clone_bytes(soid
.snap
);
10017 tctx
->delta_stats
.num_bytes
+= results
->object_size
;
10021 if (results
->mirror_snapset
) {
10022 ceph_assert(tctx
->new_obs
.oi
.soid
.snap
== CEPH_NOSNAP
);
10023 tctx
->new_snapset
.from_snap_set(
10025 get_osdmap()->require_osd_release
< ceph_release_t::luminous
);
10027 dout(20) << __func__
<< " new_snapset " << tctx
->new_snapset
<< dendl
;
10029 // take RWWRITE lock for duration of our local write. ignore starvation.
10030 if (!tctx
->lock_manager
.take_write_lock(
10033 ceph_abort_msg("problem!");
10035 dout(20) << __func__
<< " took lock on obc, " << obc
->rwstate
<< dendl
;
10037 finish_ctx(tctx
.get(), pg_log_entry_t::PROMOTE
);
10039 simple_opc_submit(std::move(tctx
));
10041 osd
->logger
->inc(l_osd_tier_promote
);
10044 agent_state
->is_idle())
10045 agent_choose_mode();
10048 void PrimaryLogPG::finish_promote_manifest(int r
, CopyResults
*results
,
10049 ObjectContextRef obc
)
10051 const hobject_t
& soid
= obc
->obs
.oi
.soid
;
10052 dout(10) << __func__
<< " " << soid
<< " r=" << r
10053 << " uv" << results
->user_version
<< dendl
;
10055 if (r
== -ECANCELED
|| r
== -EAGAIN
) {
10060 derr
<< __func__
<< " unexpected promote error " << cpp_strerror(r
) << dendl
;
10061 // pass error to everyone blocked on this object
10062 // FIXME: this is pretty sloppy, but at this point we got
10063 // something unexpected and don't have many other options.
10064 map
<hobject_t
,list
<OpRequestRef
>>::iterator blocked_iter
=
10065 waiting_for_blocked_object
.find(soid
);
10066 if (blocked_iter
!= waiting_for_blocked_object
.end()) {
10067 while (!blocked_iter
->second
.empty()) {
10068 osd
->reply_op_error(blocked_iter
->second
.front(), r
);
10069 blocked_iter
->second
.pop_front();
10071 waiting_for_blocked_object
.erase(blocked_iter
);
10076 osd
->promote_finish(results
->object_size
);
10077 osd
->logger
->inc(l_osd_tier_promote
);
10080 agent_state
->is_idle())
10081 agent_choose_mode();
10084 void PrimaryLogPG::cancel_copy(CopyOpRef cop
, bool requeue
,
10085 vector
<ceph_tid_t
> *tids
)
10087 dout(10) << __func__
<< " " << cop
->obc
->obs
.oi
.soid
10088 << " from " << cop
->src
<< " " << cop
->oloc
10089 << " v" << cop
->results
.user_version
<< dendl
;
10091 // cancel objecter op, if we can
10092 if (cop
->objecter_tid
) {
10093 tids
->push_back(cop
->objecter_tid
);
10094 cop
->objecter_tid
= 0;
10095 if (cop
->objecter_tid2
) {
10096 tids
->push_back(cop
->objecter_tid2
);
10097 cop
->objecter_tid2
= 0;
10101 copy_ops
.erase(cop
->obc
->obs
.oi
.soid
);
10102 cop
->obc
->stop_block();
10104 kick_object_context_blocked(cop
->obc
);
10105 cop
->results
.should_requeue
= requeue
;
10106 CopyCallbackResults
result(-ECANCELED
, &cop
->results
);
10107 cop
->cb
->complete(result
);
10109 // There may still be an objecter callback referencing this copy op.
10110 // That callback will not need the obc since it's been canceled, and
10111 // we need the obc reference to go away prior to flush.
10112 cop
->obc
= ObjectContextRef();
10115 void PrimaryLogPG::cancel_copy_ops(bool requeue
, vector
<ceph_tid_t
> *tids
)
10117 dout(10) << __func__
<< dendl
;
10118 map
<hobject_t
,CopyOpRef
>::iterator p
= copy_ops
.begin();
10119 while (p
!= copy_ops
.end()) {
10120 // requeue this op? can I queue up all of them?
10121 cancel_copy((p
++)->second
, requeue
, tids
);
10126 // ========================================================================
10129 // Flush a dirty object in the cache tier by writing it back to the
10130 // base tier. The sequence looks like:
10132 // * send a copy-from operation to the base tier to copy the current
10133 // version of the object
10134 // * base tier will pull the object via (perhaps multiple) copy-get(s)
10135 // * on completion, we check if the object has been modified. if so,
10136 // just reply with -EAGAIN.
10137 // * try to take a write lock so we can clear the dirty flag. if this
10138 // fails, wait and retry
10139 // * start a repop that clears the bit.
10141 // If we have to wait, we will retry by coming back through the
10142 // start_flush method. We check if a flush is already in progress
10143 // and, if so, try to finish it by rechecking the version and trying
10144 // to clear the dirty bit.
10146 // In order for the cache-flush (a write op) to not block the copy-get
10147 // from reading the object, the client *must* set the SKIPRWLOCKS
10150 // NOTE: normally writes are strictly ordered for the client, but
10151 // flushes are special in that they can be reordered with respect to
10152 // other writes. In particular, we can't have a flush request block
10153 // an update to the cache pool object!
10155 struct C_Flush
: public Context
{
10156 PrimaryLogPGRef pg
;
10158 epoch_t last_peering_reset
;
10161 C_Flush(PrimaryLogPG
*p
, hobject_t o
, epoch_t lpr
)
10162 : pg(p
), oid(o
), last_peering_reset(lpr
),
10163 tid(0), start(ceph_clock_now())
10165 void finish(int r
) override
{
10166 if (r
== -ECANCELED
)
10168 std::scoped_lock locker
{*pg
};
10169 if (last_peering_reset
== pg
->get_last_peering_reset()) {
10170 pg
->finish_flush(oid
, tid
, r
);
10171 pg
->osd
->logger
->tinc(l_osd_tier_flush_lat
, ceph_clock_now() - start
);
10176 int PrimaryLogPG::start_dedup(OpRequestRef op
, ObjectContextRef obc
)
10178 const object_info_t
& oi
= obc
->obs
.oi
;
10179 const hobject_t
& soid
= oi
.soid
;
10181 ceph_assert(obc
->is_blocked());
10182 if (oi
.size
== 0) {
10186 if (pool
.info
.get_fingerprint_type() == pg_pool_t::TYPE_FINGERPRINT_NONE
) {
10187 dout(0) << " fingerprint algorithm is not set " << dendl
;
10192 * The operations to make dedup chunks are tracked by a ManifestOp.
10193 * This op will be finished if all the operations are completed.
10195 ManifestOpRef
mop(std::make_shared
<ManifestOp
>(nullptr));
10198 std::map
<uint64_t, bufferlist
> chunks
;
10199 int r
= do_cdc(oi
, mop
->new_manifest
.chunk_map
, chunks
);
10203 if (!chunks
.size()) {
10207 // chunks issued here are different with chunk_map newly generated
10208 // because the same chunks in previous snap will not be issued
10209 // So, we need two data structures; the first is the issued chunk list to track
10210 // issued operations, and the second is the new chunk_map to update chunk_map after
10211 // all operations are finished
10212 object_ref_delta_t refs
;
10213 ObjectContextRef obc_l
, obc_g
;
10214 get_adjacent_clones(obc
, obc_l
, obc_g
);
10215 // skip if the same content exits in prev snap at same offset
10216 mop
->new_manifest
.calc_refs_to_inc_on_set(
10217 obc_l
? &(obc_l
->obs
.oi
.manifest
) : nullptr,
10218 obc_g
? &(obc_g
->obs
.oi
.manifest
) : nullptr,
10221 for (auto p
: chunks
) {
10222 hobject_t target
= mop
->new_manifest
.chunk_map
[p
.first
].oid
;
10223 if (refs
.find(target
) == refs
.end()) {
10226 C_SetDedupChunks
*fin
= new C_SetDedupChunks(this, soid
, get_last_peering_reset(), p
.first
);
10227 ceph_tid_t tid
= refcount_manifest(soid
, target
, refcount_t::CREATE_OR_GET_REF
,
10228 fin
, move(chunks
[p
.first
]));
10229 mop
->chunks
[target
] = make_pair(p
.first
, p
.second
.length());
10231 mop
->tids
[p
.first
] = tid
;
10233 dout(10) << __func__
<< " oid: " << soid
<< " tid: " << tid
10234 << " target: " << target
<< " offset: " << p
.first
10235 << " length: " << p
.second
.length() << dendl
;
10238 if (mop
->tids
.size()) {
10239 manifest_ops
[soid
] = mop
;
10240 manifest_ops
[soid
]->op
= op
;
10246 return -EINPROGRESS
;
10249 int PrimaryLogPG::do_cdc(const object_info_t
& oi
,
10250 std::map
<uint64_t, chunk_info_t
>& chunk_map
,
10251 std::map
<uint64_t, bufferlist
>& chunks
)
10253 string chunk_algo
= pool
.info
.get_dedup_chunk_algorithm_name();
10254 int64_t chunk_size
= pool
.info
.get_dedup_cdc_chunk_size();
10255 uint64_t total_length
= 0;
10257 std::unique_ptr
<CDC
> cdc
= CDC::create(chunk_algo
, cbits(chunk_size
)-1);
10259 dout(0) << __func__
<< " unrecognized chunk-algorithm " << dendl
;
10265 * We disable EC pool as a base tier of distributed dedup.
10266 * The reason why we disallow erasure code pool here is that the EC pool does not support objects_read_sync().
10267 * Therefore, we should change the current implementation totally to make EC pool compatible.
10268 * As s result, we leave this as a future work.
10270 int r
= pgbackend
->objects_read_sync(
10271 oi
.soid
, 0, oi
.size
, 0, &bl
);
10273 dout(0) << __func__
<< " read fail " << oi
.soid
10274 << " len: " << oi
.size
<< " r: " << r
<< dendl
;
10277 if (bl
.length() != oi
.size
) {
10278 dout(0) << __func__
<< " bl.length: " << bl
.length() << " != oi.size: "
10279 << oi
.size
<< " during chunking " << dendl
;
10283 dout(10) << __func__
<< " oid: " << oi
.soid
<< " len: " << bl
.length()
10284 << " oi.size: " << oi
.size
10285 << " chunk_size: " << chunk_size
<< dendl
;
10287 vector
<pair
<uint64_t, uint64_t>> cdc_chunks
;
10288 cdc
->calc_chunks(bl
, &cdc_chunks
);
10291 for (auto p
: cdc_chunks
) {
10293 chunk
.substr_of(bl
, p
.first
, p
.second
);
10294 hobject_t target
= get_fpoid_from_chunk(oi
.soid
, chunk
);
10295 chunks
[p
.first
] = move(chunk
);
10296 chunk_map
[p
.first
] = chunk_info_t(0, p
.second
, target
);
10297 total_length
+= p
.second
;
10299 return total_length
;
10302 hobject_t
PrimaryLogPG::get_fpoid_from_chunk(const hobject_t soid
, bufferlist
& chunk
)
10304 pg_pool_t::fingerprint_t fp_algo
= pool
.info
.get_fingerprint_type();
10305 if (fp_algo
== pg_pool_t::TYPE_FINGERPRINT_NONE
) {
10306 return hobject_t();
10308 object_t fp_oid
= [&fp_algo
, &chunk
]() -> string
{
10310 case pg_pool_t::TYPE_FINGERPRINT_SHA1
:
10311 return ceph::crypto::digest
<ceph::crypto::SHA1
>(chunk
).to_str();
10312 case pg_pool_t::TYPE_FINGERPRINT_SHA256
:
10313 return ceph::crypto::digest
<ceph::crypto::SHA256
>(chunk
).to_str();
10314 case pg_pool_t::TYPE_FINGERPRINT_SHA512
:
10315 return ceph::crypto::digest
<ceph::crypto::SHA512
>(chunk
).to_str();
10317 assert(0 == "unrecognized fingerprint type");
10323 object_locator_t
oloc(soid
);
10324 oloc
.pool
= pool
.info
.get_dedup_tier();
10325 get_osdmap()->object_locator_to_pg(fp_oid
, oloc
, raw_pg
);
10326 hobject_t
target(fp_oid
, oloc
.key
, snapid_t(),
10327 raw_pg
.ps(), raw_pg
.pool(),
10332 int PrimaryLogPG::finish_set_dedup(hobject_t oid
, int r
, ceph_tid_t tid
, uint64_t offset
)
10334 dout(10) << __func__
<< " " << oid
<< " tid " << tid
10335 << " " << cpp_strerror(r
) << dendl
;
10336 map
<hobject_t
,ManifestOpRef
>::iterator p
= manifest_ops
.find(oid
);
10337 if (p
== manifest_ops
.end()) {
10338 dout(10) << __func__
<< " no manifest_op found" << dendl
;
10341 ManifestOpRef mop
= p
->second
;
10342 mop
->results
[offset
] = r
;
10344 // if any failure occurs, put a mark on the results to recognize the failure
10345 mop
->results
[0] = r
;
10347 if (mop
->num_chunks
!= mop
->results
.size()) {
10348 // there are on-going works
10349 return -EINPROGRESS
;
10351 ObjectContextRef obc
= get_object_context(oid
, false);
10354 osd
->reply_op_error(mop
->op
, -EINVAL
);
10357 ceph_assert(obc
->is_blocked());
10359 kick_object_context_blocked(obc
);
10360 if (mop
->results
[0] < 0) {
10361 // check if the previous op returns fail
10362 ceph_assert(mop
->num_chunks
== mop
->results
.size());
10363 manifest_ops
.erase(oid
);
10364 osd
->reply_op_error(mop
->op
, mop
->results
[0]);
10368 if (mop
->chunks
.size()) {
10369 OpContextUPtr ctx
= simple_opc_create(obc
);
10371 if (ctx
->lock_manager
.get_lock_type(
10376 dout(20) << __func__
<< " took write lock" << dendl
;
10377 } else if (mop
->op
) {
10378 dout(10) << __func__
<< " waiting on write lock " << mop
->op
<< dendl
;
10379 close_op_ctx(ctx
.release());
10383 ctx
->at_version
= get_next_version();
10384 ctx
->new_obs
= obc
->obs
;
10385 ctx
->new_obs
.oi
.clear_flag(object_info_t::FLAG_DIRTY
);
10388 * Let's assume that there is a manifest snapshotted object, and we issue tier_flush() to head.
10389 * head: [0, 2) aaa <-- tier_flush()
10390 * 20: [0, 2) ddd, [6, 2) bbb, [8, 2) ccc
10392 * In this case, if the new chunk_map is as follows,
10393 * new_chunk_map : [0, 2) ddd, [6, 2) bbb, [8, 2) ccc
10394 * we should drop aaa from head by using calc_refs_to_drop_on_removal().
10395 * So, the precedure is
10396 * 1. calc_refs_to_drop_on_removal()
10397 * 2. register old references to drop after tier_flush() is committed
10398 * 3. update new chunk_map
10401 ObjectCleanRegions c_regions
= ctx
->clean_regions
;
10402 ObjectContextRef cobc
= get_prev_clone_obc(obc
);
10403 c_regions
.mark_fully_dirty();
10404 // CDC was done on entire range of manifest object,
10405 // so the first thing we should do here is to drop the reference to old chunks
10406 ObjectContextRef obc_l
, obc_g
;
10407 get_adjacent_clones(obc
, obc_l
, obc_g
);
10408 // clear all old references
10409 object_ref_delta_t refs
;
10410 ctx
->obs
->oi
.manifest
.calc_refs_to_drop_on_removal(
10411 obc_l
? &(obc_l
->obs
.oi
.manifest
) : nullptr,
10412 obc_g
? &(obc_g
->obs
.oi
.manifest
) : nullptr,
10414 if (!refs
.is_empty()) {
10415 ctx
->register_on_commit(
10416 [oid
, this, refs
](){
10417 dec_refcount(oid
, refs
);
10421 // set new references
10422 ctx
->new_obs
.oi
.manifest
.chunk_map
= mop
->new_manifest
.chunk_map
;
10424 finish_ctx(ctx
.get(), pg_log_entry_t::CLEAN
);
10425 simple_opc_submit(std::move(ctx
));
10428 osd
->reply_op_error(mop
->op
, r
);
10430 manifest_ops
.erase(oid
);
10434 int PrimaryLogPG::start_flush(
10435 OpRequestRef op
, ObjectContextRef obc
,
10436 bool blocking
, hobject_t
*pmissing
,
10437 std::optional
<std::function
<void()>> &&on_flush
)
10439 const object_info_t
& oi
= obc
->obs
.oi
;
10440 const hobject_t
& soid
= oi
.soid
;
10441 dout(10) << __func__
<< " " << soid
10442 << " v" << oi
.version
10443 << " uv" << oi
.user_version
10444 << " " << (blocking
? "blocking" : "non-blocking/best-effort")
10447 bool preoctopus_compat
=
10448 get_osdmap()->require_osd_release
< ceph_release_t::octopus
;
10450 if (preoctopus_compat
) {
10451 // for pre-octopus compatibility, filter SnapSet::snaps. not
10452 // certain we need this, but let's be conservative.
10453 snapset
= obc
->ssc
->snapset
.get_filtered(pool
.info
);
10455 // NOTE: change this to a const ref when we remove this compat code
10456 snapset
= obc
->ssc
->snapset
;
10459 if (obc
->obs
.oi
.has_manifest() && obc
->obs
.oi
.manifest
.is_chunked()) {
10460 // current dedup tier only supports blocking operation
10462 return -EOPNOTSUPP
;
10466 // verify there are no (older) check for dirty clones
10468 dout(20) << " snapset " << snapset
<< dendl
;
10469 vector
<snapid_t
>::reverse_iterator p
= snapset
.clones
.rbegin();
10470 while (p
!= snapset
.clones
.rend() && *p
>= soid
.snap
)
10472 if (p
!= snapset
.clones
.rend()) {
10473 hobject_t next
= soid
;
10475 ceph_assert(next
.snap
< soid
.snap
);
10476 if (recovery_state
.get_pg_log().get_missing().is_missing(next
)) {
10477 dout(10) << __func__
<< " missing clone is " << next
<< dendl
;
10482 ObjectContextRef older_obc
= get_object_context(next
, false);
10484 dout(20) << __func__
<< " next oldest clone is " << older_obc
->obs
.oi
10486 if (older_obc
->obs
.oi
.is_dirty()) {
10487 dout(10) << __func__
<< " next oldest clone is dirty: "
10488 << older_obc
->obs
.oi
<< dendl
;
10492 dout(20) << __func__
<< " next oldest clone " << next
10493 << " is not present; implicitly clean" << dendl
;
10496 dout(20) << __func__
<< " no older clones" << dendl
;
10501 obc
->start_block();
10503 map
<hobject_t
,FlushOpRef
>::iterator p
= flush_ops
.find(soid
);
10504 if (p
!= flush_ops
.end()) {
10505 FlushOpRef fop
= p
->second
;
10506 if (fop
->op
== op
) {
10507 // we couldn't take the write lock on a cache-try-flush before;
10508 // now we are trying again for the lock.
10509 return try_flush_mark_clean(fop
);
10511 if (fop
->flushed_version
== obc
->obs
.oi
.user_version
&&
10512 (fop
->blocking
|| !blocking
)) {
10513 // nonblocking can join anything
10514 // blocking can only join a blocking flush
10515 dout(20) << __func__
<< " piggybacking on existing flush " << dendl
;
10517 fop
->dup_ops
.push_back(op
);
10518 return -EAGAIN
; // clean up this ctx; op will retry later
10521 // cancel current flush since it will fail anyway, or because we
10522 // are blocking and the existing flush is nonblocking.
10523 dout(20) << __func__
<< " canceling previous flush; it will fail" << dendl
;
10525 osd
->reply_op_error(fop
->op
, -EBUSY
);
10526 while (!fop
->dup_ops
.empty()) {
10527 osd
->reply_op_error(fop
->dup_ops
.front(), -EBUSY
);
10528 fop
->dup_ops
.pop_front();
10530 vector
<ceph_tid_t
> tids
;
10531 cancel_flush(fop
, false, &tids
);
10532 osd
->objecter
->op_cancel(tids
, -ECANCELED
);
10535 if (obc
->obs
.oi
.has_manifest() && obc
->obs
.oi
.manifest
.is_chunked()) {
10536 int r
= start_dedup(op
, obc
);
10537 if (r
!= -EINPROGRESS
) {
10545 * In general, we need to send a delete and a copyfrom.
10546 * Consider snapc 10:[10, 9, 8, 4, 3, 2]:[10(10, 9), 4(4,3,2)]
10547 * where 4 is marked as clean. To flush 10, we have to:
10548 * 1) delete 4:[4,3,2] -- Logically, the object does not exist after 4
10549 * 2) copyfrom 8:[8,4,3,2] -- flush object after snap 8
10551 * There is a complicating case. Supposed there had been a clone 7
10552 * for snaps [7, 6] which has been trimmed since they no longer exist.
10553 * In the base pool, we'd have 5:[4,3,2]:[4(4,3,2)]+head. When we submit
10554 * the delete, the snap will be promoted to 5, and the head will become
10555 * a whiteout. When the copy-from goes through, we'll end up with
10556 * 8:[8,4,3,2]:[4(4,3,2)]+head.
10558 * Another complication is the case where there is an interval change
10559 * after doing the delete and the flush but before marking the object
10560 * clean. We'll happily delete head and then recreate it at the same
10561 * sequence number, which works out ok.
10564 SnapContext snapc
, dsnapc
;
10565 if (snapset
.seq
!= 0) {
10566 if (soid
.snap
== CEPH_NOSNAP
) {
10567 snapc
= snapset
.get_ssc_as_of(snapset
.seq
);
10569 snapid_t min_included_snap
;
10570 auto p
= snapset
.clone_snaps
.find(soid
.snap
);
10571 ceph_assert(p
!= snapset
.clone_snaps
.end());
10572 min_included_snap
= p
->second
.back();
10573 snapc
= snapset
.get_ssc_as_of(min_included_snap
- 1);
10576 snapid_t prev_snapc
= 0;
10577 for (vector
<snapid_t
>::reverse_iterator citer
= snapset
.clones
.rbegin();
10578 citer
!= snapset
.clones
.rend();
10580 if (*citer
< soid
.snap
) {
10581 prev_snapc
= *citer
;
10586 dsnapc
= snapset
.get_ssc_as_of(prev_snapc
);
10589 object_locator_t
base_oloc(soid
);
10590 base_oloc
.pool
= pool
.info
.tier_of
;
10592 if (dsnapc
.seq
< snapc
.seq
) {
10595 osd
->objecter
->mutate(
10600 ceph::real_clock::from_ceph_timespec(oi
.mtime
),
10601 (CEPH_OSD_FLAG_IGNORE_OVERLAY
|
10602 CEPH_OSD_FLAG_ENFORCE_SNAPC
),
10603 NULL
/* no callback, we'll rely on the ordering w.r.t the next op */);
10606 FlushOpRef
fop(std::make_shared
<FlushOp
>());
10608 fop
->flushed_version
= oi
.user_version
;
10609 fop
->blocking
= blocking
;
10610 fop
->on_flush
= std::move(on_flush
);
10614 if (oi
.is_whiteout()) {
10615 fop
->removal
= true;
10618 object_locator_t
oloc(soid
);
10619 o
.copy_from(soid
.oid
.name
, soid
.snap
, oloc
, oi
.user_version
,
10620 CEPH_OSD_COPY_FROM_FLAG_FLUSH
|
10621 CEPH_OSD_COPY_FROM_FLAG_IGNORE_OVERLAY
|
10622 CEPH_OSD_COPY_FROM_FLAG_IGNORE_CACHE
|
10623 CEPH_OSD_COPY_FROM_FLAG_MAP_SNAP_CLONE
,
10624 LIBRADOS_OP_FLAG_FADVISE_SEQUENTIAL
|LIBRADOS_OP_FLAG_FADVISE_NOCACHE
);
10626 //mean the base tier don't cache data after this
10627 if (agent_state
&& agent_state
->evict_mode
!= TierAgentState::EVICT_MODE_FULL
)
10628 o
.set_last_op_flags(LIBRADOS_OP_FLAG_FADVISE_DONTNEED
);
10630 C_Flush
*fin
= new C_Flush(this, soid
, get_last_peering_reset());
10632 ceph_tid_t tid
= osd
->objecter
->mutate(
10633 soid
.oid
, base_oloc
, o
, snapc
,
10634 ceph::real_clock::from_ceph_timespec(oi
.mtime
),
10635 CEPH_OSD_FLAG_IGNORE_OVERLAY
| CEPH_OSD_FLAG_ENFORCE_SNAPC
,
10636 new C_OnFinisher(fin
,
10637 osd
->get_objecter_finisher(get_pg_shard())));
10638 /* we're under the pg lock and fin->finish() is grabbing that */
10640 fop
->objecter_tid
= tid
;
10642 flush_ops
[soid
] = fop
;
10644 recovery_state
.update_stats(
10645 [&oi
](auto &history
, auto &stats
) {
10646 stats
.stats
.sum
.num_flush
++;
10647 stats
.stats
.sum
.num_flush_kb
+= shift_round_up(oi
.size
, 10);
10650 return -EINPROGRESS
;
10653 void PrimaryLogPG::finish_flush(hobject_t oid
, ceph_tid_t tid
, int r
)
10655 dout(10) << __func__
<< " " << oid
<< " tid " << tid
10656 << " " << cpp_strerror(r
) << dendl
;
10657 map
<hobject_t
,FlushOpRef
>::iterator p
= flush_ops
.find(oid
);
10658 if (p
== flush_ops
.end()) {
10659 dout(10) << __func__
<< " no flush_op found" << dendl
;
10662 FlushOpRef fop
= p
->second
;
10663 if (tid
!= fop
->objecter_tid
&& !fop
->obc
->obs
.oi
.has_manifest()) {
10664 dout(10) << __func__
<< " tid " << tid
<< " != fop " << fop
10665 << " tid " << fop
->objecter_tid
<< dendl
;
10668 ObjectContextRef obc
= fop
->obc
;
10669 fop
->objecter_tid
= 0;
10671 if (r
< 0 && !(r
== -ENOENT
&& fop
->removal
)) {
10673 osd
->reply_op_error(fop
->op
, -EBUSY
);
10674 if (fop
->blocking
) {
10676 kick_object_context_blocked(obc
);
10679 if (!fop
->dup_ops
.empty()) {
10680 dout(20) << __func__
<< " requeueing dups" << dendl
;
10681 requeue_ops(fop
->dup_ops
);
10683 if (fop
->on_flush
) {
10684 (*(fop
->on_flush
))();
10685 fop
->on_flush
= std::nullopt
;
10687 flush_ops
.erase(oid
);
10691 r
= try_flush_mark_clean(fop
);
10692 if (r
== -EBUSY
&& fop
->op
) {
10693 osd
->reply_op_error(fop
->op
, r
);
10697 int PrimaryLogPG::try_flush_mark_clean(FlushOpRef fop
)
10699 ObjectContextRef obc
= fop
->obc
;
10700 const hobject_t
& oid
= obc
->obs
.oi
.soid
;
10702 if (fop
->blocking
) {
10704 kick_object_context_blocked(obc
);
10707 if (fop
->flushed_version
!= obc
->obs
.oi
.user_version
||
10708 !obc
->obs
.exists
) {
10709 if (obc
->obs
.exists
)
10710 dout(10) << __func__
<< " flushed_version " << fop
->flushed_version
10711 << " != current " << obc
->obs
.oi
.user_version
10714 dout(10) << __func__
<< " object no longer exists" << dendl
;
10716 if (!fop
->dup_ops
.empty()) {
10717 dout(20) << __func__
<< " requeueing dups" << dendl
;
10718 requeue_ops(fop
->dup_ops
);
10720 if (fop
->on_flush
) {
10721 (*(fop
->on_flush
))();
10722 fop
->on_flush
= std::nullopt
;
10724 flush_ops
.erase(oid
);
10726 osd
->logger
->inc(l_osd_tier_flush_fail
);
10728 osd
->logger
->inc(l_osd_tier_try_flush_fail
);
10732 if (!fop
->blocking
&&
10733 m_scrubber
->write_blocked_by_scrub(oid
)) {
10735 dout(10) << __func__
<< " blocked by scrub" << dendl
;
10736 requeue_op(fop
->op
);
10737 requeue_ops(fop
->dup_ops
);
10738 return -EAGAIN
; // will retry
10740 osd
->logger
->inc(l_osd_tier_try_flush_fail
);
10741 vector
<ceph_tid_t
> tids
;
10742 cancel_flush(fop
, false, &tids
);
10743 osd
->objecter
->op_cancel(tids
, -ECANCELED
);
10748 // successfully flushed, can we evict this object?
10749 if (!obc
->obs
.oi
.has_manifest() && !fop
->op
&&
10750 agent_state
&& agent_state
->evict_mode
!= TierAgentState::EVICT_MODE_IDLE
&&
10751 agent_maybe_evict(obc
, true)) {
10752 osd
->logger
->inc(l_osd_tier_clean
);
10753 if (fop
->on_flush
) {
10754 (*(fop
->on_flush
))();
10755 fop
->on_flush
= std::nullopt
;
10757 flush_ops
.erase(oid
);
10761 dout(10) << __func__
<< " clearing DIRTY flag for " << oid
<< dendl
;
10762 OpContextUPtr ctx
= simple_opc_create(fop
->obc
);
10764 // successfully flushed; can we clear the dirty bit?
10765 // try to take the lock manually, since we don't
10767 if (ctx
->lock_manager
.get_lock_type(
10772 dout(20) << __func__
<< " took write lock" << dendl
;
10773 } else if (fop
->op
) {
10774 dout(10) << __func__
<< " waiting on write lock " << fop
->op
<< " "
10775 << fop
->dup_ops
<< dendl
;
10776 // fop->op is now waiting on the lock; get fop->dup_ops to wait too.
10777 for (auto op
: fop
->dup_ops
) {
10778 bool locked
= ctx
->lock_manager
.get_lock_type(
10783 ceph_assert(!locked
);
10785 close_op_ctx(ctx
.release());
10786 return -EAGAIN
; // will retry
10788 dout(10) << __func__
<< " failed write lock, no op; failing" << dendl
;
10789 close_op_ctx(ctx
.release());
10790 osd
->logger
->inc(l_osd_tier_try_flush_fail
);
10791 vector
<ceph_tid_t
> tids
;
10792 cancel_flush(fop
, false, &tids
);
10793 osd
->objecter
->op_cancel(tids
, -ECANCELED
);
10797 if (fop
->on_flush
) {
10798 ctx
->register_on_finish(*(fop
->on_flush
));
10799 fop
->on_flush
= std::nullopt
;
10802 ctx
->at_version
= get_next_version();
10804 ctx
->new_obs
= obc
->obs
;
10805 ctx
->new_obs
.oi
.clear_flag(object_info_t::FLAG_DIRTY
);
10806 --ctx
->delta_stats
.num_objects_dirty
;
10807 if (fop
->obc
->obs
.oi
.has_manifest()) {
10808 ceph_assert(obc
->obs
.oi
.manifest
.is_chunked());
10809 PGTransaction
* t
= ctx
->op_t
.get();
10810 uint64_t chunks_size
= 0;
10811 for (auto &p
: ctx
->new_obs
.oi
.manifest
.chunk_map
) {
10812 chunks_size
+= p
.second
.length
;
10814 if (ctx
->new_obs
.oi
.is_omap() && pool
.info
.supports_omap()) {
10815 t
->omap_clear(oid
);
10816 ctx
->new_obs
.oi
.clear_omap_digest();
10817 ctx
->new_obs
.oi
.clear_flag(object_info_t::FLAG_OMAP
);
10818 ctx
->clean_regions
.mark_omap_dirty();
10820 if (obc
->obs
.oi
.size
== chunks_size
) {
10821 t
->truncate(oid
, 0);
10822 interval_set
<uint64_t> trim
;
10823 trim
.insert(0, ctx
->new_obs
.oi
.size
);
10824 ctx
->modified_ranges
.union_of(trim
);
10825 truncate_update_size_and_usage(ctx
->delta_stats
,
10828 ctx
->clean_regions
.mark_data_region_dirty(0, ctx
->new_obs
.oi
.size
);
10829 ctx
->new_obs
.oi
.new_object();
10830 for (auto &p
: ctx
->new_obs
.oi
.manifest
.chunk_map
) {
10831 p
.second
.set_flag(chunk_info_t::FLAG_MISSING
);
10834 for (auto &p
: ctx
->new_obs
.oi
.manifest
.chunk_map
) {
10835 dout(20) << __func__
<< " offset: " << p
.second
.offset
10836 << " length: " << p
.second
.length
<< dendl
;
10837 p
.second
.clear_flag(chunk_info_t::FLAG_MISSING
); // CLEAN
10842 finish_ctx(ctx
.get(), pg_log_entry_t::CLEAN
);
10844 osd
->logger
->inc(l_osd_tier_clean
);
10846 if (!fop
->dup_ops
.empty() || fop
->op
) {
10847 dout(20) << __func__
<< " requeueing for " << ctx
->at_version
<< dendl
;
10848 list
<OpRequestRef
> ls
;
10850 ls
.push_back(fop
->op
);
10851 ls
.splice(ls
.end(), fop
->dup_ops
);
10855 simple_opc_submit(std::move(ctx
));
10857 flush_ops
.erase(oid
);
10860 osd
->logger
->inc(l_osd_tier_flush
);
10862 osd
->logger
->inc(l_osd_tier_try_flush
);
10864 return -EINPROGRESS
;
10867 void PrimaryLogPG::cancel_flush(FlushOpRef fop
, bool requeue
,
10868 vector
<ceph_tid_t
> *tids
)
10870 dout(10) << __func__
<< " " << fop
->obc
->obs
.oi
.soid
<< " tid "
10871 << fop
->objecter_tid
<< dendl
;
10872 if (fop
->objecter_tid
) {
10873 tids
->push_back(fop
->objecter_tid
);
10874 fop
->objecter_tid
= 0;
10876 if (fop
->io_tids
.size()) {
10877 for (auto &p
: fop
->io_tids
) {
10878 tids
->push_back(p
.second
);
10882 if (fop
->blocking
&& fop
->obc
->is_blocked()) {
10883 fop
->obc
->stop_block();
10884 kick_object_context_blocked(fop
->obc
);
10888 requeue_op(fop
->op
);
10889 requeue_ops(fop
->dup_ops
);
10891 if (fop
->on_flush
) {
10892 (*(fop
->on_flush
))();
10893 fop
->on_flush
= std::nullopt
;
10895 flush_ops
.erase(fop
->obc
->obs
.oi
.soid
);
10898 void PrimaryLogPG::cancel_flush_ops(bool requeue
, vector
<ceph_tid_t
> *tids
)
10900 dout(10) << __func__
<< dendl
;
10901 map
<hobject_t
,FlushOpRef
>::iterator p
= flush_ops
.begin();
10902 while (p
!= flush_ops
.end()) {
10903 cancel_flush((p
++)->second
, requeue
, tids
);
10907 bool PrimaryLogPG::is_present_clone(hobject_t coid
)
10909 if (!pool
.info
.allow_incomplete_clones())
10911 if (is_missing_object(coid
))
10913 ObjectContextRef obc
= get_object_context(coid
, false);
10914 return obc
&& obc
->obs
.exists
;
10917 // ========================================================================
10920 class C_OSD_RepopCommit
: public Context
{
10921 PrimaryLogPGRef pg
;
10922 boost::intrusive_ptr
<PrimaryLogPG::RepGather
> repop
;
10924 C_OSD_RepopCommit(PrimaryLogPG
*pg
, PrimaryLogPG::RepGather
*repop
)
10925 : pg(pg
), repop(repop
) {}
10926 void finish(int) override
{
10927 pg
->repop_all_committed(repop
.get());
10931 void PrimaryLogPG::repop_all_committed(RepGather
*repop
)
10933 dout(10) << __func__
<< ": repop tid " << repop
->rep_tid
<< " all committed "
10935 repop
->all_committed
= true;
10936 if (!repop
->rep_aborted
) {
10937 if (repop
->v
!= eversion_t()) {
10938 recovery_state
.complete_write(repop
->v
, repop
->pg_local_last_complete
);
10944 void PrimaryLogPG::op_applied(const eversion_t
&applied_version
)
10946 dout(10) << "op_applied version " << applied_version
<< dendl
;
10947 ceph_assert(applied_version
!= eversion_t());
10948 ceph_assert(applied_version
<= info
.last_update
);
10949 recovery_state
.local_write_applied(applied_version
);
10951 if (is_primary() && m_scrubber
->should_requeue_blocked_ops(recovery_state
.get_last_update_applied())) {
10952 osd
->queue_scrub_applied_update(this, is_scrub_blocking_ops());
10956 void PrimaryLogPG::eval_repop(RepGather
*repop
)
10959 if (repop
->op
->osd_parent_span
) {
10960 auto eval_span
= jaeger_tracing::child_span(__func__
, repop
->op
->osd_parent_span
);
10963 dout(10) << "eval_repop " << *repop
10964 << (repop
->op
&& repop
->op
->get_req
<MOSDOp
>() ? "" : " (no op)") << dendl
;
10967 if (repop
->all_committed
) {
10968 dout(10) << " commit: " << *repop
<< dendl
;
10969 for (auto p
= repop
->on_committed
.begin();
10970 p
!= repop
->on_committed
.end();
10971 repop
->on_committed
.erase(p
++)) {
10974 // send dup commits, in order
10975 auto it
= waiting_for_ondisk
.find(repop
->v
);
10976 if (it
!= waiting_for_ondisk
.end()) {
10977 ceph_assert(waiting_for_ondisk
.begin()->first
== repop
->v
);
10978 for (auto& i
: it
->second
) {
10979 int return_code
= repop
->r
;
10980 if (return_code
>= 0) {
10981 return_code
= std::get
<2>(i
);
10983 osd
->reply_op_error(std::get
<0>(i
), return_code
, repop
->v
,
10984 std::get
<1>(i
), std::get
<3>(i
));
10986 waiting_for_ondisk
.erase(it
);
10989 publish_stats_to_osd();
10991 dout(10) << " removing " << *repop
<< dendl
;
10992 ceph_assert(!repop_queue
.empty());
10993 dout(20) << " q front is " << *repop_queue
.front() << dendl
;
10994 if (repop_queue
.front() == repop
) {
10995 RepGather
*to_remove
= nullptr;
10996 while (!repop_queue
.empty() &&
10997 (to_remove
= repop_queue
.front())->all_committed
) {
10998 repop_queue
.pop_front();
10999 for (auto p
= to_remove
->on_success
.begin();
11000 p
!= to_remove
->on_success
.end();
11001 to_remove
->on_success
.erase(p
++)) {
11004 remove_repop(to_remove
);
11010 void PrimaryLogPG::issue_repop(RepGather
*repop
, OpContext
*ctx
)
11013 const hobject_t
& soid
= ctx
->obs
->oi
.soid
;
11014 dout(7) << "issue_repop rep_tid " << repop
->rep_tid
11018 if (ctx
->op
->osd_parent_span
) {
11019 auto issue_repop_span
= jaeger_tracing::child_span(__func__
, ctx
->op
->osd_parent_span
);
11023 repop
->v
= ctx
->at_version
;
11025 ctx
->op_t
->add_obc(ctx
->obc
);
11026 if (ctx
->clone_obc
) {
11027 ctx
->op_t
->add_obc(ctx
->clone_obc
);
11029 if (ctx
->head_obc
) {
11030 ctx
->op_t
->add_obc(ctx
->head_obc
);
11033 Context
*on_all_commit
= new C_OSD_RepopCommit(this, repop
);
11034 if (!(ctx
->log
.empty())) {
11035 ceph_assert(ctx
->at_version
>= projected_last_update
);
11036 projected_last_update
= ctx
->at_version
;
11038 for (auto &&entry
: ctx
->log
) {
11039 projected_log
.add(entry
);
11042 recovery_state
.pre_submit_op(
11046 pgbackend
->submit_transaction(
11050 std::move(ctx
->op_t
),
11051 recovery_state
.get_pg_trim_to(),
11052 recovery_state
.get_min_last_complete_ondisk(),
11053 std::move(ctx
->log
),
11054 ctx
->updated_hset_history
,
11061 PrimaryLogPG::RepGather
*PrimaryLogPG::new_repop(
11062 OpContext
*ctx
, ObjectContextRef obc
,
11063 ceph_tid_t rep_tid
)
11066 dout(10) << "new_repop rep_tid " << rep_tid
<< " on " << *ctx
->op
->get_req() << dendl
;
11068 dout(10) << "new_repop rep_tid " << rep_tid
<< " (no op)" << dendl
;
11070 RepGather
*repop
= new RepGather(
11071 ctx
, rep_tid
, info
.last_complete
);
11073 repop
->start
= ceph_clock_now();
11075 repop_queue
.push_back(&repop
->queue_item
);
11078 osd
->logger
->inc(l_osd_op_wip
);
11080 dout(10) << __func__
<< ": " << *repop
<< dendl
;
11084 boost::intrusive_ptr
<PrimaryLogPG::RepGather
> PrimaryLogPG::new_repop(
11085 eversion_t version
,
11087 ObcLockManager
&&manager
,
11089 std::optional
<std::function
<void(void)> > &&on_complete
)
11091 RepGather
*repop
= new RepGather(
11092 std::move(manager
),
11094 std::move(on_complete
),
11096 info
.last_complete
,
11098 repop
->v
= version
;
11100 repop
->start
= ceph_clock_now();
11102 repop_queue
.push_back(&repop
->queue_item
);
11104 osd
->logger
->inc(l_osd_op_wip
);
11106 dout(10) << __func__
<< ": " << *repop
<< dendl
;
11107 return boost::intrusive_ptr
<RepGather
>(repop
);
11110 void PrimaryLogPG::remove_repop(RepGather
*repop
)
11112 dout(20) << __func__
<< " " << *repop
<< dendl
;
11114 for (auto p
= repop
->on_finish
.begin();
11115 p
!= repop
->on_finish
.end();
11116 repop
->on_finish
.erase(p
++)) {
11120 release_object_locks(
11121 repop
->lock_manager
);
11124 osd
->logger
->dec(l_osd_op_wip
);
11127 PrimaryLogPG::OpContextUPtr
PrimaryLogPG::simple_opc_create(ObjectContextRef obc
)
11129 dout(20) << __func__
<< " " << obc
->obs
.oi
.soid
<< dendl
;
11130 ceph_tid_t rep_tid
= osd
->get_tid();
11131 osd_reqid_t
reqid(osd
->get_cluster_msgr_name(), 0, rep_tid
);
11132 OpContextUPtr
ctx(new OpContext(OpRequestRef(), reqid
, nullptr, obc
, this));
11133 ctx
->op_t
.reset(new PGTransaction());
11134 ctx
->mtime
= ceph_clock_now();
11138 void PrimaryLogPG::simple_opc_submit(OpContextUPtr ctx
)
11140 RepGather
*repop
= new_repop(ctx
.get(), ctx
->obc
, ctx
->reqid
.tid
);
11141 dout(20) << __func__
<< " " << repop
<< dendl
;
11142 issue_repop(repop
, ctx
.get());
11144 recovery_state
.update_trim_to();
11149 void PrimaryLogPG::submit_log_entries(
11150 const mempool::osd_pglog::list
<pg_log_entry_t
> &entries
,
11151 ObcLockManager
&&manager
,
11152 std::optional
<std::function
<void(void)> > &&_on_complete
,
11156 dout(10) << __func__
<< " " << entries
<< dendl
;
11157 ceph_assert(is_primary());
11159 eversion_t version
;
11160 if (!entries
.empty()) {
11161 ceph_assert(entries
.rbegin()->version
>= projected_last_update
);
11162 version
= projected_last_update
= entries
.rbegin()->version
;
11165 boost::intrusive_ptr
<RepGather
> repop
;
11166 std::optional
<std::function
<void(void)> > on_complete
;
11167 if (get_osdmap()->require_osd_release
>= ceph_release_t::jewel
) {
11171 std::move(manager
),
11173 std::move(_on_complete
));
11175 on_complete
= std::move(_on_complete
);
11178 pgbackend
->call_write_ordered(
11179 [this, entries
, repop
, on_complete
]() {
11180 ObjectStore::Transaction t
;
11181 eversion_t old_last_update
= info
.last_update
;
11182 recovery_state
.merge_new_log_entries(
11183 entries
, t
, recovery_state
.get_pg_trim_to(),
11184 recovery_state
.get_min_last_complete_ondisk());
11186 set
<pg_shard_t
> waiting_on
;
11187 for (set
<pg_shard_t
>::const_iterator i
= get_acting_recovery_backfill().begin();
11188 i
!= get_acting_recovery_backfill().end();
11190 pg_shard_t
peer(*i
);
11191 if (peer
== pg_whoami
) continue;
11192 ceph_assert(recovery_state
.get_peer_missing().count(peer
));
11193 ceph_assert(recovery_state
.has_peer_info(peer
));
11194 if (get_osdmap()->require_osd_release
>= ceph_release_t::jewel
) {
11195 ceph_assert(repop
);
11196 MOSDPGUpdateLogMissing
*m
= new MOSDPGUpdateLogMissing(
11198 spg_t(info
.pgid
.pgid
, i
->shard
),
11200 get_osdmap_epoch(),
11201 get_last_peering_reset(),
11203 recovery_state
.get_pg_trim_to(),
11204 recovery_state
.get_min_last_complete_ondisk());
11205 osd
->send_message_osd_cluster(
11206 peer
.osd
, m
, get_osdmap_epoch());
11207 waiting_on
.insert(peer
);
11209 MOSDPGLog
*m
= new MOSDPGLog(
11210 peer
.shard
, pg_whoami
.shard
,
11211 info
.last_update
.epoch
,
11212 info
, get_last_peering_reset());
11213 m
->log
.log
= entries
;
11214 m
->log
.tail
= old_last_update
;
11215 m
->log
.head
= info
.last_update
;
11216 osd
->send_message_osd_cluster(
11217 peer
.osd
, m
, get_osdmap_epoch());
11220 ceph_tid_t rep_tid
= repop
->rep_tid
;
11221 waiting_on
.insert(pg_whoami
);
11222 log_entry_update_waiting_on
.insert(
11225 LogUpdateCtx
{std::move(repop
), std::move(waiting_on
)}
11227 struct OnComplete
: public Context
{
11228 PrimaryLogPGRef pg
;
11229 ceph_tid_t rep_tid
;
11232 PrimaryLogPGRef pg
,
11233 ceph_tid_t rep_tid
,
11235 : pg(pg
), rep_tid(rep_tid
), epoch(epoch
) {}
11236 void finish(int) override
{
11237 std::scoped_lock l
{*pg
};
11238 if (!pg
->pg_has_reset_since(epoch
)) {
11239 auto it
= pg
->log_entry_update_waiting_on
.find(rep_tid
);
11240 ceph_assert(it
!= pg
->log_entry_update_waiting_on
.end());
11241 auto it2
= it
->second
.waiting_on
.find(pg
->pg_whoami
);
11242 ceph_assert(it2
!= it
->second
.waiting_on
.end());
11243 it
->second
.waiting_on
.erase(it2
);
11244 if (it
->second
.waiting_on
.empty()) {
11245 pg
->repop_all_committed(it
->second
.repop
.get());
11246 pg
->log_entry_update_waiting_on
.erase(it
);
11251 t
.register_on_commit(
11252 new OnComplete
{this, rep_tid
, get_osdmap_epoch()});
11253 int r
= osd
->store
->queue_transaction(ch
, std::move(t
), NULL
);
11254 ceph_assert(r
== 0);
11255 op_applied(info
.last_update
);
11258 recovery_state
.update_trim_to();
11261 void PrimaryLogPG::cancel_log_updates()
11263 // get rid of all the LogUpdateCtx so their references to repops are
11265 log_entry_update_waiting_on
.clear();
11268 // -------------------------------------------------------
11270 void PrimaryLogPG::get_watchers(list
<obj_watch_item_t
> *ls
)
11272 std::scoped_lock l
{*this};
11273 pair
<hobject_t
, ObjectContextRef
> i
;
11274 while (object_contexts
.get_next(i
.first
, &i
)) {
11275 ObjectContextRef
obc(i
.second
);
11276 get_obc_watchers(obc
, *ls
);
11280 void PrimaryLogPG::get_obc_watchers(ObjectContextRef obc
, list
<obj_watch_item_t
> &pg_watchers
)
11282 for (map
<pair
<uint64_t, entity_name_t
>, WatchRef
>::iterator j
=
11283 obc
->watchers
.begin();
11284 j
!= obc
->watchers
.end();
11286 obj_watch_item_t owi
;
11288 owi
.obj
= obc
->obs
.oi
.soid
;
11289 owi
.wi
.addr
= j
->second
->get_peer_addr();
11290 owi
.wi
.name
= j
->second
->get_entity();
11291 owi
.wi
.cookie
= j
->second
->get_cookie();
11292 owi
.wi
.timeout_seconds
= j
->second
->get_timeout();
11294 dout(30) << "watch: Found oid=" << owi
.obj
<< " addr=" << owi
.wi
.addr
11295 << " name=" << owi
.wi
.name
<< " cookie=" << owi
.wi
.cookie
<< dendl
;
11297 pg_watchers
.push_back(owi
);
11301 void PrimaryLogPG::check_blocklisted_watchers()
11303 dout(20) << "PrimaryLogPG::check_blocklisted_watchers for pg " << get_pgid() << dendl
;
11304 pair
<hobject_t
, ObjectContextRef
> i
;
11305 while (object_contexts
.get_next(i
.first
, &i
))
11306 check_blocklisted_obc_watchers(i
.second
);
11309 void PrimaryLogPG::check_blocklisted_obc_watchers(ObjectContextRef obc
)
11311 dout(20) << "PrimaryLogPG::check_blocklisted_obc_watchers for obc " << obc
->obs
.oi
.soid
<< dendl
;
11312 for (map
<pair
<uint64_t, entity_name_t
>, WatchRef
>::iterator k
=
11313 obc
->watchers
.begin();
11314 k
!= obc
->watchers
.end();
11316 //Advance iterator now so handle_watch_timeout() can erase element
11317 map
<pair
<uint64_t, entity_name_t
>, WatchRef
>::iterator j
= k
++;
11318 dout(30) << "watch: Found " << j
->second
->get_entity() << " cookie " << j
->second
->get_cookie() << dendl
;
11319 entity_addr_t ea
= j
->second
->get_peer_addr();
11320 dout(30) << "watch: Check entity_addr_t " << ea
<< dendl
;
11321 if (get_osdmap()->is_blocklisted(ea
)) {
11322 dout(10) << "watch: Found blocklisted watcher for " << ea
<< dendl
;
11323 ceph_assert(j
->second
->get_pg() == this);
11324 j
->second
->unregister_cb();
11325 handle_watch_timeout(j
->second
);
11330 void PrimaryLogPG::populate_obc_watchers(ObjectContextRef obc
)
11332 ceph_assert(is_primary() && is_active());
11333 auto it_objects
= recovery_state
.get_pg_log().get_log().objects
.find(obc
->obs
.oi
.soid
);
11334 ceph_assert((recovering
.count(obc
->obs
.oi
.soid
) ||
11335 !is_missing_object(obc
->obs
.oi
.soid
)) ||
11336 (it_objects
!= recovery_state
.get_pg_log().get_log().objects
.end() && // or this is a revert... see recover_primary()
11337 it_objects
->second
->op
==
11338 pg_log_entry_t::LOST_REVERT
&&
11339 it_objects
->second
->reverting_to
==
11340 obc
->obs
.oi
.version
));
11342 dout(10) << "populate_obc_watchers " << obc
->obs
.oi
.soid
<< dendl
;
11343 ceph_assert(obc
->watchers
.empty());
11344 // populate unconnected_watchers
11345 for (map
<pair
<uint64_t, entity_name_t
>, watch_info_t
>::iterator p
=
11346 obc
->obs
.oi
.watchers
.begin();
11347 p
!= obc
->obs
.oi
.watchers
.end();
11349 utime_t expire
= info
.stats
.last_became_active
;
11350 expire
+= p
->second
.timeout_seconds
;
11351 dout(10) << " unconnected watcher " << p
->first
<< " will expire " << expire
<< dendl
;
11353 Watch::makeWatchRef(
11354 this, osd
, obc
, p
->second
.timeout_seconds
, p
->first
.first
,
11355 p
->first
.second
, p
->second
.addr
));
11356 watch
->disconnect();
11357 obc
->watchers
.insert(
11359 make_pair(p
->first
.first
, p
->first
.second
),
11362 // Look for watchers from blocklisted clients and drop
11363 check_blocklisted_obc_watchers(obc
);
11366 void PrimaryLogPG::handle_watch_timeout(WatchRef watch
)
11368 ObjectContextRef obc
= watch
->get_obc(); // handle_watch_timeout owns this ref
11369 dout(10) << "handle_watch_timeout obc " << obc
<< dendl
;
11371 if (!is_active()) {
11372 dout(10) << "handle_watch_timeout not active, no-op" << dendl
;
11375 if (!obc
->obs
.exists
) {
11376 dout(10) << __func__
<< " object " << obc
->obs
.oi
.soid
<< " dne" << dendl
;
11379 if (is_degraded_or_backfilling_object(obc
->obs
.oi
.soid
)) {
11380 callbacks_for_degraded_object
[obc
->obs
.oi
.soid
].push_back(
11381 watch
->get_delayed_cb()
11383 dout(10) << "handle_watch_timeout waiting for degraded on obj "
11384 << obc
->obs
.oi
.soid
11389 if (m_scrubber
->write_blocked_by_scrub(obc
->obs
.oi
.soid
)) {
11390 dout(10) << "handle_watch_timeout waiting for scrub on obj "
11391 << obc
->obs
.oi
.soid
11393 m_scrubber
->add_callback(
11394 watch
->get_delayed_cb() // This callback!
11399 OpContextUPtr ctx
= simple_opc_create(obc
);
11400 ctx
->at_version
= get_next_version();
11402 object_info_t
& oi
= ctx
->new_obs
.oi
;
11403 oi
.watchers
.erase(make_pair(watch
->get_cookie(),
11404 watch
->get_entity()));
11406 list
<watch_disconnect_t
> watch_disconnects
= {
11407 watch_disconnect_t(watch
->get_cookie(), watch
->get_entity(), true)
11409 ctx
->register_on_success(
11410 [this, obc
, watch_disconnects
]() {
11411 complete_disconnect_watches(obc
, watch_disconnects
);
11415 PGTransaction
*t
= ctx
->op_t
.get();
11416 ctx
->log
.push_back(pg_log_entry_t(pg_log_entry_t::MODIFY
, obc
->obs
.oi
.soid
,
11420 osd_reqid_t(), ctx
->mtime
, 0));
11422 oi
.prior_version
= obc
->obs
.oi
.version
;
11423 oi
.version
= ctx
->at_version
;
11425 encode(oi
, bl
, get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD
, nullptr));
11426 t
->setattr(obc
->obs
.oi
.soid
, OI_ATTR
, bl
);
11428 // apply new object state.
11429 ctx
->obc
->obs
= ctx
->new_obs
;
11431 // no ctx->delta_stats
11432 simple_opc_submit(std::move(ctx
));
11435 ObjectContextRef
PrimaryLogPG::create_object_context(const object_info_t
& oi
,
11436 SnapSetContext
*ssc
)
11438 ObjectContextRef
obc(object_contexts
.lookup_or_create(oi
.soid
));
11439 ceph_assert(obc
->destructor_callback
== NULL
);
11440 obc
->destructor_callback
= new C_PG_ObjectContext(this, obc
.get());
11442 obc
->obs
.exists
= false;
11445 register_snapset_context(ssc
);
11446 dout(10) << "create_object_context " << (void*)obc
.get() << " " << oi
.soid
<< " " << dendl
;
11448 populate_obc_watchers(obc
);
11452 ObjectContextRef
PrimaryLogPG::get_object_context(
11453 const hobject_t
& soid
,
11455 const map
<string
, bufferlist
> *attrs
)
11457 auto it_objects
= recovery_state
.get_pg_log().get_log().objects
.find(soid
);
11459 attrs
|| !recovery_state
.get_pg_log().get_missing().is_missing(soid
) ||
11460 // or this is a revert... see recover_primary()
11461 (it_objects
!= recovery_state
.get_pg_log().get_log().objects
.end() &&
11462 it_objects
->second
->op
==
11463 pg_log_entry_t::LOST_REVERT
));
11464 ObjectContextRef obc
= object_contexts
.lookup(soid
);
11465 osd
->logger
->inc(l_osd_object_ctx_cache_total
);
11467 osd
->logger
->inc(l_osd_object_ctx_cache_hit
);
11468 dout(10) << __func__
<< ": found obc in cache: " << obc
11471 dout(10) << __func__
<< ": obc NOT found in cache: " << soid
<< dendl
;
11475 auto it_oi
= attrs
->find(OI_ATTR
);
11476 ceph_assert(it_oi
!= attrs
->end());
11477 bv
= it_oi
->second
;
11479 int r
= pgbackend
->objects_get_attr(soid
, OI_ATTR
, &bv
);
11482 dout(10) << __func__
<< ": no obc for soid "
11483 << soid
<< " and !can_create"
11485 return ObjectContextRef(); // -ENOENT!
11488 dout(10) << __func__
<< ": no obc for soid "
11489 << soid
<< " but can_create"
11492 object_info_t
oi(soid
);
11493 SnapSetContext
*ssc
= get_snapset_context(
11494 soid
, true, 0, false);
11496 obc
= create_object_context(oi
, ssc
);
11497 dout(10) << __func__
<< ": " << obc
<< " " << soid
11498 << " " << obc
->rwstate
11499 << " oi: " << obc
->obs
.oi
11500 << " ssc: " << obc
->ssc
11501 << " snapset: " << obc
->ssc
->snapset
<< dendl
;
11508 bufferlist::const_iterator bliter
= bv
.begin();
11509 decode(oi
, bliter
);
11511 dout(0) << __func__
<< ": obc corrupt: " << soid
<< dendl
;
11512 return ObjectContextRef(); // -ENOENT!
11515 ceph_assert(oi
.soid
.pool
== (int64_t)info
.pgid
.pool());
11517 obc
= object_contexts
.lookup_or_create(oi
.soid
);
11518 obc
->destructor_callback
= new C_PG_ObjectContext(this, obc
.get());
11520 obc
->obs
.exists
= true;
11522 obc
->ssc
= get_snapset_context(
11524 soid
.has_snapset() ? attrs
: 0);
11526 if (is_primary() && is_active())
11527 populate_obc_watchers(obc
);
11529 if (pool
.info
.is_erasure()) {
11531 obc
->attr_cache
= *attrs
;
11533 int r
= pgbackend
->objects_get_attrs(
11536 ceph_assert(r
== 0);
11540 dout(10) << __func__
<< ": creating obc from disk: " << obc
11544 // XXX: Caller doesn't expect this
11545 if (obc
->ssc
== NULL
) {
11546 derr
<< __func__
<< ": obc->ssc not available, not returning context" << dendl
;
11547 return ObjectContextRef(); // -ENOENT!
11550 dout(10) << __func__
<< ": " << obc
<< " " << soid
11551 << " " << obc
->rwstate
11552 << " oi: " << obc
->obs
.oi
11553 << " exists: " << (int)obc
->obs
.exists
11554 << " ssc: " << obc
->ssc
11555 << " snapset: " << obc
->ssc
->snapset
<< dendl
;
11559 void PrimaryLogPG::context_registry_on_change()
11561 pair
<hobject_t
, ObjectContextRef
> i
;
11562 while (object_contexts
.get_next(i
.first
, &i
)) {
11563 ObjectContextRef
obc(i
.second
);
11565 for (map
<pair
<uint64_t, entity_name_t
>, WatchRef
>::iterator j
=
11566 obc
->watchers
.begin();
11567 j
!= obc
->watchers
.end();
11568 obc
->watchers
.erase(j
++)) {
11569 j
->second
->discard();
11577 * If we return an error, and set *pmissing, then promoting that
11580 * If we return -EAGAIN, we will always set *pmissing to the missing
11581 * object to wait for.
11583 * If we return an error but do not set *pmissing, then we know the
11584 * object does not exist.
11586 int PrimaryLogPG::find_object_context(const hobject_t
& oid
,
11587 ObjectContextRef
*pobc
,
11589 bool map_snapid_to_clone
,
11590 hobject_t
*pmissing
)
11593 ceph_assert(oid
.pool
== static_cast<int64_t>(info
.pgid
.pool()));
11595 if (oid
.snap
== CEPH_NOSNAP
) {
11596 ObjectContextRef obc
= get_object_context(oid
, can_create
);
11602 dout(10) << __func__
<< " " << oid
11603 << " @" << oid
.snap
11604 << " oi=" << obc
->obs
.oi
11613 hobject_t head
= oid
.get_head();
11614 SnapSetContext
*ssc
= get_snapset_context(oid
, can_create
);
11615 if (!ssc
|| !(ssc
->exists
|| can_create
)) {
11616 dout(20) << __func__
<< " " << oid
<< " no snapset" << dendl
;
11618 *pmissing
= head
; // start by getting the head
11620 put_snapset_context(ssc
);
11624 if (map_snapid_to_clone
) {
11625 dout(10) << __func__
<< " " << oid
<< " @" << oid
.snap
11626 << " snapset " << ssc
->snapset
11627 << " map_snapid_to_clone=true" << dendl
;
11628 if (oid
.snap
> ssc
->snapset
.seq
) {
11629 // already must be readable
11630 ObjectContextRef obc
= get_object_context(head
, false);
11631 dout(10) << __func__
<< " " << oid
<< " @" << oid
.snap
11632 << " snapset " << ssc
->snapset
11633 << " maps to head" << dendl
;
11635 put_snapset_context(ssc
);
11636 return (obc
&& obc
->obs
.exists
) ? 0 : -ENOENT
;
11638 vector
<snapid_t
>::const_iterator citer
= std::find(
11639 ssc
->snapset
.clones
.begin(),
11640 ssc
->snapset
.clones
.end(),
11642 if (citer
== ssc
->snapset
.clones
.end()) {
11643 dout(10) << __func__
<< " " << oid
<< " @" << oid
.snap
11644 << " snapset " << ssc
->snapset
11645 << " maps to nothing" << dendl
;
11646 put_snapset_context(ssc
);
11650 dout(10) << __func__
<< " " << oid
<< " @" << oid
.snap
11651 << " snapset " << ssc
->snapset
11652 << " maps to " << oid
<< dendl
;
11654 if (recovery_state
.get_pg_log().get_missing().is_missing(oid
)) {
11655 dout(10) << __func__
<< " " << oid
<< " @" << oid
.snap
11656 << " snapset " << ssc
->snapset
11657 << " " << oid
<< " is missing" << dendl
;
11660 put_snapset_context(ssc
);
11664 ObjectContextRef obc
= get_object_context(oid
, false);
11665 if (!obc
|| !obc
->obs
.exists
) {
11666 dout(10) << __func__
<< " " << oid
<< " @" << oid
.snap
11667 << " snapset " << ssc
->snapset
11668 << " " << oid
<< " is not present" << dendl
;
11671 put_snapset_context(ssc
);
11674 dout(10) << __func__
<< " " << oid
<< " @" << oid
.snap
11675 << " snapset " << ssc
->snapset
11676 << " " << oid
<< " HIT" << dendl
;
11678 put_snapset_context(ssc
);
11681 ceph_abort(); //unreachable
11684 dout(10) << __func__
<< " " << oid
<< " @" << oid
.snap
11685 << " snapset " << ssc
->snapset
<< dendl
;
11688 if (oid
.snap
> ssc
->snapset
.seq
) {
11689 ObjectContextRef obc
= get_object_context(head
, false);
11690 dout(10) << __func__
<< " " << head
11691 << " want " << oid
.snap
<< " > snapset seq " << ssc
->snapset
.seq
11692 << " -- HIT " << obc
->obs
11697 ceph_assert(ssc
== obc
->ssc
);
11698 put_snapset_context(ssc
);
11704 // which clone would it be?
11706 while (k
< ssc
->snapset
.clones
.size() &&
11707 ssc
->snapset
.clones
[k
] < oid
.snap
)
11709 if (k
== ssc
->snapset
.clones
.size()) {
11710 dout(10) << __func__
<< " no clones with last >= oid.snap "
11711 << oid
.snap
<< " -- DNE" << dendl
;
11712 put_snapset_context(ssc
);
11715 hobject_t
soid(oid
.oid
, oid
.get_key(), ssc
->snapset
.clones
[k
], oid
.get_hash(),
11716 info
.pgid
.pool(), oid
.get_namespace());
11718 if (recovery_state
.get_pg_log().get_missing().is_missing(soid
)) {
11719 dout(20) << __func__
<< " " << soid
<< " missing, try again later"
11723 put_snapset_context(ssc
);
11727 ObjectContextRef obc
= get_object_context(soid
, false);
11728 if (!obc
|| !obc
->obs
.exists
) {
11731 put_snapset_context(ssc
);
11732 if (is_primary()) {
11733 if (is_degraded_or_backfilling_object(soid
)) {
11734 dout(20) << __func__
<< " clone is degraded or backfilling " << soid
<< dendl
;
11736 } else if (is_degraded_on_async_recovery_target(soid
)) {
11737 dout(20) << __func__
<< " clone is recovering " << soid
<< dendl
;
11740 dout(20) << __func__
<< " missing clone " << soid
<< dendl
;
11744 dout(20) << __func__
<< " replica missing clone" << soid
<< dendl
;
11752 ceph_assert(obc
->ssc
== ssc
);
11753 put_snapset_context(ssc
);
11758 dout(20) << __func__
<< " " << soid
11759 << " snapset " << obc
->ssc
->snapset
11761 snapid_t first
, last
;
11762 auto p
= obc
->ssc
->snapset
.clone_snaps
.find(soid
.snap
);
11763 ceph_assert(p
!= obc
->ssc
->snapset
.clone_snaps
.end());
11764 if (p
->second
.empty()) {
11765 dout(1) << __func__
<< " " << soid
<< " empty snapset -- DNE" << dendl
;
11766 ceph_assert(!cct
->_conf
->osd_debug_verify_snaps
);
11769 if (std::find(p
->second
.begin(), p
->second
.end(), oid
.snap
) ==
11771 dout(20) << __func__
<< " " << soid
<< " clone_snaps " << p
->second
11772 << " does not contain " << oid
.snap
<< " -- DNE" << dendl
;
11775 if (get_osdmap()->in_removed_snaps_queue(info
.pgid
.pgid
.pool(), oid
.snap
)) {
11776 dout(20) << __func__
<< " " << soid
<< " snap " << oid
.snap
11777 << " in removed_snaps_queue" << " -- DNE" << dendl
;
11780 dout(20) << __func__
<< " " << soid
<< " clone_snaps " << p
->second
11781 << " contains " << oid
.snap
<< " -- HIT " << obc
->obs
<< dendl
;
11786 void PrimaryLogPG::object_context_destructor_callback(ObjectContext
*obc
)
11789 put_snapset_context(obc
->ssc
);
11792 void PrimaryLogPG::add_object_context_to_pg_stat(ObjectContextRef obc
, pg_stat_t
*pgstat
)
11794 object_info_t
& oi
= obc
->obs
.oi
;
11796 dout(10) << __func__
<< " " << oi
.soid
<< dendl
;
11797 ceph_assert(!oi
.soid
.is_snapdir());
11799 object_stat_sum_t stat
;
11800 stat
.num_objects
++;
11802 stat
.num_objects_dirty
++;
11803 if (oi
.is_whiteout())
11804 stat
.num_whiteouts
++;
11806 stat
.num_objects_omap
++;
11807 if (oi
.is_cache_pinned())
11808 stat
.num_objects_pinned
++;
11809 if (oi
.has_manifest())
11810 stat
.num_objects_manifest
++;
11812 if (oi
.soid
.is_snap()) {
11813 stat
.num_object_clones
++;
11816 obc
->ssc
= get_snapset_context(oi
.soid
, false);
11817 ceph_assert(obc
->ssc
);
11818 stat
.num_bytes
+= obc
->ssc
->snapset
.get_clone_bytes(oi
.soid
.snap
);
11820 stat
.num_bytes
+= oi
.size
;
11824 pgstat
->stats
.sum
.add(stat
);
11827 void PrimaryLogPG::kick_object_context_blocked(ObjectContextRef obc
)
11829 const hobject_t
& soid
= obc
->obs
.oi
.soid
;
11830 if (obc
->is_blocked()) {
11831 dout(10) << __func__
<< " " << soid
<< " still blocked" << dendl
;
11835 map
<hobject_t
, list
<OpRequestRef
>>::iterator p
= waiting_for_blocked_object
.find(soid
);
11836 if (p
!= waiting_for_blocked_object
.end()) {
11837 list
<OpRequestRef
>& ls
= p
->second
;
11838 dout(10) << __func__
<< " " << soid
<< " requeuing " << ls
.size() << " requests" << dendl
;
11840 waiting_for_blocked_object
.erase(p
);
11843 map
<hobject_t
, ObjectContextRef
>::iterator i
=
11844 objects_blocked_on_snap_promotion
.find(obc
->obs
.oi
.soid
.get_head());
11845 if (i
!= objects_blocked_on_snap_promotion
.end()) {
11846 ceph_assert(i
->second
== obc
);
11847 objects_blocked_on_snap_promotion
.erase(i
);
11850 if (obc
->requeue_scrub_on_unblock
) {
11852 obc
->requeue_scrub_on_unblock
= false;
11854 dout(20) << __func__
<< " requeuing if still active: " << (is_active() ? "yes" : "no") << dendl
;
11856 // only requeue if we are still active: we may be unblocking
11857 // because we are resetting for a new peering interval
11859 osd
->queue_scrub_unblocking(this, is_scrub_blocking_ops());
11864 SnapSetContext
*PrimaryLogPG::get_snapset_context(
11865 const hobject_t
& oid
,
11867 const map
<string
, bufferlist
> *attrs
,
11870 std::lock_guard
l(snapset_contexts_lock
);
11871 SnapSetContext
*ssc
;
11872 map
<hobject_t
, SnapSetContext
*>::iterator p
= snapset_contexts
.find(
11873 oid
.get_snapdir());
11874 if (p
!= snapset_contexts
.end()) {
11875 if (can_create
|| p
->second
->exists
) {
11884 if (!(oid
.is_head() && !oid_existed
)) {
11885 r
= pgbackend
->objects_get_attr(oid
.get_head(), SS_ATTR
, &bv
);
11887 if (r
< 0 && !can_create
)
11890 auto it_ss
= attrs
->find(SS_ATTR
);
11891 ceph_assert(it_ss
!= attrs
->end());
11892 bv
= it_ss
->second
;
11894 ssc
= new SnapSetContext(oid
.get_snapdir());
11895 _register_snapset_context(ssc
);
11897 bufferlist::const_iterator bvp
= bv
.begin();
11899 ssc
->snapset
.decode(bvp
);
11900 } catch (const ceph::buffer::error
& e
) {
11901 dout(0) << __func__
<< " Can't decode snapset: " << e
.what() << dendl
;
11904 ssc
->exists
= true;
11906 ssc
->exists
= false;
11914 void PrimaryLogPG::put_snapset_context(SnapSetContext
*ssc
)
11916 std::lock_guard
l(snapset_contexts_lock
);
11918 if (ssc
->ref
== 0) {
11919 if (ssc
->registered
)
11920 snapset_contexts
.erase(ssc
->oid
);
11927 * NONE - didn't pull anything
11928 * YES - pulled what the caller wanted
11929 * HEAD - needed to pull head first
11931 enum { PULL_NONE
, PULL_HEAD
, PULL_YES
};
11933 int PrimaryLogPG::recover_missing(
11934 const hobject_t
&soid
, eversion_t v
,
11936 PGBackend::RecoveryHandle
*h
)
11938 if (recovery_state
.get_missing_loc().is_unfound(soid
)) {
11939 dout(7) << __func__
<< " " << soid
11941 << " but it is unfound" << dendl
;
11945 if (recovery_state
.get_missing_loc().is_deleted(soid
)) {
11946 start_recovery_op(soid
);
11947 ceph_assert(!recovering
.count(soid
));
11948 recovering
.insert(make_pair(soid
, ObjectContextRef()));
11949 epoch_t cur_epoch
= get_osdmap_epoch();
11950 remove_missing_object(soid
, v
, new LambdaContext(
11952 std::scoped_lock locker
{*this};
11953 if (!pg_has_reset_since(cur_epoch
)) {
11954 bool object_missing
= false;
11955 for (const auto& shard
: get_acting_recovery_backfill()) {
11956 if (shard
== pg_whoami
)
11958 if (recovery_state
.get_peer_missing(shard
).is_missing(soid
)) {
11959 dout(20) << __func__
<< ": soid " << soid
<< " needs to be deleted from replica " << shard
<< dendl
;
11960 object_missing
= true;
11964 if (!object_missing
) {
11965 object_stat_sum_t stat_diff
;
11966 stat_diff
.num_objects_recovered
= 1;
11967 if (scrub_after_recovery
)
11968 stat_diff
.num_objects_repaired
= 1;
11969 on_global_recover(soid
, stat_diff
, true);
11971 auto recovery_handle
= pgbackend
->open_recovery_op();
11972 pgbackend
->recover_delete_object(soid
, v
, recovery_handle
);
11973 pgbackend
->run_recovery_op(recovery_handle
, priority
);
11980 // is this a snapped object? if so, consult the snapset.. we may not need the entire object!
11981 ObjectContextRef obc
;
11982 ObjectContextRef head_obc
;
11983 if (soid
.snap
&& soid
.snap
< CEPH_NOSNAP
) {
11984 // do we have the head?
11985 hobject_t head
= soid
.get_head();
11986 if (recovery_state
.get_pg_log().get_missing().is_missing(head
)) {
11987 if (recovering
.count(head
)) {
11988 dout(10) << " missing but already recovering head " << head
<< dendl
;
11991 int r
= recover_missing(
11992 head
, recovery_state
.get_pg_log().get_missing().get_items().find(head
)->second
.need
, priority
,
11994 if (r
!= PULL_NONE
)
11999 head_obc
= get_object_context(
12003 ceph_assert(head_obc
);
12005 start_recovery_op(soid
);
12006 ceph_assert(!recovering
.count(soid
));
12007 recovering
.insert(make_pair(soid
, obc
));
12008 int r
= pgbackend
->recover_object(
12014 // This is only a pull which shouldn't return an error
12015 ceph_assert(r
>= 0);
12019 void PrimaryLogPG::remove_missing_object(const hobject_t
&soid
,
12020 eversion_t v
, Context
*on_complete
)
12022 dout(20) << __func__
<< " " << soid
<< " " << v
<< dendl
;
12023 ceph_assert(on_complete
!= nullptr);
12025 ObjectStore::Transaction t
;
12026 remove_snap_mapped_object(t
, soid
);
12028 ObjectRecoveryInfo recovery_info
;
12029 recovery_info
.soid
= soid
;
12030 recovery_info
.version
= v
;
12032 epoch_t cur_epoch
= get_osdmap_epoch();
12033 t
.register_on_complete(new LambdaContext(
12035 std::unique_lock locker
{*this};
12036 if (!pg_has_reset_since(cur_epoch
)) {
12037 ObjectStore::Transaction t2
;
12038 on_local_recover(soid
, recovery_info
, ObjectContextRef(), true, &t2
);
12039 t2
.register_on_complete(on_complete
);
12040 int r
= osd
->store
->queue_transaction(ch
, std::move(t2
), nullptr);
12041 ceph_assert(r
== 0);
12045 on_complete
->complete(-EAGAIN
);
12048 int r
= osd
->store
->queue_transaction(ch
, std::move(t
), nullptr);
12049 ceph_assert(r
== 0);
12052 void PrimaryLogPG::finish_degraded_object(const hobject_t oid
)
12054 dout(10) << __func__
<< " " << oid
<< dendl
;
12055 if (callbacks_for_degraded_object
.count(oid
)) {
12056 list
<Context
*> contexts
;
12057 contexts
.swap(callbacks_for_degraded_object
[oid
]);
12058 callbacks_for_degraded_object
.erase(oid
);
12059 for (list
<Context
*>::iterator i
= contexts
.begin();
12060 i
!= contexts
.end();
12065 map
<hobject_t
, snapid_t
>::iterator i
= objects_blocked_on_degraded_snap
.find(
12067 if (i
!= objects_blocked_on_degraded_snap
.end() &&
12068 i
->second
== oid
.snap
)
12069 objects_blocked_on_degraded_snap
.erase(i
);
12072 void PrimaryLogPG::_committed_pushed_object(
12073 epoch_t epoch
, eversion_t last_complete
)
12075 std::scoped_lock locker
{*this};
12076 if (!pg_has_reset_since(epoch
)) {
12077 recovery_state
.recovery_committed_to(last_complete
);
12079 dout(10) << __func__
12080 << " pg has changed, not touching last_complete_ondisk" << dendl
;
12084 void PrimaryLogPG::_applied_recovered_object(ObjectContextRef obc
)
12086 dout(20) << __func__
<< dendl
;
12088 dout(20) << "obc = " << *obc
<< dendl
;
12090 ceph_assert(active_pushes
>= 1);
12093 // requeue an active chunky scrub waiting on recovery ops
12094 if (!recovery_state
.is_deleting() && active_pushes
== 0 &&
12095 m_scrubber
->is_scrub_active()) {
12097 osd
->queue_scrub_pushes_update(this, is_scrub_blocking_ops());
12101 void PrimaryLogPG::_applied_recovered_object_replica()
12103 dout(20) << __func__
<< dendl
;
12104 ceph_assert(active_pushes
>= 1);
12107 // requeue an active scrub waiting on recovery ops
12108 if (!recovery_state
.is_deleting() && active_pushes
== 0 &&
12109 m_scrubber
->is_scrub_active()) {
12111 osd
->queue_scrub_replica_pushes(this, m_scrubber
->replica_op_priority());
12115 void PrimaryLogPG::on_failed_pull(
12116 const set
<pg_shard_t
> &from
,
12117 const hobject_t
&soid
,
12118 const eversion_t
&v
)
12120 dout(20) << __func__
<< ": " << soid
<< dendl
;
12121 ceph_assert(recovering
.count(soid
));
12122 auto obc
= recovering
[soid
];
12124 list
<OpRequestRef
> blocked_ops
;
12125 obc
->drop_recovery_read(&blocked_ops
);
12126 requeue_ops(blocked_ops
);
12128 recovering
.erase(soid
);
12129 for (auto&& i
: from
) {
12130 if (i
!= pg_whoami
) { // we'll get it below in primary_error
12131 recovery_state
.force_object_missing(i
, soid
, v
);
12135 dout(0) << __func__
<< " " << soid
<< " from shard " << from
12136 << ", reps on " << recovery_state
.get_missing_loc().get_locations(soid
)
12137 << " unfound? " << recovery_state
.get_missing_loc().is_unfound(soid
)
12139 finish_recovery_op(soid
); // close out this attempt,
12140 finish_degraded_object(soid
);
12142 if (from
.count(pg_whoami
)) {
12143 dout(0) << " primary missing oid " << soid
<< " version " << v
<< dendl
;
12144 primary_error(soid
, v
);
12145 backfills_in_flight
.erase(soid
);
12149 eversion_t
PrimaryLogPG::pick_newest_available(const hobject_t
& oid
)
12152 pg_missing_item pmi
;
12153 bool is_missing
= recovery_state
.get_pg_log().get_missing().is_missing(oid
, &pmi
);
12154 ceph_assert(is_missing
);
12156 dout(10) << "pick_newest_available " << oid
<< " " << v
<< " on osd." << osd
->whoami
<< " (local)" << dendl
;
12158 ceph_assert(!get_acting_recovery_backfill().empty());
12159 for (set
<pg_shard_t
>::iterator i
= get_acting_recovery_backfill().begin();
12160 i
!= get_acting_recovery_backfill().end();
12162 if (*i
== get_primary()) continue;
12163 pg_shard_t peer
= *i
;
12164 if (!recovery_state
.get_peer_missing(peer
).is_missing(oid
)) {
12167 eversion_t h
= recovery_state
.get_peer_missing(peer
).get_items().at(oid
).have
;
12168 dout(10) << "pick_newest_available " << oid
<< " " << h
<< " on osd." << peer
<< dendl
;
12173 dout(10) << "pick_newest_available " << oid
<< " " << v
<< " (newest)" << dendl
;
12177 void PrimaryLogPG::do_update_log_missing(OpRequestRef
&op
)
12179 const MOSDPGUpdateLogMissing
*m
= static_cast<const MOSDPGUpdateLogMissing
*>(
12181 ceph_assert(m
->get_type() == MSG_OSD_PG_UPDATE_LOG_MISSING
);
12182 ObjectStore::Transaction t
;
12183 std::optional
<eversion_t
> op_trim_to
, op_roll_forward_to
;
12184 if (m
->pg_trim_to
!= eversion_t())
12185 op_trim_to
= m
->pg_trim_to
;
12186 if (m
->pg_roll_forward_to
!= eversion_t())
12187 op_roll_forward_to
= m
->pg_roll_forward_to
;
12189 dout(20) << __func__
12190 << " op_trim_to = " << op_trim_to
<< " op_roll_forward_to = " << op_roll_forward_to
<< dendl
;
12192 recovery_state
.append_log_entries_update_missing(
12193 m
->entries
, t
, op_trim_to
, op_roll_forward_to
);
12194 eversion_t new_lcod
= info
.last_complete
;
12196 Context
*complete
= new LambdaContext(
12198 const MOSDPGUpdateLogMissing
*msg
= static_cast<const MOSDPGUpdateLogMissing
*>(
12200 std::scoped_lock locker
{*this};
12201 if (!pg_has_reset_since(msg
->get_epoch())) {
12202 update_last_complete_ondisk(new_lcod
);
12203 MOSDPGUpdateLogMissingReply
*reply
=
12204 new MOSDPGUpdateLogMissingReply(
12205 spg_t(info
.pgid
.pgid
, primary_shard().shard
),
12211 reply
->set_priority(CEPH_MSG_PRIO_HIGH
);
12212 msg
->get_connection()->send_message(reply
);
12216 if (get_osdmap()->require_osd_release
>= ceph_release_t::kraken
) {
12217 t
.register_on_commit(complete
);
12219 /* Hack to work around the fact that ReplicatedBackend sends
12220 * ack+commit if commit happens first
12222 * This behavior is no longer necessary, but we preserve it so old
12223 * primaries can keep their repops in order */
12224 if (pool
.info
.is_erasure()) {
12225 t
.register_on_complete(complete
);
12227 t
.register_on_commit(complete
);
12230 int tr
= osd
->store
->queue_transaction(
12234 ceph_assert(tr
== 0);
12235 op_applied(info
.last_update
);
12238 void PrimaryLogPG::do_update_log_missing_reply(OpRequestRef
&op
)
12240 const MOSDPGUpdateLogMissingReply
*m
=
12241 static_cast<const MOSDPGUpdateLogMissingReply
*>(
12243 dout(20) << __func__
<< " got reply from "
12244 << m
->get_from() << dendl
;
12246 auto it
= log_entry_update_waiting_on
.find(m
->get_tid());
12247 if (it
!= log_entry_update_waiting_on
.end()) {
12248 if (it
->second
.waiting_on
.count(m
->get_from())) {
12249 it
->second
.waiting_on
.erase(m
->get_from());
12250 if (m
->last_complete_ondisk
!= eversion_t()) {
12251 update_peer_last_complete_ondisk(m
->get_from(), m
->last_complete_ondisk
);
12255 << info
.pgid
<< " got reply "
12256 << *m
<< " from shard we are not waiting for "
12260 if (it
->second
.waiting_on
.empty()) {
12261 repop_all_committed(it
->second
.repop
.get());
12262 log_entry_update_waiting_on
.erase(it
);
12266 << info
.pgid
<< " got reply "
12267 << *m
<< " on unknown tid " << m
->get_tid();
12271 /* Mark all unfound objects as lost.
12273 void PrimaryLogPG::mark_all_unfound_lost(
12275 std::function
<void(int,const std::string
&,bufferlist
&)> on_finish
)
12277 dout(3) << __func__
<< " " << pg_log_entry_t::get_op_name(what
) << dendl
;
12278 list
<hobject_t
> oids
;
12280 dout(30) << __func__
<< ": log before:\n";
12281 recovery_state
.get_pg_log().get_log().print(*_dout
);
12284 mempool::osd_pglog::list
<pg_log_entry_t
> log_entries
;
12286 utime_t mtime
= ceph_clock_now();
12287 map
<hobject_t
, pg_missing_item
>::const_iterator m
=
12288 recovery_state
.get_missing_loc().get_needs_recovery().begin();
12289 map
<hobject_t
, pg_missing_item
>::const_iterator mend
=
12290 recovery_state
.get_missing_loc().get_needs_recovery().end();
12292 ObcLockManager manager
;
12293 eversion_t v
= get_next_version();
12294 v
.epoch
= get_osdmap_epoch();
12295 uint64_t num_unfound
= recovery_state
.get_missing_loc().num_unfound();
12296 while (m
!= mend
) {
12297 const hobject_t
&oid(m
->first
);
12298 if (!recovery_state
.get_missing_loc().is_unfound(oid
)) {
12299 // We only care about unfound objects
12304 ObjectContextRef obc
;
12308 case pg_log_entry_t::LOST_MARK
:
12309 ceph_abort_msg("actually, not implemented yet!");
12312 case pg_log_entry_t::LOST_REVERT
:
12313 prev
= pick_newest_available(oid
);
12314 if (prev
> eversion_t()) {
12317 pg_log_entry_t::LOST_REVERT
, oid
, v
,
12318 m
->second
.need
, 0, osd_reqid_t(), mtime
, 0);
12319 e
.reverting_to
= prev
;
12320 e
.mark_unrollbackable();
12321 log_entries
.push_back(e
);
12322 dout(10) << e
<< dendl
;
12324 // we are now missing the new version; recovery code will sort it out.
12330 case pg_log_entry_t::LOST_DELETE
:
12332 pg_log_entry_t
e(pg_log_entry_t::LOST_DELETE
, oid
, v
, m
->second
.need
,
12333 0, osd_reqid_t(), mtime
, 0);
12334 if (get_osdmap()->require_osd_release
>= ceph_release_t::jewel
) {
12335 if (pool
.info
.require_rollback()) {
12336 e
.mod_desc
.try_rmobject(v
.version
);
12338 e
.mark_unrollbackable();
12340 } // otherwise, just do what we used to do
12341 dout(10) << e
<< dendl
;
12342 log_entries
.push_back(e
);
12343 oids
.push_back(oid
);
12345 // If context found mark object as deleted in case
12346 // of racing with new creation. This can happen if
12347 // object lost and EIO at primary.
12348 obc
= object_contexts
.lookup(oid
);
12350 obc
->obs
.exists
= false;
12362 recovery_state
.update_stats(
12363 [](auto &history
, auto &stats
) {
12364 stats
.stats_invalid
= true;
12368 submit_log_entries(
12370 std::move(manager
),
12371 std::optional
<std::function
<void(void)> >(
12372 [this, oids
, num_unfound
, on_finish
]() {
12373 if (recovery_state
.perform_deletes_during_peering()) {
12374 for (auto oid
: oids
) {
12375 // clear old locations - merge_new_log_entries will have
12376 // handled rebuilding missing_loc for each of these
12377 // objects if we have the RECOVERY_DELETES flag
12378 recovery_state
.object_recovered(oid
, object_stat_sum_t());
12382 if (is_recovery_unfound()) {
12383 queue_peering_event(
12385 std::make_shared
<PGPeeringEvent
>(
12386 get_osdmap_epoch(),
12387 get_osdmap_epoch(),
12388 PeeringState::DoRecovery())));
12389 } else if (is_backfill_unfound()) {
12390 queue_peering_event(
12392 std::make_shared
<PGPeeringEvent
>(
12393 get_osdmap_epoch(),
12394 get_osdmap_epoch(),
12395 PeeringState::RequestBackfill())));
12401 ss
<< "pg has " << num_unfound
12402 << " objects unfound and apparently lost marking";
12403 string rs
= ss
.str();
12404 dout(0) << "do_command r=" << 0 << " " << rs
<< dendl
;
12405 osd
->clog
->info() << rs
;
12407 on_finish(0, rs
, empty
);
12412 void PrimaryLogPG::_split_into(pg_t child_pgid
, PG
*child
, unsigned split_bits
)
12414 ceph_assert(repop_queue
.empty());
12418 * pg status change notification
12421 void PrimaryLogPG::apply_and_flush_repops(bool requeue
)
12423 list
<OpRequestRef
> rq
;
12425 // apply all repops
12426 while (!repop_queue
.empty()) {
12427 RepGather
*repop
= repop_queue
.front();
12428 repop_queue
.pop_front();
12429 dout(10) << " canceling repop tid " << repop
->rep_tid
<< dendl
;
12430 repop
->rep_aborted
= true;
12431 repop
->on_committed
.clear();
12432 repop
->on_success
.clear();
12436 dout(10) << " requeuing " << *repop
->op
->get_req() << dendl
;
12437 rq
.push_back(repop
->op
);
12438 repop
->op
= OpRequestRef();
12441 // also requeue any dups, interleaved into position
12442 auto p
= waiting_for_ondisk
.find(repop
->v
);
12443 if (p
!= waiting_for_ondisk
.end()) {
12444 dout(10) << " also requeuing ondisk waiters " << p
->second
<< dendl
;
12445 for (auto& i
: p
->second
) {
12446 rq
.push_back(std::get
<0>(i
));
12448 waiting_for_ondisk
.erase(p
);
12452 remove_repop(repop
);
12455 ceph_assert(repop_queue
.empty());
12459 if (!waiting_for_ondisk
.empty()) {
12460 for (auto& i
: waiting_for_ondisk
) {
12461 for (auto& j
: i
.second
) {
12462 derr
<< __func__
<< ": op " << *(std::get
<0>(j
)->get_req())
12463 << " waiting on " << i
.first
<< dendl
;
12466 ceph_assert(waiting_for_ondisk
.empty());
12470 waiting_for_ondisk
.clear();
12473 void PrimaryLogPG::on_flushed()
12475 requeue_ops(waiting_for_flush
);
12476 if (!is_peered() || !is_primary()) {
12477 pair
<hobject_t
, ObjectContextRef
> i
;
12478 while (object_contexts
.get_next(i
.first
, &i
)) {
12479 derr
<< __func__
<< ": object " << i
.first
<< " obc still alive" << dendl
;
12481 ceph_assert(object_contexts
.empty());
12485 void PrimaryLogPG::on_removal(ObjectStore::Transaction
&t
)
12487 dout(10) << __func__
<< dendl
;
12491 t
.register_on_commit(new C_DeleteMore(this, get_osdmap_epoch()));
12494 void PrimaryLogPG::clear_async_reads()
12496 dout(10) << __func__
<< dendl
;
12497 for(auto& i
: in_progress_async_reads
) {
12498 dout(10) << "clear ctx: "
12499 << "OpRequestRef " << i
.first
12500 << " OpContext " << i
.second
12502 close_op_ctx(i
.second
);
12506 void PrimaryLogPG::clear_cache()
12508 object_contexts
.clear();
12511 void PrimaryLogPG::on_shutdown()
12513 dout(10) << __func__
<< dendl
;
12515 if (recovery_queued
) {
12516 recovery_queued
= false;
12517 osd
->clear_queued_recovery(this);
12520 m_scrubber
->scrub_clear_state();
12522 m_scrubber
->unreg_next_scrub();
12524 vector
<ceph_tid_t
> tids
;
12525 cancel_copy_ops(false, &tids
);
12526 cancel_flush_ops(false, &tids
);
12527 cancel_proxy_ops(false, &tids
);
12528 cancel_manifest_ops(false, &tids
);
12529 osd
->objecter
->op_cancel(tids
, -ECANCELED
);
12531 apply_and_flush_repops(false);
12532 cancel_log_updates();
12533 // we must remove PGRefs, so do this this prior to release_backoffs() callers
12535 // clean up snap trim references
12536 snap_trimmer_machine
.process_event(Reset());
12538 pgbackend
->on_change();
12540 context_registry_on_change();
12541 object_contexts
.clear();
12543 clear_async_reads();
12545 osd
->remote_reserver
.cancel_reservation(info
.pgid
);
12546 osd
->local_reserver
.cancel_reservation(info
.pgid
);
12548 clear_primary_state();
12551 if (is_primary()) {
12552 osd
->clear_ready_to_merge(this);
12556 void PrimaryLogPG::on_activate_complete()
12560 if (!recovery_state
.needs_flush()) {
12561 requeue_ops(waiting_for_peered
);
12562 } else if (!waiting_for_peered
.empty()) {
12563 dout(10) << __func__
<< " flushes in progress, moving "
12564 << waiting_for_peered
.size()
12565 << " items to waiting_for_flush"
12567 ceph_assert(waiting_for_flush
.empty());
12568 waiting_for_flush
.swap(waiting_for_peered
);
12573 if (needs_recovery()) {
12574 dout(10) << "activate not all replicas are up-to-date, queueing recovery" << dendl
;
12575 queue_peering_event(
12577 std::make_shared
<PGPeeringEvent
>(
12578 get_osdmap_epoch(),
12579 get_osdmap_epoch(),
12580 PeeringState::DoRecovery())));
12581 } else if (needs_backfill()) {
12582 dout(10) << "activate queueing backfill" << dendl
;
12583 queue_peering_event(
12585 std::make_shared
<PGPeeringEvent
>(
12586 get_osdmap_epoch(),
12587 get_osdmap_epoch(),
12588 PeeringState::RequestBackfill())));
12590 dout(10) << "activate all replicas clean, no recovery" << dendl
;
12591 queue_peering_event(
12593 std::make_shared
<PGPeeringEvent
>(
12594 get_osdmap_epoch(),
12595 get_osdmap_epoch(),
12596 PeeringState::AllReplicasRecovered())));
12599 publish_stats_to_osd();
12601 if (get_backfill_targets().size()) {
12602 last_backfill_started
= recovery_state
.earliest_backfill();
12603 new_backfill
= true;
12604 ceph_assert(!last_backfill_started
.is_max());
12605 dout(5) << __func__
<< ": bft=" << get_backfill_targets()
12606 << " from " << last_backfill_started
<< dendl
;
12607 for (set
<pg_shard_t
>::const_iterator i
= get_backfill_targets().begin();
12608 i
!= get_backfill_targets().end();
12610 dout(5) << "target shard " << *i
12611 << " from " << recovery_state
.get_peer_info(*i
).last_backfill
12620 void PrimaryLogPG::on_change(ObjectStore::Transaction
&t
)
12622 dout(10) << __func__
<< dendl
;
12624 if (hit_set
&& hit_set
->insert_count() == 0) {
12625 dout(20) << " discarding empty hit_set" << dendl
;
12629 if (recovery_queued
) {
12630 recovery_queued
= false;
12631 osd
->clear_queued_recovery(this);
12634 // requeue everything in the reverse order they should be
12636 requeue_ops(waiting_for_peered
);
12637 requeue_ops(waiting_for_flush
);
12638 requeue_ops(waiting_for_active
);
12639 requeue_ops(waiting_for_readable
);
12641 vector
<ceph_tid_t
> tids
;
12642 cancel_copy_ops(is_primary(), &tids
);
12643 cancel_flush_ops(is_primary(), &tids
);
12644 cancel_proxy_ops(is_primary(), &tids
);
12645 cancel_manifest_ops(is_primary(), &tids
);
12646 osd
->objecter
->op_cancel(tids
, -ECANCELED
);
12648 // requeue object waiters
12649 for (auto& p
: waiting_for_unreadable_object
) {
12650 release_backoffs(p
.first
);
12652 if (is_primary()) {
12653 requeue_object_waiters(waiting_for_unreadable_object
);
12655 waiting_for_unreadable_object
.clear();
12657 for (map
<hobject_t
,list
<OpRequestRef
>>::iterator p
= waiting_for_degraded_object
.begin();
12658 p
!= waiting_for_degraded_object
.end();
12659 waiting_for_degraded_object
.erase(p
++)) {
12660 release_backoffs(p
->first
);
12662 requeue_ops(p
->second
);
12665 finish_degraded_object(p
->first
);
12668 // requeues waiting_for_scrub
12669 m_scrubber
->scrub_clear_state();
12671 for (auto p
= waiting_for_blocked_object
.begin();
12672 p
!= waiting_for_blocked_object
.end();
12673 waiting_for_blocked_object
.erase(p
++)) {
12675 requeue_ops(p
->second
);
12679 for (auto i
= callbacks_for_degraded_object
.begin();
12680 i
!= callbacks_for_degraded_object
.end();
12682 finish_degraded_object((i
++)->first
);
12684 ceph_assert(callbacks_for_degraded_object
.empty());
12686 if (is_primary()) {
12687 requeue_ops(waiting_for_cache_not_full
);
12689 waiting_for_cache_not_full
.clear();
12691 objects_blocked_on_cache_full
.clear();
12693 for (list
<pair
<OpRequestRef
, OpContext
*> >::iterator i
=
12694 in_progress_async_reads
.begin();
12695 i
!= in_progress_async_reads
.end();
12696 in_progress_async_reads
.erase(i
++)) {
12697 close_op_ctx(i
->second
);
12699 requeue_op(i
->first
);
12702 // this will requeue ops we were working on but didn't finish, and
12704 apply_and_flush_repops(is_primary());
12705 cancel_log_updates();
12707 // do this *after* apply_and_flush_repops so that we catch any newly
12708 // registered watches.
12709 context_registry_on_change();
12711 pgbackend
->on_change_cleanup(&t
);
12712 m_scrubber
->cleanup_store(&t
);
12713 pgbackend
->on_change();
12715 // clear snap_trimmer state
12716 snap_trimmer_machine
.process_event(Reset());
12718 debug_op_order
.clear();
12719 unstable_stats
.clear();
12721 // we don't want to cache object_contexts through the interval change
12722 // NOTE: we actually assert that all currently live references are dead
12723 // by the time the flush for the next interval completes.
12724 object_contexts
.clear();
12726 // should have been cleared above by finishing all of the degraded objects
12727 ceph_assert(objects_blocked_on_degraded_snap
.empty());
12730 void PrimaryLogPG::plpg_on_role_change()
12732 dout(10) << __func__
<< dendl
;
12733 if (get_role() != 0 && hit_set
) {
12734 dout(10) << " clearing hit set" << dendl
;
12739 void PrimaryLogPG::plpg_on_pool_change()
12741 dout(10) << __func__
<< dendl
;
12742 // requeue cache full waiters just in case the cache_mode is
12743 // changing away from writeback mode. note that if we are not
12744 // active the normal requeuing machinery is sufficient (and properly
12747 pool
.info
.cache_mode
!= pg_pool_t::CACHEMODE_WRITEBACK
&&
12748 !waiting_for_cache_not_full
.empty()) {
12749 dout(10) << __func__
<< " requeuing full waiters (not in writeback) "
12751 requeue_ops(waiting_for_cache_not_full
);
12752 objects_blocked_on_cache_full
.clear();
12758 // clear state. called on recovery completion AND cancellation.
12759 void PrimaryLogPG::_clear_recovery_state()
12761 #ifdef DEBUG_RECOVERY_OIDS
12762 recovering_oids
.clear();
12764 dout(15) << __func__
<< " flags: " << m_planned_scrub
<< dendl
;
12766 last_backfill_started
= hobject_t();
12767 set
<hobject_t
>::iterator i
= backfills_in_flight
.begin();
12768 while (i
!= backfills_in_flight
.end()) {
12769 backfills_in_flight
.erase(i
++);
12772 list
<OpRequestRef
> blocked_ops
;
12773 for (map
<hobject_t
, ObjectContextRef
>::iterator i
= recovering
.begin();
12774 i
!= recovering
.end();
12775 recovering
.erase(i
++)) {
12777 i
->second
->drop_recovery_read(&blocked_ops
);
12778 requeue_ops(blocked_ops
);
12781 ceph_assert(backfills_in_flight
.empty());
12782 pending_backfill_updates
.clear();
12783 ceph_assert(recovering
.empty());
12784 pgbackend
->clear_recovery_state();
12787 void PrimaryLogPG::cancel_pull(const hobject_t
&soid
)
12789 dout(20) << __func__
<< ": " << soid
<< dendl
;
12790 ceph_assert(recovering
.count(soid
));
12791 ObjectContextRef obc
= recovering
[soid
];
12793 list
<OpRequestRef
> blocked_ops
;
12794 obc
->drop_recovery_read(&blocked_ops
);
12795 requeue_ops(blocked_ops
);
12797 recovering
.erase(soid
);
12798 finish_recovery_op(soid
);
12799 release_backoffs(soid
);
12800 if (waiting_for_degraded_object
.count(soid
)) {
12801 dout(20) << " kicking degraded waiters on " << soid
<< dendl
;
12802 requeue_ops(waiting_for_degraded_object
[soid
]);
12803 waiting_for_degraded_object
.erase(soid
);
12805 if (waiting_for_unreadable_object
.count(soid
)) {
12806 dout(20) << " kicking unreadable waiters on " << soid
<< dendl
;
12807 requeue_ops(waiting_for_unreadable_object
[soid
]);
12808 waiting_for_unreadable_object
.erase(soid
);
12810 if (is_missing_object(soid
))
12811 recovery_state
.set_last_requested(0);
12812 finish_degraded_object(soid
);
12815 void PrimaryLogPG::check_recovery_sources(const OSDMapRef
& osdmap
)
12817 pgbackend
->check_recovery_sources(osdmap
);
12820 bool PrimaryLogPG::start_recovery_ops(
12822 ThreadPool::TPHandle
&handle
,
12823 uint64_t *ops_started
)
12825 uint64_t& started
= *ops_started
;
12827 bool work_in_progress
= false;
12828 bool recovery_started
= false;
12829 ceph_assert(is_primary());
12830 ceph_assert(is_peered());
12831 ceph_assert(!recovery_state
.is_deleting());
12833 ceph_assert(recovery_queued
);
12834 recovery_queued
= false;
12836 if (!state_test(PG_STATE_RECOVERING
) &&
12837 !state_test(PG_STATE_BACKFILLING
)) {
12838 /* TODO: I think this case is broken and will make do_recovery()
12839 * unhappy since we're returning false */
12840 dout(10) << "recovery raced and were queued twice, ignoring!" << dendl
;
12841 return have_unfound();
12844 const auto &missing
= recovery_state
.get_pg_log().get_missing();
12846 uint64_t num_unfound
= get_num_unfound();
12848 if (!recovery_state
.have_missing()) {
12849 recovery_state
.local_recovery_complete();
12852 if (!missing
.have_missing() || // Primary does not have missing
12853 // or all of the missing objects are unfound.
12854 recovery_state
.all_missing_unfound()) {
12855 // Recover the replicas.
12856 started
= recover_replicas(max
, handle
, &recovery_started
);
12859 // We still have missing objects that we should grab from replicas.
12860 started
+= recover_primary(max
, handle
);
12862 if (!started
&& num_unfound
!= get_num_unfound()) {
12863 // second chance to recovery replicas
12864 started
= recover_replicas(max
, handle
, &recovery_started
);
12867 if (started
|| recovery_started
)
12868 work_in_progress
= true;
12870 bool deferred_backfill
= false;
12871 if (recovering
.empty() &&
12872 state_test(PG_STATE_BACKFILLING
) &&
12873 !get_backfill_targets().empty() && started
< max
&&
12874 missing
.num_missing() == 0 &&
12875 waiting_on_backfill
.empty()) {
12876 if (get_osdmap()->test_flag(CEPH_OSDMAP_NOBACKFILL
)) {
12877 dout(10) << "deferring backfill due to NOBACKFILL" << dendl
;
12878 deferred_backfill
= true;
12879 } else if (get_osdmap()->test_flag(CEPH_OSDMAP_NOREBALANCE
) &&
12881 dout(10) << "deferring backfill due to NOREBALANCE" << dendl
;
12882 deferred_backfill
= true;
12883 } else if (!recovery_state
.is_backfill_reserved()) {
12884 /* DNMNOTE I think this branch is dead */
12885 dout(10) << "deferring backfill due to !backfill_reserved" << dendl
;
12886 if (!backfill_reserving
) {
12887 dout(10) << "queueing RequestBackfill" << dendl
;
12888 backfill_reserving
= true;
12889 queue_peering_event(
12891 std::make_shared
<PGPeeringEvent
>(
12892 get_osdmap_epoch(),
12893 get_osdmap_epoch(),
12894 PeeringState::RequestBackfill())));
12896 deferred_backfill
= true;
12898 started
+= recover_backfill(max
- started
, handle
, &work_in_progress
);
12902 dout(10) << " started " << started
<< dendl
;
12903 osd
->logger
->inc(l_osd_rop
, started
);
12905 if (!recovering
.empty() ||
12906 work_in_progress
|| recovery_ops_active
> 0 || deferred_backfill
)
12907 return !work_in_progress
&& have_unfound();
12909 ceph_assert(recovering
.empty());
12910 ceph_assert(recovery_ops_active
== 0);
12912 dout(10) << __func__
<< " needs_recovery: "
12913 << recovery_state
.get_missing_loc().get_needs_recovery()
12915 dout(10) << __func__
<< " missing_loc: "
12916 << recovery_state
.get_missing_loc().get_missing_locs()
12918 int unfound
= get_num_unfound();
12920 dout(10) << " still have " << unfound
<< " unfound" << dendl
;
12924 if (missing
.num_missing() > 0) {
12925 // this shouldn't happen!
12926 osd
->clog
->error() << info
.pgid
<< " Unexpected Error: recovery ending with "
12927 << missing
.num_missing() << ": " << missing
.get_items();
12931 if (needs_recovery()) {
12932 // this shouldn't happen!
12933 // We already checked num_missing() so we must have missing replicas
12934 osd
->clog
->error() << info
.pgid
12935 << " Unexpected Error: recovery ending with missing replicas";
12939 if (state_test(PG_STATE_RECOVERING
)) {
12940 state_clear(PG_STATE_RECOVERING
);
12941 state_clear(PG_STATE_FORCED_RECOVERY
);
12942 if (needs_backfill()) {
12943 dout(10) << "recovery done, queuing backfill" << dendl
;
12944 queue_peering_event(
12946 std::make_shared
<PGPeeringEvent
>(
12947 get_osdmap_epoch(),
12948 get_osdmap_epoch(),
12949 PeeringState::RequestBackfill())));
12951 dout(10) << "recovery done, no backfill" << dendl
;
12952 state_clear(PG_STATE_FORCED_BACKFILL
);
12953 queue_peering_event(
12955 std::make_shared
<PGPeeringEvent
>(
12956 get_osdmap_epoch(),
12957 get_osdmap_epoch(),
12958 PeeringState::AllReplicasRecovered())));
12960 } else { // backfilling
12961 state_clear(PG_STATE_BACKFILLING
);
12962 state_clear(PG_STATE_FORCED_BACKFILL
);
12963 state_clear(PG_STATE_FORCED_RECOVERY
);
12964 dout(10) << "recovery done, backfill done" << dendl
;
12965 queue_peering_event(
12967 std::make_shared
<PGPeeringEvent
>(
12968 get_osdmap_epoch(),
12969 get_osdmap_epoch(),
12970 PeeringState::Backfilled())));
12977 * do one recovery op.
12978 * return true if done, false if nothing left to do.
12980 uint64_t PrimaryLogPG::recover_primary(uint64_t max
, ThreadPool::TPHandle
&handle
)
12982 ceph_assert(is_primary());
12984 const auto &missing
= recovery_state
.get_pg_log().get_missing();
12986 dout(10) << __func__
<< " recovering " << recovering
.size()
12988 << " missing " << missing
<< dendl
;
12990 dout(25) << __func__
<< " " << missing
.get_items() << dendl
;
12993 pg_log_entry_t
*latest
= 0;
12994 unsigned started
= 0;
12997 PGBackend::RecoveryHandle
*h
= pgbackend
->open_recovery_op();
12998 map
<version_t
, hobject_t
>::const_iterator p
=
12999 missing
.get_rmissing().lower_bound(recovery_state
.get_pg_log().get_log().last_requested
);
13000 while (p
!= missing
.get_rmissing().end()) {
13001 handle
.reset_tp_timeout();
13003 version_t v
= p
->first
;
13005 auto it_objects
= recovery_state
.get_pg_log().get_log().objects
.find(p
->second
);
13006 if (it_objects
!= recovery_state
.get_pg_log().get_log().objects
.end()) {
13007 latest
= it_objects
->second
;
13008 ceph_assert(latest
->is_update() || latest
->is_delete());
13009 soid
= latest
->soid
;
13014 const pg_missing_item
& item
= missing
.get_items().find(p
->second
)->second
;
13017 hobject_t head
= soid
.get_head();
13019 eversion_t need
= item
.need
;
13021 dout(10) << __func__
<< " "
13022 << soid
<< " " << item
.need
13023 << (missing
.is_missing(soid
) ? " (missing)":"")
13024 << (missing
.is_missing(head
) ? " (missing head)":"")
13025 << (recovering
.count(soid
) ? " (recovering)":"")
13026 << (recovering
.count(head
) ? " (recovering head)":"")
13030 switch (latest
->op
) {
13031 case pg_log_entry_t::CLONE
:
13033 * Handling for this special case removed for now, until we
13034 * can correctly construct an accurate SnapSet from the old
13039 case pg_log_entry_t::LOST_REVERT
:
13041 if (item
.have
== latest
->reverting_to
) {
13042 ObjectContextRef obc
= get_object_context(soid
, true);
13044 if (obc
->obs
.oi
.version
== latest
->version
) {
13045 // I'm already reverting
13046 dout(10) << " already reverting " << soid
<< dendl
;
13048 dout(10) << " reverting " << soid
<< " to " << latest
->prior_version
<< dendl
;
13049 obc
->obs
.oi
.version
= latest
->version
;
13051 ObjectStore::Transaction t
;
13053 obc
->obs
.oi
.encode(
13055 get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD
, nullptr));
13056 ceph_assert(!pool
.info
.require_rollback());
13057 t
.setattr(coll
, ghobject_t(soid
), OI_ATTR
, b2
);
13059 recovery_state
.recover_got(
13067 t
.register_on_applied(new C_OSD_AppliedRecoveredObject(this, obc
));
13068 t
.register_on_commit(new C_OSD_CommittedPushedObject(
13070 get_osdmap_epoch(),
13071 info
.last_complete
));
13072 osd
->store
->queue_transaction(ch
, std::move(t
));
13077 * Pull the old version of the object. Update missing_loc here to have the location
13078 * of the version we want.
13080 * This doesn't use the usual missing_loc paths, but that's okay:
13081 * - if we have it locally, we hit the case above, and go from there.
13082 * - if we don't, we always pass through this case during recovery and set up the location
13084 * - this way we don't need to mangle the missing code to be general about needing an old
13087 eversion_t alternate_need
= latest
->reverting_to
;
13088 dout(10) << " need to pull prior_version " << alternate_need
<< " for revert " << item
<< dendl
;
13090 set
<pg_shard_t
> good_peers
;
13091 for (auto p
= recovery_state
.get_peer_missing().begin();
13092 p
!= recovery_state
.get_peer_missing().end();
13094 if (p
->second
.is_missing(soid
, need
) &&
13095 p
->second
.get_items().at(soid
).have
== alternate_need
) {
13096 good_peers
.insert(p
->first
);
13099 recovery_state
.set_revert_with_targets(
13102 dout(10) << " will pull " << alternate_need
<< " or " << need
13104 << recovery_state
.get_missing_loc().get_locations(soid
)
13112 if (!recovering
.count(soid
)) {
13113 if (recovering
.count(head
)) {
13116 int r
= recover_missing(
13117 soid
, need
, get_recovery_op_priority(), h
);
13130 if (started
>= max
)
13135 // only advance last_requested if we haven't skipped anything
13137 recovery_state
.set_last_requested(v
);
13140 pgbackend
->run_recovery_op(h
, get_recovery_op_priority());
13144 bool PrimaryLogPG::primary_error(
13145 const hobject_t
& soid
, eversion_t v
)
13147 recovery_state
.force_object_missing(pg_whoami
, soid
, v
);
13148 bool uhoh
= recovery_state
.get_missing_loc().is_unfound(soid
);
13150 osd
->clog
->error() << info
.pgid
<< " missing primary copy of "
13151 << soid
<< ", unfound";
13153 osd
->clog
->error() << info
.pgid
<< " missing primary copy of "
13155 << ", will try copies on "
13156 << recovery_state
.get_missing_loc().get_locations(soid
);
13160 int PrimaryLogPG::prep_object_replica_deletes(
13161 const hobject_t
& soid
, eversion_t v
,
13162 PGBackend::RecoveryHandle
*h
,
13163 bool *work_started
)
13165 ceph_assert(is_primary());
13166 dout(10) << __func__
<< ": on " << soid
<< dendl
;
13168 ObjectContextRef obc
= get_object_context(soid
, false);
13170 if (!obc
->get_recovery_read()) {
13171 dout(20) << "replica delete delayed on " << soid
13172 << "; could not get rw_manager lock" << dendl
;
13173 *work_started
= true;
13176 dout(20) << "replica delete got recovery read lock on " << soid
13181 start_recovery_op(soid
);
13182 ceph_assert(!recovering
.count(soid
));
13184 recovering
.insert(make_pair(soid
, ObjectContextRef()));
13186 recovering
.insert(make_pair(soid
, obc
));
13188 pgbackend
->recover_delete_object(soid
, v
, h
);
13192 int PrimaryLogPG::prep_object_replica_pushes(
13193 const hobject_t
& soid
, eversion_t v
,
13194 PGBackend::RecoveryHandle
*h
,
13195 bool *work_started
)
13197 ceph_assert(is_primary());
13198 dout(10) << __func__
<< ": on " << soid
<< dendl
;
13200 if (soid
.snap
&& soid
.snap
< CEPH_NOSNAP
) {
13201 // do we have the head and/or snapdir?
13202 hobject_t head
= soid
.get_head();
13203 if (recovery_state
.get_pg_log().get_missing().is_missing(head
)) {
13204 if (recovering
.count(head
)) {
13205 dout(10) << " missing but already recovering head " << head
<< dendl
;
13208 int r
= recover_missing(
13209 head
, recovery_state
.get_pg_log().get_missing().get_items().find(head
)->second
.need
,
13210 get_recovery_op_priority(), h
);
13211 if (r
!= PULL_NONE
)
13218 // NOTE: we know we will get a valid oloc off of disk here.
13219 ObjectContextRef obc
= get_object_context(soid
, false);
13221 primary_error(soid
, v
);
13225 if (!obc
->get_recovery_read()) {
13226 dout(20) << "recovery delayed on " << soid
13227 << "; could not get rw_manager lock" << dendl
;
13228 *work_started
= true;
13231 dout(20) << "recovery got recovery read lock on " << soid
13235 start_recovery_op(soid
);
13236 ceph_assert(!recovering
.count(soid
));
13237 recovering
.insert(make_pair(soid
, obc
));
13239 int r
= pgbackend
->recover_object(
13242 ObjectContextRef(),
13243 obc
, // has snapset context
13246 dout(0) << __func__
<< " Error " << r
<< " on oid " << soid
<< dendl
;
13247 on_failed_pull({ pg_whoami
}, soid
, v
);
13253 uint64_t PrimaryLogPG::recover_replicas(uint64_t max
, ThreadPool::TPHandle
&handle
,
13254 bool *work_started
)
13256 dout(10) << __func__
<< "(" << max
<< ")" << dendl
;
13257 uint64_t started
= 0;
13259 PGBackend::RecoveryHandle
*h
= pgbackend
->open_recovery_op();
13261 // this is FAR from an optimal recovery order. pretty lame, really.
13262 ceph_assert(!get_acting_recovery_backfill().empty());
13263 // choose replicas to recover, replica has the shortest missing list first
13264 // so we can bring it back to normal ASAP
13265 std::vector
<std::pair
<unsigned int, pg_shard_t
>> replicas_by_num_missing
,
13266 async_by_num_missing
;
13267 replicas_by_num_missing
.reserve(get_acting_recovery_backfill().size() - 1);
13268 for (auto &p
: get_acting_recovery_backfill()) {
13269 if (p
== get_primary()) {
13272 auto pm
= recovery_state
.get_peer_missing().find(p
);
13273 ceph_assert(pm
!= recovery_state
.get_peer_missing().end());
13274 auto nm
= pm
->second
.num_missing();
13276 if (is_async_recovery_target(p
)) {
13277 async_by_num_missing
.push_back(make_pair(nm
, p
));
13279 replicas_by_num_missing
.push_back(make_pair(nm
, p
));
13283 // sort by number of missing objects, in ascending order.
13284 auto func
= [](const std::pair
<unsigned int, pg_shard_t
> &lhs
,
13285 const std::pair
<unsigned int, pg_shard_t
> &rhs
) {
13286 return lhs
.first
< rhs
.first
;
13288 // acting goes first
13289 std::sort(replicas_by_num_missing
.begin(), replicas_by_num_missing
.end(), func
);
13290 // then async_recovery_targets
13291 std::sort(async_by_num_missing
.begin(), async_by_num_missing
.end(), func
);
13292 replicas_by_num_missing
.insert(replicas_by_num_missing
.end(),
13293 async_by_num_missing
.begin(), async_by_num_missing
.end());
13294 for (auto &replica
: replicas_by_num_missing
) {
13295 pg_shard_t
&peer
= replica
.second
;
13296 ceph_assert(peer
!= get_primary());
13297 auto pm
= recovery_state
.get_peer_missing().find(peer
);
13298 ceph_assert(pm
!= recovery_state
.get_peer_missing().end());
13299 size_t m_sz
= pm
->second
.num_missing();
13301 dout(10) << " peer osd." << peer
<< " missing " << m_sz
<< " objects." << dendl
;
13302 dout(20) << " peer osd." << peer
<< " missing " << pm
->second
.get_items() << dendl
;
13305 const pg_missing_t
&m(pm
->second
);
13306 for (map
<version_t
, hobject_t
>::const_iterator p
= m
.get_rmissing().begin();
13307 p
!= m
.get_rmissing().end() && started
< max
;
13309 handle
.reset_tp_timeout();
13310 const hobject_t
soid(p
->second
);
13312 if (recovery_state
.get_missing_loc().is_unfound(soid
)) {
13313 dout(10) << __func__
<< ": " << soid
<< " still unfound" << dendl
;
13317 const pg_info_t
&pi
= recovery_state
.get_peer_info(peer
);
13318 if (soid
> pi
.last_backfill
) {
13319 if (!recovering
.count(soid
)) {
13320 derr
<< __func__
<< ": object " << soid
<< " last_backfill "
13321 << pi
.last_backfill
<< dendl
;
13322 derr
<< __func__
<< ": object added to missing set for backfill, but "
13323 << "is not in recovering, error!" << dendl
;
13329 if (recovering
.count(soid
)) {
13330 dout(10) << __func__
<< ": already recovering " << soid
<< dendl
;
13334 if (recovery_state
.get_missing_loc().is_deleted(soid
)) {
13335 dout(10) << __func__
<< ": " << soid
<< " is a delete, removing" << dendl
;
13336 map
<hobject_t
,pg_missing_item
>::const_iterator r
= m
.get_items().find(soid
);
13337 started
+= prep_object_replica_deletes(soid
, r
->second
.need
, h
, work_started
);
13341 if (soid
.is_snap() &&
13342 recovery_state
.get_pg_log().get_missing().is_missing(
13343 soid
.get_head())) {
13344 dout(10) << __func__
<< ": " << soid
.get_head()
13345 << " still missing on primary" << dendl
;
13349 if (recovery_state
.get_pg_log().get_missing().is_missing(soid
)) {
13350 dout(10) << __func__
<< ": " << soid
<< " still missing on primary" << dendl
;
13354 dout(10) << __func__
<< ": recover_object_replicas(" << soid
<< ")" << dendl
;
13355 map
<hobject_t
,pg_missing_item
>::const_iterator r
= m
.get_items().find(soid
);
13356 started
+= prep_object_replica_pushes(soid
, r
->second
.need
, h
, work_started
);
13360 pgbackend
->run_recovery_op(h
, get_recovery_op_priority());
13364 hobject_t
PrimaryLogPG::earliest_peer_backfill() const
13366 hobject_t e
= hobject_t::get_max();
13367 for (const pg_shard_t
& peer
: get_backfill_targets()) {
13368 const auto iter
= peer_backfill_info
.find(peer
);
13369 ceph_assert(iter
!= peer_backfill_info
.end());
13370 e
= std::min(e
, iter
->second
.begin
);
13375 bool PrimaryLogPG::all_peer_done() const
13377 // Primary hasn't got any more objects
13378 ceph_assert(backfill_info
.empty());
13380 for (const pg_shard_t
& bt
: get_backfill_targets()) {
13381 const auto piter
= peer_backfill_info
.find(bt
);
13382 ceph_assert(piter
!= peer_backfill_info
.end());
13383 const BackfillInterval
& pbi
= piter
->second
;
13384 // See if peer has more to process
13385 if (!pbi
.extends_to_end() || !pbi
.empty())
13396 * backfilled: fully pushed to replica or present in replica's missing set (both
13397 * our copy and theirs).
13399 * All objects on a backfill_target in
13400 * [MIN,peer_backfill_info[backfill_target].begin) are valid; logically-removed
13401 * objects have been actually deleted and all logically-valid objects are replicated.
13402 * There may be PG objects in this interval yet to be backfilled.
13404 * All objects in PG in [MIN,backfill_info.begin) have been backfilled to all
13405 * backfill_targets. There may be objects on backfill_target(s) yet to be deleted.
13407 * For a backfill target, all objects < std::min(peer_backfill_info[target].begin,
13408 * backfill_info.begin) in PG are backfilled. No deleted objects in this
13409 * interval remain on the backfill target.
13411 * For a backfill target, all objects <= peer_info[target].last_backfill
13412 * have been backfilled to target
13414 * There *MAY* be missing/outdated objects between last_backfill_started and
13415 * std::min(peer_backfill_info[*].begin, backfill_info.begin) in the event that client
13416 * io created objects since the last scan. For this reason, we call
13417 * update_range() again before continuing backfill.
13419 uint64_t PrimaryLogPG::recover_backfill(
13421 ThreadPool::TPHandle
&handle
, bool *work_started
)
13423 dout(10) << __func__
<< " (" << max
<< ")"
13424 << " bft=" << get_backfill_targets()
13425 << " last_backfill_started " << last_backfill_started
13426 << (new_backfill
? " new_backfill":"")
13428 ceph_assert(!get_backfill_targets().empty());
13430 // Initialize from prior backfill state
13431 if (new_backfill
) {
13432 // on_activate() was called prior to getting here
13433 ceph_assert(last_backfill_started
== recovery_state
.earliest_backfill());
13434 new_backfill
= false;
13436 // initialize BackfillIntervals
13437 for (set
<pg_shard_t
>::const_iterator i
= get_backfill_targets().begin();
13438 i
!= get_backfill_targets().end();
13440 peer_backfill_info
[*i
].reset(
13441 recovery_state
.get_peer_info(*i
).last_backfill
);
13443 backfill_info
.reset(last_backfill_started
);
13445 backfills_in_flight
.clear();
13446 pending_backfill_updates
.clear();
13449 for (set
<pg_shard_t
>::const_iterator i
= get_backfill_targets().begin();
13450 i
!= get_backfill_targets().end();
13452 dout(10) << "peer osd." << *i
13453 << " info " << recovery_state
.get_peer_info(*i
)
13454 << " interval " << peer_backfill_info
[*i
].begin
13455 << "-" << peer_backfill_info
[*i
].end
13456 << " " << peer_backfill_info
[*i
].objects
.size() << " objects"
13460 // update our local interval to cope with recent changes
13461 backfill_info
.begin
= last_backfill_started
;
13462 update_range(&backfill_info
, handle
);
13465 vector
<boost::tuple
<hobject_t
, eversion_t
, pg_shard_t
> > to_remove
;
13466 set
<hobject_t
> add_to_stat
;
13468 for (set
<pg_shard_t
>::const_iterator i
= get_backfill_targets().begin();
13469 i
!= get_backfill_targets().end();
13471 peer_backfill_info
[*i
].trim_to(
13473 recovery_state
.get_peer_info(*i
).last_backfill
,
13474 last_backfill_started
));
13476 backfill_info
.trim_to(last_backfill_started
);
13478 PGBackend::RecoveryHandle
*h
= pgbackend
->open_recovery_op();
13479 while (ops
< max
) {
13480 if (backfill_info
.begin
<= earliest_peer_backfill() &&
13481 !backfill_info
.extends_to_end() && backfill_info
.empty()) {
13482 hobject_t next
= backfill_info
.end
;
13483 backfill_info
.reset(next
);
13484 backfill_info
.end
= hobject_t::get_max();
13485 update_range(&backfill_info
, handle
);
13486 backfill_info
.trim();
13489 dout(20) << " my backfill interval " << backfill_info
<< dendl
;
13491 bool sent_scan
= false;
13492 for (set
<pg_shard_t
>::const_iterator i
= get_backfill_targets().begin();
13493 i
!= get_backfill_targets().end();
13495 pg_shard_t bt
= *i
;
13496 BackfillInterval
& pbi
= peer_backfill_info
[bt
];
13498 dout(20) << " peer shard " << bt
<< " backfill " << pbi
<< dendl
;
13499 if (pbi
.begin
<= backfill_info
.begin
&&
13500 !pbi
.extends_to_end() && pbi
.empty()) {
13501 dout(10) << " scanning peer osd." << bt
<< " from " << pbi
.end
<< dendl
;
13502 epoch_t e
= get_osdmap_epoch();
13503 MOSDPGScan
*m
= new MOSDPGScan(
13504 MOSDPGScan::OP_SCAN_GET_DIGEST
, pg_whoami
, e
, get_last_peering_reset(),
13505 spg_t(info
.pgid
.pgid
, bt
.shard
),
13506 pbi
.end
, hobject_t());
13507 osd
->send_message_osd_cluster(bt
.osd
, m
, get_osdmap_epoch());
13508 ceph_assert(waiting_on_backfill
.find(bt
) == waiting_on_backfill
.end());
13509 waiting_on_backfill
.insert(bt
);
13514 // Count simultaneous scans as a single op and let those complete
13517 start_recovery_op(hobject_t::get_max()); // XXX: was pbi.end
13521 if (backfill_info
.empty() && all_peer_done()) {
13522 dout(10) << " reached end for both local and all peers" << dendl
;
13526 // Get object within set of peers to operate on and
13527 // the set of targets for which that object applies.
13528 hobject_t check
= earliest_peer_backfill();
13530 if (check
< backfill_info
.begin
) {
13532 set
<pg_shard_t
> check_targets
;
13533 for (set
<pg_shard_t
>::const_iterator i
= get_backfill_targets().begin();
13534 i
!= get_backfill_targets().end();
13536 pg_shard_t bt
= *i
;
13537 BackfillInterval
& pbi
= peer_backfill_info
[bt
];
13538 if (pbi
.begin
== check
)
13539 check_targets
.insert(bt
);
13541 ceph_assert(!check_targets
.empty());
13543 dout(20) << " BACKFILL removing " << check
13544 << " from peers " << check_targets
<< dendl
;
13545 for (set
<pg_shard_t
>::iterator i
= check_targets
.begin();
13546 i
!= check_targets
.end();
13548 pg_shard_t bt
= *i
;
13549 BackfillInterval
& pbi
= peer_backfill_info
[bt
];
13550 ceph_assert(pbi
.begin
== check
);
13552 to_remove
.push_back(boost::make_tuple(check
, pbi
.objects
.begin()->second
, bt
));
13556 last_backfill_started
= check
;
13558 // Don't increment ops here because deletions
13559 // are cheap and not replied to unlike real recovery_ops,
13560 // and we can't increment ops without requeueing ourself
13563 eversion_t
& obj_v
= backfill_info
.objects
.begin()->second
;
13565 vector
<pg_shard_t
> need_ver_targs
, missing_targs
, keep_ver_targs
, skip_targs
;
13566 for (set
<pg_shard_t
>::const_iterator i
= get_backfill_targets().begin();
13567 i
!= get_backfill_targets().end();
13569 pg_shard_t bt
= *i
;
13570 BackfillInterval
& pbi
= peer_backfill_info
[bt
];
13571 // Find all check peers that have the wrong version
13572 if (check
== backfill_info
.begin
&& check
== pbi
.begin
) {
13573 if (pbi
.objects
.begin()->second
!= obj_v
) {
13574 need_ver_targs
.push_back(bt
);
13576 keep_ver_targs
.push_back(bt
);
13579 const pg_info_t
& pinfo
= recovery_state
.get_peer_info(bt
);
13581 // Only include peers that we've caught up to their backfill line
13582 // otherwise, they only appear to be missing this object
13583 // because their pbi.begin > backfill_info.begin.
13584 if (backfill_info
.begin
> pinfo
.last_backfill
)
13585 missing_targs
.push_back(bt
);
13587 skip_targs
.push_back(bt
);
13591 if (!keep_ver_targs
.empty()) {
13592 // These peers have version obj_v
13593 dout(20) << " BACKFILL keeping " << check
13594 << " with ver " << obj_v
13595 << " on peers " << keep_ver_targs
<< dendl
;
13596 //assert(!waiting_for_degraded_object.count(check));
13598 if (!need_ver_targs
.empty() || !missing_targs
.empty()) {
13599 ObjectContextRef obc
= get_object_context(backfill_info
.begin
, false);
13601 if (obc
->get_recovery_read()) {
13602 if (!need_ver_targs
.empty()) {
13603 dout(20) << " BACKFILL replacing " << check
13604 << " with ver " << obj_v
13605 << " to peers " << need_ver_targs
<< dendl
;
13607 if (!missing_targs
.empty()) {
13608 dout(20) << " BACKFILL pushing " << backfill_info
.begin
13609 << " with ver " << obj_v
13610 << " to peers " << missing_targs
<< dendl
;
13612 vector
<pg_shard_t
> all_push
= need_ver_targs
;
13613 all_push
.insert(all_push
.end(), missing_targs
.begin(), missing_targs
.end());
13615 handle
.reset_tp_timeout();
13616 int r
= prep_backfill_object_push(backfill_info
.begin
, obj_v
, obc
, all_push
, h
);
13618 *work_started
= true;
13619 dout(0) << __func__
<< " Error " << r
<< " trying to backfill " << backfill_info
.begin
<< dendl
;
13624 *work_started
= true;
13625 dout(20) << "backfill blocking on " << backfill_info
.begin
13626 << "; could not get rw_manager lock" << dendl
;
13630 dout(20) << "need_ver_targs=" << need_ver_targs
13631 << " keep_ver_targs=" << keep_ver_targs
<< dendl
;
13632 dout(20) << "backfill_targets=" << get_backfill_targets()
13633 << " missing_targs=" << missing_targs
13634 << " skip_targs=" << skip_targs
<< dendl
;
13636 last_backfill_started
= backfill_info
.begin
;
13637 add_to_stat
.insert(backfill_info
.begin
); // XXX: Only one for all pushes?
13638 backfill_info
.pop_front();
13639 vector
<pg_shard_t
> check_targets
= need_ver_targs
;
13640 check_targets
.insert(check_targets
.end(), keep_ver_targs
.begin(), keep_ver_targs
.end());
13641 for (vector
<pg_shard_t
>::iterator i
= check_targets
.begin();
13642 i
!= check_targets
.end();
13644 pg_shard_t bt
= *i
;
13645 BackfillInterval
& pbi
= peer_backfill_info
[bt
];
13651 for (set
<hobject_t
>::iterator i
= add_to_stat
.begin();
13652 i
!= add_to_stat
.end();
13654 ObjectContextRef obc
= get_object_context(*i
, false);
13657 add_object_context_to_pg_stat(obc
, &stat
);
13658 pending_backfill_updates
[*i
] = stat
;
13660 map
<pg_shard_t
,MOSDPGBackfillRemove
*> reqs
;
13661 for (unsigned i
= 0; i
< to_remove
.size(); ++i
) {
13662 handle
.reset_tp_timeout();
13663 const hobject_t
& oid
= to_remove
[i
].get
<0>();
13664 eversion_t v
= to_remove
[i
].get
<1>();
13665 pg_shard_t peer
= to_remove
[i
].get
<2>();
13666 MOSDPGBackfillRemove
*m
;
13667 auto it
= reqs
.find(peer
);
13668 if (it
!= reqs
.end()) {
13671 m
= reqs
[peer
] = new MOSDPGBackfillRemove(
13672 spg_t(info
.pgid
.pgid
, peer
.shard
),
13673 get_osdmap_epoch());
13675 m
->ls
.push_back(make_pair(oid
, v
));
13677 if (oid
<= last_backfill_started
)
13678 pending_backfill_updates
[oid
]; // add empty stat!
13680 for (auto p
: reqs
) {
13681 osd
->send_message_osd_cluster(p
.first
.osd
, p
.second
,
13682 get_osdmap_epoch());
13685 pgbackend
->run_recovery_op(h
, get_recovery_op_priority());
13687 hobject_t backfill_pos
=
13688 std::min(backfill_info
.begin
, earliest_peer_backfill());
13689 dout(5) << "backfill_pos is " << backfill_pos
<< dendl
;
13690 for (set
<hobject_t
>::iterator i
= backfills_in_flight
.begin();
13691 i
!= backfills_in_flight
.end();
13693 dout(20) << *i
<< " is still in flight" << dendl
;
13696 hobject_t next_backfill_to_complete
= backfills_in_flight
.empty() ?
13697 backfill_pos
: *(backfills_in_flight
.begin());
13698 hobject_t new_last_backfill
= recovery_state
.earliest_backfill();
13699 dout(10) << "starting new_last_backfill at " << new_last_backfill
<< dendl
;
13700 for (map
<hobject_t
, pg_stat_t
>::iterator i
=
13701 pending_backfill_updates
.begin();
13702 i
!= pending_backfill_updates
.end() &&
13703 i
->first
< next_backfill_to_complete
;
13704 pending_backfill_updates
.erase(i
++)) {
13705 dout(20) << " pending_backfill_update " << i
->first
<< dendl
;
13706 ceph_assert(i
->first
> new_last_backfill
);
13707 // carried from a previous round – if we are here, then we had to
13708 // be requeued (by e.g. on_global_recover()) and those operations
13710 recovery_state
.update_complete_backfill_object_stats(
13713 new_last_backfill
= i
->first
;
13715 dout(10) << "possible new_last_backfill at " << new_last_backfill
<< dendl
;
13717 ceph_assert(!pending_backfill_updates
.empty() ||
13718 new_last_backfill
== last_backfill_started
);
13719 if (pending_backfill_updates
.empty() &&
13720 backfill_pos
.is_max()) {
13721 ceph_assert(backfills_in_flight
.empty());
13722 new_last_backfill
= backfill_pos
;
13723 last_backfill_started
= backfill_pos
;
13725 dout(10) << "final new_last_backfill at " << new_last_backfill
<< dendl
;
13727 // If new_last_backfill == MAX, then we will send OP_BACKFILL_FINISH to
13728 // all the backfill targets. Otherwise, we will move last_backfill up on
13729 // those targets need it and send OP_BACKFILL_PROGRESS to them.
13730 for (set
<pg_shard_t
>::const_iterator i
= get_backfill_targets().begin();
13731 i
!= get_backfill_targets().end();
13733 pg_shard_t bt
= *i
;
13734 const pg_info_t
& pinfo
= recovery_state
.get_peer_info(bt
);
13736 if (new_last_backfill
> pinfo
.last_backfill
) {
13737 recovery_state
.update_peer_last_backfill(bt
, new_last_backfill
);
13738 epoch_t e
= get_osdmap_epoch();
13739 MOSDPGBackfill
*m
= NULL
;
13740 if (pinfo
.last_backfill
.is_max()) {
13741 m
= new MOSDPGBackfill(
13742 MOSDPGBackfill::OP_BACKFILL_FINISH
,
13744 get_last_peering_reset(),
13745 spg_t(info
.pgid
.pgid
, bt
.shard
));
13746 // Use default priority here, must match sub_op priority
13747 start_recovery_op(hobject_t::get_max());
13749 m
= new MOSDPGBackfill(
13750 MOSDPGBackfill::OP_BACKFILL_PROGRESS
,
13752 get_last_peering_reset(),
13753 spg_t(info
.pgid
.pgid
, bt
.shard
));
13754 // Use default priority here, must match sub_op priority
13756 m
->last_backfill
= pinfo
.last_backfill
;
13757 m
->stats
= pinfo
.stats
;
13758 osd
->send_message_osd_cluster(bt
.osd
, m
, get_osdmap_epoch());
13759 dout(10) << " peer " << bt
13760 << " num_objects now " << pinfo
.stats
.stats
.sum
.num_objects
13761 << " / " << info
.stats
.stats
.sum
.num_objects
<< dendl
;
13766 *work_started
= true;
13770 int PrimaryLogPG::prep_backfill_object_push(
13771 hobject_t oid
, eversion_t v
,
13772 ObjectContextRef obc
,
13773 vector
<pg_shard_t
> peers
,
13774 PGBackend::RecoveryHandle
*h
)
13776 dout(10) << __func__
<< " " << oid
<< " v " << v
<< " to peers " << peers
<< dendl
;
13777 ceph_assert(!peers
.empty());
13779 backfills_in_flight
.insert(oid
);
13780 recovery_state
.prepare_backfill_for_missing(oid
, v
, peers
);
13782 ceph_assert(!recovering
.count(oid
));
13784 start_recovery_op(oid
);
13785 recovering
.insert(make_pair(oid
, obc
));
13787 int r
= pgbackend
->recover_object(
13790 ObjectContextRef(),
13794 dout(0) << __func__
<< " Error " << r
<< " on oid " << oid
<< dendl
;
13795 on_failed_pull({ pg_whoami
}, oid
, v
);
13800 void PrimaryLogPG::update_range(
13801 BackfillInterval
*bi
,
13802 ThreadPool::TPHandle
&handle
)
13804 int local_min
= cct
->_conf
->osd_backfill_scan_min
;
13805 int local_max
= cct
->_conf
->osd_backfill_scan_max
;
13807 if (bi
->version
< info
.log_tail
) {
13808 dout(10) << __func__
<< ": bi is old, rescanning local backfill_info"
13810 bi
->version
= info
.last_update
;
13811 scan_range(local_min
, local_max
, bi
, handle
);
13814 if (bi
->version
>= projected_last_update
) {
13815 dout(10) << __func__
<< ": bi is current " << dendl
;
13816 ceph_assert(bi
->version
== projected_last_update
);
13817 } else if (bi
->version
>= info
.log_tail
) {
13818 if (recovery_state
.get_pg_log().get_log().empty() && projected_log
.empty()) {
13819 /* Because we don't move log_tail on split, the log might be
13820 * empty even if log_tail != last_update. However, the only
13821 * way to get here with an empty log is if log_tail is actually
13822 * eversion_t(), because otherwise the entry which changed
13823 * last_update since the last scan would have to be present.
13825 ceph_assert(bi
->version
== eversion_t());
13829 dout(10) << __func__
<< ": bi is old, (" << bi
->version
13830 << ") can be updated with log to projected_last_update "
13831 << projected_last_update
<< dendl
;
13833 auto func
= [&](const pg_log_entry_t
&e
) {
13834 dout(10) << __func__
<< ": updating from version " << e
.version
13836 const hobject_t
&soid
= e
.soid
;
13837 if (soid
>= bi
->begin
&&
13839 if (e
.is_update()) {
13840 dout(10) << __func__
<< ": " << e
.soid
<< " updated to version "
13841 << e
.version
<< dendl
;
13842 bi
->objects
.erase(e
.soid
);
13843 bi
->objects
.insert(
13847 } else if (e
.is_delete()) {
13848 dout(10) << __func__
<< ": " << e
.soid
<< " removed" << dendl
;
13849 bi
->objects
.erase(e
.soid
);
13853 dout(10) << "scanning pg log first" << dendl
;
13854 recovery_state
.get_pg_log().get_log().scan_log_after(bi
->version
, func
);
13855 dout(10) << "scanning projected log" << dendl
;
13856 projected_log
.scan_log_after(bi
->version
, func
);
13857 bi
->version
= projected_last_update
;
13859 ceph_abort_msg("scan_range should have raised bi->version past log_tail");
13863 void PrimaryLogPG::scan_range(
13864 int min
, int max
, BackfillInterval
*bi
,
13865 ThreadPool::TPHandle
&handle
)
13867 ceph_assert(is_locked());
13868 dout(10) << "scan_range from " << bi
->begin
<< dendl
;
13869 bi
->clear_objects();
13871 vector
<hobject_t
> ls
;
13873 int r
= pgbackend
->objects_list_partial(bi
->begin
, min
, max
, &ls
, &bi
->end
);
13874 ceph_assert(r
>= 0);
13875 dout(10) << " got " << ls
.size() << " items, next " << bi
->end
<< dendl
;
13876 dout(20) << ls
<< dendl
;
13878 for (vector
<hobject_t
>::iterator p
= ls
.begin(); p
!= ls
.end(); ++p
) {
13879 handle
.reset_tp_timeout();
13880 ObjectContextRef obc
;
13882 obc
= object_contexts
.lookup(*p
);
13884 if (!obc
->obs
.exists
) {
13885 /* If the object does not exist here, it must have been removed
13886 * between the collection_list_partial and here. This can happen
13887 * for the first item in the range, which is usually last_backfill.
13891 bi
->objects
[*p
] = obc
->obs
.oi
.version
;
13892 dout(20) << " " << *p
<< " " << obc
->obs
.oi
.version
<< dendl
;
13895 int r
= pgbackend
->objects_get_attr(*p
, OI_ATTR
, &bl
);
13896 /* If the object does not exist here, it must have been removed
13897 * between the collection_list_partial and here. This can happen
13898 * for the first item in the range, which is usually last_backfill.
13903 ceph_assert(r
>= 0);
13904 object_info_t
oi(bl
);
13905 bi
->objects
[*p
] = oi
.version
;
13906 dout(20) << " " << *p
<< " " << oi
.version
<< dendl
;
13914 * verifies that stray objects have been deleted
13916 void PrimaryLogPG::check_local()
13918 dout(10) << __func__
<< dendl
;
13921 info
.last_update
>=
13922 recovery_state
.get_pg_log().get_tail()); // otherwise we need some help!
13924 if (!cct
->_conf
->osd_debug_verify_stray_on_activate
)
13927 // just scan the log.
13928 set
<hobject_t
> did
;
13929 for (list
<pg_log_entry_t
>::const_reverse_iterator p
= recovery_state
.get_pg_log().get_log().log
.rbegin();
13930 p
!= recovery_state
.get_pg_log().get_log().log
.rend();
13932 if (did
.count(p
->soid
))
13934 did
.insert(p
->soid
);
13936 if (p
->is_delete() && !is_missing_object(p
->soid
)) {
13937 dout(10) << " checking " << p
->soid
13938 << " at " << p
->version
<< dendl
;
13940 int r
= osd
->store
->stat(
13942 ghobject_t(p
->soid
, ghobject_t::NO_GEN
, pg_whoami
.shard
),
13944 if (r
!= -ENOENT
) {
13945 derr
<< __func__
<< " " << p
->soid
<< " exists, but should have been "
13946 << "deleted" << dendl
;
13947 ceph_abort_msg("erroneously present object");
13950 // ignore old(+missing) objects
13957 // ===========================
13960 hobject_t
PrimaryLogPG::get_hit_set_current_object(utime_t stamp
)
13963 ss
<< "hit_set_" << info
.pgid
.pgid
<< "_current_" << stamp
;
13964 hobject_t
hoid(sobject_t(ss
.str(), CEPH_NOSNAP
), "",
13965 info
.pgid
.ps(), info
.pgid
.pool(),
13966 cct
->_conf
->osd_hit_set_namespace
);
13967 dout(20) << __func__
<< " " << hoid
<< dendl
;
13971 hobject_t
PrimaryLogPG::get_hit_set_archive_object(utime_t start
,
13976 ss
<< "hit_set_" << info
.pgid
.pgid
<< "_archive_";
13978 start
.gmtime(ss
, true /* legacy pre-octopus form */) << "_";
13979 end
.gmtime(ss
, true /* legacy pre-octopus form */);
13981 start
.localtime(ss
, true /* legacy pre-octopus form */) << "_";
13982 end
.localtime(ss
, true /* legacy pre-octopus form */);
13984 hobject_t
hoid(sobject_t(ss
.str(), CEPH_NOSNAP
), "",
13985 info
.pgid
.ps(), info
.pgid
.pool(),
13986 cct
->_conf
->osd_hit_set_namespace
);
13987 dout(20) << __func__
<< " " << hoid
<< dendl
;
13991 void PrimaryLogPG::hit_set_clear()
13993 dout(20) << __func__
<< dendl
;
13995 hit_set_start_stamp
= utime_t();
13998 void PrimaryLogPG::hit_set_setup()
14000 if (!is_active() ||
14006 if (is_active() && is_primary() &&
14007 (!pool
.info
.hit_set_count
||
14008 !pool
.info
.hit_set_period
||
14009 pool
.info
.hit_set_params
.get_type() == HitSet::TYPE_NONE
)) {
14012 // only primary is allowed to remove all the hit set objects
14013 hit_set_remove_all();
14017 // FIXME: discard any previous data for now
14020 // include any writes we know about from the pg log. this doesn't
14021 // capture reads, but it is better than nothing!
14022 hit_set_apply_log();
14025 void PrimaryLogPG::hit_set_remove_all()
14027 // If any archives are degraded we skip this
14028 for (auto p
= info
.hit_set
.history
.begin();
14029 p
!= info
.hit_set
.history
.end();
14031 hobject_t aoid
= get_hit_set_archive_object(p
->begin
, p
->end
, p
->using_gmt
);
14033 // Once we hit a degraded object just skip
14034 if (is_degraded_or_backfilling_object(aoid
))
14036 if (m_scrubber
->write_blocked_by_scrub(aoid
))
14040 if (!info
.hit_set
.history
.empty()) {
14041 auto p
= info
.hit_set
.history
.rbegin();
14042 ceph_assert(p
!= info
.hit_set
.history
.rend());
14043 hobject_t oid
= get_hit_set_archive_object(p
->begin
, p
->end
, p
->using_gmt
);
14044 ceph_assert(!is_degraded_or_backfilling_object(oid
));
14045 ObjectContextRef obc
= get_object_context(oid
, false);
14048 OpContextUPtr ctx
= simple_opc_create(obc
);
14049 ctx
->at_version
= get_next_version();
14050 ctx
->updated_hset_history
= info
.hit_set
;
14051 utime_t now
= ceph_clock_now();
14053 hit_set_trim(ctx
, 0);
14054 simple_opc_submit(std::move(ctx
));
14057 recovery_state
.update_hset(pg_hit_set_history_t());
14059 agent_state
->discard_hit_sets();
14063 void PrimaryLogPG::hit_set_create()
14065 utime_t now
= ceph_clock_now();
14066 // make a copy of the params to modify
14067 HitSet::Params
params(pool
.info
.hit_set_params
);
14069 dout(20) << __func__
<< " " << params
<< dendl
;
14070 if (pool
.info
.hit_set_params
.get_type() == HitSet::TYPE_BLOOM
) {
14071 BloomHitSet::Params
*p
=
14072 static_cast<BloomHitSet::Params
*>(params
.impl
.get());
14074 // convert false positive rate so it holds up across the full period
14075 p
->set_fpp(p
->get_fpp() / pool
.info
.hit_set_count
);
14076 if (p
->get_fpp() <= 0.0)
14077 p
->set_fpp(.01); // fpp cannot be zero!
14079 // if we don't have specified size, estimate target size based on the
14081 if (p
->target_size
== 0 && hit_set
) {
14082 utime_t dur
= now
- hit_set_start_stamp
;
14083 unsigned unique
= hit_set
->approx_unique_insert_count();
14084 dout(20) << __func__
<< " previous set had approx " << unique
14085 << " unique items over " << dur
<< " seconds" << dendl
;
14086 p
->target_size
= (double)unique
* (double)pool
.info
.hit_set_period
14089 if (p
->target_size
<
14090 static_cast<uint64_t>(cct
->_conf
->osd_hit_set_min_size
))
14091 p
->target_size
= cct
->_conf
->osd_hit_set_min_size
;
14094 > static_cast<uint64_t>(cct
->_conf
->osd_hit_set_max_size
))
14095 p
->target_size
= cct
->_conf
->osd_hit_set_max_size
;
14097 p
->seed
= now
.sec();
14099 dout(10) << __func__
<< " target_size " << p
->target_size
14100 << " fpp " << p
->get_fpp() << dendl
;
14102 hit_set
.reset(new HitSet(params
));
14103 hit_set_start_stamp
= now
;
14107 * apply log entries to set
14109 * this would only happen after peering, to at least capture writes
14110 * during an interval that was potentially lost.
14112 bool PrimaryLogPG::hit_set_apply_log()
14117 eversion_t to
= info
.last_update
;
14118 eversion_t from
= info
.hit_set
.current_last_update
;
14120 dout(20) << __func__
<< " no update" << dendl
;
14124 dout(20) << __func__
<< " " << to
<< " .. " << info
.last_update
<< dendl
;
14125 list
<pg_log_entry_t
>::const_reverse_iterator p
=
14126 recovery_state
.get_pg_log().get_log().log
.rbegin();
14127 while (p
!= recovery_state
.get_pg_log().get_log().log
.rend() && p
->version
> to
)
14129 while (p
!= recovery_state
.get_pg_log().get_log().log
.rend() && p
->version
> from
) {
14130 hit_set
->insert(p
->soid
);
14137 void PrimaryLogPG::hit_set_persist()
14139 dout(10) << __func__
<< dendl
;
14141 unsigned max
= pool
.info
.hit_set_count
;
14143 utime_t now
= ceph_clock_now();
14146 // If any archives are degraded we skip this persist request
14147 // account for the additional entry being added below
14148 for (auto p
= info
.hit_set
.history
.begin();
14149 p
!= info
.hit_set
.history
.end();
14151 hobject_t aoid
= get_hit_set_archive_object(p
->begin
, p
->end
, p
->using_gmt
);
14153 // Once we hit a degraded object just skip further trim
14154 if (is_degraded_or_backfilling_object(aoid
))
14156 if (m_scrubber
->write_blocked_by_scrub(aoid
))
14160 // If backfill is in progress and we could possibly overlap with the
14161 // hit_set_* objects, back off. Since these all have
14162 // hobject_t::hash set to pgid.ps(), and those sort first, we can
14163 // look just at that. This is necessary because our transactions
14164 // may include a modify of the new hit_set *and* a delete of the
14165 // old one, and this may span the backfill boundary.
14166 for (set
<pg_shard_t
>::const_iterator p
= get_backfill_targets().begin();
14167 p
!= get_backfill_targets().end();
14169 const pg_info_t
& pi
= recovery_state
.get_peer_info(*p
);
14170 if (pi
.last_backfill
== hobject_t() ||
14171 pi
.last_backfill
.get_hash() == info
.pgid
.ps()) {
14172 dout(10) << __func__
<< " backfill target osd." << *p
14173 << " last_backfill has not progressed past pgid ps"
14180 pg_hit_set_info_t new_hset
= pg_hit_set_info_t(pool
.info
.use_gmt_hitset
);
14181 new_hset
.begin
= hit_set_start_stamp
;
14182 new_hset
.end
= now
;
14183 oid
= get_hit_set_archive_object(
14186 new_hset
.using_gmt
);
14188 // If the current object is degraded we skip this persist request
14189 if (m_scrubber
->write_blocked_by_scrub(oid
))
14193 encode(*hit_set
, bl
);
14194 dout(20) << __func__
<< " archive " << oid
<< dendl
;
14197 agent_state
->add_hit_set(new_hset
.begin
, hit_set
);
14198 uint32_t size
= agent_state
->hit_set_map
.size();
14199 if (size
>= pool
.info
.hit_set_count
) {
14200 size
= pool
.info
.hit_set_count
> 0 ? pool
.info
.hit_set_count
- 1: 0;
14202 hit_set_in_memory_trim(size
);
14205 ObjectContextRef obc
= get_object_context(oid
, true);
14206 OpContextUPtr ctx
= simple_opc_create(obc
);
14208 ctx
->at_version
= get_next_version();
14209 ctx
->updated_hset_history
= info
.hit_set
;
14210 pg_hit_set_history_t
&updated_hit_set_hist
= *(ctx
->updated_hset_history
);
14212 updated_hit_set_hist
.current_last_update
= info
.last_update
;
14213 new_hset
.version
= ctx
->at_version
;
14215 updated_hit_set_hist
.history
.push_back(new_hset
);
14218 // fabricate an object_info_t and SnapSet
14219 obc
->obs
.oi
.version
= ctx
->at_version
;
14220 obc
->obs
.oi
.mtime
= now
;
14221 obc
->obs
.oi
.size
= bl
.length();
14222 obc
->obs
.exists
= true;
14223 obc
->obs
.oi
.set_data_digest(bl
.crc32c(-1));
14225 ctx
->new_obs
= obc
->obs
;
14227 ctx
->new_snapset
= obc
->ssc
->snapset
;
14229 ctx
->delta_stats
.num_objects
++;
14230 ctx
->delta_stats
.num_objects_hit_set_archive
++;
14232 ctx
->delta_stats
.num_bytes
+= bl
.length();
14233 ctx
->delta_stats
.num_bytes_hit_set_archive
+= bl
.length();
14236 encode(ctx
->new_snapset
, bss
);
14237 bufferlist
boi(sizeof(ctx
->new_obs
.oi
));
14238 encode(ctx
->new_obs
.oi
, boi
,
14239 get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD
, nullptr));
14241 ctx
->op_t
->create(oid
);
14243 ctx
->op_t
->write(oid
, 0, bl
.length(), bl
, 0);
14244 write_update_size_and_usage(ctx
->delta_stats
, obc
->obs
.oi
, ctx
->modified_ranges
,
14246 ctx
->clean_regions
.mark_data_region_dirty(0, bl
.length());
14248 map
<string
, bufferlist
> attrs
;
14249 attrs
[OI_ATTR
] = std::move(boi
);
14250 attrs
[SS_ATTR
] = std::move(bss
);
14251 setattrs_maybe_cache(ctx
->obc
, ctx
->op_t
.get(), attrs
);
14252 ctx
->log
.push_back(
14254 pg_log_entry_t::MODIFY
,
14263 ctx
->log
.back().clean_regions
= ctx
->clean_regions
;
14265 hit_set_trim(ctx
, max
);
14267 simple_opc_submit(std::move(ctx
));
14270 void PrimaryLogPG::hit_set_trim(OpContextUPtr
&ctx
, unsigned max
)
14272 ceph_assert(ctx
->updated_hset_history
);
14273 pg_hit_set_history_t
&updated_hit_set_hist
=
14274 *(ctx
->updated_hset_history
);
14275 for (unsigned num
= updated_hit_set_hist
.history
.size(); num
> max
; --num
) {
14276 list
<pg_hit_set_info_t
>::iterator p
= updated_hit_set_hist
.history
.begin();
14277 ceph_assert(p
!= updated_hit_set_hist
.history
.end());
14278 hobject_t oid
= get_hit_set_archive_object(p
->begin
, p
->end
, p
->using_gmt
);
14280 ceph_assert(!is_degraded_or_backfilling_object(oid
));
14282 dout(20) << __func__
<< " removing " << oid
<< dendl
;
14283 ++ctx
->at_version
.version
;
14284 ctx
->log
.push_back(
14285 pg_log_entry_t(pg_log_entry_t::DELETE
,
14294 ctx
->op_t
->remove(oid
);
14295 updated_hit_set_hist
.history
.pop_front();
14297 ObjectContextRef obc
= get_object_context(oid
, false);
14299 --ctx
->delta_stats
.num_objects
;
14300 --ctx
->delta_stats
.num_objects_hit_set_archive
;
14301 ctx
->delta_stats
.num_bytes
-= obc
->obs
.oi
.size
;
14302 ctx
->delta_stats
.num_bytes_hit_set_archive
-= obc
->obs
.oi
.size
;
14306 void PrimaryLogPG::hit_set_in_memory_trim(uint32_t max_in_memory
)
14308 while (agent_state
->hit_set_map
.size() > max_in_memory
) {
14309 agent_state
->remove_oldest_hit_set();
14314 // =======================================
14317 void PrimaryLogPG::agent_setup()
14319 ceph_assert(is_locked());
14320 if (!is_active() ||
14322 state_test(PG_STATE_PREMERGE
) ||
14323 pool
.info
.cache_mode
== pg_pool_t::CACHEMODE_NONE
||
14324 pool
.info
.tier_of
< 0 ||
14325 !get_osdmap()->have_pg_pool(pool
.info
.tier_of
)) {
14329 if (!agent_state
) {
14330 agent_state
.reset(new TierAgentState
);
14332 // choose random starting position
14333 agent_state
->position
= hobject_t();
14334 agent_state
->position
.pool
= info
.pgid
.pool();
14335 agent_state
->position
.set_hash(pool
.info
.get_random_pg_position(
14338 agent_state
->start
= agent_state
->position
;
14340 dout(10) << __func__
<< " allocated new state, position "
14341 << agent_state
->position
<< dendl
;
14343 dout(10) << __func__
<< " keeping existing state" << dendl
;
14346 if (info
.stats
.stats_invalid
) {
14347 osd
->clog
->warn() << "pg " << info
.pgid
<< " has invalid (post-split) stats; must scrub before tier agent can activate";
14350 agent_choose_mode();
14353 void PrimaryLogPG::agent_clear()
14356 agent_state
.reset(NULL
);
14359 // Return false if no objects operated on since start of object hash space
14360 bool PrimaryLogPG::agent_work(int start_max
, int agent_flush_quota
)
14362 std::scoped_lock locker
{*this};
14363 if (!agent_state
) {
14364 dout(10) << __func__
<< " no agent state, stopping" << dendl
;
14368 ceph_assert(!recovery_state
.is_deleting());
14370 if (agent_state
->is_idle()) {
14371 dout(10) << __func__
<< " idle, stopping" << dendl
;
14375 osd
->logger
->inc(l_osd_agent_wake
);
14377 dout(10) << __func__
14378 << " max " << start_max
14379 << ", flush " << agent_state
->get_flush_mode_name()
14380 << ", evict " << agent_state
->get_evict_mode_name()
14381 << ", pos " << agent_state
->position
14383 ceph_assert(is_primary());
14384 ceph_assert(is_active());
14386 agent_load_hit_sets();
14388 const pg_pool_t
*base_pool
= get_osdmap()->get_pg_pool(pool
.info
.tier_of
);
14389 ceph_assert(base_pool
);
14392 int ls_max
= cct
->_conf
->osd_pool_default_cache_max_evict_check_size
;
14394 // list some objects. this conveniently lists clones (oldest to
14395 // newest) before heads... the same order we want to flush in.
14397 // NOTE: do not flush the Sequencer. we will assume that the
14398 // listing we get back is imprecise.
14399 vector
<hobject_t
> ls
;
14401 int r
= pgbackend
->objects_list_partial(agent_state
->position
, ls_min
, ls_max
,
14403 ceph_assert(r
>= 0);
14404 dout(20) << __func__
<< " got " << ls
.size() << " objects" << dendl
;
14406 for (vector
<hobject_t
>::iterator p
= ls
.begin();
14409 if (p
->nspace
== cct
->_conf
->osd_hit_set_namespace
) {
14410 dout(20) << __func__
<< " skip (hit set) " << *p
<< dendl
;
14411 osd
->logger
->inc(l_osd_agent_skip
);
14414 if (is_degraded_or_backfilling_object(*p
)) {
14415 dout(20) << __func__
<< " skip (degraded) " << *p
<< dendl
;
14416 osd
->logger
->inc(l_osd_agent_skip
);
14419 if (is_missing_object(p
->get_head())) {
14420 dout(20) << __func__
<< " skip (missing head) " << *p
<< dendl
;
14421 osd
->logger
->inc(l_osd_agent_skip
);
14424 ObjectContextRef obc
= get_object_context(*p
, false, NULL
);
14426 // we didn't flush; we may miss something here.
14427 dout(20) << __func__
<< " skip (no obc) " << *p
<< dendl
;
14428 osd
->logger
->inc(l_osd_agent_skip
);
14431 if (!obc
->obs
.exists
) {
14432 dout(20) << __func__
<< " skip (dne) " << obc
->obs
.oi
.soid
<< dendl
;
14433 osd
->logger
->inc(l_osd_agent_skip
);
14436 if (m_scrubber
->range_intersects_scrub(obc
->obs
.oi
.soid
,
14437 obc
->obs
.oi
.soid
.get_head())) {
14438 dout(20) << __func__
<< " skip (scrubbing) " << obc
->obs
.oi
<< dendl
;
14439 osd
->logger
->inc(l_osd_agent_skip
);
14442 if (obc
->is_blocked()) {
14443 dout(20) << __func__
<< " skip (blocked) " << obc
->obs
.oi
<< dendl
;
14444 osd
->logger
->inc(l_osd_agent_skip
);
14447 if (obc
->is_request_pending()) {
14448 dout(20) << __func__
<< " skip (request pending) " << obc
->obs
.oi
<< dendl
;
14449 osd
->logger
->inc(l_osd_agent_skip
);
14453 // be careful flushing omap to an EC pool.
14454 if (!base_pool
->supports_omap() &&
14455 obc
->obs
.oi
.is_omap()) {
14456 dout(20) << __func__
<< " skip (omap to EC) " << obc
->obs
.oi
<< dendl
;
14457 osd
->logger
->inc(l_osd_agent_skip
);
14461 if (agent_state
->evict_mode
!= TierAgentState::EVICT_MODE_IDLE
&&
14462 agent_maybe_evict(obc
, false))
14464 else if (agent_state
->flush_mode
!= TierAgentState::FLUSH_MODE_IDLE
&&
14465 agent_flush_quota
> 0 && agent_maybe_flush(obc
)) {
14467 --agent_flush_quota
;
14469 if (started
>= start_max
) {
14470 // If finishing early, set "next" to the next object
14471 if (++p
!= ls
.end())
14477 if (++agent_state
->hist_age
> cct
->_conf
->osd_agent_hist_halflife
) {
14478 dout(20) << __func__
<< " resetting atime and temp histograms" << dendl
;
14479 agent_state
->hist_age
= 0;
14480 agent_state
->temp_hist
.decay();
14483 // Total objects operated on so far
14484 int total_started
= agent_state
->started
+ started
;
14485 bool need_delay
= false;
14487 dout(20) << __func__
<< " start pos " << agent_state
->position
14488 << " next start pos " << next
14489 << " started " << total_started
<< dendl
;
14491 // See if we've made a full pass over the object hash space
14492 // This might check at most ls_max objects a second time to notice that
14493 // we've checked every objects at least once.
14494 if (agent_state
->position
< agent_state
->start
&&
14495 next
>= agent_state
->start
) {
14496 dout(20) << __func__
<< " wrap around " << agent_state
->start
<< dendl
;
14497 if (total_started
== 0)
14501 agent_state
->start
= next
;
14503 agent_state
->started
= total_started
;
14505 // See if we are starting from beginning
14507 agent_state
->position
= hobject_t();
14509 agent_state
->position
= next
;
14511 // Discard old in memory HitSets
14512 hit_set_in_memory_trim(pool
.info
.hit_set_count
);
14515 ceph_assert(agent_state
->delaying
== false);
14519 agent_choose_mode();
14523 void PrimaryLogPG::agent_load_hit_sets()
14525 if (agent_state
->evict_mode
== TierAgentState::EVICT_MODE_IDLE
) {
14529 if (agent_state
->hit_set_map
.size() < info
.hit_set
.history
.size()) {
14530 dout(10) << __func__
<< dendl
;
14531 for (auto p
= info
.hit_set
.history
.begin();
14532 p
!= info
.hit_set
.history
.end(); ++p
) {
14533 if (agent_state
->hit_set_map
.count(p
->begin
.sec()) == 0) {
14534 dout(10) << __func__
<< " loading " << p
->begin
<< "-"
14535 << p
->end
<< dendl
;
14536 if (!pool
.info
.is_replicated()) {
14537 // FIXME: EC not supported here yet
14538 derr
<< __func__
<< " on non-replicated pool" << dendl
;
14542 hobject_t oid
= get_hit_set_archive_object(p
->begin
, p
->end
, p
->using_gmt
);
14543 if (is_unreadable_object(oid
)) {
14544 dout(10) << __func__
<< " unreadable " << oid
<< ", waiting" << dendl
;
14548 ObjectContextRef obc
= get_object_context(oid
, false);
14550 derr
<< __func__
<< ": could not load hitset " << oid
<< dendl
;
14556 int r
= osd
->store
->read(ch
, ghobject_t(oid
), 0, 0, bl
);
14557 ceph_assert(r
>= 0);
14559 HitSetRef
hs(new HitSet
);
14560 bufferlist::const_iterator pbl
= bl
.begin();
14562 agent_state
->add_hit_set(p
->begin
.sec(), hs
);
14568 bool PrimaryLogPG::agent_maybe_flush(ObjectContextRef
& obc
)
14570 if (!obc
->obs
.oi
.is_dirty()) {
14571 dout(20) << __func__
<< " skip (clean) " << obc
->obs
.oi
<< dendl
;
14572 osd
->logger
->inc(l_osd_agent_skip
);
14575 if (obc
->obs
.oi
.is_cache_pinned()) {
14576 dout(20) << __func__
<< " skip (cache_pinned) " << obc
->obs
.oi
<< dendl
;
14577 osd
->logger
->inc(l_osd_agent_skip
);
14581 utime_t now
= ceph_clock_now();
14582 utime_t ob_local_mtime
;
14583 if (obc
->obs
.oi
.local_mtime
!= utime_t()) {
14584 ob_local_mtime
= obc
->obs
.oi
.local_mtime
;
14586 ob_local_mtime
= obc
->obs
.oi
.mtime
;
14588 bool evict_mode_full
=
14589 (agent_state
->evict_mode
== TierAgentState::EVICT_MODE_FULL
);
14590 if (!evict_mode_full
&&
14591 obc
->obs
.oi
.soid
.snap
== CEPH_NOSNAP
&& // snaps immutable; don't delay
14592 (ob_local_mtime
+ utime_t(pool
.info
.cache_min_flush_age
, 0) > now
)) {
14593 dout(20) << __func__
<< " skip (too young) " << obc
->obs
.oi
<< dendl
;
14594 osd
->logger
->inc(l_osd_agent_skip
);
14598 if (osd
->agent_is_active_oid(obc
->obs
.oi
.soid
)) {
14599 dout(20) << __func__
<< " skip (flushing) " << obc
->obs
.oi
<< dendl
;
14600 osd
->logger
->inc(l_osd_agent_skip
);
14604 dout(10) << __func__
<< " flushing " << obc
->obs
.oi
<< dendl
;
14606 // FIXME: flush anything dirty, regardless of what distribution of
14609 hobject_t oid
= obc
->obs
.oi
.soid
;
14610 osd
->agent_start_op(oid
);
14611 // no need to capture a pg ref, can't outlive fop or ctx
14612 std::function
<void()> on_flush
= [this, oid
]() {
14613 osd
->agent_finish_op(oid
);
14616 int result
= start_flush(
14617 OpRequestRef(), obc
, false, NULL
,
14619 if (result
!= -EINPROGRESS
) {
14621 dout(10) << __func__
<< " start_flush() failed " << obc
->obs
.oi
14622 << " with " << result
<< dendl
;
14623 osd
->logger
->inc(l_osd_agent_skip
);
14627 osd
->logger
->inc(l_osd_agent_flush
);
14631 bool PrimaryLogPG::agent_maybe_evict(ObjectContextRef
& obc
, bool after_flush
)
14633 const hobject_t
& soid
= obc
->obs
.oi
.soid
;
14634 if (!after_flush
&& obc
->obs
.oi
.is_dirty()) {
14635 dout(20) << __func__
<< " skip (dirty) " << obc
->obs
.oi
<< dendl
;
14638 // This is already checked by agent_work() which passes after_flush = false
14639 if (after_flush
&& m_scrubber
->range_intersects_scrub(soid
, soid
.get_head())) {
14640 dout(20) << __func__
<< " skip (scrubbing) " << obc
->obs
.oi
<< dendl
;
14643 if (!obc
->obs
.oi
.watchers
.empty()) {
14644 dout(20) << __func__
<< " skip (watchers) " << obc
->obs
.oi
<< dendl
;
14647 if (obc
->is_blocked()) {
14648 dout(20) << __func__
<< " skip (blocked) " << obc
->obs
.oi
<< dendl
;
14651 if (obc
->obs
.oi
.is_cache_pinned()) {
14652 dout(20) << __func__
<< " skip (cache_pinned) " << obc
->obs
.oi
<< dendl
;
14656 if (soid
.snap
== CEPH_NOSNAP
) {
14657 int result
= _verify_no_head_clones(soid
, obc
->ssc
->snapset
);
14659 dout(20) << __func__
<< " skip (clones) " << obc
->obs
.oi
<< dendl
;
14664 if (agent_state
->evict_mode
!= TierAgentState::EVICT_MODE_FULL
) {
14665 // is this object old than cache_min_evict_age?
14666 utime_t now
= ceph_clock_now();
14667 utime_t ob_local_mtime
;
14668 if (obc
->obs
.oi
.local_mtime
!= utime_t()) {
14669 ob_local_mtime
= obc
->obs
.oi
.local_mtime
;
14671 ob_local_mtime
= obc
->obs
.oi
.mtime
;
14673 if (ob_local_mtime
+ utime_t(pool
.info
.cache_min_evict_age
, 0) > now
) {
14674 dout(20) << __func__
<< " skip (too young) " << obc
->obs
.oi
<< dendl
;
14675 osd
->logger
->inc(l_osd_agent_skip
);
14678 // is this object old and/or cold enough?
14680 uint64_t temp_upper
= 0, temp_lower
= 0;
14682 agent_estimate_temp(soid
, &temp
);
14683 agent_state
->temp_hist
.add(temp
);
14684 agent_state
->temp_hist
.get_position_micro(temp
, &temp_lower
, &temp_upper
);
14686 dout(20) << __func__
14687 << " temp " << temp
14688 << " pos " << temp_lower
<< "-" << temp_upper
14689 << ", evict_effort " << agent_state
->evict_effort
14691 dout(30) << "agent_state:\n";
14692 Formatter
*f
= Formatter::create("");
14693 f
->open_object_section("agent_state");
14694 agent_state
->dump(f
);
14695 f
->close_section();
14700 if (1000000 - temp_upper
>= agent_state
->evict_effort
)
14704 dout(10) << __func__
<< " evicting " << obc
->obs
.oi
<< dendl
;
14705 OpContextUPtr ctx
= simple_opc_create(obc
);
14707 auto null_op_req
= OpRequestRef();
14708 if (!ctx
->lock_manager
.get_lock_type(
14713 close_op_ctx(ctx
.release());
14714 dout(20) << __func__
<< " skip (cannot get lock) " << obc
->obs
.oi
<< dendl
;
14718 osd
->agent_start_evict_op();
14719 ctx
->register_on_finish(
14721 osd
->agent_finish_evict_op();
14724 ctx
->at_version
= get_next_version();
14725 ceph_assert(ctx
->new_obs
.exists
);
14726 int r
= _delete_oid(ctx
.get(), true, false);
14727 if (obc
->obs
.oi
.is_omap())
14728 ctx
->delta_stats
.num_objects_omap
--;
14729 ctx
->delta_stats
.num_evict
++;
14730 ctx
->delta_stats
.num_evict_kb
+= shift_round_up(obc
->obs
.oi
.size
, 10);
14731 if (obc
->obs
.oi
.is_dirty())
14732 --ctx
->delta_stats
.num_objects_dirty
;
14733 ceph_assert(r
== 0);
14734 finish_ctx(ctx
.get(), pg_log_entry_t::DELETE
);
14735 simple_opc_submit(std::move(ctx
));
14736 osd
->logger
->inc(l_osd_tier_evict
);
14737 osd
->logger
->inc(l_osd_agent_evict
);
14741 void PrimaryLogPG::agent_stop()
14743 dout(20) << __func__
<< dendl
;
14744 if (agent_state
&& !agent_state
->is_idle()) {
14745 agent_state
->evict_mode
= TierAgentState::EVICT_MODE_IDLE
;
14746 agent_state
->flush_mode
= TierAgentState::FLUSH_MODE_IDLE
;
14747 osd
->agent_disable_pg(this, agent_state
->evict_effort
);
14751 void PrimaryLogPG::agent_delay()
14753 dout(20) << __func__
<< dendl
;
14754 if (agent_state
&& !agent_state
->is_idle()) {
14755 ceph_assert(agent_state
->delaying
== false);
14756 agent_state
->delaying
= true;
14757 osd
->agent_disable_pg(this, agent_state
->evict_effort
);
14761 void PrimaryLogPG::agent_choose_mode_restart()
14763 dout(20) << __func__
<< dendl
;
14764 std::scoped_lock locker
{*this};
14765 if (agent_state
&& agent_state
->delaying
) {
14766 agent_state
->delaying
= false;
14767 agent_choose_mode(true);
14771 bool PrimaryLogPG::agent_choose_mode(bool restart
, OpRequestRef op
)
14773 bool requeued
= false;
14774 // Let delay play out
14775 if (agent_state
->delaying
) {
14776 dout(20) << __func__
<< " " << this << " delaying, ignored" << dendl
;
14780 TierAgentState::flush_mode_t flush_mode
= TierAgentState::FLUSH_MODE_IDLE
;
14781 TierAgentState::evict_mode_t evict_mode
= TierAgentState::EVICT_MODE_IDLE
;
14782 unsigned evict_effort
= 0;
14784 if (info
.stats
.stats_invalid
) {
14785 // idle; stats can't be trusted until we scrub.
14786 dout(20) << __func__
<< " stats invalid (post-split), idle" << dendl
;
14791 uint64_t divisor
= pool
.info
.get_pg_num_divisor(info
.pgid
.pgid
);
14792 ceph_assert(divisor
> 0);
14794 // adjust (effective) user objects down based on the number
14795 // of HitSet objects, which should not count toward our total since
14796 // they cannot be flushed.
14797 uint64_t unflushable
= info
.stats
.stats
.sum
.num_objects_hit_set_archive
;
14799 // also exclude omap objects if ec backing pool
14800 const pg_pool_t
*base_pool
= get_osdmap()->get_pg_pool(pool
.info
.tier_of
);
14801 ceph_assert(base_pool
);
14802 if (!base_pool
->supports_omap())
14803 unflushable
+= info
.stats
.stats
.sum
.num_objects_omap
;
14805 uint64_t num_user_objects
= info
.stats
.stats
.sum
.num_objects
;
14806 if (num_user_objects
> unflushable
)
14807 num_user_objects
-= unflushable
;
14809 num_user_objects
= 0;
14811 uint64_t num_user_bytes
= info
.stats
.stats
.sum
.num_bytes
;
14812 uint64_t unflushable_bytes
= info
.stats
.stats
.sum
.num_bytes_hit_set_archive
;
14813 num_user_bytes
-= unflushable_bytes
;
14814 uint64_t num_overhead_bytes
= osd
->store
->estimate_objects_overhead(num_user_objects
);
14815 num_user_bytes
+= num_overhead_bytes
;
14817 // also reduce the num_dirty by num_objects_omap
14818 int64_t num_dirty
= info
.stats
.stats
.sum
.num_objects_dirty
;
14819 if (!base_pool
->supports_omap()) {
14820 if (num_dirty
> info
.stats
.stats
.sum
.num_objects_omap
)
14821 num_dirty
-= info
.stats
.stats
.sum
.num_objects_omap
;
14826 dout(10) << __func__
14828 << TierAgentState::get_flush_mode_name(agent_state
->flush_mode
)
14830 << TierAgentState::get_evict_mode_name(agent_state
->evict_mode
)
14831 << " num_objects: " << info
.stats
.stats
.sum
.num_objects
14832 << " num_bytes: " << info
.stats
.stats
.sum
.num_bytes
14833 << " num_objects_dirty: " << info
.stats
.stats
.sum
.num_objects_dirty
14834 << " num_objects_omap: " << info
.stats
.stats
.sum
.num_objects_omap
14835 << " num_dirty: " << num_dirty
14836 << " num_user_objects: " << num_user_objects
14837 << " num_user_bytes: " << num_user_bytes
14838 << " num_overhead_bytes: " << num_overhead_bytes
14839 << " pool.info.target_max_bytes: " << pool
.info
.target_max_bytes
14840 << " pool.info.target_max_objects: " << pool
.info
.target_max_objects
14843 // get dirty, full ratios
14844 uint64_t dirty_micro
= 0;
14845 uint64_t full_micro
= 0;
14846 if (pool
.info
.target_max_bytes
&& num_user_objects
> 0) {
14847 uint64_t avg_size
= num_user_bytes
/ num_user_objects
;
14849 num_dirty
* avg_size
* 1000000 /
14850 std::max
<uint64_t>(pool
.info
.target_max_bytes
/ divisor
, 1);
14852 num_user_objects
* avg_size
* 1000000 /
14853 std::max
<uint64_t>(pool
.info
.target_max_bytes
/ divisor
, 1);
14855 if (pool
.info
.target_max_objects
> 0) {
14856 uint64_t dirty_objects_micro
=
14857 num_dirty
* 1000000 /
14858 std::max
<uint64_t>(pool
.info
.target_max_objects
/ divisor
, 1);
14859 if (dirty_objects_micro
> dirty_micro
)
14860 dirty_micro
= dirty_objects_micro
;
14861 uint64_t full_objects_micro
=
14862 num_user_objects
* 1000000 /
14863 std::max
<uint64_t>(pool
.info
.target_max_objects
/ divisor
, 1);
14864 if (full_objects_micro
> full_micro
)
14865 full_micro
= full_objects_micro
;
14867 dout(20) << __func__
<< " dirty " << ((float)dirty_micro
/ 1000000.0)
14868 << " full " << ((float)full_micro
/ 1000000.0)
14872 uint64_t flush_target
= pool
.info
.cache_target_dirty_ratio_micro
;
14873 uint64_t flush_high_target
= pool
.info
.cache_target_dirty_high_ratio_micro
;
14874 uint64_t flush_slop
= (float)flush_target
* cct
->_conf
->osd_agent_slop
;
14875 if (restart
|| agent_state
->flush_mode
== TierAgentState::FLUSH_MODE_IDLE
) {
14876 flush_target
+= flush_slop
;
14877 flush_high_target
+= flush_slop
;
14879 flush_target
-= std::min(flush_target
, flush_slop
);
14880 flush_high_target
-= std::min(flush_high_target
, flush_slop
);
14883 if (dirty_micro
> flush_high_target
) {
14884 flush_mode
= TierAgentState::FLUSH_MODE_HIGH
;
14885 } else if (dirty_micro
> flush_target
|| (!flush_target
&& num_dirty
> 0)) {
14886 flush_mode
= TierAgentState::FLUSH_MODE_LOW
;
14890 uint64_t evict_target
= pool
.info
.cache_target_full_ratio_micro
;
14891 uint64_t evict_slop
= (float)evict_target
* cct
->_conf
->osd_agent_slop
;
14892 if (restart
|| agent_state
->evict_mode
== TierAgentState::EVICT_MODE_IDLE
)
14893 evict_target
+= evict_slop
;
14895 evict_target
-= std::min(evict_target
, evict_slop
);
14897 if (full_micro
> 1000000) {
14898 // evict anything clean
14899 evict_mode
= TierAgentState::EVICT_MODE_FULL
;
14900 evict_effort
= 1000000;
14901 } else if (full_micro
> evict_target
) {
14902 // set effort in [0..1] range based on where we are between
14903 evict_mode
= TierAgentState::EVICT_MODE_SOME
;
14904 uint64_t over
= full_micro
- evict_target
;
14905 uint64_t span
= 1000000 - evict_target
;
14906 evict_effort
= std::max(over
* 1000000 / span
,
14907 uint64_t(1000000.0 *
14908 cct
->_conf
->osd_agent_min_evict_effort
));
14910 // quantize effort to avoid too much reordering in the agent_queue.
14911 uint64_t inc
= cct
->_conf
->osd_agent_quantize_effort
* 1000000;
14912 ceph_assert(inc
> 0);
14913 uint64_t was
= evict_effort
;
14914 evict_effort
-= evict_effort
% inc
;
14915 if (evict_effort
< inc
)
14916 evict_effort
= inc
;
14917 ceph_assert(evict_effort
>= inc
&& evict_effort
<= 1000000);
14918 dout(30) << __func__
<< " evict_effort " << was
<< " quantized by " << inc
<< " to " << evict_effort
<< dendl
;
14923 bool old_idle
= agent_state
->is_idle();
14924 if (flush_mode
!= agent_state
->flush_mode
) {
14925 dout(5) << __func__
<< " flush_mode "
14926 << TierAgentState::get_flush_mode_name(agent_state
->flush_mode
)
14928 << TierAgentState::get_flush_mode_name(flush_mode
)
14930 recovery_state
.update_stats(
14931 [=](auto &history
, auto &stats
) {
14932 if (flush_mode
== TierAgentState::FLUSH_MODE_HIGH
) {
14933 osd
->agent_inc_high_count();
14934 stats
.stats
.sum
.num_flush_mode_high
= 1;
14935 } else if (flush_mode
== TierAgentState::FLUSH_MODE_LOW
) {
14936 stats
.stats
.sum
.num_flush_mode_low
= 1;
14938 if (agent_state
->flush_mode
== TierAgentState::FLUSH_MODE_HIGH
) {
14939 osd
->agent_dec_high_count();
14940 stats
.stats
.sum
.num_flush_mode_high
= 0;
14941 } else if (agent_state
->flush_mode
== TierAgentState::FLUSH_MODE_LOW
) {
14942 stats
.stats
.sum
.num_flush_mode_low
= 0;
14946 agent_state
->flush_mode
= flush_mode
;
14948 if (evict_mode
!= agent_state
->evict_mode
) {
14949 dout(5) << __func__
<< " evict_mode "
14950 << TierAgentState::get_evict_mode_name(agent_state
->evict_mode
)
14952 << TierAgentState::get_evict_mode_name(evict_mode
)
14954 if (agent_state
->evict_mode
== TierAgentState::EVICT_MODE_FULL
&&
14958 requeue_ops(waiting_for_flush
);
14959 requeue_ops(waiting_for_active
);
14960 requeue_ops(waiting_for_readable
);
14961 requeue_ops(waiting_for_scrub
);
14962 requeue_ops(waiting_for_cache_not_full
);
14963 objects_blocked_on_cache_full
.clear();
14966 recovery_state
.update_stats(
14967 [=](auto &history
, auto &stats
) {
14968 if (evict_mode
== TierAgentState::EVICT_MODE_SOME
) {
14969 stats
.stats
.sum
.num_evict_mode_some
= 1;
14970 } else if (evict_mode
== TierAgentState::EVICT_MODE_FULL
) {
14971 stats
.stats
.sum
.num_evict_mode_full
= 1;
14973 if (agent_state
->evict_mode
== TierAgentState::EVICT_MODE_SOME
) {
14974 stats
.stats
.sum
.num_evict_mode_some
= 0;
14975 } else if (agent_state
->evict_mode
== TierAgentState::EVICT_MODE_FULL
) {
14976 stats
.stats
.sum
.num_evict_mode_full
= 0;
14980 agent_state
->evict_mode
= evict_mode
;
14982 uint64_t old_effort
= agent_state
->evict_effort
;
14983 if (evict_effort
!= agent_state
->evict_effort
) {
14984 dout(5) << __func__
<< " evict_effort "
14985 << ((float)agent_state
->evict_effort
/ 1000000.0)
14987 << ((float)evict_effort
/ 1000000.0)
14989 agent_state
->evict_effort
= evict_effort
;
14992 // NOTE: we are using evict_effort as a proxy for *all* agent effort
14993 // (including flush). This is probably fine (they should be
14994 // correlated) but it is not precisely correct.
14995 if (agent_state
->is_idle()) {
14996 if (!restart
&& !old_idle
) {
14997 osd
->agent_disable_pg(this, old_effort
);
15000 if (restart
|| old_idle
) {
15001 osd
->agent_enable_pg(this, agent_state
->evict_effort
);
15002 } else if (old_effort
!= agent_state
->evict_effort
) {
15003 osd
->agent_adjust_pg(this, old_effort
, agent_state
->evict_effort
);
15009 void PrimaryLogPG::agent_estimate_temp(const hobject_t
& oid
, int *temp
)
15011 ceph_assert(hit_set
);
15014 if (hit_set
->contains(oid
))
15017 int last_n
= pool
.info
.hit_set_search_last_n
;
15018 for (map
<time_t,HitSetRef
>::reverse_iterator p
=
15019 agent_state
->hit_set_map
.rbegin(); last_n
> 0 &&
15020 p
!= agent_state
->hit_set_map
.rend(); ++p
, ++i
) {
15021 if (p
->second
->contains(oid
)) {
15022 *temp
+= pool
.info
.get_grade(i
);
15028 // Dup op detection
15030 bool PrimaryLogPG::already_complete(eversion_t v
)
15032 dout(20) << __func__
<< ": " << v
<< dendl
;
15033 for (xlist
<RepGather
*>::iterator i
= repop_queue
.begin();
15036 dout(20) << __func__
<< ": " << **i
<< dendl
;
15037 // skip copy from temp object ops
15038 if ((*i
)->v
== eversion_t()) {
15039 dout(20) << __func__
<< ": " << **i
15040 << " version is empty" << dendl
;
15044 dout(20) << __func__
<< ": " << **i
15045 << " (*i)->v past v" << dendl
;
15048 if (!(*i
)->all_committed
) {
15049 dout(20) << __func__
<< ": " << **i
15050 << " not committed, returning false"
15055 dout(20) << __func__
<< ": returning true" << dendl
;
15060 // ==========================================================================================
15063 void PrimaryLogPG::do_replica_scrub_map(OpRequestRef op
)
15065 dout(15) << __func__
<< " is scrub active? " << m_scrubber
->is_scrub_active() << dendl
;
15066 op
->mark_started();
15068 if (!m_scrubber
->is_scrub_active()) {
15069 dout(10) << __func__
<< " scrub isn't active" << dendl
;
15072 m_scrubber
->map_from_replica(op
);
15075 bool PrimaryLogPG::_range_available_for_scrub(const hobject_t
& begin
,
15076 const hobject_t
& end
)
15078 pair
<hobject_t
, ObjectContextRef
> next
;
15079 next
.second
= object_contexts
.lookup(begin
);
15080 next
.first
= begin
;
15082 while (more
&& next
.first
< end
) {
15083 if (next
.second
&& next
.second
->is_blocked()) {
15084 next
.second
->requeue_scrub_on_unblock
= true;
15085 dout(10) << __func__
<< ": scrub delayed, "
15086 << next
.first
<< " is blocked"
15090 more
= object_contexts
.get_next(next
.first
, &next
);
15096 int PrimaryLogPG::rep_repair_primary_object(const hobject_t
& soid
, OpContext
*ctx
)
15098 OpRequestRef op
= ctx
->op
;
15099 // Only supports replicated pools
15100 ceph_assert(!pool
.info
.is_erasure());
15101 ceph_assert(is_primary());
15103 dout(10) << __func__
<< " " << soid
15104 << " peers osd.{" << get_acting_recovery_backfill() << "}" << dendl
;
15107 block_for_clean(soid
, op
);
15111 ceph_assert(!recovery_state
.get_pg_log().get_missing().is_missing(soid
));
15112 auto& oi
= ctx
->new_obs
.oi
;
15113 eversion_t v
= oi
.version
;
15115 if (primary_error(soid
, v
)) {
15116 dout(0) << __func__
<< " No other replicas available for " << soid
<< dendl
;
15117 // XXX: If we knew that there is no down osd which could include this
15118 // object, it would be nice if we could return EIO here.
15119 // If a "never fail" flag was available, that could be used
15120 // for rbd to NOT return EIO until object marked lost.
15122 // Drop through to save this op in case an osd comes up with the object.
15125 // Restart the op after object becomes readable again
15126 waiting_for_unreadable_object
[soid
].push_back(op
);
15127 op
->mark_delayed("waiting for missing object");
15129 ceph_assert(is_clean());
15130 state_set(PG_STATE_REPAIR
);
15131 state_clear(PG_STATE_CLEAN
);
15132 queue_peering_event(
15134 std::make_shared
<PGPeeringEvent
>(
15135 get_osdmap_epoch(),
15136 get_osdmap_epoch(),
15137 PeeringState::DoRecovery())));
15142 /*---SnapTrimmer Logging---*/
15144 #define dout_prefix pg->gen_prefix(*_dout)
15146 void PrimaryLogPG::SnapTrimmer::log_enter(const char *state_name
)
15148 ldout(pg
->cct
, 20) << "enter " << state_name
<< dendl
;
15151 void PrimaryLogPG::SnapTrimmer::log_exit(const char *state_name
, utime_t enter_time
)
15153 ldout(pg
->cct
, 20) << "exit " << state_name
<< dendl
;
15156 bool PrimaryLogPG::SnapTrimmer::permit_trim() {
15159 !pg
->m_scrubber
->is_scrub_active() &&
15160 !pg
->snap_trimq
.empty();
15163 /*---SnapTrimmer states---*/
15165 #define dout_prefix (context< SnapTrimmer >().pg->gen_prefix(*_dout) \
15166 << "SnapTrimmer state<" << get_state_name() << ">: ")
15169 PrimaryLogPG::NotTrimming::NotTrimming(my_context ctx
)
15171 NamedState(nullptr, "NotTrimming")
15173 context
< SnapTrimmer
>().log_enter(state_name
);
15176 void PrimaryLogPG::NotTrimming::exit()
15178 context
< SnapTrimmer
>().log_exit(state_name
, enter_time
);
15181 boost::statechart::result
PrimaryLogPG::NotTrimming::react(const KickTrim
&)
15183 PrimaryLogPG
*pg
= context
< SnapTrimmer
>().pg
;
15184 ldout(pg
->cct
, 10) << "NotTrimming react KickTrim" << dendl
;
15186 if (!(pg
->is_primary() && pg
->is_active())) {
15187 ldout(pg
->cct
, 10) << "NotTrimming not primary or active" << dendl
;
15188 return discard_event();
15190 if (!pg
->is_clean() ||
15191 pg
->snap_trimq
.empty()) {
15192 ldout(pg
->cct
, 10) << "NotTrimming not clean or nothing to trim" << dendl
;
15193 return discard_event();
15195 if (pg
->m_scrubber
->is_scrub_active()) {
15196 ldout(pg
->cct
, 10) << " scrubbing, will requeue snap_trimmer after" << dendl
;
15197 return transit
< WaitScrub
>();
15199 return transit
< Trimming
>();
15203 boost::statechart::result
PrimaryLogPG::WaitReservation::react(const SnapTrimReserved
&)
15205 PrimaryLogPG
*pg
= context
< SnapTrimmer
>().pg
;
15206 ldout(pg
->cct
, 10) << "WaitReservation react SnapTrimReserved" << dendl
;
15209 if (!context
< SnapTrimmer
>().can_trim()) {
15210 post_event(KickTrim());
15211 return transit
< NotTrimming
>();
15214 context
<Trimming
>().snap_to_trim
= pg
->snap_trimq
.range_start();
15215 ldout(pg
->cct
, 10) << "NotTrimming: trimming "
15216 << pg
->snap_trimq
.range_start()
15218 return transit
< AwaitAsyncWork
>();
15221 /* AwaitAsyncWork */
15222 PrimaryLogPG::AwaitAsyncWork::AwaitAsyncWork(my_context ctx
)
15224 NamedState(nullptr, "Trimming/AwaitAsyncWork")
15226 auto *pg
= context
< SnapTrimmer
>().pg
;
15227 context
< SnapTrimmer
>().log_enter(state_name
);
15228 context
< SnapTrimmer
>().pg
->osd
->queue_for_snap_trim(pg
);
15229 pg
->state_set(PG_STATE_SNAPTRIM
);
15230 pg
->state_clear(PG_STATE_SNAPTRIM_ERROR
);
15231 pg
->publish_stats_to_osd();
15234 boost::statechart::result
PrimaryLogPG::AwaitAsyncWork::react(const DoSnapWork
&)
15236 PrimaryLogPGRef pg
= context
< SnapTrimmer
>().pg
;
15237 snapid_t snap_to_trim
= context
<Trimming
>().snap_to_trim
;
15238 auto &in_flight
= context
<Trimming
>().in_flight
;
15239 ceph_assert(in_flight
.empty());
15241 ceph_assert(pg
->is_primary() && pg
->is_active());
15242 if (!context
< SnapTrimmer
>().can_trim()) {
15243 ldout(pg
->cct
, 10) << "something changed, reverting to NotTrimming" << dendl
;
15244 post_event(KickTrim());
15245 return transit
< NotTrimming
>();
15248 ldout(pg
->cct
, 10) << "AwaitAsyncWork: trimming snap " << snap_to_trim
<< dendl
;
15250 vector
<hobject_t
> to_trim
;
15251 unsigned max
= pg
->cct
->_conf
->osd_pg_max_concurrent_snap_trims
;
15252 to_trim
.reserve(max
);
15253 int r
= pg
->snap_mapper
.get_next_objects_to_trim(
15257 if (r
!= 0 && r
!= -ENOENT
) {
15258 lderr(pg
->cct
) << "get_next_objects_to_trim returned "
15259 << cpp_strerror(r
) << dendl
;
15260 ceph_abort_msg("get_next_objects_to_trim returned an invalid code");
15261 } else if (r
== -ENOENT
) {
15263 ldout(pg
->cct
, 10) << "got ENOENT" << dendl
;
15265 pg
->snap_trimq
.erase(snap_to_trim
);
15267 if (pg
->snap_trimq_repeat
.count(snap_to_trim
)) {
15268 ldout(pg
->cct
, 10) << " removing from snap_trimq_repeat" << dendl
;
15269 pg
->snap_trimq_repeat
.erase(snap_to_trim
);
15271 ldout(pg
->cct
, 10) << "adding snap " << snap_to_trim
15272 << " to purged_snaps"
15274 ObjectStore::Transaction t
;
15275 pg
->recovery_state
.adjust_purged_snaps(
15276 [snap_to_trim
](auto &purged_snaps
) {
15277 purged_snaps
.insert(snap_to_trim
);
15279 pg
->write_if_dirty(t
);
15281 ldout(pg
->cct
, 10) << "purged_snaps now "
15282 << pg
->info
.purged_snaps
<< ", snap_trimq now "
15283 << pg
->snap_trimq
<< dendl
;
15285 int tr
= pg
->osd
->store
->queue_transaction(pg
->ch
, std::move(t
), NULL
);
15286 ceph_assert(tr
== 0);
15288 pg
->recovery_state
.share_pg_info();
15290 post_event(KickTrim());
15291 return transit
< NotTrimming
>();
15293 ceph_assert(!to_trim
.empty());
15295 for (auto &&object
: to_trim
) {
15297 ldout(pg
->cct
, 10) << "AwaitAsyncWork react trimming " << object
<< dendl
;
15299 int error
= pg
->trim_object(in_flight
.empty(), object
, snap_to_trim
, &ctx
);
15301 if (error
== -ENOLCK
) {
15302 ldout(pg
->cct
, 10) << "could not get write lock on obj "
15303 << object
<< dendl
;
15305 pg
->state_set(PG_STATE_SNAPTRIM_ERROR
);
15306 ldout(pg
->cct
, 10) << "Snaptrim error=" << error
<< dendl
;
15308 if (!in_flight
.empty()) {
15309 ldout(pg
->cct
, 10) << "letting the ones we already started finish" << dendl
;
15310 return transit
< WaitRepops
>();
15312 if (error
== -ENOLCK
) {
15313 ldout(pg
->cct
, 10) << "waiting for it to clear"
15315 return transit
< WaitRWLock
>();
15317 return transit
< NotTrimming
>();
15321 in_flight
.insert(object
);
15322 ctx
->register_on_success(
15323 [pg
, object
, &in_flight
]() {
15324 ceph_assert(in_flight
.find(object
) != in_flight
.end());
15325 in_flight
.erase(object
);
15326 if (in_flight
.empty()) {
15327 if (pg
->state_test(PG_STATE_SNAPTRIM_ERROR
)) {
15328 pg
->snap_trimmer_machine
.process_event(Reset());
15330 pg
->snap_trimmer_machine
.process_event(RepopsComplete());
15335 pg
->simple_opc_submit(std::move(ctx
));
15338 return transit
< WaitRepops
>();
15341 void PrimaryLogPG::setattr_maybe_cache(
15342 ObjectContextRef obc
,
15347 t
->setattr(obc
->obs
.oi
.soid
, key
, val
);
15350 void PrimaryLogPG::setattrs_maybe_cache(
15351 ObjectContextRef obc
,
15353 map
<string
, bufferlist
> &attrs
)
15355 t
->setattrs(obc
->obs
.oi
.soid
, attrs
);
15358 void PrimaryLogPG::rmattr_maybe_cache(
15359 ObjectContextRef obc
,
15363 t
->rmattr(obc
->obs
.oi
.soid
, key
);
15366 int PrimaryLogPG::getattr_maybe_cache(
15367 ObjectContextRef obc
,
15371 if (pool
.info
.is_erasure()) {
15372 map
<string
, bufferlist
>::iterator i
= obc
->attr_cache
.find(key
);
15373 if (i
!= obc
->attr_cache
.end()) {
15381 return pgbackend
->objects_get_attr(obc
->obs
.oi
.soid
, key
, val
);
15384 int PrimaryLogPG::getattrs_maybe_cache(
15385 ObjectContextRef obc
,
15386 map
<string
, bufferlist
> *out
)
15390 if (pool
.info
.is_erasure()) {
15391 *out
= obc
->attr_cache
;
15393 r
= pgbackend
->objects_get_attrs(obc
->obs
.oi
.soid
, out
);
15395 map
<string
, bufferlist
> tmp
;
15396 for (map
<string
, bufferlist
>::iterator i
= out
->begin();
15399 if (i
->first
.size() > 1 && i
->first
[0] == '_')
15400 tmp
[i
->first
.substr(1, i
->first
.size())] = std::move(i
->second
);
15406 bool PrimaryLogPG::check_failsafe_full() {
15407 return osd
->check_failsafe_full(get_dpp());
15410 bool PrimaryLogPG::maybe_preempt_replica_scrub(const hobject_t
& oid
)
15412 return m_scrubber
->write_blocked_by_scrub(oid
);
15415 void intrusive_ptr_add_ref(PrimaryLogPG
*pg
) { pg
->get("intptr"); }
15416 void intrusive_ptr_release(PrimaryLogPG
*pg
) { pg
->put("intptr"); }
15418 #ifdef PG_DEBUG_REFS
15419 uint64_t get_with_id(PrimaryLogPG
*pg
) { return pg
->get_with_id(); }
15420 void put_with_id(PrimaryLogPG
*pg
, uint64_t id
) { return pg
->put_with_id(id
); }
15423 void intrusive_ptr_add_ref(PrimaryLogPG::RepGather
*repop
) { repop
->get(); }
15424 void intrusive_ptr_release(PrimaryLogPG::RepGather
*repop
) { repop
->put(); }