1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 * Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com>
9 * Author: Loic Dachary <loic@dachary.org>
11 * This is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public
13 * License version 2.1, as published by the Free Software
14 * Foundation. See file COPYING.
18 #include "boost/tuple/tuple.hpp"
19 #include "boost/intrusive_ptr.hpp"
21 #include "PrimaryLogPG.h"
23 #include "OpRequest.h"
24 #include "ScrubStore.h"
26 #include "objclass/objclass.h"
28 #include "common/ceph_crypto.h"
29 #include "common/errno.h"
30 #include "common/scrub_types.h"
31 #include "common/perf_counters.h"
33 #include "messages/MOSDOp.h"
34 #include "messages/MOSDBackoff.h"
35 #include "messages/MOSDPGTrim.h"
36 #include "messages/MOSDPGScan.h"
37 #include "messages/MOSDRepScrub.h"
38 #include "messages/MOSDPGBackfill.h"
39 #include "messages/MOSDPGBackfillRemove.h"
40 #include "messages/MOSDPGUpdateLogMissing.h"
41 #include "messages/MOSDPGUpdateLogMissingReply.h"
42 #include "messages/MCommandReply.h"
43 #include "messages/MOSDScrubReserve.h"
44 #include "common/EventTrace.h"
46 #include "common/config.h"
47 #include "include/compat.h"
48 #include "mon/MonClient.h"
49 #include "osdc/Objecter.h"
50 #include "json_spirit/json_spirit_value.h"
51 #include "json_spirit/json_spirit_reader.h"
52 #include "include/ceph_assert.h" // json_spirit clobbers it
53 #include "include/rados/rados_types.hpp"
56 #include "tracing/osd.h"
58 #define tracepoint(...)
61 #define dout_context cct
62 #define dout_subsys ceph_subsys_osd
63 #define DOUT_PREFIX_ARGS this, osd->whoami, get_osdmap()
65 #define dout_prefix _prefix(_dout, this)
66 using TOPNSPC::common::cmd_getval
;
69 static ostream
& _prefix(std::ostream
*_dout
, T
*pg
) {
70 return pg
->gen_prefix(*_dout
);
79 MEMPOOL_DEFINE_OBJECT_FACTORY(PrimaryLogPG
, replicatedpg
, osd
);
81 using namespace ceph::osd::scheduler
;
84 * The CopyCallback class defines an interface for completions to the
85 * copy_start code. Users of the copy infrastructure must implement
86 * one and give an instance of the class to start_copy.
88 * The implementer is responsible for making sure that the CopyCallback
89 * can associate itself with the correct copy operation.
91 class PrimaryLogPG::CopyCallback
: public GenContext
<CopyCallbackResults
> {
95 * results.get<0>() is the return code: 0 for success; -ECANCELED if
96 * the operation was cancelled by the local OSD; -errno for other issues.
97 * results.get<1>() is a pointer to a CopyResults object, which you are
98 * responsible for deleting.
100 void finish(CopyCallbackResults results_
) override
= 0;
103 /// Provide the final size of the copied object to the CopyCallback
104 ~CopyCallback() override
{}
107 template <typename T
>
108 class PrimaryLogPG::BlessedGenContext
: public GenContext
<T
> {
110 unique_ptr
<GenContext
<T
>> c
;
113 BlessedGenContext(PrimaryLogPG
*pg
, GenContext
<T
> *c
, epoch_t e
)
114 : pg(pg
), c(c
), e(e
) {}
115 void finish(T t
) override
{
116 std::scoped_lock locker
{*pg
};
117 if (pg
->pg_has_reset_since(e
))
120 c
.release()->complete(t
);
122 bool sync_finish(T t
) {
123 // we assume here all blessed/wrapped Contexts can complete synchronously.
124 c
.release()->complete(t
);
129 GenContext
<ThreadPool::TPHandle
&> *PrimaryLogPG::bless_gencontext(
130 GenContext
<ThreadPool::TPHandle
&> *c
) {
131 return new BlessedGenContext
<ThreadPool::TPHandle
&>(
132 this, c
, get_osdmap_epoch());
135 template <typename T
>
136 class PrimaryLogPG::UnlockedBlessedGenContext
: public GenContext
<T
> {
138 unique_ptr
<GenContext
<T
>> c
;
141 UnlockedBlessedGenContext(PrimaryLogPG
*pg
, GenContext
<T
> *c
, epoch_t e
)
142 : pg(pg
), c(c
), e(e
) {}
143 void finish(T t
) override
{
144 if (pg
->pg_has_reset_since(e
))
147 c
.release()->complete(t
);
149 bool sync_finish(T t
) {
150 // we assume here all blessed/wrapped Contexts can complete synchronously.
151 c
.release()->complete(t
);
156 GenContext
<ThreadPool::TPHandle
&> *PrimaryLogPG::bless_unlocked_gencontext(
157 GenContext
<ThreadPool::TPHandle
&> *c
) {
158 return new UnlockedBlessedGenContext
<ThreadPool::TPHandle
&>(
159 this, c
, get_osdmap_epoch());
162 class PrimaryLogPG::BlessedContext
: public Context
{
164 unique_ptr
<Context
> c
;
167 BlessedContext(PrimaryLogPG
*pg
, Context
*c
, epoch_t e
)
168 : pg(pg
), c(c
), e(e
) {}
169 void finish(int r
) override
{
170 std::scoped_lock locker
{*pg
};
171 if (pg
->pg_has_reset_since(e
))
174 c
.release()->complete(r
);
176 bool sync_finish(int r
) {
177 // we assume here all blessed/wrapped Contexts can complete synchronously.
178 c
.release()->complete(r
);
183 Context
*PrimaryLogPG::bless_context(Context
*c
) {
184 return new BlessedContext(this, c
, get_osdmap_epoch());
187 class PrimaryLogPG::C_PG_ObjectContext
: public Context
{
191 C_PG_ObjectContext(PrimaryLogPG
*p
, ObjectContext
*o
) :
193 void finish(int r
) override
{
194 pg
->object_context_destructor_callback(obc
);
198 struct OnReadComplete
: public Context
{
200 PrimaryLogPG::OpContext
*opcontext
;
203 PrimaryLogPG::OpContext
*ctx
) : pg(pg
), opcontext(ctx
) {}
204 void finish(int r
) override
{
205 opcontext
->finish_read(pg
);
207 ~OnReadComplete() override
{}
210 class PrimaryLogPG::C_OSD_AppliedRecoveredObject
: public Context
{
212 ObjectContextRef obc
;
214 C_OSD_AppliedRecoveredObject(PrimaryLogPG
*p
, ObjectContextRef o
) :
216 bool sync_finish(int r
) override
{
217 pg
->_applied_recovered_object(obc
);
220 void finish(int r
) override
{
221 std::scoped_lock locker
{*pg
};
222 pg
->_applied_recovered_object(obc
);
226 class PrimaryLogPG::C_OSD_CommittedPushedObject
: public Context
{
229 eversion_t last_complete
;
231 C_OSD_CommittedPushedObject(
232 PrimaryLogPG
*p
, epoch_t epoch
, eversion_t lc
) :
233 pg(p
), epoch(epoch
), last_complete(lc
) {
235 void finish(int r
) override
{
236 pg
->_committed_pushed_object(epoch
, last_complete
);
240 class PrimaryLogPG::C_OSD_AppliedRecoveredObjectReplica
: public Context
{
243 explicit C_OSD_AppliedRecoveredObjectReplica(PrimaryLogPG
*p
) :
245 bool sync_finish(int r
) override
{
246 pg
->_applied_recovered_object_replica();
249 void finish(int r
) override
{
250 std::scoped_lock locker
{*pg
};
251 pg
->_applied_recovered_object_replica();
256 void PrimaryLogPG::OpContext::start_async_reads(PrimaryLogPG
*pg
)
259 list
<pair
<boost::tuple
<uint64_t, uint64_t, unsigned>,
260 pair
<bufferlist
*, Context
*> > > in
;
261 in
.swap(pending_async_reads
);
262 pg
->pgbackend
->objects_read_async(
265 new OnReadComplete(pg
, this), pg
->get_pool().fast_read
);
267 void PrimaryLogPG::OpContext::finish_read(PrimaryLogPG
*pg
)
269 ceph_assert(inflightreads
> 0);
271 if (async_reads_complete()) {
272 ceph_assert(pg
->in_progress_async_reads
.size());
273 ceph_assert(pg
->in_progress_async_reads
.front().second
== this);
274 pg
->in_progress_async_reads
.pop_front();
276 // Restart the op context now that all reads have been
277 // completed. Read failures will be handled by the op finisher
278 pg
->execute_ctx(this);
282 class CopyFromCallback
: public PrimaryLogPG::CopyCallback
{
284 PrimaryLogPG::CopyResults
*results
= nullptr;
285 PrimaryLogPG::OpContext
*ctx
;
287 uint32_t truncate_seq
;
288 uint64_t truncate_size
;
289 bool have_truncate
= false;
291 CopyFromCallback(PrimaryLogPG::OpContext
*ctx
, OSDOp
&osd_op
)
292 : ctx(ctx
), osd_op(osd_op
) {
294 ~CopyFromCallback() override
{}
296 void finish(PrimaryLogPG::CopyCallbackResults results_
) override
{
297 results
= results_
.get
<1>();
298 int r
= results_
.get
<0>();
300 // Only use truncate_{seq,size} from the original object if the client
301 // did not sent us these parameters
302 if (!have_truncate
) {
303 truncate_seq
= results
->truncate_seq
;
304 truncate_size
= results
->truncate_size
;
307 // for finish_copyfrom
308 ctx
->user_at_version
= results
->user_version
;
311 ctx
->pg
->execute_ctx(ctx
);
313 if (r
!= -ECANCELED
) { // on cancel just toss it out; client resends
315 ctx
->pg
->osd
->reply_op_error(ctx
->op
, r
);
316 } else if (results
->should_requeue
) {
318 ctx
->pg
->requeue_op(ctx
->op
);
320 ctx
->pg
->close_op_ctx(ctx
);
324 bool is_temp_obj_used() {
325 return results
->started_temp_obj
;
327 uint64_t get_data_size() {
328 return results
->object_size
;
330 void set_truncate(uint32_t seq
, uint64_t size
) {
332 truncate_size
= size
;
333 have_truncate
= true;
337 struct CopyFromFinisher
: public PrimaryLogPG::OpFinisher
{
338 CopyFromCallback
*copy_from_callback
;
340 explicit CopyFromFinisher(CopyFromCallback
*copy_from_callback
)
341 : copy_from_callback(copy_from_callback
) {
344 int execute() override
{
345 // instance will be destructed after this method completes
346 copy_from_callback
->ctx
->pg
->finish_copyfrom(copy_from_callback
);
351 // ======================
352 // PGBackend::Listener
354 void PrimaryLogPG::on_local_recover(
355 const hobject_t
&hoid
,
356 const ObjectRecoveryInfo
&_recovery_info
,
357 ObjectContextRef obc
,
359 ObjectStore::Transaction
*t
362 dout(10) << __func__
<< ": " << hoid
<< dendl
;
364 ObjectRecoveryInfo
recovery_info(_recovery_info
);
365 clear_object_snap_mapping(t
, hoid
);
366 if (!is_delete
&& recovery_info
.soid
.is_snap()) {
367 OSDriver::OSTransaction
_t(osdriver
.get_transaction(t
));
369 dout(20) << " snapset " << recovery_info
.ss
<< dendl
;
370 auto p
= recovery_info
.ss
.clone_snaps
.find(hoid
.snap
);
371 if (p
!= recovery_info
.ss
.clone_snaps
.end()) {
372 snaps
.insert(p
->second
.begin(), p
->second
.end());
373 dout(20) << " snaps " << snaps
<< dendl
;
379 derr
<< __func__
<< " " << hoid
<< " had no clone_snaps" << dendl
;
382 if (!is_delete
&& recovery_state
.get_pg_log().get_missing().is_missing(recovery_info
.soid
) &&
383 recovery_state
.get_pg_log().get_missing().get_items().find(recovery_info
.soid
)->second
.need
> recovery_info
.version
) {
384 ceph_assert(is_primary());
385 const pg_log_entry_t
*latest
= recovery_state
.get_pg_log().get_log().objects
.find(recovery_info
.soid
)->second
;
386 if (latest
->op
== pg_log_entry_t::LOST_REVERT
&&
387 latest
->reverting_to
== recovery_info
.version
) {
388 dout(10) << " got old revert version " << recovery_info
.version
389 << " for " << *latest
<< dendl
;
390 recovery_info
.version
= latest
->version
;
391 // update the attr to the revert event version
392 recovery_info
.oi
.prior_version
= recovery_info
.oi
.version
;
393 recovery_info
.oi
.version
= latest
->version
;
395 encode(recovery_info
.oi
, bl
,
396 get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD
, nullptr));
397 ceph_assert(!pool
.info
.is_erasure());
398 t
->setattr(coll
, ghobject_t(recovery_info
.soid
), OI_ATTR
, bl
);
400 obc
->attr_cache
[OI_ATTR
] = bl
;
404 // keep track of active pushes for scrub
407 recovery_state
.recover_got(
409 recovery_info
.version
,
415 obc
->obs
.exists
= true;
417 bool got
= obc
->get_recovery_read();
420 ceph_assert(recovering
.count(obc
->obs
.oi
.soid
));
421 recovering
[obc
->obs
.oi
.soid
] = obc
;
422 obc
->obs
.oi
= recovery_info
.oi
; // may have been updated above
425 t
->register_on_applied(new C_OSD_AppliedRecoveredObject(this, obc
));
427 publish_stats_to_osd();
428 release_backoffs(hoid
);
429 if (!is_unreadable_object(hoid
)) {
430 auto unreadable_object_entry
= waiting_for_unreadable_object
.find(hoid
);
431 if (unreadable_object_entry
!= waiting_for_unreadable_object
.end()) {
432 dout(20) << " kicking unreadable waiters on " << hoid
<< dendl
;
433 requeue_ops(unreadable_object_entry
->second
);
434 waiting_for_unreadable_object
.erase(unreadable_object_entry
);
438 t
->register_on_applied(
439 new C_OSD_AppliedRecoveredObjectReplica(this));
443 t
->register_on_commit(
444 new C_OSD_CommittedPushedObject(
447 info
.last_complete
));
450 void PrimaryLogPG::on_global_recover(
451 const hobject_t
&soid
,
452 const object_stat_sum_t
&stat_diff
,
455 recovery_state
.object_recovered(soid
, stat_diff
);
456 publish_stats_to_osd();
457 dout(10) << "pushed " << soid
<< " to all replicas" << dendl
;
458 map
<hobject_t
, ObjectContextRef
>::iterator i
= recovering
.find(soid
);
459 ceph_assert(i
!= recovering
.end());
461 if (i
->second
&& i
->second
->rwstate
.recovery_read_marker
) {
462 // recover missing won't have had an obc, but it gets filled in
463 // during on_local_recover
464 ceph_assert(i
->second
);
465 list
<OpRequestRef
> requeue_list
;
466 i
->second
->drop_recovery_read(&requeue_list
);
467 requeue_ops(requeue_list
);
470 backfills_in_flight
.erase(soid
);
473 finish_recovery_op(soid
);
474 release_backoffs(soid
);
475 auto degraded_object_entry
= waiting_for_degraded_object
.find(soid
);
476 if (degraded_object_entry
!= waiting_for_degraded_object
.end()) {
477 dout(20) << " kicking degraded waiters on " << soid
<< dendl
;
478 requeue_ops(degraded_object_entry
->second
);
479 waiting_for_degraded_object
.erase(degraded_object_entry
);
481 auto unreadable_object_entry
= waiting_for_unreadable_object
.find(soid
);
482 if (unreadable_object_entry
!= waiting_for_unreadable_object
.end()) {
483 dout(20) << " kicking unreadable waiters on " << soid
<< dendl
;
484 requeue_ops(unreadable_object_entry
->second
);
485 waiting_for_unreadable_object
.erase(unreadable_object_entry
);
487 finish_degraded_object(soid
);
490 void PrimaryLogPG::schedule_recovery_work(
491 GenContext
<ThreadPool::TPHandle
&> *c
)
493 osd
->queue_recovery_context(this, c
);
496 void PrimaryLogPG::replica_clear_repop_obc(
497 const vector
<pg_log_entry_t
> &logv
,
498 ObjectStore::Transaction
&t
)
500 for (auto &&e
: logv
) {
501 /* Have to blast all clones, they share a snapset */
502 object_contexts
.clear_range(
503 e
.soid
.get_object_boundary(), e
.soid
.get_head());
505 snapset_contexts
.find(e
.soid
.get_head()) ==
506 snapset_contexts
.end());
510 bool PrimaryLogPG::should_send_op(
512 const hobject_t
&hoid
) {
513 if (peer
== get_primary())
515 ceph_assert(recovery_state
.has_peer_info(peer
));
517 hoid
.pool
!= (int64_t)info
.pgid
.pool() ||
518 hoid
<= last_backfill_started
||
519 hoid
<= recovery_state
.get_peer_info(peer
).last_backfill
;
521 ceph_assert(is_backfill_target(peer
));
522 dout(10) << __func__
<< " issue_repop shipping empty opt to osd." << peer
523 << ", object " << hoid
524 << " beyond std::max(last_backfill_started "
525 << ", peer_info[peer].last_backfill "
526 << recovery_state
.get_peer_info(peer
).last_backfill
530 if (is_async_recovery_target(peer
) &&
531 recovery_state
.get_peer_missing(peer
).is_missing(hoid
)) {
533 dout(10) << __func__
<< " issue_repop shipping empty opt to osd." << peer
534 << ", object " << hoid
535 << " which is pending recovery in async_recovery_targets" << dendl
;
541 ConnectionRef
PrimaryLogPG::get_con_osd_cluster(
542 int peer
, epoch_t from_epoch
)
544 return osd
->get_con_osd_cluster(peer
, from_epoch
);
547 PerfCounters
*PrimaryLogPG::get_logger()
553 // ====================
556 bool PrimaryLogPG::is_missing_object(const hobject_t
& soid
) const
558 return recovery_state
.get_pg_log().get_missing().get_items().count(soid
);
561 void PrimaryLogPG::maybe_kick_recovery(
562 const hobject_t
&soid
)
565 bool work_started
= false;
566 if (!recovery_state
.get_missing_loc().needs_recovery(soid
, &v
))
569 map
<hobject_t
, ObjectContextRef
>::const_iterator p
= recovering
.find(soid
);
570 if (p
!= recovering
.end()) {
571 dout(7) << "object " << soid
<< " v " << v
<< ", already recovering." << dendl
;
572 } else if (recovery_state
.get_missing_loc().is_unfound(soid
)) {
573 dout(7) << "object " << soid
<< " v " << v
<< ", is unfound." << dendl
;
575 dout(7) << "object " << soid
<< " v " << v
<< ", recovering." << dendl
;
576 PGBackend::RecoveryHandle
*h
= pgbackend
->open_recovery_op();
577 if (is_missing_object(soid
)) {
578 recover_missing(soid
, v
, CEPH_MSG_PRIO_HIGH
, h
);
579 } else if (recovery_state
.get_missing_loc().is_deleted(soid
)) {
580 prep_object_replica_deletes(soid
, v
, h
, &work_started
);
582 prep_object_replica_pushes(soid
, v
, h
, &work_started
);
584 pgbackend
->run_recovery_op(h
, CEPH_MSG_PRIO_HIGH
);
588 void PrimaryLogPG::wait_for_unreadable_object(
589 const hobject_t
& soid
, OpRequestRef op
)
591 ceph_assert(is_unreadable_object(soid
));
592 maybe_kick_recovery(soid
);
593 waiting_for_unreadable_object
[soid
].push_back(op
);
594 op
->mark_delayed("waiting for missing object");
597 bool PrimaryLogPG::is_degraded_or_backfilling_object(const hobject_t
& soid
)
599 /* The conditions below may clear (on_local_recover, before we queue
600 * the transaction) before we actually requeue the degraded waiters
601 * in on_global_recover after the transaction completes.
603 if (waiting_for_degraded_object
.count(soid
))
605 if (recovery_state
.get_pg_log().get_missing().get_items().count(soid
))
607 ceph_assert(!get_acting_recovery_backfill().empty());
608 for (set
<pg_shard_t
>::iterator i
= get_acting_recovery_backfill().begin();
609 i
!= get_acting_recovery_backfill().end();
611 if (*i
== get_primary()) continue;
612 pg_shard_t peer
= *i
;
613 auto peer_missing_entry
= recovery_state
.get_peer_missing().find(peer
);
614 // If an object is missing on an async_recovery_target, return false.
615 // This will not block the op and the object is async recovered later.
616 if (peer_missing_entry
!= recovery_state
.get_peer_missing().end() &&
617 peer_missing_entry
->second
.get_items().count(soid
)) {
618 if (is_async_recovery_target(peer
))
623 // Object is degraded if after last_backfill AND
624 // we are backfilling it
625 if (is_backfill_target(peer
) &&
626 recovery_state
.get_peer_info(peer
).last_backfill
<= soid
&&
627 last_backfill_started
>= soid
&&
628 backfills_in_flight
.count(soid
))
634 bool PrimaryLogPG::is_degraded_on_async_recovery_target(const hobject_t
& soid
)
636 for (auto &i
: get_async_recovery_targets()) {
637 auto peer_missing_entry
= recovery_state
.get_peer_missing().find(i
);
638 if (peer_missing_entry
!= recovery_state
.get_peer_missing().end() &&
639 peer_missing_entry
->second
.get_items().count(soid
)) {
640 dout(30) << __func__
<< " " << soid
<< dendl
;
647 void PrimaryLogPG::wait_for_degraded_object(const hobject_t
& soid
, OpRequestRef op
)
649 ceph_assert(is_degraded_or_backfilling_object(soid
) || is_degraded_on_async_recovery_target(soid
));
651 maybe_kick_recovery(soid
);
652 waiting_for_degraded_object
[soid
].push_back(op
);
653 op
->mark_delayed("waiting for degraded object");
656 void PrimaryLogPG::block_write_on_full_cache(
657 const hobject_t
& _oid
, OpRequestRef op
)
659 const hobject_t oid
= _oid
.get_head();
660 dout(20) << __func__
<< ": blocking object " << oid
661 << " on full cache" << dendl
;
662 objects_blocked_on_cache_full
.insert(oid
);
663 waiting_for_cache_not_full
.push_back(op
);
664 op
->mark_delayed("waiting for cache not full");
667 void PrimaryLogPG::block_for_clean(
668 const hobject_t
& oid
, OpRequestRef op
)
670 dout(20) << __func__
<< ": blocking object " << oid
671 << " on primary repair" << dendl
;
672 waiting_for_clean_to_primary_repair
.push_back(op
);
673 op
->mark_delayed("waiting for clean to repair");
676 void PrimaryLogPG::block_write_on_snap_rollback(
677 const hobject_t
& oid
, ObjectContextRef obc
, OpRequestRef op
)
679 dout(20) << __func__
<< ": blocking object " << oid
.get_head()
680 << " on snap promotion " << obc
->obs
.oi
.soid
<< dendl
;
681 // otherwise, we'd have blocked in do_op
682 ceph_assert(oid
.is_head());
683 ceph_assert(objects_blocked_on_snap_promotion
.count(oid
) == 0);
684 objects_blocked_on_snap_promotion
[oid
] = obc
;
685 wait_for_blocked_object(obc
->obs
.oi
.soid
, op
);
688 void PrimaryLogPG::block_write_on_degraded_snap(
689 const hobject_t
& snap
, OpRequestRef op
)
691 dout(20) << __func__
<< ": blocking object " << snap
.get_head()
692 << " on degraded snap " << snap
<< dendl
;
693 // otherwise, we'd have blocked in do_op
694 ceph_assert(objects_blocked_on_degraded_snap
.count(snap
.get_head()) == 0);
695 objects_blocked_on_degraded_snap
[snap
.get_head()] = snap
.snap
;
696 wait_for_degraded_object(snap
, op
);
699 bool PrimaryLogPG::maybe_await_blocked_head(
700 const hobject_t
&hoid
,
703 ObjectContextRef obc
;
704 obc
= object_contexts
.lookup(hoid
.get_head());
706 if (obc
->is_blocked()) {
707 wait_for_blocked_object(obc
->obs
.oi
.soid
, op
);
716 void PrimaryLogPG::wait_for_blocked_object(const hobject_t
& soid
, OpRequestRef op
)
718 dout(10) << __func__
<< " " << soid
<< " " << op
<< dendl
;
719 waiting_for_blocked_object
[soid
].push_back(op
);
720 op
->mark_delayed("waiting for blocked object");
723 void PrimaryLogPG::maybe_force_recovery()
725 // no force if not in degraded/recovery/backfill states
726 if (!is_degraded() &&
727 !state_test(PG_STATE_RECOVERING
|
728 PG_STATE_RECOVERY_WAIT
|
729 PG_STATE_BACKFILLING
|
730 PG_STATE_BACKFILL_WAIT
|
731 PG_STATE_BACKFILL_TOOFULL
))
734 if (recovery_state
.get_pg_log().get_log().approx_size() <
735 cct
->_conf
->osd_max_pg_log_entries
*
736 cct
->_conf
->osd_force_recovery_pg_log_entries_factor
)
739 // find the oldest missing object
740 version_t min_version
= recovery_state
.get_pg_log().get_log().head
.version
;
742 if (!recovery_state
.get_pg_log().get_missing().get_rmissing().empty()) {
743 min_version
= recovery_state
.get_pg_log().get_missing().get_rmissing().begin()->first
;
744 soid
= recovery_state
.get_pg_log().get_missing().get_rmissing().begin()->second
;
746 ceph_assert(!get_acting_recovery_backfill().empty());
747 for (set
<pg_shard_t
>::iterator it
= get_acting_recovery_backfill().begin();
748 it
!= get_acting_recovery_backfill().end();
750 if (*it
== get_primary()) continue;
751 pg_shard_t peer
= *it
;
752 auto it_missing
= recovery_state
.get_peer_missing().find(peer
);
753 if (it_missing
!= recovery_state
.get_peer_missing().end() &&
754 !it_missing
->second
.get_rmissing().empty()) {
755 const auto& min_obj
= recovery_state
.get_peer_missing(peer
).get_rmissing().begin();
756 dout(20) << __func__
<< " peer " << peer
<< " min_version " << min_obj
->first
757 << " oid " << min_obj
->second
<< dendl
;
758 if (min_version
> min_obj
->first
) {
759 min_version
= min_obj
->first
;
760 soid
= min_obj
->second
;
766 if (soid
!= hobject_t())
767 maybe_kick_recovery(soid
);
770 bool PrimaryLogPG::check_laggy(OpRequestRef
& op
)
772 if (!HAVE_FEATURE(recovery_state
.get_min_upacting_features(),
774 dout(20) << __func__
<< " not all upacting has SERVER_OCTOPUS" << dendl
;
777 if (state_test(PG_STATE_WAIT
)) {
778 dout(10) << __func__
<< " PG is WAIT state" << dendl
;
779 } else if (!state_test(PG_STATE_LAGGY
)) {
780 auto mnow
= osd
->get_mnow();
781 auto ru
= recovery_state
.get_readable_until();
788 << " > readable_until " << ru
<< dendl
;
791 osd
->reply_op_error(op
, -EAGAIN
);
796 state_set(PG_STATE_LAGGY
);
797 publish_stats_to_osd();
799 dout(10) << __func__
<< " not readable" << dendl
;
800 waiting_for_readable
.push_back(op
);
801 op
->mark_delayed("waiting for readable");
805 bool PrimaryLogPG::check_laggy_requeue(OpRequestRef
& op
)
807 if (!HAVE_FEATURE(recovery_state
.get_min_upacting_features(),
811 if (!state_test(PG_STATE_WAIT
) && !state_test(PG_STATE_LAGGY
)) {
812 return true; // not laggy
814 dout(10) << __func__
<< " not readable" << dendl
;
815 waiting_for_readable
.push_front(op
);
816 op
->mark_delayed("waiting for readable");
820 void PrimaryLogPG::recheck_readable()
822 if (!is_wait() && !is_laggy()) {
823 dout(20) << __func__
<< " wasn't wait or laggy" << dendl
;
826 auto mnow
= osd
->get_mnow();
829 auto prior_readable_until_ub
= recovery_state
.get_prior_readable_until_ub();
830 if (mnow
< prior_readable_until_ub
) {
831 dout(10) << __func__
<< " still wait (mnow " << mnow
832 << " < prior_readable_until_ub " << prior_readable_until_ub
835 dout(10) << __func__
<< " no longer wait (mnow " << mnow
836 << " >= prior_readable_until_ub " << prior_readable_until_ub
838 state_clear(PG_STATE_WAIT
);
839 recovery_state
.clear_prior_readable_until_ub();
844 auto ru
= recovery_state
.get_readable_until();
845 if (ru
== ceph::signedspan::zero()) {
846 dout(10) << __func__
<< " still laggy (mnow " << mnow
847 << ", readable_until zero)" << dendl
;
848 } else if (mnow
>= ru
) {
849 dout(10) << __func__
<< " still laggy (mnow " << mnow
850 << " >= readable_until " << ru
<< ")" << dendl
;
852 dout(10) << __func__
<< " no longer laggy (mnow " << mnow
853 << " < readable_until " << ru
<< ")" << dendl
;
854 state_clear(PG_STATE_LAGGY
);
859 publish_stats_to_osd();
861 if (!is_laggy() && !is_wait()) {
862 requeue_ops(waiting_for_readable
);
866 bool PrimaryLogPG::pgls_filter(const PGLSFilter
& filter
, const hobject_t
& sobj
)
870 // If filter has expressed an interest in an xattr, load it.
871 if (!filter
.get_xattr().empty()) {
872 int ret
= pgbackend
->objects_get_attr(
876 dout(0) << "getattr (sobj=" << sobj
<< ", attr=" << filter
.get_xattr() << ") returned " << ret
<< dendl
;
878 if (ret
!= -ENODATA
|| filter
.reject_empty_xattr()) {
884 return filter
.filter(sobj
, bl
);
887 std::pair
<int, std::unique_ptr
<const PGLSFilter
>>
888 PrimaryLogPG::get_pgls_filter(bufferlist::const_iterator
& iter
)
891 // storing non-const PGLSFilter for the sake of ::init()
892 std::unique_ptr
<PGLSFilter
> filter
;
897 catch (buffer::error
& e
) {
898 return { -EINVAL
, nullptr };
901 if (type
.compare("plain") == 0) {
902 filter
= std::make_unique
<PGLSPlainFilter
>();
904 std::size_t dot
= type
.find(".");
905 if (dot
== std::string::npos
|| dot
== 0 || dot
== type
.size() - 1) {
906 return { -EINVAL
, nullptr };
909 const std::string class_name
= type
.substr(0, dot
);
910 const std::string filter_name
= type
.substr(dot
+ 1);
911 ClassHandler::ClassData
*cls
= NULL
;
912 int r
= ClassHandler::get_instance().open_class(class_name
, &cls
);
914 derr
<< "Error opening class '" << class_name
<< "': "
915 << cpp_strerror(r
) << dendl
;
916 if (r
!= -EPERM
) // propogate permission error
918 return { r
, nullptr };
923 ClassHandler::ClassFilter
*class_filter
= cls
->get_filter(filter_name
);
924 if (class_filter
== NULL
) {
925 derr
<< "Error finding filter '" << filter_name
<< "' in class "
926 << class_name
<< dendl
;
927 return { -EINVAL
, nullptr };
929 filter
.reset(class_filter
->fn());
931 // Object classes are obliged to return us something, but let's
932 // give an error rather than asserting out.
933 derr
<< "Buggy class " << class_name
<< " failed to construct "
934 "filter " << filter_name
<< dendl
;
935 return { -EINVAL
, nullptr };
940 int r
= filter
->init(iter
);
942 derr
<< "Error initializing filter " << type
<< ": "
943 << cpp_strerror(r
) << dendl
;
944 return { -EINVAL
, nullptr };
946 // Successfully constructed and initialized, return it.
947 return std::make_pair(0, std::move(filter
));
952 // ==========================================================
954 void PrimaryLogPG::do_command(
955 const string_view
& orig_prefix
,
956 const cmdmap_t
& cmdmap
,
957 const bufferlist
& idata
,
958 std::function
<void(int,const std::string
&,bufferlist
&)> on_finish
)
961 cmd_getval(cmdmap
, "format", format
);
962 std::unique_ptr
<Formatter
> f(Formatter::create(
963 format
, "json-pretty", "json-pretty"));
965 stringstream ss
; // stderr error message stream
966 bufferlist outbl
; // if empty at end, we'll dump formatter as output
969 // - ceph pg <pgid> foo -> prefix=pg, cmd=foo
970 // - ceph tell <pgid> foo -> prefix=foo
971 string
prefix(orig_prefix
);
973 cmd_getval(cmdmap
, "cmd", command
);
974 if (command
.size()) {
978 if (prefix
== "query") {
979 f
->open_object_section("pg");
980 f
->dump_stream("snap_trimq") << snap_trimq
;
981 f
->dump_unsigned("snap_trimq_len", snap_trimq
.size());
982 recovery_state
.dump_peering_state(f
.get());
985 f
->open_array_section("recovery_state");
986 handle_query_state(f
.get());
989 f
->open_object_section("agent_state");
991 agent_state
->dump(f
.get());
997 else if (prefix
== "mark_unfound_lost") {
999 cmd_getval(cmdmap
, "mulcmd", mulcmd
);
1001 if (mulcmd
== "revert") {
1002 if (pool
.info
.is_erasure()) {
1003 ss
<< "mode must be 'delete' for ec pool";
1007 mode
= pg_log_entry_t::LOST_REVERT
;
1008 } else if (mulcmd
== "delete") {
1009 mode
= pg_log_entry_t::LOST_DELETE
;
1011 ss
<< "mode must be 'revert' or 'delete'; mark not yet implemented";
1015 ceph_assert(mode
== pg_log_entry_t::LOST_REVERT
||
1016 mode
== pg_log_entry_t::LOST_DELETE
);
1018 if (!is_primary()) {
1019 ss
<< "not primary";
1024 uint64_t unfound
= recovery_state
.get_missing_loc().num_unfound();
1026 ss
<< "pg has no unfound objects";
1027 goto out
; // make command idempotent
1030 if (!recovery_state
.all_unfound_are_queried_or_lost(get_osdmap())) {
1031 ss
<< "pg has " << unfound
1032 << " unfound objects but we haven't probed all sources, not marking lost";
1037 mark_all_unfound_lost(mode
, on_finish
);
1041 else if (prefix
== "list_unfound") {
1044 bool show_offset
= false;
1045 if (cmd_getval(cmdmap
, "offset", offset_json
)) {
1046 json_spirit::Value v
;
1048 if (!json_spirit::read(offset_json
, v
))
1049 throw std::runtime_error("bad json");
1051 } catch (std::runtime_error
& e
) {
1052 ss
<< "error parsing offset: " << e
.what();
1058 f
->open_object_section("missing");
1060 f
->open_object_section("offset");
1061 offset
.dump(f
.get());
1064 auto &needs_recovery_map
= recovery_state
.get_missing_loc()
1065 .get_needs_recovery();
1066 f
->dump_int("num_missing", needs_recovery_map
.size());
1067 f
->dump_int("num_unfound", get_num_unfound());
1068 map
<hobject_t
, pg_missing_item
>::const_iterator p
=
1069 needs_recovery_map
.upper_bound(offset
);
1071 f
->open_array_section("objects");
1073 for (; p
!= needs_recovery_map
.end() &&
1074 num
< cct
->_conf
->osd_command_max_records
;
1076 if (recovery_state
.get_missing_loc().is_unfound(p
->first
)) {
1077 f
->open_object_section("object");
1079 f
->open_object_section("oid");
1080 p
->first
.dump(f
.get());
1083 p
->second
.dump(f
.get()); // have, need keys
1085 f
->open_array_section("locations");
1086 for (auto &&r
: recovery_state
.get_missing_loc().get_locations(
1088 f
->dump_stream("shard") << r
;
1098 f
->dump_bool("more", p
!= needs_recovery_map
.end());
1102 else if (prefix
== "scrub" ||
1103 prefix
== "deep_scrub") {
1104 bool deep
= (prefix
== "deep_scrub");
1106 cmd_getval(cmdmap
, "time", time
, (int64_t)0);
1109 const pg_pool_t
*p
= &pool
.info
;
1110 double pool_scrub_max_interval
= 0;
1111 double scrub_max_interval
;
1113 p
->opts
.get(pool_opts_t::DEEP_SCRUB_INTERVAL
, &pool_scrub_max_interval
);
1114 scrub_max_interval
= pool_scrub_max_interval
> 0 ?
1115 pool_scrub_max_interval
: g_conf()->osd_deep_scrub_interval
;
1117 p
->opts
.get(pool_opts_t::SCRUB_MAX_INTERVAL
, &pool_scrub_max_interval
);
1118 scrub_max_interval
= pool_scrub_max_interval
> 0 ?
1119 pool_scrub_max_interval
: g_conf()->osd_scrub_max_interval
;
1121 // Instead of marking must_scrub force a schedule scrub
1122 utime_t stamp
= ceph_clock_now();
1124 stamp
-= scrub_max_interval
;
1126 stamp
-= (float)time
;
1127 stamp
-= 100.0; // push back last scrub more for good measure
1129 set_last_deep_scrub_stamp(stamp
);
1131 set_last_scrub_stamp(stamp
);
1133 f
->open_object_section("result");
1134 f
->dump_bool("deep", deep
);
1135 f
->dump_stream("stamp") << stamp
;
1138 ss
<< "Not primary";
1141 outbl
.append(ss
.str());
1146 ss
<< "prefix '" << prefix
<< "' not implemented";
1150 if (ret
>= 0 && outbl
.length() == 0) {
1153 on_finish(ret
, ss
.str(), outbl
);
1157 // ==========================================================
1159 void PrimaryLogPG::do_pg_op(OpRequestRef op
)
1161 const MOSDOp
*m
= static_cast<const MOSDOp
*>(op
->get_req());
1162 ceph_assert(m
->get_type() == CEPH_MSG_OSD_OP
);
1163 dout(10) << "do_pg_op " << *m
<< dendl
;
1168 string cname
, mname
;
1170 snapid_t snapid
= m
->get_snapid();
1172 vector
<OSDOp
> ops
= m
->ops
;
1174 for (vector
<OSDOp
>::iterator p
= ops
.begin(); p
!= ops
.end(); ++p
) {
1175 std::unique_ptr
<const PGLSFilter
> filter
;
1177 auto bp
= p
->indata
.cbegin();
1179 case CEPH_OSD_OP_PGNLS_FILTER
:
1184 catch (const buffer::error
& e
) {
1185 dout(0) << "unable to decode PGLS_FILTER description in " << *m
<< dendl
;
1189 std::tie(result
, filter
) = get_pgls_filter(bp
);
1193 ceph_assert(filter
);
1197 case CEPH_OSD_OP_PGNLS
:
1198 if (snapid
!= CEPH_NOSNAP
) {
1202 if (get_osdmap()->raw_pg_to_pg(m
->get_pg()) != info
.pgid
.pgid
) {
1203 dout(10) << " pgnls pg=" << m
->get_pg()
1204 << " " << get_osdmap()->raw_pg_to_pg(m
->get_pg())
1205 << " != " << info
.pgid
<< dendl
;
1208 unsigned list_size
= std::min
<uint64_t>(cct
->_conf
->osd_max_pgls
,
1211 dout(10) << " pgnls pg=" << m
->get_pg() << " count " << list_size
1213 // read into a buffer
1214 vector
<hobject_t
> sentries
;
1215 pg_nls_response_t response
;
1217 decode(response
.handle
, bp
);
1219 catch (const buffer::error
& e
) {
1220 dout(0) << "unable to decode PGNLS handle in " << *m
<< dendl
;
1226 hobject_t lower_bound
= response
.handle
;
1227 hobject_t pg_start
= info
.pgid
.pgid
.get_hobj_start();
1228 hobject_t pg_end
= info
.pgid
.pgid
.get_hobj_end(pool
.info
.get_pg_num());
1229 dout(10) << " pgnls lower_bound " << lower_bound
1230 << " pg_end " << pg_end
<< dendl
;
1231 if (((!lower_bound
.is_max() && lower_bound
>= pg_end
) ||
1232 (lower_bound
!= hobject_t() && lower_bound
< pg_start
))) {
1233 // this should only happen with a buggy client.
1234 dout(10) << "outside of PG bounds " << pg_start
<< " .. "
1240 hobject_t current
= lower_bound
;
1241 int r
= pgbackend
->objects_list_partial(
1252 map
<hobject_t
, pg_missing_item
>::const_iterator missing_iter
=
1253 recovery_state
.get_pg_log().get_missing().get_items().lower_bound(current
);
1254 vector
<hobject_t
>::iterator ls_iter
= sentries
.begin();
1255 hobject_t _max
= hobject_t::get_max();
1257 const hobject_t
&mcand
=
1258 missing_iter
== recovery_state
.get_pg_log().get_missing().get_items().end() ?
1260 missing_iter
->first
;
1261 const hobject_t
&lcand
=
1262 ls_iter
== sentries
.end() ?
1266 hobject_t candidate
;
1267 if (mcand
== lcand
) {
1269 if (!mcand
.is_max()) {
1273 } else if (mcand
< lcand
) {
1275 ceph_assert(!mcand
.is_max());
1279 ceph_assert(!lcand
.is_max());
1283 dout(10) << " pgnls candidate 0x" << std::hex
<< candidate
.get_hash()
1284 << " vs lower bound 0x" << lower_bound
.get_hash()
1285 << std::dec
<< dendl
;
1287 if (candidate
>= next
) {
1291 if (response
.entries
.size() == list_size
) {
1296 if (candidate
.snap
!= CEPH_NOSNAP
)
1299 // skip internal namespace
1300 if (candidate
.get_namespace() == cct
->_conf
->osd_hit_set_namespace
)
1303 if (recovery_state
.get_missing_loc().is_deleted(candidate
))
1306 // skip wrong namespace
1307 if (m
->get_hobj().nspace
!= librados::all_nspaces
&&
1308 candidate
.get_namespace() != m
->get_hobj().nspace
)
1311 if (filter
&& !pgls_filter(*filter
, candidate
))
1314 dout(20) << "pgnls item 0x" << std::hex
1315 << candidate
.get_hash()
1316 << ", rev 0x" << hobject_t::_reverse_bits(candidate
.get_hash())
1318 << candidate
.oid
.name
<< dendl
;
1320 librados::ListObjectImpl item
;
1321 item
.nspace
= candidate
.get_namespace();
1322 item
.oid
= candidate
.oid
.name
;
1323 item
.locator
= candidate
.get_key();
1324 response
.entries
.push_back(item
);
1327 if (next
.is_max() &&
1328 missing_iter
== recovery_state
.get_pg_log().get_missing().get_items().end() &&
1329 ls_iter
== sentries
.end()) {
1332 // Set response.handle to the start of the next PG according
1333 // to the object sort order.
1334 response
.handle
= info
.pgid
.pgid
.get_hobj_end(pool
.info
.get_pg_num());
1336 response
.handle
= next
;
1338 dout(10) << "pgnls handle=" << response
.handle
<< dendl
;
1339 encode(response
, osd_op
.outdata
);
1340 dout(10) << " pgnls result=" << result
<< " outdata.length()="
1341 << osd_op
.outdata
.length() << dendl
;
1345 case CEPH_OSD_OP_PGLS_FILTER
:
1350 catch (const buffer::error
& e
) {
1351 dout(0) << "unable to decode PGLS_FILTER description in " << *m
<< dendl
;
1355 std::tie(result
, filter
) = get_pgls_filter(bp
);
1359 ceph_assert(filter
);
1363 case CEPH_OSD_OP_PGLS
:
1364 if (snapid
!= CEPH_NOSNAP
) {
1368 if (get_osdmap()->raw_pg_to_pg(m
->get_pg()) != info
.pgid
.pgid
) {
1369 dout(10) << " pgls pg=" << m
->get_pg()
1370 << " " << get_osdmap()->raw_pg_to_pg(m
->get_pg())
1371 << " != " << info
.pgid
<< dendl
;
1374 unsigned list_size
= std::min
<uint64_t>(cct
->_conf
->osd_max_pgls
,
1377 dout(10) << " pgls pg=" << m
->get_pg() << " count " << list_size
<< dendl
;
1378 // read into a buffer
1379 vector
<hobject_t
> sentries
;
1380 pg_ls_response_t response
;
1382 decode(response
.handle
, bp
);
1384 catch (const buffer::error
& e
) {
1385 dout(0) << "unable to decode PGLS handle in " << *m
<< dendl
;
1391 hobject_t current
= response
.handle
;
1392 int r
= pgbackend
->objects_list_partial(
1403 ceph_assert(snapid
== CEPH_NOSNAP
|| recovery_state
.get_pg_log().get_missing().get_items().empty());
1405 map
<hobject_t
, pg_missing_item
>::const_iterator missing_iter
=
1406 recovery_state
.get_pg_log().get_missing().get_items().lower_bound(current
);
1407 vector
<hobject_t
>::iterator ls_iter
= sentries
.begin();
1408 hobject_t _max
= hobject_t::get_max();
1410 const hobject_t
&mcand
=
1411 missing_iter
== recovery_state
.get_pg_log().get_missing().get_items().end() ?
1413 missing_iter
->first
;
1414 const hobject_t
&lcand
=
1415 ls_iter
== sentries
.end() ?
1419 hobject_t candidate
;
1420 if (mcand
== lcand
) {
1422 if (!mcand
.is_max()) {
1426 } else if (mcand
< lcand
) {
1428 ceph_assert(!mcand
.is_max());
1432 ceph_assert(!lcand
.is_max());
1436 if (candidate
>= next
) {
1440 if (response
.entries
.size() == list_size
) {
1445 if (candidate
.snap
!= CEPH_NOSNAP
)
1448 // skip wrong namespace
1449 if (candidate
.get_namespace() != m
->get_hobj().nspace
)
1452 if (recovery_state
.get_missing_loc().is_deleted(candidate
))
1455 if (filter
&& !pgls_filter(*filter
, candidate
))
1458 response
.entries
.push_back(make_pair(candidate
.oid
,
1459 candidate
.get_key()));
1461 if (next
.is_max() &&
1462 missing_iter
== recovery_state
.get_pg_log().get_missing().get_items().end() &&
1463 ls_iter
== sentries
.end()) {
1466 response
.handle
= next
;
1467 encode(response
, osd_op
.outdata
);
1468 dout(10) << " pgls result=" << result
<< " outdata.length()="
1469 << osd_op
.outdata
.length() << dendl
;
1473 case CEPH_OSD_OP_PG_HITSET_LS
:
1475 list
< pair
<utime_t
,utime_t
> > ls
;
1476 for (list
<pg_hit_set_info_t
>::const_iterator p
= info
.hit_set
.history
.begin();
1477 p
!= info
.hit_set
.history
.end();
1479 ls
.push_back(make_pair(p
->begin
, p
->end
));
1481 ls
.push_back(make_pair(hit_set_start_stamp
, utime_t()));
1482 encode(ls
, osd_op
.outdata
);
1486 case CEPH_OSD_OP_PG_HITSET_GET
:
1488 utime_t
stamp(osd_op
.op
.hit_set_get
.stamp
);
1489 if (hit_set_start_stamp
&& stamp
>= hit_set_start_stamp
) {
1490 // read the current in-memory HitSet, not the version we've
1496 encode(*hit_set
, osd_op
.outdata
);
1497 result
= osd_op
.outdata
.length();
1499 // read an archived HitSet.
1501 for (list
<pg_hit_set_info_t
>::const_iterator p
= info
.hit_set
.history
.begin();
1502 p
!= info
.hit_set
.history
.end();
1504 if (stamp
>= p
->begin
&& stamp
<= p
->end
) {
1505 oid
= get_hit_set_archive_object(p
->begin
, p
->end
, p
->using_gmt
);
1509 if (oid
== hobject_t()) {
1513 if (!pool
.info
.is_replicated()) {
1514 // FIXME: EC not supported yet
1515 result
= -EOPNOTSUPP
;
1518 if (is_unreadable_object(oid
)) {
1519 wait_for_unreadable_object(oid
, op
);
1522 result
= osd
->store
->read(ch
, ghobject_t(oid
), 0, 0, osd_op
.outdata
);
1527 case CEPH_OSD_OP_SCRUBLS
:
1528 result
= do_scrub_ls(m
, &osd_op
);
1541 MOSDOpReply
*reply
= new MOSDOpReply(m
, 0, get_osdmap_epoch(),
1542 CEPH_OSD_FLAG_ACK
| CEPH_OSD_FLAG_ONDISK
,
1544 reply
->claim_op_out_data(ops
);
1545 reply
->set_result(result
);
1546 reply
->set_reply_versions(info
.last_update
, info
.last_user_version
);
1547 osd
->send_message_osd_client(reply
, m
->get_connection());
1550 int PrimaryLogPG::do_scrub_ls(const MOSDOp
*m
, OSDOp
*osd_op
)
1552 if (m
->get_pg() != info
.pgid
.pgid
) {
1553 dout(10) << " scrubls pg=" << m
->get_pg() << " != " << info
.pgid
<< dendl
;
1554 return -EINVAL
; // hmm?
1556 auto bp
= osd_op
->indata
.cbegin();
1560 } catch (buffer::error
&) {
1561 dout(10) << " corrupted scrub_ls_arg_t" << dendl
;
1565 scrub_ls_result_t result
= {.interval
= info
.history
.same_interval_since
};
1566 if (arg
.interval
!= 0 && arg
.interval
!= info
.history
.same_interval_since
) {
1568 } else if (!scrubber
.store
) {
1570 } else if (arg
.get_snapsets
) {
1571 result
.vals
= scrubber
.store
->get_snap_errors(osd
->store
,
1576 result
.vals
= scrubber
.store
->get_object_errors(osd
->store
,
1581 encode(result
, osd_op
->outdata
);
1585 PrimaryLogPG::PrimaryLogPG(OSDService
*o
, OSDMapRef curmap
,
1586 const PGPool
&_pool
,
1587 const map
<string
,string
>& ec_profile
, spg_t p
) :
1588 PG(o
, curmap
, _pool
, p
),
1590 PGBackend::build_pg_backend(
1591 _pool
.info
, ec_profile
, this, coll_t(p
), ch
, o
->store
, cct
)),
1592 object_contexts(o
->cct
, o
->cct
->_conf
->osd_pg_object_context_cache_count
),
1593 new_backfill(false),
1595 snap_trimmer_machine(this)
1597 recovery_state
.set_backend_predicates(
1598 pgbackend
->get_is_readable_predicate(),
1599 pgbackend
->get_is_recoverable_predicate());
1600 snap_trimmer_machine
.initiate();
1603 void PrimaryLogPG::get_src_oloc(const object_t
& oid
, const object_locator_t
& oloc
, object_locator_t
& src_oloc
)
1606 if (oloc
.key
.empty())
1607 src_oloc
.key
= oid
.name
;
1610 void PrimaryLogPG::handle_backoff(OpRequestRef
& op
)
1612 auto m
= op
->get_req
<MOSDBackoff
>();
1613 auto session
= ceph::ref_cast
<Session
>(m
->get_connection()->get_priv());
1616 hobject_t begin
= info
.pgid
.pgid
.get_hobj_start();
1617 hobject_t end
= info
.pgid
.pgid
.get_hobj_end(pool
.info
.get_pg_num());
1618 if (begin
< m
->begin
) {
1624 dout(10) << __func__
<< " backoff ack id " << m
->id
1625 << " [" << begin
<< "," << end
<< ")" << dendl
;
1626 session
->ack_backoff(cct
, m
->pgid
, m
->id
, begin
, end
);
1629 void PrimaryLogPG::do_request(
1631 ThreadPool::TPHandle
&handle
)
1633 if (op
->osd_trace
) {
1634 op
->pg_trace
.init("pg op", &trace_endpoint
, &op
->osd_trace
);
1635 op
->pg_trace
.event("do request");
1637 // make sure we have a new enough map
1638 auto p
= waiting_for_map
.find(op
->get_source());
1639 if (p
!= waiting_for_map
.end()) {
1640 // preserve ordering
1641 dout(20) << __func__
<< " waiting_for_map "
1642 << p
->first
<< " not empty, queueing" << dendl
;
1643 p
->second
.push_back(op
);
1644 op
->mark_delayed("waiting_for_map not empty");
1647 if (!have_same_or_newer_map(op
->min_epoch
)) {
1648 dout(20) << __func__
<< " min " << op
->min_epoch
1649 << ", queue on waiting_for_map " << op
->get_source() << dendl
;
1650 waiting_for_map
[op
->get_source()].push_back(op
);
1651 op
->mark_delayed("op must wait for map");
1652 osd
->request_osdmap_update(op
->min_epoch
);
1656 if (can_discard_request(op
)) {
1661 const Message
*m
= op
->get_req();
1662 int msg_type
= m
->get_type();
1663 if (m
->get_connection()->has_feature(CEPH_FEATURE_RADOS_BACKOFF
)) {
1664 auto session
= ceph::ref_cast
<Session
>(m
->get_connection()->get_priv());
1668 if (msg_type
== CEPH_MSG_OSD_OP
) {
1669 if (session
->check_backoff(cct
, info
.pgid
,
1670 info
.pgid
.pgid
.get_hobj_start(), m
)) {
1677 (!is_active() && is_peered());
1678 if (g_conf()->osd_backoff_on_peering
&& !backoff
) {
1684 add_pg_backoff(session
);
1688 // pg backoff acks at pg-level
1689 if (msg_type
== CEPH_MSG_OSD_BACKOFF
) {
1690 const MOSDBackoff
*ba
= static_cast<const MOSDBackoff
*>(m
);
1691 if (ba
->begin
!= ba
->end
) {
1699 // Delay unless PGBackend says it's ok
1700 if (pgbackend
->can_handle_while_inactive(op
)) {
1701 bool handled
= pgbackend
->handle_message(op
);
1702 ceph_assert(handled
);
1705 waiting_for_peered
.push_back(op
);
1706 op
->mark_delayed("waiting for peered");
1711 if (recovery_state
.needs_flush()) {
1712 dout(20) << "waiting for flush on " << op
<< dendl
;
1713 waiting_for_flush
.push_back(op
);
1714 op
->mark_delayed("waiting for flush");
1718 ceph_assert(is_peered() && !recovery_state
.needs_flush());
1719 if (pgbackend
->handle_message(op
))
1723 case CEPH_MSG_OSD_OP
:
1724 case CEPH_MSG_OSD_BACKOFF
:
1726 dout(20) << " peered, not active, waiting for active on " << op
<< dendl
;
1727 waiting_for_active
.push_back(op
);
1728 op
->mark_delayed("waiting for active");
1732 case CEPH_MSG_OSD_OP
:
1733 // verify client features
1734 if ((pool
.info
.has_tiers() || pool
.info
.is_tier()) &&
1735 !op
->has_feature(CEPH_FEATURE_OSD_CACHEPOOL
)) {
1736 osd
->reply_op_error(op
, -EOPNOTSUPP
);
1741 case CEPH_MSG_OSD_BACKOFF
:
1742 // object-level backoff acks handled in osdop context
1748 case MSG_OSD_PG_SCAN
:
1749 do_scan(op
, handle
);
1752 case MSG_OSD_PG_BACKFILL
:
1756 case MSG_OSD_PG_BACKFILL_REMOVE
:
1757 do_backfill_remove(op
);
1760 case MSG_OSD_SCRUB_RESERVE
:
1762 auto m
= op
->get_req
<MOSDScrubReserve
>();
1764 case MOSDScrubReserve::REQUEST
:
1765 handle_scrub_reserve_request(op
);
1767 case MOSDScrubReserve::GRANT
:
1768 handle_scrub_reserve_grant(op
, m
->from
);
1770 case MOSDScrubReserve::REJECT
:
1771 handle_scrub_reserve_reject(op
, m
->from
);
1773 case MOSDScrubReserve::RELEASE
:
1774 handle_scrub_reserve_release(op
);
1780 case MSG_OSD_REP_SCRUB
:
1781 replica_scrub(op
, handle
);
1784 case MSG_OSD_REP_SCRUBMAP
:
1785 do_replica_scrub_map(op
);
1788 case MSG_OSD_PG_UPDATE_LOG_MISSING
:
1789 do_update_log_missing(op
);
1792 case MSG_OSD_PG_UPDATE_LOG_MISSING_REPLY
:
1793 do_update_log_missing_reply(op
);
1797 ceph_abort_msg("bad message type in do_request");
1801 hobject_t
PrimaryLogPG::earliest_backfill() const
1803 hobject_t e
= hobject_t::get_max();
1804 for (const pg_shard_t
& bt
: get_backfill_targets()) {
1805 const pg_info_t
&pi
= recovery_state
.get_peer_info(bt
);
1806 e
= std::min(pi
.last_backfill
, e
);
1811 /** do_op - do an op
1812 * pg lock will be held (if multithreaded)
1813 * osd_lock NOT held.
1815 void PrimaryLogPG::do_op(OpRequestRef
& op
)
1818 // NOTE: take a non-const pointer here; we must be careful not to
1819 // change anything that will break other reads on m (operator<<).
1820 MOSDOp
*m
= static_cast<MOSDOp
*>(op
->get_nonconst_req());
1821 ceph_assert(m
->get_type() == CEPH_MSG_OSD_OP
);
1822 if (m
->finish_decode()) {
1823 op
->reset_desc(); // for TrackedOp
1827 dout(20) << __func__
<< ": op " << *m
<< dendl
;
1829 const hobject_t head
= m
->get_hobj().get_head();
1831 if (!info
.pgid
.pgid
.contains(
1832 info
.pgid
.pgid
.get_split_bits(pool
.info
.get_pg_num()), head
)) {
1833 derr
<< __func__
<< " " << info
.pgid
.pgid
<< " does not contain "
1834 << head
<< " pg_num " << pool
.info
.get_pg_num() << " hash "
1835 << std::hex
<< head
.get_hash() << std::dec
<< dendl
;
1836 osd
->clog
->warn() << info
.pgid
.pgid
<< " does not contain " << head
1838 ceph_assert(!cct
->_conf
->osd_debug_misdirected_ops
);
1843 m
->get_connection()->has_feature(CEPH_FEATURE_RADOS_BACKOFF
);
1844 ceph::ref_t
<Session
> session
;
1846 session
= static_cast<Session
*>(m
->get_connection()->get_priv().get());
1847 if (!session
.get()) {
1848 dout(10) << __func__
<< " no session" << dendl
;
1852 if (session
->check_backoff(cct
, info
.pgid
, head
, m
)) {
1857 if (m
->has_flag(CEPH_OSD_FLAG_PARALLELEXEC
)) {
1859 dout(20) << __func__
<< ": PARALLELEXEC not implemented " << *m
<< dendl
;
1860 osd
->reply_op_error(op
, -EINVAL
);
1865 int r
= op
->maybe_init_op_info(*get_osdmap());
1867 osd
->reply_op_error(op
, r
);
1872 if ((m
->get_flags() & (CEPH_OSD_FLAG_BALANCE_READS
|
1873 CEPH_OSD_FLAG_LOCALIZE_READS
)) &&
1875 !(op
->may_write() || op
->may_cache())) {
1876 // balanced reads; any replica will do
1877 if (!(is_primary() || is_nonprimary())) {
1878 osd
->handle_misdirected_op(this, op
);
1882 // normal case; must be primary
1883 if (!is_primary()) {
1884 osd
->handle_misdirected_op(this, op
);
1889 if (!check_laggy(op
)) {
1893 if (!op_has_sufficient_caps(op
)) {
1894 osd
->reply_op_error(op
, -EPERM
);
1898 if (op
->includes_pg_op()) {
1899 return do_pg_op(op
);
1902 // object name too long?
1903 if (m
->get_oid().name
.size() > cct
->_conf
->osd_max_object_name_len
) {
1904 dout(4) << "do_op name is longer than "
1905 << cct
->_conf
->osd_max_object_name_len
1906 << " bytes" << dendl
;
1907 osd
->reply_op_error(op
, -ENAMETOOLONG
);
1910 if (m
->get_hobj().get_key().size() > cct
->_conf
->osd_max_object_name_len
) {
1911 dout(4) << "do_op locator is longer than "
1912 << cct
->_conf
->osd_max_object_name_len
1913 << " bytes" << dendl
;
1914 osd
->reply_op_error(op
, -ENAMETOOLONG
);
1917 if (m
->get_hobj().nspace
.size() > cct
->_conf
->osd_max_object_namespace_len
) {
1918 dout(4) << "do_op namespace is longer than "
1919 << cct
->_conf
->osd_max_object_namespace_len
1920 << " bytes" << dendl
;
1921 osd
->reply_op_error(op
, -ENAMETOOLONG
);
1924 if (m
->get_hobj().oid
.name
.empty()) {
1925 dout(4) << "do_op empty oid name is not allowed" << dendl
;
1926 osd
->reply_op_error(op
, -EINVAL
);
1930 if (int r
= osd
->store
->validate_hobject_key(head
)) {
1931 dout(4) << "do_op object " << head
<< " invalid for backing store: "
1933 osd
->reply_op_error(op
, r
);
1938 if (get_osdmap()->is_blacklisted(m
->get_source_addr())) {
1939 dout(10) << "do_op " << m
->get_source_addr() << " is blacklisted" << dendl
;
1940 osd
->reply_op_error(op
, -EBLACKLISTED
);
1944 // order this op as a write?
1945 bool write_ordered
= op
->rwordered();
1947 // discard due to cluster full transition? (we discard any op that
1948 // originates before the cluster or pool is marked full; the client
1949 // will resend after the full flag is removed or if they expect the
1950 // op to succeed despite being full). The except is FULL_FORCE and
1951 // FULL_TRY ops, which there is no reason to discard because they
1952 // bypass all full checks anyway. If this op isn't write or
1953 // read-ordered, we skip.
1954 // FIXME: we exclude mds writes for now.
1955 if (write_ordered
&& !(m
->get_source().is_mds() ||
1956 m
->has_flag(CEPH_OSD_FLAG_FULL_TRY
) ||
1957 m
->has_flag(CEPH_OSD_FLAG_FULL_FORCE
)) &&
1958 info
.history
.last_epoch_marked_full
> m
->get_map_epoch()) {
1959 dout(10) << __func__
<< " discarding op sent before full " << m
<< " "
1963 // mds should have stopped writing before this point.
1964 // We can't allow OSD to become non-startable even if mds
1965 // could be writing as part of file removals.
1966 if (write_ordered
&& osd
->check_failsafe_full(get_dpp()) &&
1967 !m
->has_flag(CEPH_OSD_FLAG_FULL_TRY
)) {
1968 dout(10) << __func__
<< " fail-safe full check failed, dropping request." << dendl
;
1971 int64_t poolid
= get_pgid().pool();
1972 if (op
->may_write()) {
1974 const pg_pool_t
*pi
= get_osdmap()->get_pg_pool(poolid
);
1980 if (m
->get_snapid() != CEPH_NOSNAP
) {
1981 dout(20) << __func__
<< ": write to clone not valid " << *m
<< dendl
;
1982 osd
->reply_op_error(op
, -EINVAL
);
1987 if (cct
->_conf
->osd_max_write_size
&&
1988 m
->get_data_len() > cct
->_conf
->osd_max_write_size
<< 20) {
1989 // journal can't hold commit!
1990 derr
<< "do_op msg data len " << m
->get_data_len()
1991 << " > osd_max_write_size " << (cct
->_conf
->osd_max_write_size
<< 20)
1992 << " on " << *m
<< dendl
;
1993 osd
->reply_op_error(op
, -OSD_WRITETOOBIG
);
1998 dout(10) << "do_op " << *m
1999 << (op
->may_write() ? " may_write" : "")
2000 << (op
->may_read() ? " may_read" : "")
2001 << (op
->may_cache() ? " may_cache" : "")
2002 << " -> " << (write_ordered
? "write-ordered" : "read-ordered")
2003 << " flags " << ceph_osd_flag_string(m
->get_flags())
2007 if (is_unreadable_object(head
)) {
2008 if (!is_primary()) {
2009 osd
->reply_op_error(op
, -EAGAIN
);
2013 (g_conf()->osd_backoff_on_degraded
||
2014 (g_conf()->osd_backoff_on_unfound
&&
2015 recovery_state
.get_missing_loc().is_unfound(head
)))) {
2016 add_backoff(session
, head
, head
);
2017 maybe_kick_recovery(head
);
2019 wait_for_unreadable_object(head
, op
);
2024 if (write_ordered
) {
2026 if (is_degraded_or_backfilling_object(head
)) {
2027 if (can_backoff
&& g_conf()->osd_backoff_on_degraded
) {
2028 add_backoff(session
, head
, head
);
2029 maybe_kick_recovery(head
);
2031 wait_for_degraded_object(head
, op
);
2036 if (scrubber
.is_chunky_scrub_active() && write_blocked_by_scrub(head
)) {
2037 dout(20) << __func__
<< ": waiting for scrub" << dendl
;
2038 waiting_for_scrub
.push_back(op
);
2039 op
->mark_delayed("waiting for scrub");
2042 if (!check_laggy_requeue(op
)) {
2047 if (auto blocked_iter
= objects_blocked_on_degraded_snap
.find(head
);
2048 blocked_iter
!= std::end(objects_blocked_on_degraded_snap
)) {
2049 hobject_t
to_wait_on(head
);
2050 to_wait_on
.snap
= blocked_iter
->second
;
2051 wait_for_degraded_object(to_wait_on
, op
);
2054 if (auto blocked_snap_promote_iter
= objects_blocked_on_snap_promotion
.find(head
);
2055 blocked_snap_promote_iter
!= std::end(objects_blocked_on_snap_promotion
)) {
2056 wait_for_blocked_object(blocked_snap_promote_iter
->second
->obs
.oi
.soid
, op
);
2059 if (objects_blocked_on_cache_full
.count(head
)) {
2060 block_write_on_full_cache(head
, op
);
2066 if (op
->may_write() || op
->may_cache()) {
2067 // warning: we will get back *a* request for this reqid, but not
2068 // necessarily the most recent. this happens with flush and
2069 // promote ops, but we can't possible have both in our log where
2070 // the original request is still not stable on disk, so for our
2071 // purposes here it doesn't matter which one we get.
2073 version_t user_version
;
2074 int return_code
= 0;
2075 vector
<pg_log_op_return_item_t
> op_returns
;
2076 bool got
= check_in_progress_op(
2077 m
->get_reqid(), &version
, &user_version
, &return_code
, &op_returns
);
2079 dout(3) << __func__
<< " dup " << m
->get_reqid()
2080 << " version " << version
<< dendl
;
2081 if (already_complete(version
)) {
2082 osd
->reply_op_error(op
, return_code
, version
, user_version
, op_returns
);
2084 dout(10) << " waiting for " << version
<< " to commit" << dendl
;
2085 // always queue ondisk waiters, so that we can requeue if needed
2086 waiting_for_ondisk
[version
].emplace_back(op
, user_version
, return_code
,
2088 op
->mark_delayed("waiting for ondisk");
2094 ObjectContextRef obc
;
2095 bool can_create
= op
->may_write();
2096 hobject_t missing_oid
;
2098 // kludge around the fact that LIST_SNAPS sets CEPH_SNAPDIR for LIST_SNAPS
2099 const hobject_t
& oid
=
2100 m
->get_snapid() == CEPH_SNAPDIR
? head
: m
->get_hobj();
2102 // make sure LIST_SNAPS is on CEPH_SNAPDIR and nothing else
2103 for (vector
<OSDOp
>::iterator p
= m
->ops
.begin(); p
!= m
->ops
.end(); ++p
) {
2106 if (osd_op
.op
.op
== CEPH_OSD_OP_LIST_SNAPS
) {
2107 if (m
->get_snapid() != CEPH_SNAPDIR
) {
2108 dout(10) << "LIST_SNAPS with incorrect context" << dendl
;
2109 osd
->reply_op_error(op
, -EINVAL
);
2113 if (m
->get_snapid() == CEPH_SNAPDIR
) {
2114 dout(10) << "non-LIST_SNAPS on snapdir" << dendl
;
2115 osd
->reply_op_error(op
, -EINVAL
);
2121 // io blocked on obc?
2122 if (!m
->has_flag(CEPH_OSD_FLAG_FLUSH
) &&
2123 maybe_await_blocked_head(oid
, op
)) {
2127 if (!is_primary()) {
2128 if (!recovery_state
.can_serve_replica_read(oid
)) {
2129 dout(20) << __func__
<< ": oid " << oid
2130 << " unstable write on replica, bouncing to primary."
2132 osd
->reply_op_error(op
, -EAGAIN
);
2135 dout(20) << __func__
<< ": serving replica read on oid" << oid
2140 int r
= find_object_context(
2141 oid
, &obc
, can_create
,
2142 m
->has_flag(CEPH_OSD_FLAG_MAP_SNAP_CLONE
),
2145 // LIST_SNAPS needs the ssc too
2147 m
->get_snapid() == CEPH_SNAPDIR
&&
2149 obc
->ssc
= get_snapset_context(oid
, true);
2153 // If we're not the primary of this OSD, we just return -EAGAIN. Otherwise,
2154 // we have to wait for the object.
2156 // missing the specific snap we need; requeue and wait.
2157 ceph_assert(!op
->may_write()); // only happens on a read/cache
2158 wait_for_unreadable_object(missing_oid
, op
);
2161 } else if (r
== 0) {
2162 if (is_unreadable_object(obc
->obs
.oi
.soid
)) {
2163 dout(10) << __func__
<< ": clone " << obc
->obs
.oi
.soid
2164 << " is unreadable, waiting" << dendl
;
2165 wait_for_unreadable_object(obc
->obs
.oi
.soid
, op
);
2169 // degraded object? (the check above was for head; this could be a clone)
2170 if (write_ordered
&&
2171 obc
->obs
.oi
.soid
.snap
!= CEPH_NOSNAP
&&
2172 is_degraded_or_backfilling_object(obc
->obs
.oi
.soid
)) {
2173 dout(10) << __func__
<< ": clone " << obc
->obs
.oi
.soid
2174 << " is degraded, waiting" << dendl
;
2175 wait_for_degraded_object(obc
->obs
.oi
.soid
, op
);
2180 bool in_hit_set
= false;
2183 if (obc
->obs
.oi
.soid
!= hobject_t() && hit_set
->contains(obc
->obs
.oi
.soid
))
2186 if (missing_oid
!= hobject_t() && hit_set
->contains(missing_oid
))
2189 if (!op
->hitset_inserted
) {
2190 hit_set
->insert(oid
);
2191 op
->hitset_inserted
= true;
2192 if (hit_set
->is_full() ||
2193 hit_set_start_stamp
+ pool
.info
.hit_set_period
<= m
->get_recv_stamp()) {
2200 if (agent_choose_mode(false, op
))
2204 if (obc
.get() && obc
->obs
.exists
&& obc
->obs
.oi
.has_manifest()) {
2205 if (maybe_handle_manifest(op
,
2211 if (maybe_handle_cache(op
,
2220 if (r
&& (r
!= -ENOENT
|| !obc
)) {
2221 // copy the reqids for copy get on ENOENT
2223 (m
->ops
[0].op
.op
== CEPH_OSD_OP_COPY_GET
)) {
2224 fill_in_copy_get_noent(op
, oid
, m
->ops
[0]);
2227 dout(20) << __func__
<< ": find_object_context got error " << r
<< dendl
;
2228 if (op
->may_write() &&
2229 get_osdmap()->require_osd_release
>= ceph_release_t::kraken
) {
2230 record_write_error(op
, oid
, nullptr, r
);
2232 osd
->reply_op_error(op
, r
);
2237 // make sure locator is consistent
2238 object_locator_t
oloc(obc
->obs
.oi
.soid
);
2239 if (m
->get_object_locator() != oloc
) {
2240 dout(10) << " provided locator " << m
->get_object_locator()
2241 << " != object's " << obc
->obs
.oi
.soid
<< dendl
;
2242 osd
->clog
->warn() << "bad locator " << m
->get_object_locator()
2243 << " on object " << oloc
2247 // io blocked on obc?
2248 if (obc
->is_blocked() &&
2249 !m
->has_flag(CEPH_OSD_FLAG_FLUSH
)) {
2250 wait_for_blocked_object(obc
->obs
.oi
.soid
, op
);
2254 dout(25) << __func__
<< " oi " << obc
->obs
.oi
<< dendl
;
2256 OpContext
*ctx
= new OpContext(op
, m
->get_reqid(), &m
->ops
, obc
, this);
2258 if (m
->has_flag(CEPH_OSD_FLAG_SKIPRWLOCKS
)) {
2259 dout(20) << __func__
<< ": skipping rw locks" << dendl
;
2260 } else if (m
->get_flags() & CEPH_OSD_FLAG_FLUSH
) {
2261 dout(20) << __func__
<< ": part of flush, will ignore write lock" << dendl
;
2263 // verify there is in fact a flush in progress
2264 // FIXME: we could make this a stronger test.
2265 map
<hobject_t
,FlushOpRef
>::iterator p
= flush_ops
.find(obc
->obs
.oi
.soid
);
2266 if (p
== flush_ops
.end()) {
2267 dout(10) << __func__
<< " no flush in progress, aborting" << dendl
;
2268 reply_ctx(ctx
, -EINVAL
);
2271 } else if (!get_rw_locks(write_ordered
, ctx
)) {
2272 dout(20) << __func__
<< " waiting for rw locks " << dendl
;
2273 op
->mark_delayed("waiting for rw locks");
2277 dout(20) << __func__
<< " obc " << *obc
<< dendl
;
2280 dout(20) << __func__
<< " returned an error: " << r
<< dendl
;
2281 if (op
->may_write() &&
2282 get_osdmap()->require_osd_release
>= ceph_release_t::kraken
) {
2283 record_write_error(op
, oid
, nullptr, r
,
2284 ctx
->op
->allows_returnvec() ? ctx
: nullptr);
2286 osd
->reply_op_error(op
, r
);
2292 if (m
->has_flag(CEPH_OSD_FLAG_IGNORE_CACHE
)) {
2293 ctx
->ignore_cache
= true;
2296 if ((op
->may_read()) && (obc
->obs
.oi
.is_lost())) {
2297 // This object is lost. Reading from it returns an error.
2298 dout(20) << __func__
<< ": object " << obc
->obs
.oi
.soid
2299 << " is lost" << dendl
;
2300 reply_ctx(ctx
, -ENFILE
);
2303 if (!op
->may_write() &&
2305 (!obc
->obs
.exists
||
2306 ((m
->get_snapid() != CEPH_SNAPDIR
) &&
2307 obc
->obs
.oi
.is_whiteout()))) {
2308 // copy the reqids for copy get on ENOENT
2309 if (m
->ops
[0].op
.op
== CEPH_OSD_OP_COPY_GET
) {
2310 fill_in_copy_get_noent(op
, oid
, m
->ops
[0]);
2314 reply_ctx(ctx
, -ENOENT
);
2321 utime_t prepare_latency
= ceph_clock_now();
2322 prepare_latency
-= op
->get_dequeued_time();
2323 osd
->logger
->tinc(l_osd_op_prepare_lat
, prepare_latency
);
2324 if (op
->may_read() && op
->may_write()) {
2325 osd
->logger
->tinc(l_osd_op_rw_prepare_lat
, prepare_latency
);
2326 } else if (op
->may_read()) {
2327 osd
->logger
->tinc(l_osd_op_r_prepare_lat
, prepare_latency
);
2328 } else if (op
->may_write() || op
->may_cache()) {
2329 osd
->logger
->tinc(l_osd_op_w_prepare_lat
, prepare_latency
);
2332 // force recovery of the oldest missing object if too many logs
2333 maybe_force_recovery();
2336 PrimaryLogPG::cache_result_t
PrimaryLogPG::maybe_handle_manifest_detail(
2339 ObjectContextRef obc
)
2342 if (op
->get_req
<MOSDOp
>()->get_flags() & CEPH_OSD_FLAG_IGNORE_REDIRECT
) {
2343 dout(20) << __func__
<< ": ignoring redirect due to flag" << dendl
;
2344 return cache_result_t::NOOP
;
2347 // if it is write-ordered and blocked, stop now
2348 if (obc
->is_blocked() && write_ordered
) {
2349 // we're already doing something with this object
2350 dout(20) << __func__
<< " blocked on " << obc
->obs
.oi
.soid
<< dendl
;
2351 return cache_result_t::NOOP
;
2354 vector
<OSDOp
> ops
= op
->get_req
<MOSDOp
>()->ops
;
2355 for (vector
<OSDOp
>::iterator p
= ops
.begin(); p
!= ops
.end(); ++p
) {
2357 ceph_osd_op
& op
= osd_op
.op
;
2358 if (op
.op
== CEPH_OSD_OP_SET_REDIRECT
||
2359 op
.op
== CEPH_OSD_OP_SET_CHUNK
||
2360 op
.op
== CEPH_OSD_OP_UNSET_MANIFEST
||
2361 op
.op
== CEPH_OSD_OP_TIER_FLUSH
) {
2362 return cache_result_t::NOOP
;
2363 } else if (op
.op
== CEPH_OSD_OP_TIER_PROMOTE
) {
2364 bool is_dirty
= false;
2365 for (auto& p
: obc
->obs
.oi
.manifest
.chunk_map
) {
2366 if (p
.second
.is_dirty()) {
2371 start_flush(OpRequestRef(), obc
, true, NULL
, std::nullopt
);
2373 return cache_result_t::NOOP
;
2377 switch (obc
->obs
.oi
.manifest
.type
) {
2378 case object_manifest_t::TYPE_REDIRECT
:
2379 if (op
->may_write() || write_ordered
) {
2380 do_proxy_write(op
, obc
);
2383 if (obc
->obs
.oi
.size
!= 0) {
2384 return cache_result_t::NOOP
;
2386 do_proxy_read(op
, obc
);
2388 return cache_result_t::HANDLED_PROXY
;
2389 case object_manifest_t::TYPE_CHUNKED
:
2391 if (can_proxy_chunked_read(op
, obc
)) {
2392 map
<hobject_t
,FlushOpRef
>::iterator p
= flush_ops
.find(obc
->obs
.oi
.soid
);
2393 if (p
!= flush_ops
.end()) {
2394 do_proxy_chunked_op(op
, obc
->obs
.oi
.soid
, obc
, true);
2395 return cache_result_t::HANDLED_PROXY
;
2397 do_proxy_chunked_op(op
, obc
->obs
.oi
.soid
, obc
, write_ordered
);
2398 return cache_result_t::HANDLED_PROXY
;
2401 MOSDOp
*m
= static_cast<MOSDOp
*>(op
->get_nonconst_req());
2402 ceph_assert(m
->get_type() == CEPH_MSG_OSD_OP
);
2403 hobject_t head
= m
->get_hobj();
2405 if (is_degraded_or_backfilling_object(head
)) {
2406 dout(20) << __func__
<< ": " << head
<< " is degraded, waiting" << dendl
;
2407 wait_for_degraded_object(head
, op
);
2408 return cache_result_t::BLOCKED_RECOVERY
;
2411 if (write_blocked_by_scrub(head
)) {
2412 dout(20) << __func__
<< ": waiting for scrub" << dendl
;
2413 waiting_for_scrub
.push_back(op
);
2414 op
->mark_delayed("waiting for scrub");
2415 return cache_result_t::BLOCKED_RECOVERY
;
2417 if (!check_laggy_requeue(op
)) {
2418 return cache_result_t::BLOCKED_RECOVERY
;
2421 for (auto& p
: obc
->obs
.oi
.manifest
.chunk_map
) {
2422 if (p
.second
.is_missing()) {
2423 auto m
= op
->get_req
<MOSDOp
>();
2424 const object_locator_t oloc
= m
->get_object_locator();
2425 promote_object(obc
, obc
->obs
.oi
.soid
, oloc
, op
, NULL
);
2426 return cache_result_t::BLOCKED_PROMOTE
;
2430 bool all_dirty
= true;
2431 for (auto& p
: obc
->obs
.oi
.manifest
.chunk_map
) {
2432 if (!p
.second
.is_dirty()) {
2437 start_flush(OpRequestRef(), obc
, true, NULL
, std::nullopt
);
2439 return cache_result_t::NOOP
;
2442 ceph_abort_msg("unrecognized manifest type");
2445 return cache_result_t::NOOP
;
2448 struct C_ManifestFlush
: public Context
{
2455 uint64_t last_offset
;
2456 C_ManifestFlush(PrimaryLogPG
*p
, hobject_t o
, epoch_t e
)
2457 : pg(p
), oid(o
), lpr(e
),
2458 tid(0), start(ceph_clock_now())
2460 void finish(int r
) override
{
2461 if (r
== -ECANCELED
)
2463 std::scoped_lock locker
{*pg
};
2464 pg
->handle_manifest_flush(oid
, tid
, r
, offset
, last_offset
, lpr
);
2465 pg
->osd
->logger
->tinc(l_osd_tier_flush_lat
, ceph_clock_now() - start
);
2469 void PrimaryLogPG::handle_manifest_flush(hobject_t oid
, ceph_tid_t tid
, int r
,
2470 uint64_t offset
, uint64_t last_offset
,
2473 map
<hobject_t
,FlushOpRef
>::iterator p
= flush_ops
.find(oid
);
2474 if (p
== flush_ops
.end()) {
2475 dout(10) << __func__
<< " no flush_op found" << dendl
;
2478 if (p
->second
->rval
< 0) {
2481 p
->second
->io_results
[offset
] = r
;
2482 for (auto &ior
: p
->second
->io_results
) {
2483 if (ior
.second
< 0) {
2484 finish_manifest_flush(oid
, tid
, r
, p
->second
->obc
, last_offset
);
2485 p
->second
->rval
= r
;
2489 if (p
->second
->chunks
== p
->second
->io_results
.size()) {
2490 if (lpr
== get_last_peering_reset()) {
2491 ceph_assert(p
->second
->obc
);
2492 finish_manifest_flush(oid
, tid
, r
, p
->second
->obc
, last_offset
);
2497 int PrimaryLogPG::start_manifest_flush(OpRequestRef op
, ObjectContextRef obc
, bool blocking
,
2498 std::optional
<std::function
<void()>> &&on_flush
)
2500 auto p
= obc
->obs
.oi
.manifest
.chunk_map
.begin();
2501 FlushOpRef
manifest_fop(std::make_shared
<FlushOp
>());
2502 manifest_fop
->op
= op
;
2503 manifest_fop
->obc
= obc
;
2504 manifest_fop
->flushed_version
= obc
->obs
.oi
.user_version
;
2505 manifest_fop
->blocking
= blocking
;
2506 manifest_fop
->on_flush
= std::move(on_flush
);
2507 int r
= do_manifest_flush(op
, obc
, manifest_fop
, p
->first
, blocking
);
2512 flush_ops
[obc
->obs
.oi
.soid
] = manifest_fop
;
2513 return -EINPROGRESS
;
2516 int PrimaryLogPG::do_manifest_flush(OpRequestRef op
, ObjectContextRef obc
, FlushOpRef manifest_fop
,
2517 uint64_t start_offset
, bool block
)
2519 struct object_manifest_t
&manifest
= obc
->obs
.oi
.manifest
;
2520 hobject_t soid
= obc
->obs
.oi
.soid
;
2523 uint64_t max_copy_size
= 0, last_offset
= 0;
2525 map
<uint64_t, chunk_info_t
>::iterator iter
= manifest
.chunk_map
.find(start_offset
);
2526 ceph_assert(iter
!= manifest
.chunk_map
.end());
2527 for (;iter
!= manifest
.chunk_map
.end(); ++iter
) {
2528 if (iter
->second
.is_dirty()) {
2529 last_offset
= iter
->first
;
2530 max_copy_size
+= iter
->second
.length
;
2532 if (get_copy_chunk_size() < max_copy_size
) {
2537 iter
= manifest
.chunk_map
.find(start_offset
);
2538 for (;iter
!= manifest
.chunk_map
.end(); ++iter
) {
2539 if (!iter
->second
.is_dirty()) {
2542 uint64_t tgt_length
= iter
->second
.length
;
2543 uint64_t tgt_offset
= iter
->second
.offset
;
2544 hobject_t tgt_soid
= iter
->second
.oid
;
2545 object_locator_t
oloc(tgt_soid
);
2546 ObjectOperation obj_op
;
2547 bufferlist chunk_data
;
2548 int r
= pgbackend
->objects_read_sync(
2549 soid
, iter
->first
, tgt_length
, 0, &chunk_data
);
2551 dout(0) << __func__
<< " read fail " << " offset: " << tgt_offset
2552 << " len: " << tgt_length
<< " r: " << r
<< dendl
;
2555 if (!chunk_data
.length()) {
2559 unsigned flags
= CEPH_OSD_FLAG_IGNORE_CACHE
| CEPH_OSD_FLAG_IGNORE_OVERLAY
|
2560 CEPH_OSD_FLAG_RWORDERED
;
2561 tgt_length
= chunk_data
.length();
2562 if (pg_pool_t::fingerprint_t fp_algo
= pool
.info
.get_fingerprint_type();
2563 iter
->second
.has_reference() &&
2564 fp_algo
!= pg_pool_t::TYPE_FINGERPRINT_NONE
) {
2565 object_t fp_oid
= [fp_algo
, &chunk_data
]() -> string
{
2567 case pg_pool_t::TYPE_FINGERPRINT_SHA1
:
2568 return crypto::digest
<crypto::SHA1
>(chunk_data
).to_str();
2569 case pg_pool_t::TYPE_FINGERPRINT_SHA256
:
2570 return crypto::digest
<crypto::SHA256
>(chunk_data
).to_str();
2571 case pg_pool_t::TYPE_FINGERPRINT_SHA512
:
2572 return crypto::digest
<crypto::SHA512
>(chunk_data
).to_str();
2574 assert(0 == "unrecognized fingerprint type");
2579 if (fp_oid
!= tgt_soid
.oid
) {
2580 // decrement old chunk's reference count
2581 ObjectOperation dec_op
;
2582 cls_chunk_refcount_put_op put_call
;
2583 put_call
.source
= soid
;
2584 ::encode(put_call
, in
);
2585 dec_op
.call("cas", "chunk_put", in
);
2586 // we don't care dec_op's completion. scrub for dedup will fix this.
2587 tid
= osd
->objecter
->mutate(
2588 tgt_soid
.oid
, oloc
, dec_op
, snapc
,
2589 ceph::real_clock::from_ceph_timespec(obc
->obs
.oi
.mtime
),
2593 tgt_soid
.oid
= fp_oid
;
2594 iter
->second
.oid
= tgt_soid
;
2597 osd_op
.extent
.offset
= 0;
2598 osd_op
.extent
.length
= chunk_data
.length();
2601 in
.append(chunk_data
);
2602 obj_op
.call("cas", "cas_write_or_get", in
);
2604 obj_op
.add_data(CEPH_OSD_OP_WRITE
, tgt_offset
, tgt_length
, chunk_data
);
2607 C_ManifestFlush
*fin
= new C_ManifestFlush(this, soid
, get_last_peering_reset());
2608 fin
->offset
= iter
->first
;
2609 fin
->last_offset
= last_offset
;
2610 manifest_fop
->chunks
++;
2612 tid
= osd
->objecter
->mutate(
2613 tgt_soid
.oid
, oloc
, obj_op
, snapc
,
2614 ceph::real_clock::from_ceph_timespec(obc
->obs
.oi
.mtime
),
2615 flags
, new C_OnFinisher(fin
, osd
->get_objecter_finisher(get_pg_shard())));
2617 manifest_fop
->io_tids
[iter
->first
] = tid
;
2619 dout(20) << __func__
<< " offset: " << tgt_offset
<< " len: " << tgt_length
2620 << " oid: " << tgt_soid
.oid
<< " ori oid: " << soid
.oid
.name
2621 << " tid: " << tid
<< dendl
;
2622 if (last_offset
< iter
->first
) {
2630 void PrimaryLogPG::finish_manifest_flush(hobject_t oid
, ceph_tid_t tid
, int r
,
2631 ObjectContextRef obc
, uint64_t last_offset
)
2633 dout(10) << __func__
<< " " << oid
<< " tid " << tid
2634 << " " << cpp_strerror(r
) << " last_offset: " << last_offset
<< dendl
;
2635 map
<hobject_t
,FlushOpRef
>::iterator p
= flush_ops
.find(oid
);
2636 if (p
== flush_ops
.end()) {
2637 dout(10) << __func__
<< " no flush_op found" << dendl
;
2640 map
<uint64_t, chunk_info_t
>::iterator iter
=
2641 obc
->obs
.oi
.manifest
.chunk_map
.find(last_offset
);
2642 ceph_assert(iter
!= obc
->obs
.oi
.manifest
.chunk_map
.end());
2643 for (;iter
!= obc
->obs
.oi
.manifest
.chunk_map
.end(); ++iter
) {
2644 if (iter
->second
.is_dirty() && last_offset
< iter
->first
) {
2645 do_manifest_flush(p
->second
->op
, obc
, p
->second
, iter
->first
, p
->second
->blocking
);
2649 finish_flush(oid
, tid
, r
);
2652 void PrimaryLogPG::record_write_error(OpRequestRef op
, const hobject_t
&soid
,
2653 MOSDOpReply
*orig_reply
, int r
,
2654 OpContext
*ctx_for_op_returns
)
2656 dout(20) << __func__
<< " r=" << r
<< dendl
;
2657 ceph_assert(op
->may_write());
2658 const osd_reqid_t
&reqid
= op
->get_req
<MOSDOp
>()->get_reqid();
2659 mempool::osd_pglog::list
<pg_log_entry_t
> entries
;
2660 entries
.push_back(pg_log_entry_t(pg_log_entry_t::ERROR
, soid
,
2661 get_next_version(), eversion_t(), 0,
2662 reqid
, utime_t(), r
));
2663 if (ctx_for_op_returns
) {
2664 entries
.back().set_op_returns(*ctx_for_op_returns
->ops
);
2665 dout(20) << __func__
<< " op_returns=" << entries
.back().op_returns
<< dendl
;
2671 boost::intrusive_ptr
<MOSDOpReply
> orig_reply
;
2676 MOSDOpReply
*orig_reply
,
2679 orig_reply(orig_reply
, false /* take over ref */), r(r
)
2682 ldpp_dout(pg
, 20) << "finished " << __func__
<< " r=" << r
<< dendl
;
2683 auto m
= op
->get_req
<MOSDOp
>();
2684 MOSDOpReply
*reply
= orig_reply
.detach();
2685 ldpp_dout(pg
, 10) << " sending commit on " << *m
<< " " << reply
<< dendl
;
2686 pg
->osd
->send_message_osd_client(reply
, m
->get_connection());
2690 ObcLockManager lock_manager
;
2693 std::move(lock_manager
),
2694 std::optional
<std::function
<void(void)> >(
2695 OnComplete(this, op
, orig_reply
, r
)),
2700 PrimaryLogPG::cache_result_t
PrimaryLogPG::maybe_handle_cache_detail(
2703 ObjectContextRef obc
,
2704 int r
, hobject_t missing_oid
,
2707 ObjectContextRef
*promote_obc
)
2709 // return quickly if caching is not enabled
2710 if (pool
.info
.cache_mode
== pg_pool_t::CACHEMODE_NONE
)
2711 return cache_result_t::NOOP
;
2715 op
->get_req()->get_type() == CEPH_MSG_OSD_OP
&&
2716 (op
->get_req
<MOSDOp
>()->get_flags() &
2717 CEPH_OSD_FLAG_IGNORE_CACHE
)) {
2718 dout(20) << __func__
<< ": ignoring cache due to flag" << dendl
;
2719 return cache_result_t::NOOP
;
2722 must_promote
= must_promote
|| op
->need_promote();
2725 dout(25) << __func__
<< " " << obc
->obs
.oi
<< " "
2726 << (obc
->obs
.exists
? "exists" : "DNE")
2727 << " missing_oid " << missing_oid
2728 << " must_promote " << (int)must_promote
2729 << " in_hit_set " << (int)in_hit_set
2732 dout(25) << __func__
<< " (no obc)"
2733 << " missing_oid " << missing_oid
2734 << " must_promote " << (int)must_promote
2735 << " in_hit_set " << (int)in_hit_set
2738 // if it is write-ordered and blocked, stop now
2739 if (obc
.get() && obc
->is_blocked() && write_ordered
) {
2740 // we're already doing something with this object
2741 dout(20) << __func__
<< " blocked on " << obc
->obs
.oi
.soid
<< dendl
;
2742 return cache_result_t::NOOP
;
2745 if (r
== -ENOENT
&& missing_oid
== hobject_t()) {
2746 // we know this object is logically absent (e.g., an undefined clone)
2747 return cache_result_t::NOOP
;
2750 if (obc
.get() && obc
->obs
.exists
) {
2751 osd
->logger
->inc(l_osd_op_cache_hit
);
2752 return cache_result_t::NOOP
;
2754 if (!is_primary()) {
2755 dout(20) << __func__
<< " cache miss; ask the primary" << dendl
;
2756 osd
->reply_op_error(op
, -EAGAIN
);
2757 return cache_result_t::REPLIED_WITH_EAGAIN
;
2760 if (missing_oid
== hobject_t() && obc
.get()) {
2761 missing_oid
= obc
->obs
.oi
.soid
;
2764 auto m
= op
->get_req
<MOSDOp
>();
2765 const object_locator_t oloc
= m
->get_object_locator();
2767 if (op
->need_skip_handle_cache()) {
2768 return cache_result_t::NOOP
;
2771 OpRequestRef promote_op
;
2773 switch (pool
.info
.cache_mode
) {
2774 case pg_pool_t::CACHEMODE_WRITEBACK
:
2776 agent_state
->evict_mode
== TierAgentState::EVICT_MODE_FULL
) {
2777 if (!op
->may_write() && !op
->may_cache() &&
2778 !write_ordered
&& !must_promote
) {
2779 dout(20) << __func__
<< " cache pool full, proxying read" << dendl
;
2781 return cache_result_t::HANDLED_PROXY
;
2783 dout(20) << __func__
<< " cache pool full, waiting" << dendl
;
2784 block_write_on_full_cache(missing_oid
, op
);
2785 return cache_result_t::BLOCKED_FULL
;
2788 if (must_promote
|| (!hit_set
&& !op
->need_skip_promote())) {
2789 promote_object(obc
, missing_oid
, oloc
, op
, promote_obc
);
2790 return cache_result_t::BLOCKED_PROMOTE
;
2793 if (op
->may_write() || op
->may_cache()) {
2797 if (!op
->need_skip_promote() &&
2798 maybe_promote(obc
, missing_oid
, oloc
, in_hit_set
,
2799 pool
.info
.min_write_recency_for_promote
,
2802 return cache_result_t::BLOCKED_PROMOTE
;
2804 return cache_result_t::HANDLED_PROXY
;
2808 // Avoid duplicate promotion
2809 if (obc
.get() && obc
->is_blocked()) {
2812 return cache_result_t::BLOCKED_PROMOTE
;
2816 if (!op
->need_skip_promote()) {
2817 (void)maybe_promote(obc
, missing_oid
, oloc
, in_hit_set
,
2818 pool
.info
.min_read_recency_for_promote
,
2819 promote_op
, promote_obc
);
2822 return cache_result_t::HANDLED_PROXY
;
2824 ceph_abort_msg("unreachable");
2825 return cache_result_t::NOOP
;
2827 case pg_pool_t::CACHEMODE_READONLY
:
2828 // TODO: clean this case up
2829 if (!obc
.get() && r
== -ENOENT
) {
2830 // we don't have the object and op's a read
2831 promote_object(obc
, missing_oid
, oloc
, op
, promote_obc
);
2832 return cache_result_t::BLOCKED_PROMOTE
;
2834 if (!r
) { // it must be a write
2835 do_cache_redirect(op
);
2836 return cache_result_t::HANDLED_REDIRECT
;
2838 // crap, there was a failure of some kind
2839 return cache_result_t::NOOP
;
2841 case pg_pool_t::CACHEMODE_FORWARD
:
2842 // this mode is deprecated; proxy instead
2843 case pg_pool_t::CACHEMODE_PROXY
:
2844 if (!must_promote
) {
2845 if (op
->may_write() || op
->may_cache() || write_ordered
) {
2847 return cache_result_t::HANDLED_PROXY
;
2850 return cache_result_t::HANDLED_PROXY
;
2853 // ugh, we're forced to promote.
2855 agent_state
->evict_mode
== TierAgentState::EVICT_MODE_FULL
) {
2856 dout(20) << __func__
<< " cache pool full, waiting" << dendl
;
2857 block_write_on_full_cache(missing_oid
, op
);
2858 return cache_result_t::BLOCKED_FULL
;
2860 promote_object(obc
, missing_oid
, oloc
, op
, promote_obc
);
2861 return cache_result_t::BLOCKED_PROMOTE
;
2863 case pg_pool_t::CACHEMODE_READFORWARD
:
2864 // this mode is deprecated; proxy instead
2865 case pg_pool_t::CACHEMODE_READPROXY
:
2866 // Do writeback to the cache tier for writes
2867 if (op
->may_write() || write_ordered
|| must_promote
) {
2869 agent_state
->evict_mode
== TierAgentState::EVICT_MODE_FULL
) {
2870 dout(20) << __func__
<< " cache pool full, waiting" << dendl
;
2871 block_write_on_full_cache(missing_oid
, op
);
2872 return cache_result_t::BLOCKED_FULL
;
2874 promote_object(obc
, missing_oid
, oloc
, op
, promote_obc
);
2875 return cache_result_t::BLOCKED_PROMOTE
;
2878 // If it is a read, we can read, we need to proxy it
2880 return cache_result_t::HANDLED_PROXY
;
2883 ceph_abort_msg("unrecognized cache_mode");
2885 return cache_result_t::NOOP
;
2888 bool PrimaryLogPG::maybe_promote(ObjectContextRef obc
,
2889 const hobject_t
& missing_oid
,
2890 const object_locator_t
& oloc
,
2893 OpRequestRef promote_op
,
2894 ObjectContextRef
*promote_obc
)
2896 dout(20) << __func__
<< " missing_oid " << missing_oid
2897 << " in_hit_set " << in_hit_set
<< dendl
;
2903 // Check if in the current hit set
2913 unsigned count
= (int)in_hit_set
;
2915 // Check if in other hit sets
2916 const hobject_t
& oid
= obc
.get() ? obc
->obs
.oi
.soid
: missing_oid
;
2917 for (map
<time_t,HitSetRef
>::reverse_iterator itor
=
2918 agent_state
->hit_set_map
.rbegin();
2919 itor
!= agent_state
->hit_set_map
.rend();
2921 if (!itor
->second
->contains(oid
)) {
2925 if (count
>= recency
) {
2930 if (count
>= recency
) {
2933 return false; // not promoting
2938 if (osd
->promote_throttle()) {
2939 dout(10) << __func__
<< " promote throttled" << dendl
;
2942 promote_object(obc
, missing_oid
, oloc
, promote_op
, promote_obc
);
2946 void PrimaryLogPG::do_cache_redirect(OpRequestRef op
)
2948 auto m
= op
->get_req
<MOSDOp
>();
2949 int flags
= m
->get_flags() & (CEPH_OSD_FLAG_ACK
|CEPH_OSD_FLAG_ONDISK
);
2950 MOSDOpReply
*reply
= new MOSDOpReply(m
, -ENOENT
, get_osdmap_epoch(),
2952 request_redirect_t
redir(m
->get_object_locator(), pool
.info
.tier_of
);
2953 reply
->set_redirect(redir
);
2954 dout(10) << "sending redirect to pool " << pool
.info
.tier_of
<< " for op "
2956 m
->get_connection()->send_message(reply
);
2960 struct C_ProxyRead
: public Context
{
2963 epoch_t last_peering_reset
;
2965 PrimaryLogPG::ProxyReadOpRef prdop
;
2967 C_ProxyRead(PrimaryLogPG
*p
, hobject_t o
, epoch_t lpr
,
2968 const PrimaryLogPG::ProxyReadOpRef
& prd
)
2969 : pg(p
), oid(o
), last_peering_reset(lpr
),
2970 tid(0), prdop(prd
), start(ceph_clock_now())
2972 void finish(int r
) override
{
2973 if (prdop
->canceled
)
2975 std::scoped_lock locker
{*pg
};
2976 if (prdop
->canceled
) {
2979 if (last_peering_reset
== pg
->get_last_peering_reset()) {
2980 pg
->finish_proxy_read(oid
, tid
, r
);
2981 pg
->osd
->logger
->tinc(l_osd_tier_r_lat
, ceph_clock_now() - start
);
2986 struct C_ProxyChunkRead
: public Context
{
2989 epoch_t last_peering_reset
;
2991 PrimaryLogPG::ProxyReadOpRef prdop
;
2993 ObjectOperation
*obj_op
;
2995 uint64_t req_offset
= 0;
2996 ObjectContextRef obc
;
2997 uint64_t req_total_len
= 0;
2998 C_ProxyChunkRead(PrimaryLogPG
*p
, hobject_t o
, epoch_t lpr
,
2999 const PrimaryLogPG::ProxyReadOpRef
& prd
)
3000 : pg(p
), oid(o
), last_peering_reset(lpr
),
3001 tid(0), prdop(prd
), start(ceph_clock_now()), obj_op(NULL
)
3003 void finish(int r
) override
{
3004 if (prdop
->canceled
)
3006 std::scoped_lock locker
{*pg
};
3007 if (prdop
->canceled
) {
3010 if (last_peering_reset
== pg
->get_last_peering_reset()) {
3012 if (!prdop
->ops
[op_index
].outdata
.length()) {
3013 ceph_assert(req_total_len
);
3015 bufferptr
bptr(req_total_len
);
3016 list
.push_back(std::move(bptr
));
3017 prdop
->ops
[op_index
].outdata
.append(list
);
3019 ceph_assert(obj_op
);
3020 uint64_t copy_offset
;
3021 if (req_offset
>= prdop
->ops
[op_index
].op
.extent
.offset
) {
3022 copy_offset
= req_offset
- prdop
->ops
[op_index
].op
.extent
.offset
;
3026 prdop
->ops
[op_index
].outdata
.begin(copy_offset
).copy_in(
3027 obj_op
->ops
[0].outdata
.length(),
3028 obj_op
->ops
[0].outdata
.c_str());
3031 pg
->finish_proxy_read(oid
, tid
, r
);
3032 pg
->osd
->logger
->tinc(l_osd_tier_r_lat
, ceph_clock_now() - start
);
3040 void PrimaryLogPG::do_proxy_read(OpRequestRef op
, ObjectContextRef obc
)
3042 // NOTE: non-const here because the ProxyReadOp needs mutable refs to
3043 // stash the result in the request's OSDOp vector
3044 MOSDOp
*m
= static_cast<MOSDOp
*>(op
->get_nonconst_req());
3045 object_locator_t oloc
;
3047 /* extensible tier */
3048 if (obc
&& obc
->obs
.exists
&& obc
->obs
.oi
.has_manifest()) {
3049 switch (obc
->obs
.oi
.manifest
.type
) {
3050 case object_manifest_t::TYPE_REDIRECT
:
3051 oloc
= object_locator_t(obc
->obs
.oi
.manifest
.redirect_target
);
3052 soid
= obc
->obs
.oi
.manifest
.redirect_target
;
3055 ceph_abort_msg("unrecognized manifest type");
3059 soid
= m
->get_hobj();
3060 oloc
= object_locator_t(m
->get_object_locator());
3061 oloc
.pool
= pool
.info
.tier_of
;
3063 unsigned flags
= CEPH_OSD_FLAG_IGNORE_CACHE
| CEPH_OSD_FLAG_IGNORE_OVERLAY
;
3065 // pass through some original flags that make sense.
3066 // - leave out redirection and balancing flags since we are
3067 // already proxying through the primary
3068 // - leave off read/write/exec flags that are derived from the op
3069 flags
|= m
->get_flags() & (CEPH_OSD_FLAG_RWORDERED
|
3070 CEPH_OSD_FLAG_ORDERSNAP
|
3071 CEPH_OSD_FLAG_ENFORCE_SNAPC
|
3072 CEPH_OSD_FLAG_MAP_SNAP_CLONE
);
3074 dout(10) << __func__
<< " Start proxy read for " << *m
<< dendl
;
3076 ProxyReadOpRef
prdop(std::make_shared
<ProxyReadOp
>(op
, soid
, m
->ops
));
3078 ObjectOperation obj_op
;
3079 obj_op
.dup(prdop
->ops
);
3081 if (pool
.info
.cache_mode
== pg_pool_t::CACHEMODE_WRITEBACK
&&
3082 (agent_state
&& agent_state
->evict_mode
!= TierAgentState::EVICT_MODE_FULL
)) {
3083 for (unsigned i
= 0; i
< obj_op
.ops
.size(); i
++) {
3084 ceph_osd_op op
= obj_op
.ops
[i
].op
;
3086 case CEPH_OSD_OP_READ
:
3087 case CEPH_OSD_OP_SYNC_READ
:
3088 case CEPH_OSD_OP_SPARSE_READ
:
3089 case CEPH_OSD_OP_CHECKSUM
:
3090 case CEPH_OSD_OP_CMPEXT
:
3091 op
.flags
= (op
.flags
| CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL
) &
3092 ~(CEPH_OSD_OP_FLAG_FADVISE_DONTNEED
| CEPH_OSD_OP_FLAG_FADVISE_NOCACHE
);
3097 C_ProxyRead
*fin
= new C_ProxyRead(this, soid
, get_last_peering_reset(),
3099 ceph_tid_t tid
= osd
->objecter
->read(
3100 soid
.oid
, oloc
, obj_op
,
3101 m
->get_snapid(), NULL
,
3102 flags
, new C_OnFinisher(fin
, osd
->get_objecter_finisher(get_pg_shard())),
3103 &prdop
->user_version
,
3104 &prdop
->data_offset
,
3107 prdop
->objecter_tid
= tid
;
3108 proxyread_ops
[tid
] = prdop
;
3109 in_progress_proxy_ops
[soid
].push_back(op
);
3112 void PrimaryLogPG::finish_proxy_read(hobject_t oid
, ceph_tid_t tid
, int r
)
3114 dout(10) << __func__
<< " " << oid
<< " tid " << tid
3115 << " " << cpp_strerror(r
) << dendl
;
3117 map
<ceph_tid_t
, ProxyReadOpRef
>::iterator p
= proxyread_ops
.find(tid
);
3118 if (p
== proxyread_ops
.end()) {
3119 dout(10) << __func__
<< " no proxyread_op found" << dendl
;
3122 ProxyReadOpRef prdop
= p
->second
;
3123 if (tid
!= prdop
->objecter_tid
) {
3124 dout(10) << __func__
<< " tid " << tid
<< " != prdop " << prdop
3125 << " tid " << prdop
->objecter_tid
<< dendl
;
3128 if (oid
!= prdop
->soid
) {
3129 dout(10) << __func__
<< " oid " << oid
<< " != prdop " << prdop
3130 << " soid " << prdop
->soid
<< dendl
;
3133 proxyread_ops
.erase(tid
);
3135 map
<hobject_t
, list
<OpRequestRef
>>::iterator q
= in_progress_proxy_ops
.find(oid
);
3136 if (q
== in_progress_proxy_ops
.end()) {
3137 dout(10) << __func__
<< " no in_progress_proxy_ops found" << dendl
;
3140 ceph_assert(q
->second
.size());
3141 list
<OpRequestRef
>::iterator it
= std::find(q
->second
.begin(),
3144 ceph_assert(it
!= q
->second
.end());
3145 OpRequestRef op
= *it
;
3146 q
->second
.erase(it
);
3147 if (q
->second
.size() == 0) {
3148 in_progress_proxy_ops
.erase(oid
);
3149 } else if (std::find(q
->second
.begin(),
3151 prdop
->op
) != q
->second
.end()) {
3152 /* multiple read case */
3153 dout(20) << __func__
<< " " << oid
<< " is not completed " << dendl
;
3157 osd
->logger
->inc(l_osd_tier_proxy_read
);
3159 auto m
= op
->get_req
<MOSDOp
>();
3160 OpContext
*ctx
= new OpContext(op
, m
->get_reqid(), &prdop
->ops
, this);
3161 ctx
->reply
= new MOSDOpReply(m
, 0, get_osdmap_epoch(), 0, false);
3162 ctx
->user_at_version
= prdop
->user_version
;
3163 ctx
->data_off
= prdop
->data_offset
;
3164 ctx
->ignore_log_op_stats
= true;
3165 complete_read_ctx(r
, ctx
);
3168 void PrimaryLogPG::kick_proxy_ops_blocked(hobject_t
& soid
)
3170 map
<hobject_t
, list
<OpRequestRef
>>::iterator p
= in_progress_proxy_ops
.find(soid
);
3171 if (p
== in_progress_proxy_ops
.end())
3174 list
<OpRequestRef
>& ls
= p
->second
;
3175 dout(10) << __func__
<< " " << soid
<< " requeuing " << ls
.size() << " requests" << dendl
;
3177 in_progress_proxy_ops
.erase(p
);
3180 void PrimaryLogPG::cancel_proxy_read(ProxyReadOpRef prdop
,
3181 vector
<ceph_tid_t
> *tids
)
3183 dout(10) << __func__
<< " " << prdop
->soid
<< dendl
;
3184 prdop
->canceled
= true;
3186 // cancel objecter op, if we can
3187 if (prdop
->objecter_tid
) {
3188 tids
->push_back(prdop
->objecter_tid
);
3189 for (uint32_t i
= 0; i
< prdop
->ops
.size(); i
++) {
3190 prdop
->ops
[i
].outdata
.clear();
3192 proxyread_ops
.erase(prdop
->objecter_tid
);
3193 prdop
->objecter_tid
= 0;
3197 void PrimaryLogPG::cancel_proxy_ops(bool requeue
, vector
<ceph_tid_t
> *tids
)
3199 dout(10) << __func__
<< dendl
;
3201 // cancel proxy reads
3202 map
<ceph_tid_t
, ProxyReadOpRef
>::iterator p
= proxyread_ops
.begin();
3203 while (p
!= proxyread_ops
.end()) {
3204 cancel_proxy_read((p
++)->second
, tids
);
3207 // cancel proxy writes
3208 map
<ceph_tid_t
, ProxyWriteOpRef
>::iterator q
= proxywrite_ops
.begin();
3209 while (q
!= proxywrite_ops
.end()) {
3210 cancel_proxy_write((q
++)->second
, tids
);
3214 map
<hobject_t
, list
<OpRequestRef
>>::iterator p
=
3215 in_progress_proxy_ops
.begin();
3216 while (p
!= in_progress_proxy_ops
.end()) {
3217 list
<OpRequestRef
>& ls
= p
->second
;
3218 dout(10) << __func__
<< " " << p
->first
<< " requeuing " << ls
.size()
3219 << " requests" << dendl
;
3221 in_progress_proxy_ops
.erase(p
++);
3224 in_progress_proxy_ops
.clear();
3228 struct C_ProxyWrite_Commit
: public Context
{
3231 epoch_t last_peering_reset
;
3233 PrimaryLogPG::ProxyWriteOpRef pwop
;
3234 C_ProxyWrite_Commit(PrimaryLogPG
*p
, hobject_t o
, epoch_t lpr
,
3235 const PrimaryLogPG::ProxyWriteOpRef
& pw
)
3236 : pg(p
), oid(o
), last_peering_reset(lpr
),
3239 void finish(int r
) override
{
3242 std::scoped_lock locker
{*pg
};
3243 if (pwop
->canceled
) {
3246 if (last_peering_reset
== pg
->get_last_peering_reset()) {
3247 pg
->finish_proxy_write(oid
, tid
, r
);
3252 void PrimaryLogPG::do_proxy_write(OpRequestRef op
, ObjectContextRef obc
)
3254 // NOTE: non-const because ProxyWriteOp takes a mutable ref
3255 MOSDOp
*m
= static_cast<MOSDOp
*>(op
->get_nonconst_req());
3256 object_locator_t oloc
;
3257 SnapContext
snapc(m
->get_snap_seq(), m
->get_snaps());
3259 /* extensible tier */
3260 if (obc
&& obc
->obs
.exists
&& obc
->obs
.oi
.has_manifest()) {
3261 switch (obc
->obs
.oi
.manifest
.type
) {
3262 case object_manifest_t::TYPE_REDIRECT
:
3263 oloc
= object_locator_t(obc
->obs
.oi
.manifest
.redirect_target
);
3264 soid
= obc
->obs
.oi
.manifest
.redirect_target
;
3267 ceph_abort_msg("unrecognized manifest type");
3271 soid
= m
->get_hobj();
3272 oloc
= object_locator_t(m
->get_object_locator());
3273 oloc
.pool
= pool
.info
.tier_of
;
3276 unsigned flags
= CEPH_OSD_FLAG_IGNORE_CACHE
| CEPH_OSD_FLAG_IGNORE_OVERLAY
;
3277 if (!(op
->may_write() || op
->may_cache())) {
3278 flags
|= CEPH_OSD_FLAG_RWORDERED
;
3280 if (op
->allows_returnvec()) {
3281 flags
|= CEPH_OSD_FLAG_RETURNVEC
;
3284 dout(10) << __func__
<< " Start proxy write for " << *m
<< dendl
;
3286 ProxyWriteOpRef
pwop(std::make_shared
<ProxyWriteOp
>(op
, soid
, m
->ops
, m
->get_reqid()));
3287 pwop
->ctx
= new OpContext(op
, m
->get_reqid(), &pwop
->ops
, this);
3288 pwop
->mtime
= m
->get_mtime();
3290 ObjectOperation obj_op
;
3291 obj_op
.dup(pwop
->ops
);
3293 C_ProxyWrite_Commit
*fin
= new C_ProxyWrite_Commit(
3294 this, soid
, get_last_peering_reset(), pwop
);
3295 ceph_tid_t tid
= osd
->objecter
->mutate(
3296 soid
.oid
, oloc
, obj_op
, snapc
,
3297 ceph::real_clock::from_ceph_timespec(pwop
->mtime
),
3298 flags
, new C_OnFinisher(fin
, osd
->get_objecter_finisher(get_pg_shard())),
3299 &pwop
->user_version
, pwop
->reqid
);
3301 pwop
->objecter_tid
= tid
;
3302 proxywrite_ops
[tid
] = pwop
;
3303 in_progress_proxy_ops
[soid
].push_back(op
);
3306 void PrimaryLogPG::do_proxy_chunked_op(OpRequestRef op
, const hobject_t
& missing_oid
,
3307 ObjectContextRef obc
, bool write_ordered
)
3309 MOSDOp
*m
= static_cast<MOSDOp
*>(op
->get_nonconst_req());
3310 OSDOp
*osd_op
= NULL
;
3311 for (unsigned int i
= 0; i
< m
->ops
.size(); i
++) {
3312 osd_op
= &m
->ops
[i
];
3313 uint64_t cursor
= osd_op
->op
.extent
.offset
;
3314 uint64_t op_length
= osd_op
->op
.extent
.offset
+ osd_op
->op
.extent
.length
;
3315 uint64_t chunk_length
= 0, chunk_index
= 0, req_len
= 0;
3316 object_manifest_t
*manifest
= &obc
->obs
.oi
.manifest
;
3317 map
<uint64_t, map
<uint64_t, uint64_t>> chunk_read
;
3319 while (cursor
< op_length
) {
3322 /* find the right chunk position for cursor */
3323 for (auto &p
: manifest
->chunk_map
) {
3324 if (p
.first
<= cursor
&& p
.first
+ p
.second
.length
> cursor
) {
3325 chunk_length
= p
.second
.length
;
3326 chunk_index
= p
.first
;
3331 if (!chunk_index
&& !chunk_length
) {
3332 if (cursor
== osd_op
->op
.extent
.offset
) {
3333 OpContext
*ctx
= new OpContext(op
, m
->get_reqid(), &m
->ops
, this);
3334 ctx
->reply
= new MOSDOpReply(m
, 0, get_osdmap_epoch(), 0, false);
3335 ctx
->data_off
= osd_op
->op
.extent
.offset
;
3336 ctx
->ignore_log_op_stats
= true;
3337 complete_read_ctx(0, ctx
);
3341 uint64_t next_length
= chunk_length
;
3342 /* the size to read -> | op length | */
3344 if (cursor
+ next_length
> op_length
) {
3345 next_length
= op_length
- cursor
;
3347 /* the size to read -> | op length | */
3349 if (cursor
+ next_length
> chunk_index
+ chunk_length
) {
3350 next_length
= chunk_index
+ chunk_length
- cursor
;
3353 chunk_read
[cursor
] = {{chunk_index
, next_length
}};
3354 cursor
+= next_length
;
3357 req_len
= cursor
- osd_op
->op
.extent
.offset
;
3358 for (auto &p
: chunk_read
) {
3359 auto chunks
= p
.second
.begin();
3360 dout(20) << __func__
<< " chunk_index: " << chunks
->first
3361 << " next_length: " << chunks
->second
<< " cursor: "
3362 << p
.first
<< dendl
;
3363 do_proxy_chunked_read(op
, obc
, i
, chunks
->first
, p
.first
, chunks
->second
, req_len
, write_ordered
);
3368 struct RefCountCallback
: public Context
{
3370 PrimaryLogPG::OpContext
*ctx
;
3372 bool requeue
= false;
3374 RefCountCallback(PrimaryLogPG::OpContext
*ctx
, OSDOp
&osd_op
)
3375 : ctx(ctx
), osd_op(osd_op
) {}
3376 void finish(int r
) override
{
3377 // NB: caller must already have pg->lock held
3378 ctx
->obc
->stop_block();
3379 ctx
->pg
->kick_object_context_blocked(ctx
->obc
);
3382 ctx
->pg
->execute_ctx(ctx
);
3384 // on cancel simply toss op out,
3385 // or requeue as requested
3386 if (r
!= -ECANCELED
) {
3388 ctx
->pg
->osd
->reply_op_error(ctx
->op
, r
);
3389 } else if (requeue
) {
3391 ctx
->pg
->requeue_op(ctx
->op
);
3393 ctx
->pg
->close_op_ctx(ctx
);
3396 void set_requeue(bool rq
) {
3401 struct SetManifestFinisher
: public PrimaryLogPG::OpFinisher
{
3404 explicit SetManifestFinisher(OSDOp
& osd_op
) : osd_op(osd_op
) {
3407 int execute() override
{
3412 struct C_SetManifestRefCountDone
: public Context
{
3413 RefCountCallback
* cb
;
3415 C_SetManifestRefCountDone(
3416 RefCountCallback
* cb
, hobject_t soid
) : cb(cb
), soid(soid
) {}
3417 void finish(int r
) override
{
3418 if (r
== -ECANCELED
)
3420 auto pg
= cb
->ctx
->pg
;
3421 std::scoped_lock locker
{*pg
};
3422 auto it
= pg
->manifest_ops
.find(soid
);
3423 if (it
== pg
->manifest_ops
.end()) {
3424 // raced with cancel_manifest_ops
3427 pg
->manifest_ops
.erase(it
);
3432 void PrimaryLogPG::cancel_manifest_ops(bool requeue
, vector
<ceph_tid_t
> *tids
)
3434 dout(10) << __func__
<< dendl
;
3435 auto p
= manifest_ops
.begin();
3436 while (p
!= manifest_ops
.end()) {
3437 auto mop
= p
->second
;
3438 // cancel objecter op, if we can
3439 if (mop
->objecter_tid
) {
3440 tids
->push_back(mop
->objecter_tid
);
3441 mop
->objecter_tid
= 0;
3443 mop
->cb
->set_requeue(requeue
);
3444 mop
->cb
->complete(-ECANCELED
);
3445 manifest_ops
.erase(p
++);
3449 void PrimaryLogPG::refcount_manifest(ObjectContextRef obc
, object_locator_t oloc
, hobject_t soid
,
3450 SnapContext snapc
, bool get
, RefCountCallback
*cb
, uint64_t offset
)
3452 unsigned flags
= CEPH_OSD_FLAG_IGNORE_CACHE
| CEPH_OSD_FLAG_IGNORE_OVERLAY
|
3453 CEPH_OSD_FLAG_RWORDERED
;
3455 dout(10) << __func__
<< " Start refcount for " << soid
<< dendl
;
3457 ObjectOperation obj_op
;
3460 cls_chunk_refcount_get_op call
;
3461 call
.source
= obc
->obs
.oi
.soid
;
3463 obj_op
.call("cas", "chunk_get", in
);
3465 cls_chunk_refcount_put_op call
;
3466 call
.source
= obc
->obs
.oi
.soid
;
3468 obj_op
.call("cas", "chunk_put", in
);
3471 Context
*c
= nullptr;
3473 C_SetManifestRefCountDone
*fin
=
3474 new C_SetManifestRefCountDone(cb
, obc
->obs
.oi
.soid
);
3475 c
= new C_OnFinisher(fin
, osd
->get_objecter_finisher(get_pg_shard()));
3478 auto tid
= osd
->objecter
->mutate(
3479 soid
.oid
, oloc
, obj_op
, snapc
,
3480 ceph::real_clock::from_ceph_timespec(obc
->obs
.oi
.mtime
),
3483 manifest_ops
[obc
->obs
.oi
.soid
] = std::make_shared
<ManifestOp
>(cb
, tid
);
3488 void PrimaryLogPG::do_proxy_chunked_read(OpRequestRef op
, ObjectContextRef obc
, int op_index
,
3489 uint64_t chunk_index
, uint64_t req_offset
, uint64_t req_length
,
3490 uint64_t req_total_len
, bool write_ordered
)
3492 MOSDOp
*m
= static_cast<MOSDOp
*>(op
->get_nonconst_req());
3493 object_manifest_t
*manifest
= &obc
->obs
.oi
.manifest
;
3494 if (!manifest
->chunk_map
.count(chunk_index
)) {
3497 uint64_t chunk_length
= manifest
->chunk_map
[chunk_index
].length
;
3498 hobject_t soid
= manifest
->chunk_map
[chunk_index
].oid
;
3499 hobject_t ori_soid
= m
->get_hobj();
3500 object_locator_t
oloc(soid
);
3501 unsigned flags
= CEPH_OSD_FLAG_IGNORE_CACHE
| CEPH_OSD_FLAG_IGNORE_OVERLAY
;
3502 if (write_ordered
) {
3503 flags
|= CEPH_OSD_FLAG_RWORDERED
;
3506 if (!chunk_length
|| soid
== hobject_t()) {
3510 /* same as do_proxy_read() */
3511 flags
|= m
->get_flags() & (CEPH_OSD_FLAG_RWORDERED
|
3512 CEPH_OSD_FLAG_ORDERSNAP
|
3513 CEPH_OSD_FLAG_ENFORCE_SNAPC
|
3514 CEPH_OSD_FLAG_MAP_SNAP_CLONE
);
3516 dout(10) << __func__
<< " Start do chunk proxy read for " << *m
3517 << " index: " << op_index
<< " oid: " << soid
.oid
.name
<< " req_offset: " << req_offset
3518 << " req_length: " << req_length
<< dendl
;
3520 ProxyReadOpRef
prdop(std::make_shared
<ProxyReadOp
>(op
, ori_soid
, m
->ops
));
3522 ObjectOperation
*pobj_op
= new ObjectOperation
;
3523 OSDOp
&osd_op
= pobj_op
->add_op(m
->ops
[op_index
].op
.op
);
3525 if (chunk_index
<= req_offset
) {
3526 osd_op
.op
.extent
.offset
= manifest
->chunk_map
[chunk_index
].offset
+ req_offset
- chunk_index
;
3528 ceph_abort_msg("chunk_index > req_offset");
3530 osd_op
.op
.extent
.length
= req_length
;
3532 ObjectOperation obj_op
;
3533 obj_op
.dup(pobj_op
->ops
);
3535 C_ProxyChunkRead
*fin
= new C_ProxyChunkRead(this, ori_soid
, get_last_peering_reset(),
3537 fin
->obj_op
= pobj_op
;
3538 fin
->op_index
= op_index
;
3539 fin
->req_offset
= req_offset
;
3541 fin
->req_total_len
= req_total_len
;
3543 ceph_tid_t tid
= osd
->objecter
->read(
3544 soid
.oid
, oloc
, obj_op
,
3545 m
->get_snapid(), NULL
,
3546 flags
, new C_OnFinisher(fin
, osd
->get_objecter_finisher(get_pg_shard())),
3547 &prdop
->user_version
,
3548 &prdop
->data_offset
,
3551 prdop
->objecter_tid
= tid
;
3552 proxyread_ops
[tid
] = prdop
;
3553 in_progress_proxy_ops
[ori_soid
].push_back(op
);
3556 bool PrimaryLogPG::can_proxy_chunked_read(OpRequestRef op
, ObjectContextRef obc
)
3558 MOSDOp
*m
= static_cast<MOSDOp
*>(op
->get_nonconst_req());
3559 OSDOp
*osd_op
= NULL
;
3561 for (unsigned int i
= 0; i
< m
->ops
.size(); i
++) {
3562 osd_op
= &m
->ops
[i
];
3563 ceph_osd_op op
= osd_op
->op
;
3565 case CEPH_OSD_OP_READ
:
3566 case CEPH_OSD_OP_SYNC_READ
: {
3567 uint64_t cursor
= osd_op
->op
.extent
.offset
;
3568 uint64_t remain
= osd_op
->op
.extent
.length
;
3570 /* requested chunks exist in chunk_map ? */
3571 for (auto &p
: obc
->obs
.oi
.manifest
.chunk_map
) {
3572 if (p
.first
<= cursor
&& p
.first
+ p
.second
.length
> cursor
) {
3573 if (!p
.second
.is_missing()) {
3576 if (p
.second
.length
>= remain
) {
3580 remain
= remain
- p
.second
.length
;
3582 cursor
+= p
.second
.length
;
3587 dout(20) << __func__
<< " requested chunks don't exist in chunk_map " << dendl
;
3599 void PrimaryLogPG::finish_proxy_write(hobject_t oid
, ceph_tid_t tid
, int r
)
3601 dout(10) << __func__
<< " " << oid
<< " tid " << tid
3602 << " " << cpp_strerror(r
) << dendl
;
3604 map
<ceph_tid_t
, ProxyWriteOpRef
>::iterator p
= proxywrite_ops
.find(tid
);
3605 if (p
== proxywrite_ops
.end()) {
3606 dout(10) << __func__
<< " no proxywrite_op found" << dendl
;
3609 ProxyWriteOpRef pwop
= p
->second
;
3610 ceph_assert(tid
== pwop
->objecter_tid
);
3611 ceph_assert(oid
== pwop
->soid
);
3613 proxywrite_ops
.erase(tid
);
3615 map
<hobject_t
, list
<OpRequestRef
> >::iterator q
= in_progress_proxy_ops
.find(oid
);
3616 if (q
== in_progress_proxy_ops
.end()) {
3617 dout(10) << __func__
<< " no in_progress_proxy_ops found" << dendl
;
3622 list
<OpRequestRef
>& in_progress_op
= q
->second
;
3623 ceph_assert(in_progress_op
.size());
3624 list
<OpRequestRef
>::iterator it
= std::find(in_progress_op
.begin(),
3625 in_progress_op
.end(),
3627 ceph_assert(it
!= in_progress_op
.end());
3628 in_progress_op
.erase(it
);
3629 if (in_progress_op
.size() == 0) {
3630 in_progress_proxy_ops
.erase(oid
);
3631 } else if (std::find(in_progress_op
.begin(),
3632 in_progress_op
.end(),
3633 pwop
->op
) != in_progress_op
.end()) {
3637 dout(20) << __func__
<< " " << oid
<< " tid " << tid
3638 << " in_progress_op size: "
3639 << in_progress_op
.size() << dendl
;
3643 osd
->logger
->inc(l_osd_tier_proxy_write
);
3645 auto m
= pwop
->op
->get_req
<MOSDOp
>();
3646 ceph_assert(m
!= NULL
);
3648 if (!pwop
->sent_reply
) {
3650 assert(pwop
->ctx
->reply
== nullptr);
3651 MOSDOpReply
*reply
= new MOSDOpReply(m
, r
, get_osdmap_epoch(), 0,
3652 true /* we claim it below */);
3653 reply
->set_reply_versions(eversion_t(), pwop
->user_version
);
3654 reply
->add_flags(CEPH_OSD_FLAG_ACK
| CEPH_OSD_FLAG_ONDISK
);
3655 reply
->claim_op_out_data(pwop
->ops
);
3656 dout(10) << " sending commit on " << pwop
<< " " << reply
<< dendl
;
3657 osd
->send_message_osd_client(reply
, m
->get_connection());
3658 pwop
->sent_reply
= true;
3659 pwop
->ctx
->op
->mark_commit_sent();
3666 void PrimaryLogPG::cancel_proxy_write(ProxyWriteOpRef pwop
,
3667 vector
<ceph_tid_t
> *tids
)
3669 dout(10) << __func__
<< " " << pwop
->soid
<< dendl
;
3670 pwop
->canceled
= true;
3672 // cancel objecter op, if we can
3673 if (pwop
->objecter_tid
) {
3674 tids
->push_back(pwop
->objecter_tid
);
3677 proxywrite_ops
.erase(pwop
->objecter_tid
);
3678 pwop
->objecter_tid
= 0;
3682 class PromoteCallback
: public PrimaryLogPG::CopyCallback
{
3683 ObjectContextRef obc
;
3687 PromoteCallback(ObjectContextRef obc_
, PrimaryLogPG
*pg_
)
3690 start(ceph_clock_now()) {}
3692 void finish(PrimaryLogPG::CopyCallbackResults results
) override
{
3693 PrimaryLogPG::CopyResults
*results_data
= results
.get
<1>();
3694 int r
= results
.get
<0>();
3695 pg
->finish_promote(r
, results_data
, obc
);
3696 pg
->osd
->logger
->tinc(l_osd_tier_promote_lat
, ceph_clock_now() - start
);
3700 class PromoteManifestCallback
: public PrimaryLogPG::CopyCallback
{
3701 ObjectContextRef obc
;
3704 PrimaryLogPG::OpContext
*ctx
;
3705 PrimaryLogPG::CopyCallbackResults promote_results
;
3707 PromoteManifestCallback(ObjectContextRef obc_
, PrimaryLogPG
*pg_
, PrimaryLogPG::OpContext
*ctx
= NULL
)
3710 start(ceph_clock_now()), ctx(ctx
) {}
3712 void finish(PrimaryLogPG::CopyCallbackResults results
) override
{
3713 PrimaryLogPG::CopyResults
*results_data
= results
.get
<1>();
3714 int r
= results
.get
<0>();
3716 promote_results
= results
;
3717 pg
->execute_ctx(ctx
);
3719 pg
->finish_promote_manifest(r
, results_data
, obc
);
3721 pg
->osd
->logger
->tinc(l_osd_tier_promote_lat
, ceph_clock_now() - start
);
3723 friend struct PromoteFinisher
;
3726 struct PromoteFinisher
: public PrimaryLogPG::OpFinisher
{
3727 PromoteManifestCallback
*promote_callback
;
3729 explicit PromoteFinisher(PromoteManifestCallback
*promote_callback
)
3730 : promote_callback(promote_callback
) {
3733 int execute() override
{
3734 if (promote_callback
->ctx
->obc
->obs
.oi
.manifest
.is_redirect()) {
3735 promote_callback
->ctx
->pg
->finish_promote(promote_callback
->promote_results
.get
<0>(),
3736 promote_callback
->promote_results
.get
<1>(),
3737 promote_callback
->obc
);
3738 } else if (promote_callback
->ctx
->obc
->obs
.oi
.manifest
.is_chunked()) {
3739 promote_callback
->ctx
->pg
->finish_promote_manifest(promote_callback
->promote_results
.get
<0>(),
3740 promote_callback
->promote_results
.get
<1>(),
3741 promote_callback
->obc
);
3743 ceph_abort_msg("unrecognized manifest type");
3749 void PrimaryLogPG::promote_object(ObjectContextRef obc
,
3750 const hobject_t
& missing_oid
,
3751 const object_locator_t
& oloc
,
3753 ObjectContextRef
*promote_obc
)
3755 hobject_t hoid
= obc
? obc
->obs
.oi
.soid
: missing_oid
;
3756 ceph_assert(hoid
!= hobject_t());
3757 if (write_blocked_by_scrub(hoid
)) {
3758 dout(10) << __func__
<< " " << hoid
3759 << " blocked by scrub" << dendl
;
3761 waiting_for_scrub
.push_back(op
);
3762 op
->mark_delayed("waiting for scrub");
3763 dout(10) << __func__
<< " " << hoid
3764 << " placing op in waiting_for_scrub" << dendl
;
3766 dout(10) << __func__
<< " " << hoid
3767 << " no op, dropping on the floor" << dendl
;
3771 if (op
&& !check_laggy_requeue(op
)) {
3774 if (!obc
) { // we need to create an ObjectContext
3775 ceph_assert(missing_oid
!= hobject_t());
3776 obc
= get_object_context(missing_oid
, true);
3782 * Before promote complete, if there are proxy-reads for the object,
3783 * for this case we don't use DONTNEED.
3785 unsigned src_fadvise_flags
= LIBRADOS_OP_FLAG_FADVISE_SEQUENTIAL
;
3786 map
<hobject_t
, list
<OpRequestRef
>>::iterator q
= in_progress_proxy_ops
.find(obc
->obs
.oi
.soid
);
3787 if (q
== in_progress_proxy_ops
.end()) {
3788 src_fadvise_flags
|= LIBRADOS_OP_FLAG_FADVISE_DONTNEED
;
3792 object_locator_t my_oloc
;
3794 if (!obc
->obs
.oi
.has_manifest()) {
3796 my_oloc
.pool
= pool
.info
.tier_of
;
3797 src_hoid
= obc
->obs
.oi
.soid
;
3798 cb
= new PromoteCallback(obc
, this);
3800 if (obc
->obs
.oi
.manifest
.is_chunked()) {
3801 src_hoid
= obc
->obs
.oi
.soid
;
3802 cb
= new PromoteManifestCallback(obc
, this);
3803 } else if (obc
->obs
.oi
.manifest
.is_redirect()) {
3804 object_locator_t
src_oloc(obc
->obs
.oi
.manifest
.redirect_target
);
3806 src_hoid
= obc
->obs
.oi
.manifest
.redirect_target
;
3807 cb
= new PromoteCallback(obc
, this);
3809 ceph_abort_msg("unrecognized manifest type");
3813 unsigned flags
= CEPH_OSD_COPY_FROM_FLAG_IGNORE_OVERLAY
|
3814 CEPH_OSD_COPY_FROM_FLAG_IGNORE_CACHE
|
3815 CEPH_OSD_COPY_FROM_FLAG_MAP_SNAP_CLONE
|
3816 CEPH_OSD_COPY_FROM_FLAG_RWORDERED
;
3817 start_copy(cb
, obc
, src_hoid
, my_oloc
, 0, flags
,
3818 obc
->obs
.oi
.soid
.snap
== CEPH_NOSNAP
,
3819 src_fadvise_flags
, 0);
3821 ceph_assert(obc
->is_blocked());
3824 wait_for_blocked_object(obc
->obs
.oi
.soid
, op
);
3826 recovery_state
.update_stats(
3827 [](auto &history
, auto &stats
) {
3828 stats
.stats
.sum
.num_promote
++;
3833 void PrimaryLogPG::execute_ctx(OpContext
*ctx
)
3836 dout(10) << __func__
<< " " << ctx
<< dendl
;
3837 ctx
->reset_obs(ctx
->obc
);
3838 ctx
->update_log_only
= false; // reset in case finish_copyfrom() is re-running execute_ctx
3839 OpRequestRef op
= ctx
->op
;
3840 auto m
= op
->get_req
<MOSDOp
>();
3841 ObjectContextRef obc
= ctx
->obc
;
3842 const hobject_t
& soid
= obc
->obs
.oi
.soid
;
3844 // this method must be idempotent since we may call it several times
3845 // before we finally apply the resulting transaction.
3846 ctx
->op_t
.reset(new PGTransaction
);
3848 if (op
->may_write() || op
->may_cache()) {
3850 if (!(m
->has_flag(CEPH_OSD_FLAG_ENFORCE_SNAPC
)) &&
3851 pool
.info
.is_pool_snaps_mode()) {
3853 ctx
->snapc
= pool
.snapc
;
3855 // client specified snapc
3856 ctx
->snapc
.seq
= m
->get_snap_seq();
3857 ctx
->snapc
.snaps
= m
->get_snaps();
3858 filter_snapc(ctx
->snapc
.snaps
);
3860 if ((m
->has_flag(CEPH_OSD_FLAG_ORDERSNAP
)) &&
3861 ctx
->snapc
.seq
< obc
->ssc
->snapset
.seq
) {
3862 dout(10) << " ORDERSNAP flag set and snapc seq " << ctx
->snapc
.seq
3863 << " < snapset seq " << obc
->ssc
->snapset
.seq
3864 << " on " << obc
->obs
.oi
.soid
<< dendl
;
3865 reply_ctx(ctx
, -EOLDSNAPC
);
3870 ctx
->at_version
= get_next_version();
3871 ctx
->mtime
= m
->get_mtime();
3873 dout(10) << __func__
<< " " << soid
<< " " << *ctx
->ops
3874 << " ov " << obc
->obs
.oi
.version
<< " av " << ctx
->at_version
3875 << " snapc " << ctx
->snapc
3876 << " snapset " << obc
->ssc
->snapset
3879 dout(10) << __func__
<< " " << soid
<< " " << *ctx
->ops
3880 << " ov " << obc
->obs
.oi
.version
3884 if (!ctx
->user_at_version
)
3885 ctx
->user_at_version
= obc
->obs
.oi
.user_version
;
3886 dout(30) << __func__
<< " user_at_version " << ctx
->user_at_version
<< dendl
;
3890 osd_reqid_t reqid
= ctx
->op
->get_reqid();
3892 tracepoint(osd
, prepare_tx_enter
, reqid
.name
._type
,
3893 reqid
.name
._num
, reqid
.tid
, reqid
.inc
);
3896 int result
= prepare_transaction(ctx
);
3900 osd_reqid_t reqid
= ctx
->op
->get_reqid();
3902 tracepoint(osd
, prepare_tx_exit
, reqid
.name
._type
,
3903 reqid
.name
._num
, reqid
.tid
, reqid
.inc
);
3906 bool pending_async_reads
= !ctx
->pending_async_reads
.empty();
3907 if (result
== -EINPROGRESS
|| pending_async_reads
) {
3909 if (pending_async_reads
) {
3910 ceph_assert(pool
.info
.is_erasure());
3911 in_progress_async_reads
.push_back(make_pair(op
, ctx
));
3912 ctx
->start_async_reads(this);
3917 if (result
== -EAGAIN
) {
3918 // clean up after the ctx
3923 bool ignore_out_data
= false;
3924 if (!ctx
->op_t
->empty() &&
3927 // successful update
3928 if (ctx
->op
->allows_returnvec()) {
3929 // enforce reasonable bound on the return buffer sizes
3930 for (auto& i
: *ctx
->ops
) {
3931 if (i
.outdata
.length() > cct
->_conf
->osd_max_write_op_reply_len
) {
3932 dout(10) << __func__
<< " op " << i
<< " outdata overflow" << dendl
;
3933 result
= -EOVERFLOW
; // overall result is overflow
3934 i
.rval
= -EOVERFLOW
;
3939 // legacy behavior -- zero result and return data etc.
3940 ignore_out_data
= true;
3945 // prepare the reply
3946 ctx
->reply
= new MOSDOpReply(m
, result
, get_osdmap_epoch(), 0,
3948 dout(20) << __func__
<< " alloc reply " << ctx
->reply
3949 << " result " << result
<< dendl
;
3952 if ((ctx
->op_t
->empty() || result
< 0) && !ctx
->update_log_only
) {
3953 // finish side-effects
3955 do_osd_op_effects(ctx
, m
->get_connection());
3957 complete_read_ctx(result
, ctx
);
3961 ctx
->reply
->set_reply_versions(ctx
->at_version
, ctx
->user_at_version
);
3963 ceph_assert(op
->may_write() || op
->may_cache());
3966 recovery_state
.update_trim_to();
3968 // verify that we are doing this in order?
3969 if (cct
->_conf
->osd_debug_op_order
&& m
->get_source().is_client() &&
3970 !pool
.info
.is_tier() && !pool
.info
.has_tiers()) {
3971 map
<client_t
,ceph_tid_t
>& cm
= debug_op_order
[obc
->obs
.oi
.soid
];
3972 ceph_tid_t t
= m
->get_tid();
3973 client_t n
= m
->get_source().num();
3974 map
<client_t
,ceph_tid_t
>::iterator p
= cm
.find(n
);
3975 if (p
== cm
.end()) {
3976 dout(20) << " op order client." << n
<< " tid " << t
<< " (first)" << dendl
;
3979 dout(20) << " op order client." << n
<< " tid " << t
<< " last was " << p
->second
<< dendl
;
3980 if (p
->second
> t
) {
3981 derr
<< "bad op order, already applied " << p
->second
<< " > this " << t
<< dendl
;
3982 ceph_abort_msg("out of order op");
3988 if (ctx
->update_log_only
) {
3990 do_osd_op_effects(ctx
, m
->get_connection());
3992 dout(20) << __func__
<< " update_log_only -- result=" << result
<< dendl
;
3993 // save just what we need from ctx
3994 MOSDOpReply
*reply
= ctx
->reply
;
3995 ctx
->reply
= nullptr;
3996 reply
->get_header().data_off
= (ctx
->data_off
? *ctx
->data_off
: 0);
3998 if (result
== -ENOENT
) {
3999 reply
->set_enoent_reply_versions(info
.last_update
,
4000 info
.last_user_version
);
4002 reply
->add_flags(CEPH_OSD_FLAG_ACK
| CEPH_OSD_FLAG_ONDISK
);
4003 // append to pg log for dup detection - don't save buffers for now
4004 record_write_error(op
, soid
, reply
, result
,
4005 ctx
->op
->allows_returnvec() ? ctx
: nullptr);
4010 // no need to capture PG ref, repop cancel will handle that
4011 // Can capture the ctx by pointer, it's owned by the repop
4012 ctx
->register_on_commit(
4015 log_op_stats(*ctx
->op
, ctx
->bytes_written
, ctx
->bytes_read
);
4017 if (m
&& !ctx
->sent_reply
) {
4018 MOSDOpReply
*reply
= ctx
->reply
;
4019 ctx
->reply
= nullptr;
4020 reply
->add_flags(CEPH_OSD_FLAG_ACK
| CEPH_OSD_FLAG_ONDISK
);
4021 dout(10) << " sending reply on " << *m
<< " " << reply
<< dendl
;
4022 osd
->send_message_osd_client(reply
, m
->get_connection());
4023 ctx
->sent_reply
= true;
4024 ctx
->op
->mark_commit_sent();
4027 ctx
->register_on_success(
4031 ctx
->op
? ctx
->op
->get_req()->get_connection() :
4034 ctx
->register_on_finish(
4039 // issue replica writes
4040 ceph_tid_t rep_tid
= osd
->get_tid();
4042 RepGather
*repop
= new_repop(ctx
, obc
, rep_tid
);
4044 issue_repop(repop
, ctx
);
4049 void PrimaryLogPG::close_op_ctx(OpContext
*ctx
) {
4050 release_object_locks(ctx
->lock_manager
);
4054 for (auto p
= ctx
->on_finish
.begin(); p
!= ctx
->on_finish
.end();
4055 ctx
->on_finish
.erase(p
++)) {
4061 void PrimaryLogPG::reply_ctx(OpContext
*ctx
, int r
)
4064 osd
->reply_op_error(ctx
->op
, r
);
4068 void PrimaryLogPG::log_op_stats(const OpRequest
& op
,
4070 const uint64_t outb
)
4072 auto m
= op
.get_req
<MOSDOp
>();
4073 const utime_t now
= ceph_clock_now();
4075 const utime_t latency
= now
- m
->get_recv_stamp();
4076 const utime_t process_latency
= now
- op
.get_dequeued_time();
4078 osd
->logger
->inc(l_osd_op
);
4080 osd
->logger
->inc(l_osd_op_outb
, outb
);
4081 osd
->logger
->inc(l_osd_op_inb
, inb
);
4082 osd
->logger
->tinc(l_osd_op_lat
, latency
);
4083 osd
->logger
->tinc(l_osd_op_process_lat
, process_latency
);
4085 if (op
.may_read() && op
.may_write()) {
4086 osd
->logger
->inc(l_osd_op_rw
);
4087 osd
->logger
->inc(l_osd_op_rw_inb
, inb
);
4088 osd
->logger
->inc(l_osd_op_rw_outb
, outb
);
4089 osd
->logger
->tinc(l_osd_op_rw_lat
, latency
);
4090 osd
->logger
->hinc(l_osd_op_rw_lat_inb_hist
, latency
.to_nsec(), inb
);
4091 osd
->logger
->hinc(l_osd_op_rw_lat_outb_hist
, latency
.to_nsec(), outb
);
4092 osd
->logger
->tinc(l_osd_op_rw_process_lat
, process_latency
);
4093 } else if (op
.may_read()) {
4094 osd
->logger
->inc(l_osd_op_r
);
4095 osd
->logger
->inc(l_osd_op_r_outb
, outb
);
4096 osd
->logger
->tinc(l_osd_op_r_lat
, latency
);
4097 osd
->logger
->hinc(l_osd_op_r_lat_outb_hist
, latency
.to_nsec(), outb
);
4098 osd
->logger
->tinc(l_osd_op_r_process_lat
, process_latency
);
4099 } else if (op
.may_write() || op
.may_cache()) {
4100 osd
->logger
->inc(l_osd_op_w
);
4101 osd
->logger
->inc(l_osd_op_w_inb
, inb
);
4102 osd
->logger
->tinc(l_osd_op_w_lat
, latency
);
4103 osd
->logger
->hinc(l_osd_op_w_lat_inb_hist
, latency
.to_nsec(), inb
);
4104 osd
->logger
->tinc(l_osd_op_w_process_lat
, process_latency
);
4109 dout(15) << "log_op_stats " << *m
4112 << " lat " << latency
<< dendl
;
4114 if (m_dynamic_perf_stats
.is_enabled()) {
4115 m_dynamic_perf_stats
.add(osd
, info
, op
, inb
, outb
, latency
);
4119 void PrimaryLogPG::set_dynamic_perf_stats_queries(
4120 const std::list
<OSDPerfMetricQuery
> &queries
)
4122 m_dynamic_perf_stats
.set_queries(queries
);
4125 void PrimaryLogPG::get_dynamic_perf_stats(DynamicPerfStats
*stats
)
4127 std::swap(m_dynamic_perf_stats
, *stats
);
4130 void PrimaryLogPG::do_scan(
4132 ThreadPool::TPHandle
&handle
)
4134 auto m
= op
->get_req
<MOSDPGScan
>();
4135 ceph_assert(m
->get_type() == MSG_OSD_PG_SCAN
);
4136 dout(10) << "do_scan " << *m
<< dendl
;
4141 case MOSDPGScan::OP_SCAN_GET_DIGEST
:
4143 auto dpp
= get_dpp();
4144 if (osd
->check_backfill_full(dpp
)) {
4145 dout(1) << __func__
<< ": Canceling backfill: Full." << dendl
;
4146 queue_peering_event(
4148 std::make_shared
<PGPeeringEvent
>(
4151 PeeringState::BackfillTooFull())));
4155 BackfillInterval bi
;
4156 bi
.begin
= m
->begin
;
4157 // No need to flush, there won't be any in progress writes occuring
4160 cct
->_conf
->osd_backfill_scan_min
,
4161 cct
->_conf
->osd_backfill_scan_max
,
4164 MOSDPGScan
*reply
= new MOSDPGScan(
4165 MOSDPGScan::OP_SCAN_DIGEST
,
4167 get_osdmap_epoch(), m
->query_epoch
,
4168 spg_t(info
.pgid
.pgid
, get_primary().shard
), bi
.begin
, bi
.end
);
4169 encode(bi
.objects
, reply
->get_data());
4170 osd
->send_message_osd_cluster(reply
, m
->get_connection());
4174 case MOSDPGScan::OP_SCAN_DIGEST
:
4176 pg_shard_t from
= m
->from
;
4178 // Check that from is in backfill_targets vector
4179 ceph_assert(is_backfill_target(from
));
4181 BackfillInterval
& bi
= peer_backfill_info
[from
];
4182 bi
.begin
= m
->begin
;
4184 auto p
= m
->get_data().cbegin();
4186 // take care to preserve ordering!
4188 ::decode_noclear(bi
.objects
, p
);
4190 if (waiting_on_backfill
.erase(from
)) {
4191 if (waiting_on_backfill
.empty()) {
4193 peer_backfill_info
.size() ==
4194 get_backfill_targets().size());
4195 finish_recovery_op(hobject_t::get_max());
4198 // we canceled backfill for a while due to a too full, and this
4199 // is an extra response from a non-too-full peer
4200 dout(20) << __func__
<< " canceled backfill (too full?)" << dendl
;
4207 void PrimaryLogPG::do_backfill(OpRequestRef op
)
4209 auto m
= op
->get_req
<MOSDPGBackfill
>();
4210 ceph_assert(m
->get_type() == MSG_OSD_PG_BACKFILL
);
4211 dout(10) << "do_backfill " << *m
<< dendl
;
4216 case MOSDPGBackfill::OP_BACKFILL_FINISH
:
4218 ceph_assert(cct
->_conf
->osd_kill_backfill_at
!= 1);
4220 MOSDPGBackfill
*reply
= new MOSDPGBackfill(
4221 MOSDPGBackfill::OP_BACKFILL_FINISH_ACK
,
4224 spg_t(info
.pgid
.pgid
, get_primary().shard
));
4225 reply
->set_priority(get_recovery_op_priority());
4226 osd
->send_message_osd_cluster(reply
, m
->get_connection());
4227 queue_peering_event(
4229 std::make_shared
<PGPeeringEvent
>(
4236 case MOSDPGBackfill::OP_BACKFILL_PROGRESS
:
4238 ceph_assert(cct
->_conf
->osd_kill_backfill_at
!= 2);
4240 ObjectStore::Transaction t
;
4241 recovery_state
.update_backfill_progress(
4244 m
->op
== MOSDPGBackfill::OP_BACKFILL_PROGRESS
,
4247 int tr
= osd
->store
->queue_transaction(ch
, std::move(t
), NULL
);
4248 ceph_assert(tr
== 0);
4252 case MOSDPGBackfill::OP_BACKFILL_FINISH_ACK
:
4254 ceph_assert(is_primary());
4255 ceph_assert(cct
->_conf
->osd_kill_backfill_at
!= 3);
4256 finish_recovery_op(hobject_t::get_max());
4262 void PrimaryLogPG::do_backfill_remove(OpRequestRef op
)
4264 const MOSDPGBackfillRemove
*m
= static_cast<const MOSDPGBackfillRemove
*>(
4266 ceph_assert(m
->get_type() == MSG_OSD_PG_BACKFILL_REMOVE
);
4267 dout(7) << __func__
<< " " << m
->ls
<< dendl
;
4271 ObjectStore::Transaction t
;
4272 for (auto& p
: m
->ls
) {
4273 if (is_remote_backfilling()) {
4275 int r
= osd
->store
->stat(ch
, ghobject_t(p
.first
, ghobject_t::NO_GEN
,
4276 pg_whoami
.shard
) , &st
);
4278 sub_local_num_bytes(st
.st_size
);
4280 if (pool
.info
.is_erasure()) {
4282 int r
= osd
->store
->getattr(
4284 ghobject_t(p
.first
, ghobject_t::NO_GEN
, pg_whoami
.shard
),
4288 object_info_t
oi(bv
);
4289 usersize
= oi
.size
* pgbackend
->get_ec_data_chunk_count();
4291 dout(0) << __func__
<< " " << ghobject_t(p
.first
, ghobject_t::NO_GEN
, pg_whoami
.shard
)
4292 << " can't get object info" << dendl
;
4296 usersize
= st
.st_size
;
4298 sub_num_bytes(usersize
);
4299 dout(10) << __func__
<< " " << ghobject_t(p
.first
, ghobject_t::NO_GEN
, pg_whoami
.shard
)
4300 << " sub actual data by " << st
.st_size
4301 << " sub num_bytes by " << usersize
4305 remove_snap_mapped_object(t
, p
.first
);
4307 int r
= osd
->store
->queue_transaction(ch
, std::move(t
), NULL
);
4308 ceph_assert(r
== 0);
4311 int PrimaryLogPG::trim_object(
4312 bool first
, const hobject_t
&coid
, snapid_t snap_to_trim
,
4313 PrimaryLogPG::OpContextUPtr
*ctxp
)
4319 ObjectContextRef obc
= get_object_context(coid
, false, NULL
);
4320 if (!obc
|| !obc
->ssc
|| !obc
->ssc
->exists
) {
4321 osd
->clog
->error() << __func__
<< ": Can not trim " << coid
4322 << " repair needed " << (obc
? "(no obc->ssc or !exists)" : "(no obc)");
4326 hobject_t head_oid
= coid
.get_head();
4327 ObjectContextRef head_obc
= get_object_context(head_oid
, false);
4329 osd
->clog
->error() << __func__
<< ": Can not trim " << coid
4330 << " repair needed, no snapset obc for " << head_oid
;
4334 SnapSet
& snapset
= obc
->ssc
->snapset
;
4336 object_info_t
&coi
= obc
->obs
.oi
;
4337 auto citer
= snapset
.clone_snaps
.find(coid
.snap
);
4338 if (citer
== snapset
.clone_snaps
.end()) {
4339 osd
->clog
->error() << "No clone_snaps in snapset " << snapset
4340 << " for object " << coid
<< "\n";
4343 set
<snapid_t
> old_snaps(citer
->second
.begin(), citer
->second
.end());
4344 if (old_snaps
.empty()) {
4345 osd
->clog
->error() << "No object info snaps for object " << coid
;
4349 dout(10) << coid
<< " old_snaps " << old_snaps
4350 << " old snapset " << snapset
<< dendl
;
4351 if (snapset
.seq
== 0) {
4352 osd
->clog
->error() << "No snapset.seq for object " << coid
;
4356 set
<snapid_t
> new_snaps
;
4357 const OSDMapRef
& osdmap
= get_osdmap();
4358 for (set
<snapid_t
>::iterator i
= old_snaps
.begin();
4359 i
!= old_snaps
.end();
4361 if (!osdmap
->in_removed_snaps_queue(info
.pgid
.pgid
.pool(), *i
) &&
4362 *i
!= snap_to_trim
) {
4363 new_snaps
.insert(*i
);
4367 vector
<snapid_t
>::iterator p
= snapset
.clones
.end();
4369 if (new_snaps
.empty()) {
4370 p
= std::find(snapset
.clones
.begin(), snapset
.clones
.end(), coid
.snap
);
4371 if (p
== snapset
.clones
.end()) {
4372 osd
->clog
->error() << "Snap " << coid
.snap
<< " not in clones";
4377 OpContextUPtr ctx
= simple_opc_create(obc
);
4378 ctx
->head_obc
= head_obc
;
4380 if (!ctx
->lock_manager
.get_snaptrimmer_write(
4384 close_op_ctx(ctx
.release());
4385 dout(10) << __func__
<< ": Unable to get a wlock on " << coid
<< dendl
;
4389 if (!ctx
->lock_manager
.get_snaptrimmer_write(
4393 close_op_ctx(ctx
.release());
4394 dout(10) << __func__
<< ": Unable to get a wlock on " << head_oid
<< dendl
;
4398 ctx
->at_version
= get_next_version();
4400 PGTransaction
*t
= ctx
->op_t
.get();
4402 if (new_snaps
.empty()) {
4404 dout(10) << coid
<< " snaps " << old_snaps
<< " -> "
4405 << new_snaps
<< " ... deleting" << dendl
;
4408 ceph_assert(p
!= snapset
.clones
.end());
4410 snapid_t last
= coid
.snap
;
4411 ctx
->delta_stats
.num_bytes
-= snapset
.get_clone_bytes(last
);
4413 if (p
!= snapset
.clones
.begin()) {
4414 // not the oldest... merge overlap into next older clone
4415 vector
<snapid_t
>::iterator n
= p
- 1;
4416 hobject_t prev_coid
= coid
;
4417 prev_coid
.snap
= *n
;
4418 bool adjust_prev_bytes
= is_present_clone(prev_coid
);
4420 if (adjust_prev_bytes
)
4421 ctx
->delta_stats
.num_bytes
-= snapset
.get_clone_bytes(*n
);
4423 snapset
.clone_overlap
[*n
].intersection_of(
4424 snapset
.clone_overlap
[*p
]);
4426 if (adjust_prev_bytes
)
4427 ctx
->delta_stats
.num_bytes
+= snapset
.get_clone_bytes(*n
);
4429 ctx
->delta_stats
.num_objects
--;
4431 ctx
->delta_stats
.num_objects_dirty
--;
4433 ctx
->delta_stats
.num_objects_omap
--;
4434 if (coi
.is_whiteout()) {
4435 dout(20) << __func__
<< " trimming whiteout on " << coid
<< dendl
;
4436 ctx
->delta_stats
.num_whiteouts
--;
4438 ctx
->delta_stats
.num_object_clones
--;
4439 if (coi
.is_cache_pinned())
4440 ctx
->delta_stats
.num_objects_pinned
--;
4441 if (coi
.has_manifest())
4442 ctx
->delta_stats
.num_objects_manifest
--;
4443 obc
->obs
.exists
= false;
4445 snapset
.clones
.erase(p
);
4446 snapset
.clone_overlap
.erase(last
);
4447 snapset
.clone_size
.erase(last
);
4448 snapset
.clone_snaps
.erase(last
);
4452 pg_log_entry_t::DELETE
,
4455 ctx
->obs
->oi
.version
,
4467 coi
= object_info_t(coid
);
4469 ctx
->at_version
.version
++;
4471 // save adjusted snaps for this object
4472 dout(10) << coid
<< " snaps " << old_snaps
<< " -> " << new_snaps
<< dendl
;
4473 snapset
.clone_snaps
[coid
.snap
] =
4474 vector
<snapid_t
>(new_snaps
.rbegin(), new_snaps
.rend());
4475 // we still do a 'modify' event on this object just to trigger a
4476 // snapmapper.update ... :(
4478 coi
.prior_version
= coi
.version
;
4479 coi
.version
= ctx
->at_version
;
4481 encode(coi
, bl
, get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD
, nullptr));
4482 t
->setattr(coid
, OI_ATTR
, bl
);
4486 pg_log_entry_t::MODIFY
,
4495 ctx
->at_version
.version
++;
4503 // save head snapset
4504 dout(10) << coid
<< " new snapset " << snapset
<< " on "
4505 << head_obc
->obs
.oi
<< dendl
;
4506 if (snapset
.clones
.empty() &&
4507 (head_obc
->obs
.oi
.is_whiteout() &&
4508 !(head_obc
->obs
.oi
.is_dirty() && pool
.info
.is_tier()) &&
4509 !head_obc
->obs
.oi
.is_cache_pinned())) {
4510 // NOTE: this arguably constitutes minor interference with the
4511 // tiering agent if this is a cache tier since a snap trim event
4512 // is effectively evicting a whiteout we might otherwise want to
4514 dout(10) << coid
<< " removing " << head_oid
<< dendl
;
4517 pg_log_entry_t::DELETE
,
4520 head_obc
->obs
.oi
.version
,
4526 derr
<< "removing snap head" << dendl
;
4527 object_info_t
& oi
= head_obc
->obs
.oi
;
4528 ctx
->delta_stats
.num_objects
--;
4529 if (oi
.is_dirty()) {
4530 ctx
->delta_stats
.num_objects_dirty
--;
4533 ctx
->delta_stats
.num_objects_omap
--;
4534 if (oi
.is_whiteout()) {
4535 dout(20) << __func__
<< " trimming whiteout on " << oi
.soid
<< dendl
;
4536 ctx
->delta_stats
.num_whiteouts
--;
4538 if (oi
.is_cache_pinned()) {
4539 ctx
->delta_stats
.num_objects_pinned
--;
4541 if (coi
.has_manifest())
4542 ctx
->delta_stats
.num_objects_manifest
--;
4543 head_obc
->obs
.exists
= false;
4544 head_obc
->obs
.oi
= object_info_t(head_oid
);
4545 t
->remove(head_oid
);
4547 if (get_osdmap()->require_osd_release
< ceph_release_t::octopus
) {
4548 // filter SnapSet::snaps for the benefit of pre-octopus
4549 // peers. This is perhaps overly conservative in that I'm not
4550 // certain they need this, but let's be conservative here.
4551 dout(10) << coid
<< " filtering snapset on " << head_oid
<< dendl
;
4552 snapset
.filter(pool
.info
);
4554 snapset
.snaps
.clear();
4556 dout(10) << coid
<< " writing updated snapset on " << head_oid
4557 << ", snapset is " << snapset
<< dendl
;
4560 pg_log_entry_t::MODIFY
,
4563 head_obc
->obs
.oi
.version
,
4570 head_obc
->obs
.oi
.prior_version
= head_obc
->obs
.oi
.version
;
4571 head_obc
->obs
.oi
.version
= ctx
->at_version
;
4573 map
<string
, bufferlist
> attrs
;
4575 encode(snapset
, bl
);
4576 attrs
[SS_ATTR
].claim(bl
);
4579 encode(head_obc
->obs
.oi
, bl
,
4580 get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD
, nullptr));
4581 attrs
[OI_ATTR
].claim(bl
);
4582 t
->setattrs(head_oid
, attrs
);
4585 *ctxp
= std::move(ctx
);
4589 void PrimaryLogPG::kick_snap_trim()
4591 ceph_assert(is_active());
4592 ceph_assert(is_primary());
4594 !state_test(PG_STATE_PREMERGE
) &&
4595 !snap_trimq
.empty()) {
4596 if (get_osdmap()->test_flag(CEPH_OSDMAP_NOSNAPTRIM
)) {
4597 dout(10) << __func__
<< ": nosnaptrim set, not kicking" << dendl
;
4599 dout(10) << __func__
<< ": clean and snaps to trim, kicking" << dendl
;
4600 snap_trimmer_machine
.process_event(KickTrim());
4605 void PrimaryLogPG::snap_trimmer_scrub_complete()
4607 if (is_primary() && is_active() && is_clean()) {
4608 ceph_assert(!snap_trimq
.empty());
4609 snap_trimmer_machine
.process_event(ScrubComplete());
4613 void PrimaryLogPG::snap_trimmer(epoch_t queued
)
4615 if (recovery_state
.is_deleting() || pg_has_reset_since(queued
)) {
4619 ceph_assert(is_primary());
4621 dout(10) << "snap_trimmer posting" << dendl
;
4622 snap_trimmer_machine
.process_event(DoSnapWork());
4623 dout(10) << "snap_trimmer complete" << dendl
;
4627 int PrimaryLogPG::do_xattr_cmp_u64(int op
, __u64 v1
, bufferlist
& xattr
)
4631 string
v2s(xattr
.c_str(), xattr
.length());
4633 v2
= strtoull(v2s
.c_str(), NULL
, 10);
4637 dout(20) << "do_xattr_cmp_u64 '" << v1
<< "' vs '" << v2
<< "' op " << op
<< dendl
;
4640 case CEPH_OSD_CMPXATTR_OP_EQ
:
4642 case CEPH_OSD_CMPXATTR_OP_NE
:
4644 case CEPH_OSD_CMPXATTR_OP_GT
:
4646 case CEPH_OSD_CMPXATTR_OP_GTE
:
4648 case CEPH_OSD_CMPXATTR_OP_LT
:
4650 case CEPH_OSD_CMPXATTR_OP_LTE
:
4657 int PrimaryLogPG::do_xattr_cmp_str(int op
, string
& v1s
, bufferlist
& xattr
)
4659 string
v2s(xattr
.c_str(), xattr
.length());
4661 dout(20) << "do_xattr_cmp_str '" << v1s
<< "' vs '" << v2s
<< "' op " << op
<< dendl
;
4664 case CEPH_OSD_CMPXATTR_OP_EQ
:
4665 return (v1s
.compare(v2s
) == 0);
4666 case CEPH_OSD_CMPXATTR_OP_NE
:
4667 return (v1s
.compare(v2s
) != 0);
4668 case CEPH_OSD_CMPXATTR_OP_GT
:
4669 return (v1s
.compare(v2s
) > 0);
4670 case CEPH_OSD_CMPXATTR_OP_GTE
:
4671 return (v1s
.compare(v2s
) >= 0);
4672 case CEPH_OSD_CMPXATTR_OP_LT
:
4673 return (v1s
.compare(v2s
) < 0);
4674 case CEPH_OSD_CMPXATTR_OP_LTE
:
4675 return (v1s
.compare(v2s
) <= 0);
4681 int PrimaryLogPG::do_writesame(OpContext
*ctx
, OSDOp
& osd_op
)
4683 ceph_osd_op
& op
= osd_op
.op
;
4684 vector
<OSDOp
> write_ops(1);
4685 OSDOp
& write_op
= write_ops
[0];
4686 uint64_t write_length
= op
.writesame
.length
;
4692 if (!op
.writesame
.data_length
|| write_length
% op
.writesame
.data_length
)
4695 if (op
.writesame
.data_length
!= osd_op
.indata
.length()) {
4696 derr
<< "invalid length ws data length " << op
.writesame
.data_length
<< " actual len " << osd_op
.indata
.length() << dendl
;
4700 while (write_length
) {
4701 write_op
.indata
.append(osd_op
.indata
);
4702 write_length
-= op
.writesame
.data_length
;
4705 write_op
.op
.op
= CEPH_OSD_OP_WRITE
;
4706 write_op
.op
.extent
.offset
= op
.writesame
.offset
;
4707 write_op
.op
.extent
.length
= op
.writesame
.length
;
4708 result
= do_osd_ops(ctx
, write_ops
);
4710 derr
<< "do_writesame do_osd_ops failed " << result
<< dendl
;
4715 // ========================================================================
4716 // low level osd ops
4718 int PrimaryLogPG::do_tmap2omap(OpContext
*ctx
, unsigned flags
)
4720 dout(20) << " convert tmap to omap for " << ctx
->new_obs
.oi
.soid
<< dendl
;
4721 bufferlist header
, vals
;
4722 int r
= _get_tmap(ctx
, &header
, &vals
);
4724 if (r
== -ENODATA
&& (flags
& CEPH_OSD_TMAP2OMAP_NULLOK
))
4729 vector
<OSDOp
> ops(3);
4731 ops
[0].op
.op
= CEPH_OSD_OP_TRUNCATE
;
4732 ops
[0].op
.extent
.offset
= 0;
4733 ops
[0].op
.extent
.length
= 0;
4735 ops
[1].op
.op
= CEPH_OSD_OP_OMAPSETHEADER
;
4736 ops
[1].indata
.claim(header
);
4738 ops
[2].op
.op
= CEPH_OSD_OP_OMAPSETVALS
;
4739 ops
[2].indata
.claim(vals
);
4741 return do_osd_ops(ctx
, ops
);
4744 int PrimaryLogPG::do_tmapup_slow(OpContext
*ctx
, bufferlist::const_iterator
& bp
,
4745 OSDOp
& osd_op
, bufferlist
& bl
)
4749 map
<string
, bufferlist
> m
;
4751 auto p
= bl
.cbegin();
4754 ceph_assert(p
.end());
4764 case CEPH_OSD_TMAP_SET
: // insert key
4772 case CEPH_OSD_TMAP_RM
: // remove key
4774 if (!m
.count(key
)) {
4779 case CEPH_OSD_TMAP_RMSLOPPY
: // remove key
4783 case CEPH_OSD_TMAP_HDR
: // update header
4795 encode(header
, obl
);
4799 vector
<OSDOp
> nops(1);
4800 OSDOp
& newop
= nops
[0];
4801 newop
.op
.op
= CEPH_OSD_OP_WRITEFULL
;
4802 newop
.op
.extent
.offset
= 0;
4803 newop
.op
.extent
.length
= obl
.length();
4805 do_osd_ops(ctx
, nops
);
4809 int PrimaryLogPG::do_tmapup(OpContext
*ctx
, bufferlist::const_iterator
& bp
, OSDOp
& osd_op
)
4811 bufferlist::const_iterator orig_bp
= bp
;
4814 dout(10) << "tmapup is a no-op" << dendl
;
4816 // read the whole object
4817 vector
<OSDOp
> nops(1);
4818 OSDOp
& newop
= nops
[0];
4819 newop
.op
.op
= CEPH_OSD_OP_READ
;
4820 newop
.op
.extent
.offset
= 0;
4821 newop
.op
.extent
.length
= 0;
4822 result
= do_osd_ops(ctx
, nops
);
4824 dout(10) << "tmapup read " << newop
.outdata
.length() << dendl
;
4826 dout(30) << " starting is \n";
4827 newop
.outdata
.hexdump(*_dout
);
4830 auto ip
= newop
.outdata
.cbegin();
4833 dout(30) << "the update command is: \n";
4834 osd_op
.indata
.hexdump(*_dout
);
4840 if (newop
.outdata
.length()) {
4844 dout(10) << "tmapup header " << header
.length() << dendl
;
4846 if (!bp
.end() && *bp
== CEPH_OSD_TMAP_HDR
) {
4849 dout(10) << "tmapup new header " << header
.length() << dendl
;
4852 encode(header
, obl
);
4854 dout(20) << "tmapup initial nkeys " << nkeys
<< dendl
;
4857 bufferlist newkeydata
;
4858 string nextkey
, last_in_key
;
4860 bool have_next
= false;
4863 decode(nextkey
, ip
);
4864 decode(nextval
, ip
);
4866 while (!bp
.end() && !result
) {
4873 catch (buffer::error
& e
) {
4876 if (key
< last_in_key
) {
4877 dout(5) << "tmapup warning: key '" << key
<< "' < previous key '" << last_in_key
4878 << "', falling back to an inefficient (unsorted) update" << dendl
;
4880 return do_tmapup_slow(ctx
, bp
, osd_op
, newop
.outdata
);
4884 dout(10) << "tmapup op " << (int)op
<< " key " << key
<< dendl
;
4886 // skip existing intervening keys
4887 bool key_exists
= false;
4888 while (have_next
&& !key_exists
) {
4889 dout(20) << " (have_next=" << have_next
<< " nextkey=" << nextkey
<< ")" << dendl
;
4892 if (nextkey
< key
) {
4894 encode(nextkey
, newkeydata
);
4895 encode(nextval
, newkeydata
);
4896 dout(20) << " keep " << nextkey
<< " " << nextval
.length() << dendl
;
4898 // don't copy; discard old value. and stop.
4899 dout(20) << " drop " << nextkey
<< " " << nextval
.length() << dendl
;
4904 decode(nextkey
, ip
);
4905 decode(nextval
, ip
);
4911 if (op
== CEPH_OSD_TMAP_SET
) {
4916 catch (buffer::error
& e
) {
4919 encode(key
, newkeydata
);
4920 encode(val
, newkeydata
);
4921 dout(20) << " set " << key
<< " " << val
.length() << dendl
;
4923 } else if (op
== CEPH_OSD_TMAP_CREATE
) {
4931 catch (buffer::error
& e
) {
4934 encode(key
, newkeydata
);
4935 encode(val
, newkeydata
);
4936 dout(20) << " create " << key
<< " " << val
.length() << dendl
;
4938 } else if (op
== CEPH_OSD_TMAP_RM
) {
4943 } else if (op
== CEPH_OSD_TMAP_RMSLOPPY
) {
4946 dout(10) << " invalid tmap op " << (int)op
<< dendl
;
4953 encode(nextkey
, newkeydata
);
4954 encode(nextval
, newkeydata
);
4955 dout(20) << " keep " << nextkey
<< " " << nextval
.length() << dendl
;
4959 rest
.substr_of(newop
.outdata
, ip
.get_off(), newop
.outdata
.length() - ip
.get_off());
4960 dout(20) << " keep trailing " << rest
.length()
4961 << " at " << newkeydata
.length() << dendl
;
4962 newkeydata
.claim_append(rest
);
4965 // encode final key count + key data
4966 dout(20) << "tmapup final nkeys " << nkeys
<< dendl
;
4968 obl
.claim_append(newkeydata
);
4971 dout(30) << " final is \n";
4972 obl
.hexdump(*_dout
);
4976 auto tp
= obl
.cbegin();
4979 map
<string
,bufferlist
> d
;
4981 ceph_assert(tp
.end());
4982 dout(0) << " **** debug sanity check, looks ok ****" << dendl
;
4987 dout(20) << "tmapput write " << obl
.length() << dendl
;
4988 newop
.op
.op
= CEPH_OSD_OP_WRITEFULL
;
4989 newop
.op
.extent
.offset
= 0;
4990 newop
.op
.extent
.length
= obl
.length();
4992 do_osd_ops(ctx
, nops
);
4998 static int check_offset_and_length(uint64_t offset
, uint64_t length
,
4999 uint64_t max
, DoutPrefixProvider
*dpp
)
5001 if (offset
>= max
||
5003 offset
+ length
> max
) {
5004 ldpp_dout(dpp
, 10) << __func__
<< " "
5005 << "osd_max_object_size: " << max
5006 << "; Hard limit of object size is 4GB." << dendl
;
5013 struct FillInVerifyExtent
: public Context
{
5016 bufferlist
*outdatap
;
5017 std::optional
<uint32_t> maybe_crc
;
5022 FillInVerifyExtent(ceph_le64
*r
, int32_t *rv
, bufferlist
*blp
,
5023 std::optional
<uint32_t> mc
, uint64_t size
,
5024 OSDService
*osd
, hobject_t soid
, uint32_t flags
) :
5025 r(r
), rval(rv
), outdatap(blp
), maybe_crc(mc
),
5026 size(size
), osd(osd
), soid(soid
), flags(flags
) {}
5027 void finish(int len
) override
{
5035 // whole object? can we verify the checksum?
5036 if (maybe_crc
&& *r
== size
) {
5037 uint32_t crc
= outdatap
->crc32c(-1);
5038 if (maybe_crc
!= crc
) {
5039 osd
->clog
->error() << std::hex
<< " full-object read crc 0x" << crc
5040 << " != expected 0x" << *maybe_crc
5041 << std::dec
<< " on " << soid
;
5042 if (!(flags
& CEPH_OSD_OP_FLAG_FAILOK
)) {
5051 struct ToSparseReadResult
: public Context
{
5053 bufferlist
* data_bl
;
5054 uint64_t data_offset
;
5056 ToSparseReadResult(int* result
, bufferlist
* bl
, uint64_t offset
,
5058 : result(result
), data_bl(bl
), data_offset(offset
),len(len
) {}
5059 void finish(int r
) override
{
5067 map
<uint64_t, uint64_t> extents
= {{data_offset
, r
}};
5068 encode(extents
, outdata
);
5069 ::encode_destructively(*data_bl
, outdata
);
5070 data_bl
->swap(outdata
);
5074 template<typename V
>
5075 static string
list_keys(const map
<string
, V
>& m
) {
5077 for (typename map
<string
, V
>::const_iterator itr
= m
.begin(); itr
!= m
.end(); ++itr
) {
5081 s
.append(itr
->first
);
5086 template<typename T
>
5087 static string
list_entries(const T
& m
) {
5089 for (typename
T::const_iterator itr
= m
.begin(); itr
!= m
.end(); ++itr
) {
5098 void PrimaryLogPG::maybe_create_new_object(
5100 bool ignore_transaction
)
5102 ObjectState
& obs
= ctx
->new_obs
;
5104 ctx
->delta_stats
.num_objects
++;
5106 ceph_assert(!obs
.oi
.is_whiteout());
5107 obs
.oi
.new_object();
5108 if (!ignore_transaction
)
5109 ctx
->op_t
->create(obs
.oi
.soid
);
5110 } else if (obs
.oi
.is_whiteout()) {
5111 dout(10) << __func__
<< " clearing whiteout on " << obs
.oi
.soid
<< dendl
;
5112 ctx
->new_obs
.oi
.clear_flag(object_info_t::FLAG_WHITEOUT
);
5113 --ctx
->delta_stats
.num_whiteouts
;
5117 struct ReadFinisher
: public PrimaryLogPG::OpFinisher
{
5120 explicit ReadFinisher(OSDOp
& osd_op
) : osd_op(osd_op
) {
5123 int execute() override
{
5128 struct C_ChecksumRead
: public Context
{
5129 PrimaryLogPG
*primary_log_pg
;
5131 Checksummer::CSumType csum_type
;
5132 bufferlist init_value_bl
;
5133 ceph_le64 read_length
;
5135 Context
*fill_extent_ctx
;
5137 C_ChecksumRead(PrimaryLogPG
*primary_log_pg
, OSDOp
&osd_op
,
5138 Checksummer::CSumType csum_type
, bufferlist
&&init_value_bl
,
5139 std::optional
<uint32_t> maybe_crc
, uint64_t size
,
5140 OSDService
*osd
, hobject_t soid
, uint32_t flags
)
5141 : primary_log_pg(primary_log_pg
), osd_op(osd_op
),
5142 csum_type(csum_type
), init_value_bl(std::move(init_value_bl
)),
5143 fill_extent_ctx(new FillInVerifyExtent(&read_length
, &osd_op
.rval
,
5144 &read_bl
, maybe_crc
, size
,
5145 osd
, soid
, flags
)) {
5147 ~C_ChecksumRead() override
{
5148 delete fill_extent_ctx
;
5151 void finish(int r
) override
{
5152 fill_extent_ctx
->complete(r
);
5153 fill_extent_ctx
= nullptr;
5155 if (osd_op
.rval
>= 0) {
5156 bufferlist::const_iterator init_value_bl_it
= init_value_bl
.begin();
5157 osd_op
.rval
= primary_log_pg
->finish_checksum(osd_op
, csum_type
,
5158 &init_value_bl_it
, read_bl
);
5163 int PrimaryLogPG::do_checksum(OpContext
*ctx
, OSDOp
& osd_op
,
5164 bufferlist::const_iterator
*bl_it
)
5166 dout(20) << __func__
<< dendl
;
5168 auto& op
= osd_op
.op
;
5169 if (op
.checksum
.chunk_size
> 0) {
5170 if (op
.checksum
.length
== 0) {
5171 dout(10) << __func__
<< ": length required when chunk size provided"
5175 if (op
.checksum
.length
% op
.checksum
.chunk_size
!= 0) {
5176 dout(10) << __func__
<< ": length not aligned to chunk size" << dendl
;
5181 auto& oi
= ctx
->new_obs
.oi
;
5182 if (op
.checksum
.offset
== 0 && op
.checksum
.length
== 0) {
5183 // zeroed offset+length implies checksum whole object
5184 op
.checksum
.length
= oi
.size
;
5185 } else if (op
.checksum
.offset
>= oi
.size
) {
5186 // read size was trimmed to zero, do nothing
5187 // see PrimaryLogPG::do_read
5189 } else if (op
.extent
.offset
+ op
.extent
.length
> oi
.size
) {
5190 op
.extent
.length
= oi
.size
- op
.extent
.offset
;
5191 if (op
.checksum
.chunk_size
> 0 &&
5192 op
.checksum
.length
% op
.checksum
.chunk_size
!= 0) {
5193 dout(10) << __func__
<< ": length (trimmed to 0x"
5194 << std::hex
<< op
.checksum
.length
5195 << ") not aligned to chunk size 0x"
5196 << op
.checksum
.chunk_size
<< std::dec
5202 Checksummer::CSumType csum_type
;
5203 switch (op
.checksum
.type
) {
5204 case CEPH_OSD_CHECKSUM_OP_TYPE_XXHASH32
:
5205 csum_type
= Checksummer::CSUM_XXHASH32
;
5207 case CEPH_OSD_CHECKSUM_OP_TYPE_XXHASH64
:
5208 csum_type
= Checksummer::CSUM_XXHASH64
;
5210 case CEPH_OSD_CHECKSUM_OP_TYPE_CRC32C
:
5211 csum_type
= Checksummer::CSUM_CRC32C
;
5214 dout(10) << __func__
<< ": unknown crc type ("
5215 << static_cast<uint32_t>(op
.checksum
.type
) << ")" << dendl
;
5219 size_t csum_init_value_size
= Checksummer::get_csum_init_value_size(csum_type
);
5220 if (bl_it
->get_remaining() < csum_init_value_size
) {
5221 dout(10) << __func__
<< ": init value not provided" << dendl
;
5225 bufferlist init_value_bl
;
5226 init_value_bl
.substr_of(bl_it
->get_bl(), bl_it
->get_off(),
5227 csum_init_value_size
);
5228 *bl_it
+= csum_init_value_size
;
5230 if (pool
.info
.is_erasure() && op
.checksum
.length
> 0) {
5231 // If there is a data digest and it is possible we are reading
5232 // entire object, pass the digest.
5233 std::optional
<uint32_t> maybe_crc
;
5234 if (oi
.is_data_digest() && op
.checksum
.offset
== 0 &&
5235 op
.checksum
.length
>= oi
.size
) {
5236 maybe_crc
= oi
.data_digest
;
5240 auto& soid
= oi
.soid
;
5241 auto checksum_ctx
= new C_ChecksumRead(this, osd_op
, csum_type
,
5242 std::move(init_value_bl
), maybe_crc
,
5243 oi
.size
, osd
, soid
, op
.flags
);
5245 ctx
->pending_async_reads
.push_back({
5246 {op
.checksum
.offset
, op
.checksum
.length
, op
.flags
},
5247 {&checksum_ctx
->read_bl
, checksum_ctx
}});
5249 dout(10) << __func__
<< ": async_read noted for " << soid
<< dendl
;
5250 ctx
->op_finishers
[ctx
->current_osd_subop_num
].reset(
5251 new ReadFinisher(osd_op
));
5252 return -EINPROGRESS
;
5256 std::vector
<OSDOp
> read_ops(1);
5257 auto& read_op
= read_ops
[0];
5258 if (op
.checksum
.length
> 0) {
5259 read_op
.op
.op
= CEPH_OSD_OP_READ
;
5260 read_op
.op
.flags
= op
.flags
;
5261 read_op
.op
.extent
.offset
= op
.checksum
.offset
;
5262 read_op
.op
.extent
.length
= op
.checksum
.length
;
5263 read_op
.op
.extent
.truncate_size
= 0;
5264 read_op
.op
.extent
.truncate_seq
= 0;
5266 int r
= do_osd_ops(ctx
, read_ops
);
5268 derr
<< __func__
<< ": do_osd_ops failed: " << cpp_strerror(r
) << dendl
;
5273 bufferlist::const_iterator init_value_bl_it
= init_value_bl
.begin();
5274 return finish_checksum(osd_op
, csum_type
, &init_value_bl_it
,
5278 int PrimaryLogPG::finish_checksum(OSDOp
& osd_op
,
5279 Checksummer::CSumType csum_type
,
5280 bufferlist::const_iterator
*init_value_bl_it
,
5281 const bufferlist
&read_bl
) {
5282 dout(20) << __func__
<< dendl
;
5284 auto& op
= osd_op
.op
;
5286 if (op
.checksum
.length
> 0 && read_bl
.length() != op
.checksum
.length
) {
5287 derr
<< __func__
<< ": bytes read " << read_bl
.length() << " != "
5288 << op
.checksum
.length
<< dendl
;
5292 size_t csum_chunk_size
= (op
.checksum
.chunk_size
!= 0 ?
5293 op
.checksum
.chunk_size
: read_bl
.length());
5294 uint32_t csum_count
= (csum_chunk_size
> 0 ?
5295 read_bl
.length() / csum_chunk_size
: 0);
5298 bufferptr csum_data
;
5299 if (csum_count
> 0) {
5300 size_t csum_value_size
= Checksummer::get_csum_value_size(csum_type
);
5301 csum_data
= buffer::create(csum_value_size
* csum_count
);
5303 csum
.append(csum_data
);
5305 switch (csum_type
) {
5306 case Checksummer::CSUM_XXHASH32
:
5308 Checksummer::xxhash32::init_value_t init_value
;
5309 decode(init_value
, *init_value_bl_it
);
5310 Checksummer::calculate
<Checksummer::xxhash32
>(
5311 init_value
, csum_chunk_size
, 0, read_bl
.length(), read_bl
,
5315 case Checksummer::CSUM_XXHASH64
:
5317 Checksummer::xxhash64::init_value_t init_value
;
5318 decode(init_value
, *init_value_bl_it
);
5319 Checksummer::calculate
<Checksummer::xxhash64
>(
5320 init_value
, csum_chunk_size
, 0, read_bl
.length(), read_bl
,
5324 case Checksummer::CSUM_CRC32C
:
5326 Checksummer::crc32c::init_value_t init_value
;
5327 decode(init_value
, *init_value_bl_it
);
5328 Checksummer::calculate
<Checksummer::crc32c
>(
5329 init_value
, csum_chunk_size
, 0, read_bl
.length(), read_bl
,
5338 encode(csum_count
, osd_op
.outdata
);
5339 osd_op
.outdata
.claim_append(csum
);
5343 struct C_ExtentCmpRead
: public Context
{
5344 PrimaryLogPG
*primary_log_pg
;
5346 ceph_le64 read_length
{};
5348 Context
*fill_extent_ctx
;
5350 C_ExtentCmpRead(PrimaryLogPG
*primary_log_pg
, OSDOp
&osd_op
,
5351 std::optional
<uint32_t> maybe_crc
, uint64_t size
,
5352 OSDService
*osd
, hobject_t soid
, uint32_t flags
)
5353 : primary_log_pg(primary_log_pg
), osd_op(osd_op
),
5354 fill_extent_ctx(new FillInVerifyExtent(&read_length
, &osd_op
.rval
,
5355 &read_bl
, maybe_crc
, size
,
5356 osd
, soid
, flags
)) {
5358 ~C_ExtentCmpRead() override
{
5359 delete fill_extent_ctx
;
5362 void finish(int r
) override
{
5366 delete fill_extent_ctx
;
5368 fill_extent_ctx
->complete(r
);
5370 fill_extent_ctx
= nullptr;
5372 if (osd_op
.rval
>= 0) {
5373 osd_op
.rval
= primary_log_pg
->finish_extent_cmp(osd_op
, read_bl
);
5378 int PrimaryLogPG::do_extent_cmp(OpContext
*ctx
, OSDOp
& osd_op
)
5380 dout(20) << __func__
<< dendl
;
5381 ceph_osd_op
& op
= osd_op
.op
;
5383 auto& oi
= ctx
->new_obs
.oi
;
5384 uint64_t size
= oi
.size
;
5385 if ((oi
.truncate_seq
< op
.extent
.truncate_seq
) &&
5386 (op
.extent
.offset
+ op
.extent
.length
> op
.extent
.truncate_size
)) {
5387 size
= op
.extent
.truncate_size
;
5390 if (op
.extent
.offset
>= size
) {
5391 op
.extent
.length
= 0;
5392 } else if (op
.extent
.offset
+ op
.extent
.length
> size
) {
5393 op
.extent
.length
= size
- op
.extent
.offset
;
5396 if (op
.extent
.length
== 0) {
5397 dout(20) << __func__
<< " zero length extent" << dendl
;
5398 return finish_extent_cmp(osd_op
, bufferlist
{});
5399 } else if (!ctx
->obs
->exists
|| ctx
->obs
->oi
.is_whiteout()) {
5400 dout(20) << __func__
<< " object DNE" << dendl
;
5401 return finish_extent_cmp(osd_op
, {});
5402 } else if (pool
.info
.is_erasure()) {
5403 // If there is a data digest and it is possible we are reading
5404 // entire object, pass the digest.
5405 std::optional
<uint32_t> maybe_crc
;
5406 if (oi
.is_data_digest() && op
.checksum
.offset
== 0 &&
5407 op
.checksum
.length
>= oi
.size
) {
5408 maybe_crc
= oi
.data_digest
;
5412 auto& soid
= oi
.soid
;
5413 auto extent_cmp_ctx
= new C_ExtentCmpRead(this, osd_op
, maybe_crc
, oi
.size
,
5414 osd
, soid
, op
.flags
);
5415 ctx
->pending_async_reads
.push_back({
5416 {op
.extent
.offset
, op
.extent
.length
, op
.flags
},
5417 {&extent_cmp_ctx
->read_bl
, extent_cmp_ctx
}});
5419 dout(10) << __func__
<< ": async_read noted for " << soid
<< dendl
;
5421 ctx
->op_finishers
[ctx
->current_osd_subop_num
].reset(
5422 new ReadFinisher(osd_op
));
5423 return -EINPROGRESS
;
5427 vector
<OSDOp
> read_ops(1);
5428 OSDOp
& read_op
= read_ops
[0];
5430 read_op
.op
.op
= CEPH_OSD_OP_SYNC_READ
;
5431 read_op
.op
.extent
.offset
= op
.extent
.offset
;
5432 read_op
.op
.extent
.length
= op
.extent
.length
;
5433 read_op
.op
.extent
.truncate_seq
= op
.extent
.truncate_seq
;
5434 read_op
.op
.extent
.truncate_size
= op
.extent
.truncate_size
;
5436 int result
= do_osd_ops(ctx
, read_ops
);
5438 derr
<< __func__
<< " failed " << result
<< dendl
;
5441 return finish_extent_cmp(osd_op
, read_op
.outdata
);
5444 int PrimaryLogPG::finish_extent_cmp(OSDOp
& osd_op
, const bufferlist
&read_bl
)
5446 for (uint64_t idx
= 0; idx
< osd_op
.indata
.length(); ++idx
) {
5447 char read_byte
= (idx
< read_bl
.length() ? read_bl
[idx
] : 0);
5448 if (osd_op
.indata
[idx
] != read_byte
) {
5449 return (-MAX_ERRNO
- idx
);
5456 int PrimaryLogPG::do_read(OpContext
*ctx
, OSDOp
& osd_op
) {
5457 dout(20) << __func__
<< dendl
;
5458 auto& op
= osd_op
.op
;
5459 auto& oi
= ctx
->new_obs
.oi
;
5460 auto& soid
= oi
.soid
;
5461 __u32 seq
= oi
.truncate_seq
;
5462 uint64_t size
= oi
.size
;
5463 bool trimmed_read
= false;
5465 dout(30) << __func__
<< " oi.size: " << oi
.size
<< dendl
;
5466 dout(30) << __func__
<< " oi.truncate_seq: " << oi
.truncate_seq
<< dendl
;
5467 dout(30) << __func__
<< " op.extent.truncate_seq: " << op
.extent
.truncate_seq
<< dendl
;
5468 dout(30) << __func__
<< " op.extent.truncate_size: " << op
.extent
.truncate_size
<< dendl
;
5470 // are we beyond truncate_size?
5471 if ( (seq
< op
.extent
.truncate_seq
) &&
5472 (op
.extent
.offset
+ op
.extent
.length
> op
.extent
.truncate_size
) &&
5473 (size
> op
.extent
.truncate_size
) )
5474 size
= op
.extent
.truncate_size
;
5476 if (op
.extent
.length
== 0) //length is zero mean read the whole object
5477 op
.extent
.length
= size
;
5479 if (op
.extent
.offset
>= size
) {
5480 op
.extent
.length
= 0;
5481 trimmed_read
= true;
5482 } else if (op
.extent
.offset
+ op
.extent
.length
> size
) {
5483 op
.extent
.length
= size
- op
.extent
.offset
;
5484 trimmed_read
= true;
5487 dout(30) << __func__
<< "op.extent.length is now " << op
.extent
.length
<< dendl
;
5489 // read into a buffer
5491 if (trimmed_read
&& op
.extent
.length
== 0) {
5492 // read size was trimmed to zero and it is expected to do nothing
5493 // a read operation of 0 bytes does *not* do nothing, this is why
5494 // the trimmed_read boolean is needed
5495 } else if (pool
.info
.is_erasure()) {
5496 // The initialisation below is required to silence a false positive
5497 // -Wmaybe-uninitialized warning
5498 std::optional
<uint32_t> maybe_crc
;
5499 // If there is a data digest and it is possible we are reading
5500 // entire object, pass the digest. FillInVerifyExtent will
5501 // will check the oi.size again.
5502 if (oi
.is_data_digest() && op
.extent
.offset
== 0 &&
5503 op
.extent
.length
>= oi
.size
)
5504 maybe_crc
= oi
.data_digest
;
5505 ctx
->pending_async_reads
.push_back(
5507 boost::make_tuple(op
.extent
.offset
, op
.extent
.length
, op
.flags
),
5508 make_pair(&osd_op
.outdata
,
5509 new FillInVerifyExtent(&op
.extent
.length
, &osd_op
.rval
,
5510 &osd_op
.outdata
, maybe_crc
, oi
.size
,
5511 osd
, soid
, op
.flags
))));
5512 dout(10) << " async_read noted for " << soid
<< dendl
;
5514 ctx
->op_finishers
[ctx
->current_osd_subop_num
].reset(
5515 new ReadFinisher(osd_op
));
5517 int r
= pgbackend
->objects_read_sync(
5518 soid
, op
.extent
.offset
, op
.extent
.length
, op
.flags
, &osd_op
.outdata
);
5519 // whole object? can we verify the checksum?
5520 if (r
>= 0 && op
.extent
.offset
== 0 &&
5521 (uint64_t)r
== oi
.size
&& oi
.is_data_digest()) {
5522 uint32_t crc
= osd_op
.outdata
.crc32c(-1);
5523 if (oi
.data_digest
!= crc
) {
5524 osd
->clog
->error() << info
.pgid
<< std::hex
5525 << " full-object read crc 0x" << crc
5526 << " != expected 0x" << oi
.data_digest
5527 << std::dec
<< " on " << soid
;
5528 r
= -EIO
; // try repair later
5532 r
= rep_repair_primary_object(soid
, ctx
);
5535 op
.extent
.length
= r
;
5536 else if (r
== -EAGAIN
) {
5540 op
.extent
.length
= 0;
5542 dout(10) << " read got " << r
<< " / " << op
.extent
.length
5543 << " bytes from obj " << soid
<< dendl
;
5546 ctx
->delta_stats
.num_rd_kb
+= shift_round_up(op
.extent
.length
, 10);
5547 ctx
->delta_stats
.num_rd
++;
5552 int PrimaryLogPG::do_sparse_read(OpContext
*ctx
, OSDOp
& osd_op
) {
5553 dout(20) << __func__
<< dendl
;
5554 auto& op
= osd_op
.op
;
5555 auto& oi
= ctx
->new_obs
.oi
;
5556 auto& soid
= oi
.soid
;
5558 if (op
.extent
.truncate_seq
) {
5559 dout(0) << "sparse_read does not support truncation sequence " << dendl
;
5564 if (pool
.info
.is_erasure()) {
5565 // translate sparse read to a normal one if not supported
5566 uint64_t offset
= op
.extent
.offset
;
5567 uint64_t length
= op
.extent
.length
;
5568 if (offset
> oi
.size
) {
5570 } else if (offset
+ length
> oi
.size
) {
5571 length
= oi
.size
- offset
;
5575 ctx
->pending_async_reads
.push_back(
5577 boost::make_tuple(offset
, length
, op
.flags
),
5580 new ToSparseReadResult(&osd_op
.rval
, &osd_op
.outdata
, offset
,
5581 &op
.extent
.length
))));
5582 dout(10) << " async_read (was sparse_read) noted for " << soid
<< dendl
;
5584 ctx
->op_finishers
[ctx
->current_osd_subop_num
].reset(
5585 new ReadFinisher(osd_op
));
5587 dout(10) << " sparse read ended up empty for " << soid
<< dendl
;
5588 map
<uint64_t, uint64_t> extents
;
5589 encode(extents
, osd_op
.outdata
);
5592 // read into a buffer
5593 map
<uint64_t, uint64_t> m
;
5594 uint32_t total_read
= 0;
5595 int r
= osd
->store
->fiemap(ch
, ghobject_t(soid
, ghobject_t::NO_GEN
,
5597 op
.extent
.offset
, op
.extent
.length
, m
);
5603 r
= pgbackend
->objects_readv_sync(soid
, std::move(m
), op
.flags
, &data_bl
);
5605 r
= rep_repair_primary_object(soid
, ctx
);
5611 // Why SPARSE_READ need checksum? In fact, librbd always use sparse-read.
5612 // Maybe at first, there is no much whole objects. With continued use, more
5613 // and more whole object exist. So from this point, for spare-read add
5614 // checksum make sense.
5615 if ((uint64_t)r
== oi
.size
&& oi
.is_data_digest()) {
5616 uint32_t crc
= data_bl
.crc32c(-1);
5617 if (oi
.data_digest
!= crc
) {
5618 osd
->clog
->error() << info
.pgid
<< std::hex
5619 << " full-object read crc 0x" << crc
5620 << " != expected 0x" << oi
.data_digest
5621 << std::dec
<< " on " << soid
;
5622 r
= rep_repair_primary_object(soid
, ctx
);
5629 op
.extent
.length
= total_read
;
5631 encode(m
, osd_op
.outdata
); // re-encode since it might be modified
5632 ::encode_destructively(data_bl
, osd_op
.outdata
);
5634 dout(10) << " sparse_read got " << r
<< " bytes from object "
5638 ctx
->delta_stats
.num_rd_kb
+= shift_round_up(op
.extent
.length
, 10);
5639 ctx
->delta_stats
.num_rd
++;
5643 int PrimaryLogPG::do_osd_ops(OpContext
*ctx
, vector
<OSDOp
>& ops
)
5646 SnapSetContext
*ssc
= ctx
->obc
->ssc
;
5647 ObjectState
& obs
= ctx
->new_obs
;
5648 object_info_t
& oi
= obs
.oi
;
5649 const hobject_t
& soid
= oi
.soid
;
5650 const bool skip_data_digest
= osd
->store
->has_builtin_csum() &&
5651 osd
->osd_skip_data_digest
;
5653 PGTransaction
* t
= ctx
->op_t
.get();
5655 dout(10) << "do_osd_op " << soid
<< " " << ops
<< dendl
;
5657 ctx
->current_osd_subop_num
= 0;
5658 for (auto p
= ops
.begin(); p
!= ops
.end(); ++p
, ctx
->current_osd_subop_num
++, ctx
->processed_subop_count
++) {
5660 ceph_osd_op
& op
= osd_op
.op
;
5662 OpFinisher
* op_finisher
= nullptr;
5664 auto op_finisher_it
= ctx
->op_finishers
.find(ctx
->current_osd_subop_num
);
5665 if (op_finisher_it
!= ctx
->op_finishers
.end()) {
5666 op_finisher
= op_finisher_it
->second
.get();
5670 // TODO: check endianness (ceph_le32 vs uint32_t, etc.)
5671 // The fields in ceph_osd_op are little-endian (according to the definition in rados.h),
5672 // but the code in this function seems to treat them as native-endian. What should the
5674 tracepoint(osd
, do_osd_op_pre
, soid
.oid
.name
.c_str(), soid
.snap
.val
, op
.op
, ceph_osd_op_name(op
.op
), op
.flags
);
5676 dout(10) << "do_osd_op " << osd_op
<< dendl
;
5678 auto bp
= osd_op
.indata
.cbegin();
5680 // user-visible modifcation?
5682 // non user-visible modifications
5683 case CEPH_OSD_OP_WATCH
:
5684 case CEPH_OSD_OP_CACHE_EVICT
:
5685 case CEPH_OSD_OP_CACHE_FLUSH
:
5686 case CEPH_OSD_OP_CACHE_TRY_FLUSH
:
5687 case CEPH_OSD_OP_UNDIRTY
:
5688 case CEPH_OSD_OP_COPY_FROM
: // we handle user_version update explicitly
5689 case CEPH_OSD_OP_COPY_FROM2
:
5690 case CEPH_OSD_OP_CACHE_PIN
:
5691 case CEPH_OSD_OP_CACHE_UNPIN
:
5692 case CEPH_OSD_OP_SET_REDIRECT
:
5693 case CEPH_OSD_OP_TIER_PROMOTE
:
5694 case CEPH_OSD_OP_TIER_FLUSH
:
5697 if (op
.op
& CEPH_OSD_OP_MODE_WR
)
5698 ctx
->user_modify
= true;
5701 // munge -1 truncate to 0 truncate
5702 if (ceph_osd_op_uses_extent(op
.op
) &&
5703 op
.extent
.truncate_seq
== 1 &&
5704 op
.extent
.truncate_size
== (-1ULL)) {
5705 op
.extent
.truncate_size
= 0;
5706 op
.extent
.truncate_seq
= 0;
5709 // munge ZERO -> TRUNCATE? (don't munge to DELETE or we risk hosing attributes)
5710 if (op
.op
== CEPH_OSD_OP_ZERO
&&
5712 op
.extent
.offset
< static_cast<Option::size_t>(osd
->osd_max_object_size
) &&
5713 op
.extent
.length
>= 1 &&
5714 op
.extent
.length
<= static_cast<Option::size_t>(osd
->osd_max_object_size
) &&
5715 op
.extent
.offset
+ op
.extent
.length
>= oi
.size
) {
5716 if (op
.extent
.offset
>= oi
.size
) {
5720 dout(10) << " munging ZERO " << op
.extent
.offset
<< "~" << op
.extent
.length
5721 << " -> TRUNCATE " << op
.extent
.offset
<< " (old size is " << oi
.size
<< ")" << dendl
;
5722 op
.op
= CEPH_OSD_OP_TRUNCATE
;
5729 case CEPH_OSD_OP_CMPEXT
:
5731 tracepoint(osd
, do_osd_op_pre_extent_cmp
, soid
.oid
.name
.c_str(),
5732 soid
.snap
.val
, oi
.size
, oi
.truncate_seq
, op
.extent
.offset
,
5733 op
.extent
.length
, op
.extent
.truncate_size
,
5734 op
.extent
.truncate_seq
);
5736 if (op_finisher
== nullptr) {
5737 result
= do_extent_cmp(ctx
, osd_op
);
5739 result
= op_finisher
->execute();
5743 case CEPH_OSD_OP_SYNC_READ
:
5744 if (pool
.info
.is_erasure()) {
5745 result
= -EOPNOTSUPP
;
5749 case CEPH_OSD_OP_READ
:
5751 tracepoint(osd
, do_osd_op_pre_read
, soid
.oid
.name
.c_str(),
5752 soid
.snap
.val
, oi
.size
, oi
.truncate_seq
, op
.extent
.offset
,
5753 op
.extent
.length
, op
.extent
.truncate_size
,
5754 op
.extent
.truncate_seq
);
5755 if (op_finisher
== nullptr) {
5756 if (!ctx
->data_off
) {
5757 ctx
->data_off
= op
.extent
.offset
;
5759 result
= do_read(ctx
, osd_op
);
5761 result
= op_finisher
->execute();
5765 case CEPH_OSD_OP_CHECKSUM
:
5768 tracepoint(osd
, do_osd_op_pre_checksum
, soid
.oid
.name
.c_str(),
5769 soid
.snap
.val
, oi
.size
, oi
.truncate_seq
, op
.checksum
.type
,
5770 op
.checksum
.offset
, op
.checksum
.length
,
5771 op
.checksum
.chunk_size
);
5773 if (op_finisher
== nullptr) {
5774 result
= do_checksum(ctx
, osd_op
, &bp
);
5776 result
= op_finisher
->execute();
5782 case CEPH_OSD_OP_MAPEXT
:
5783 tracepoint(osd
, do_osd_op_pre_mapext
, soid
.oid
.name
.c_str(), soid
.snap
.val
, op
.extent
.offset
, op
.extent
.length
);
5784 if (pool
.info
.is_erasure()) {
5785 result
= -EOPNOTSUPP
;
5790 // read into a buffer
5792 int r
= osd
->store
->fiemap(ch
, ghobject_t(soid
, ghobject_t::NO_GEN
,
5794 op
.extent
.offset
, op
.extent
.length
, bl
);
5795 osd_op
.outdata
.claim(bl
);
5799 ctx
->delta_stats
.num_rd_kb
+= shift_round_up(bl
.length(), 10);
5800 ctx
->delta_stats
.num_rd
++;
5801 dout(10) << " map_extents done on object " << soid
<< dendl
;
5806 case CEPH_OSD_OP_SPARSE_READ
:
5807 tracepoint(osd
, do_osd_op_pre_sparse_read
, soid
.oid
.name
.c_str(),
5808 soid
.snap
.val
, oi
.size
, oi
.truncate_seq
, op
.extent
.offset
,
5809 op
.extent
.length
, op
.extent
.truncate_size
,
5810 op
.extent
.truncate_seq
);
5811 if (op_finisher
== nullptr) {
5812 result
= do_sparse_read(ctx
, osd_op
);
5814 result
= op_finisher
->execute();
5818 case CEPH_OSD_OP_CALL
:
5820 string cname
, mname
;
5823 bp
.copy(op
.cls
.class_len
, cname
);
5824 bp
.copy(op
.cls
.method_len
, mname
);
5825 bp
.copy(op
.cls
.indata_len
, indata
);
5826 } catch (buffer::error
& e
) {
5827 dout(10) << "call unable to decode class + method + indata" << dendl
;
5828 dout(30) << "in dump: ";
5829 osd_op
.indata
.hexdump(*_dout
);
5832 tracepoint(osd
, do_osd_op_pre_call
, soid
.oid
.name
.c_str(), soid
.snap
.val
, "???", "???");
5835 tracepoint(osd
, do_osd_op_pre_call
, soid
.oid
.name
.c_str(), soid
.snap
.val
, cname
.c_str(), mname
.c_str());
5837 ClassHandler::ClassData
*cls
;
5838 result
= ClassHandler::get_instance().open_class(cname
, &cls
);
5839 ceph_assert(result
== 0); // init_op_flags() already verified this works.
5841 ClassHandler::ClassMethod
*method
= cls
->get_method(mname
);
5843 dout(10) << "call method " << cname
<< "." << mname
<< " does not exist" << dendl
;
5844 result
= -EOPNOTSUPP
;
5848 int flags
= method
->get_flags();
5849 if (flags
& CLS_METHOD_WR
)
5850 ctx
->user_modify
= true;
5853 dout(10) << "call method " << cname
<< "." << mname
<< dendl
;
5854 int prev_rd
= ctx
->num_read
;
5855 int prev_wr
= ctx
->num_write
;
5856 result
= method
->exec((cls_method_context_t
)&ctx
, indata
, outdata
);
5858 if (ctx
->num_read
> prev_rd
&& !(flags
& CLS_METHOD_RD
)) {
5859 derr
<< "method " << cname
<< "." << mname
<< " tried to read object but is not marked RD" << dendl
;
5863 if (ctx
->num_write
> prev_wr
&& !(flags
& CLS_METHOD_WR
)) {
5864 derr
<< "method " << cname
<< "." << mname
<< " tried to update object but is not marked WR" << dendl
;
5869 dout(10) << "method called response length=" << outdata
.length() << dendl
;
5870 op
.extent
.length
= outdata
.length();
5871 osd_op
.outdata
.claim_append(outdata
);
5872 dout(30) << "out dump: ";
5873 osd_op
.outdata
.hexdump(*_dout
);
5878 case CEPH_OSD_OP_STAT
:
5879 // note: stat does not require RD
5881 tracepoint(osd
, do_osd_op_pre_stat
, soid
.oid
.name
.c_str(), soid
.snap
.val
);
5883 if (obs
.exists
&& !oi
.is_whiteout()) {
5884 encode(oi
.size
, osd_op
.outdata
);
5885 encode(oi
.mtime
, osd_op
.outdata
);
5886 dout(10) << "stat oi has " << oi
.size
<< " " << oi
.mtime
<< dendl
;
5889 dout(10) << "stat oi object does not exist" << dendl
;
5892 ctx
->delta_stats
.num_rd
++;
5896 case CEPH_OSD_OP_ISDIRTY
:
5899 tracepoint(osd
, do_osd_op_pre_isdirty
, soid
.oid
.name
.c_str(), soid
.snap
.val
);
5900 bool is_dirty
= obs
.oi
.is_dirty();
5901 encode(is_dirty
, osd_op
.outdata
);
5902 ctx
->delta_stats
.num_rd
++;
5907 case CEPH_OSD_OP_UNDIRTY
:
5911 tracepoint(osd
, do_osd_op_pre_undirty
, soid
.oid
.name
.c_str(), soid
.snap
.val
);
5912 if (oi
.is_dirty()) {
5913 ctx
->undirty
= true; // see make_writeable()
5915 ctx
->delta_stats
.num_wr
++;
5920 case CEPH_OSD_OP_CACHE_TRY_FLUSH
:
5924 tracepoint(osd
, do_osd_op_pre_try_flush
, soid
.oid
.name
.c_str(), soid
.snap
.val
);
5925 if (ctx
->lock_type
!= RWState::RWNONE
) {
5926 dout(10) << "cache-try-flush without SKIPRWLOCKS flag set" << dendl
;
5930 if (pool
.info
.cache_mode
== pg_pool_t::CACHEMODE_NONE
) {
5938 if (oi
.is_cache_pinned()) {
5939 dout(10) << "cache-try-flush on a pinned object, consider unpin this object first" << dendl
;
5943 if (oi
.is_dirty()) {
5944 result
= start_flush(ctx
->op
, ctx
->obc
, false, NULL
, std::nullopt
);
5945 if (result
== -EINPROGRESS
)
5953 case CEPH_OSD_OP_CACHE_FLUSH
:
5957 tracepoint(osd
, do_osd_op_pre_cache_flush
, soid
.oid
.name
.c_str(), soid
.snap
.val
);
5958 if (ctx
->lock_type
== RWState::RWNONE
) {
5959 dout(10) << "cache-flush with SKIPRWLOCKS flag set" << dendl
;
5963 if (pool
.info
.cache_mode
== pg_pool_t::CACHEMODE_NONE
) {
5971 if (oi
.is_cache_pinned()) {
5972 dout(10) << "cache-flush on a pinned object, consider unpin this object first" << dendl
;
5977 if (oi
.is_dirty()) {
5978 result
= start_flush(ctx
->op
, ctx
->obc
, true, &missing
, std::nullopt
);
5979 if (result
== -EINPROGRESS
)
5984 // Check special return value which has set missing_return
5985 if (result
== -ENOENT
) {
5986 dout(10) << __func__
<< " CEPH_OSD_OP_CACHE_FLUSH got ENOENT" << dendl
;
5987 ceph_assert(!missing
.is_min());
5988 wait_for_unreadable_object(missing
, ctx
->op
);
5989 // Error code which is used elsewhere when wait_for_unreadable_object() is used
5995 case CEPH_OSD_OP_CACHE_EVICT
:
5999 tracepoint(osd
, do_osd_op_pre_cache_evict
, soid
.oid
.name
.c_str(), soid
.snap
.val
);
6000 if (pool
.info
.cache_mode
== pg_pool_t::CACHEMODE_NONE
) {
6008 if (oi
.is_cache_pinned()) {
6009 dout(10) << "cache-evict on a pinned object, consider unpin this object first" << dendl
;
6013 if (oi
.is_dirty()) {
6017 if (!oi
.watchers
.empty()) {
6021 if (soid
.snap
== CEPH_NOSNAP
) {
6022 result
= _verify_no_head_clones(soid
, ssc
->snapset
);
6026 result
= _delete_oid(ctx
, true, false);
6028 // mark that this is a cache eviction to avoid triggering normal
6029 // make_writeable() clone creation in finish_ctx()
6030 ctx
->cache_evict
= true;
6032 osd
->logger
->inc(l_osd_tier_evict
);
6036 case CEPH_OSD_OP_GETXATTR
:
6040 bp
.copy(op
.xattr
.name_len
, aname
);
6041 tracepoint(osd
, do_osd_op_pre_getxattr
, soid
.oid
.name
.c_str(), soid
.snap
.val
, aname
.c_str());
6042 string name
= "_" + aname
;
6043 int r
= getattr_maybe_cache(
6048 op
.xattr
.value_len
= osd_op
.outdata
.length();
6050 ctx
->delta_stats
.num_rd_kb
+= shift_round_up(osd_op
.outdata
.length(), 10);
6054 ctx
->delta_stats
.num_rd
++;
6058 case CEPH_OSD_OP_GETXATTRS
:
6061 tracepoint(osd
, do_osd_op_pre_getxattrs
, soid
.oid
.name
.c_str(), soid
.snap
.val
);
6062 map
<string
, bufferlist
> out
;
6063 result
= getattrs_maybe_cache(
6069 ctx
->delta_stats
.num_rd_kb
+= shift_round_up(bl
.length(), 10);
6070 ctx
->delta_stats
.num_rd
++;
6071 osd_op
.outdata
.claim_append(bl
);
6075 case CEPH_OSD_OP_CMPXATTR
:
6079 bp
.copy(op
.xattr
.name_len
, aname
);
6080 tracepoint(osd
, do_osd_op_pre_cmpxattr
, soid
.oid
.name
.c_str(), soid
.snap
.val
, aname
.c_str());
6081 string name
= "_" + aname
;
6082 name
[op
.xattr
.name_len
+ 1] = 0;
6085 result
= getattr_maybe_cache(
6089 if (result
< 0 && result
!= -EEXIST
&& result
!= -ENODATA
)
6092 ctx
->delta_stats
.num_rd
++;
6093 ctx
->delta_stats
.num_rd_kb
+= shift_round_up(xattr
.length(), 10);
6095 switch (op
.xattr
.cmp_mode
) {
6096 case CEPH_OSD_CMPXATTR_MODE_STRING
:
6099 bp
.copy(op
.xattr
.value_len
, val
);
6100 val
[op
.xattr
.value_len
] = 0;
6101 dout(10) << "CEPH_OSD_OP_CMPXATTR name=" << name
<< " val=" << val
6102 << " op=" << (int)op
.xattr
.cmp_op
<< " mode=" << (int)op
.xattr
.cmp_mode
<< dendl
;
6103 result
= do_xattr_cmp_str(op
.xattr
.cmp_op
, val
, xattr
);
6107 case CEPH_OSD_CMPXATTR_MODE_U64
:
6113 catch (buffer::error
& e
) {
6117 dout(10) << "CEPH_OSD_OP_CMPXATTR name=" << name
<< " val=" << u64val
6118 << " op=" << (int)op
.xattr
.cmp_op
<< " mode=" << (int)op
.xattr
.cmp_mode
<< dendl
;
6119 result
= do_xattr_cmp_u64(op
.xattr
.cmp_op
, u64val
, xattr
);
6124 dout(10) << "bad cmp mode " << (int)op
.xattr
.cmp_mode
<< dendl
;
6129 dout(10) << "comparison returned false" << dendl
;
6130 result
= -ECANCELED
;
6134 dout(10) << "comparison returned " << result
<< " " << cpp_strerror(-result
) << dendl
;
6138 dout(10) << "comparison returned true" << dendl
;
6142 case CEPH_OSD_OP_ASSERT_VER
:
6145 uint64_t ver
= op
.assert_ver
.ver
;
6146 tracepoint(osd
, do_osd_op_pre_assert_ver
, soid
.oid
.name
.c_str(), soid
.snap
.val
, ver
);
6149 else if (ver
< oi
.user_version
)
6151 else if (ver
> oi
.user_version
)
6152 result
= -EOVERFLOW
;
6156 case CEPH_OSD_OP_LIST_WATCHERS
:
6159 tracepoint(osd
, do_osd_op_pre_list_watchers
, soid
.oid
.name
.c_str(), soid
.snap
.val
);
6160 obj_list_watch_response_t resp
;
6162 map
<pair
<uint64_t, entity_name_t
>, watch_info_t
>::const_iterator oi_iter
;
6163 for (oi_iter
= oi
.watchers
.begin(); oi_iter
!= oi
.watchers
.end();
6165 dout(20) << "key cookie=" << oi_iter
->first
.first
6166 << " entity=" << oi_iter
->first
.second
<< " "
6167 << oi_iter
->second
<< dendl
;
6168 ceph_assert(oi_iter
->first
.first
== oi_iter
->second
.cookie
);
6169 ceph_assert(oi_iter
->first
.second
.is_client());
6171 watch_item_t
wi(oi_iter
->first
.second
, oi_iter
->second
.cookie
,
6172 oi_iter
->second
.timeout_seconds
, oi_iter
->second
.addr
);
6173 resp
.entries
.push_back(wi
);
6176 resp
.encode(osd_op
.outdata
, ctx
->get_features());
6179 ctx
->delta_stats
.num_rd
++;
6183 case CEPH_OSD_OP_LIST_SNAPS
:
6186 tracepoint(osd
, do_osd_op_pre_list_snaps
, soid
.oid
.name
.c_str(), soid
.snap
.val
);
6187 obj_list_snap_response_t resp
;
6190 ssc
= ctx
->obc
->ssc
= get_snapset_context(soid
, false);
6193 dout(20) << " snapset " << ssc
->snapset
<< dendl
;
6195 int clonecount
= ssc
->snapset
.clones
.size();
6196 clonecount
++; // for head
6197 resp
.clones
.reserve(clonecount
);
6198 for (auto clone_iter
= ssc
->snapset
.clones
.begin();
6199 clone_iter
!= ssc
->snapset
.clones
.end(); ++clone_iter
) {
6201 ci
.cloneid
= *clone_iter
;
6203 hobject_t clone_oid
= soid
;
6204 clone_oid
.snap
= *clone_iter
;
6206 auto p
= ssc
->snapset
.clone_snaps
.find(*clone_iter
);
6207 if (p
== ssc
->snapset
.clone_snaps
.end()) {
6208 osd
->clog
->error() << "osd." << osd
->whoami
6209 << ": inconsistent clone_snaps found for oid "
6210 << soid
<< " clone " << *clone_iter
6211 << " snapset " << ssc
->snapset
;
6215 for (auto q
= p
->second
.rbegin(); q
!= p
->second
.rend(); ++q
) {
6216 ci
.snaps
.push_back(*q
);
6219 dout(20) << " clone " << *clone_iter
<< " snaps " << ci
.snaps
<< dendl
;
6221 map
<snapid_t
, interval_set
<uint64_t> >::const_iterator coi
;
6222 coi
= ssc
->snapset
.clone_overlap
.find(ci
.cloneid
);
6223 if (coi
== ssc
->snapset
.clone_overlap
.end()) {
6224 osd
->clog
->error() << "osd." << osd
->whoami
6225 << ": inconsistent clone_overlap found for oid "
6226 << soid
<< " clone " << *clone_iter
;
6230 const interval_set
<uint64_t> &o
= coi
->second
;
6231 ci
.overlap
.reserve(o
.num_intervals());
6232 for (interval_set
<uint64_t>::const_iterator r
= o
.begin();
6233 r
!= o
.end(); ++r
) {
6234 ci
.overlap
.push_back(pair
<uint64_t,uint64_t>(r
.get_start(),
6238 map
<snapid_t
, uint64_t>::const_iterator si
;
6239 si
= ssc
->snapset
.clone_size
.find(ci
.cloneid
);
6240 if (si
== ssc
->snapset
.clone_size
.end()) {
6241 osd
->clog
->error() << "osd." << osd
->whoami
6242 << ": inconsistent clone_size found for oid "
6243 << soid
<< " clone " << *clone_iter
;
6247 ci
.size
= si
->second
;
6249 resp
.clones
.push_back(ci
);
6254 if (!ctx
->obc
->obs
.oi
.is_whiteout()) {
6255 ceph_assert(obs
.exists
);
6257 ci
.cloneid
= CEPH_NOSNAP
;
6259 //Size for HEAD is oi.size
6262 resp
.clones
.push_back(ci
);
6264 resp
.seq
= ssc
->snapset
.seq
;
6266 resp
.encode(osd_op
.outdata
);
6269 ctx
->delta_stats
.num_rd
++;
6273 case CEPH_OSD_OP_NOTIFY
:
6280 uint32_t ver
; // obsolete
6282 decode(timeout
, bp
);
6284 } catch (const buffer::error
&e
) {
6287 tracepoint(osd
, do_osd_op_pre_notify
, soid
.oid
.name
.c_str(), soid
.snap
.val
, timeout
);
6289 timeout
= cct
->_conf
->osd_default_notify_timeout
;
6292 n
.timeout
= timeout
;
6293 n
.notify_id
= osd
->get_next_id(get_osdmap_epoch());
6294 n
.cookie
= op
.notify
.cookie
;
6296 ctx
->notifies
.push_back(n
);
6298 // return our unique notify id to the client
6299 encode(n
.notify_id
, osd_op
.outdata
);
6303 case CEPH_OSD_OP_NOTIFY_ACK
:
6307 uint64_t notify_id
= 0;
6308 uint64_t watch_cookie
= 0;
6309 decode(notify_id
, bp
);
6310 decode(watch_cookie
, bp
);
6311 bufferlist reply_bl
;
6313 decode(reply_bl
, bp
);
6315 tracepoint(osd
, do_osd_op_pre_notify_ack
, soid
.oid
.name
.c_str(), soid
.snap
.val
, notify_id
, watch_cookie
, "Y");
6316 OpContext::NotifyAck
ack(notify_id
, watch_cookie
, reply_bl
);
6317 ctx
->notify_acks
.push_back(ack
);
6318 } catch (const buffer::error
&e
) {
6319 tracepoint(osd
, do_osd_op_pre_notify_ack
, soid
.oid
.name
.c_str(), soid
.snap
.val
, op
.watch
.cookie
, 0, "N");
6320 OpContext::NotifyAck
ack(
6321 // op.watch.cookie is actually the notify_id for historical reasons
6324 ctx
->notify_acks
.push_back(ack
);
6329 case CEPH_OSD_OP_SETALLOCHINT
:
6333 tracepoint(osd
, do_osd_op_pre_setallochint
, soid
.oid
.name
.c_str(), soid
.snap
.val
, op
.alloc_hint
.expected_object_size
, op
.alloc_hint
.expected_write_size
);
6334 maybe_create_new_object(ctx
);
6335 oi
.expected_object_size
= op
.alloc_hint
.expected_object_size
;
6336 oi
.expected_write_size
= op
.alloc_hint
.expected_write_size
;
6337 oi
.alloc_hint_flags
= op
.alloc_hint
.flags
;
6338 t
->set_alloc_hint(soid
, op
.alloc_hint
.expected_object_size
,
6339 op
.alloc_hint
.expected_write_size
,
6340 op
.alloc_hint
.flags
);
6347 // -- object data --
6349 case CEPH_OSD_OP_WRITE
:
6353 __u32 seq
= oi
.truncate_seq
;
6354 tracepoint(osd
, do_osd_op_pre_write
, soid
.oid
.name
.c_str(), soid
.snap
.val
, oi
.size
, seq
, op
.extent
.offset
, op
.extent
.length
, op
.extent
.truncate_size
, op
.extent
.truncate_seq
);
6355 if (op
.extent
.length
!= osd_op
.indata
.length()) {
6360 if (pool
.info
.has_flag(pg_pool_t::FLAG_WRITE_FADVISE_DONTNEED
))
6361 op
.flags
= op
.flags
| CEPH_OSD_OP_FLAG_FADVISE_DONTNEED
;
6363 if (pool
.info
.requires_aligned_append() &&
6364 (op
.extent
.offset
% pool
.info
.required_alignment() != 0)) {
6365 result
= -EOPNOTSUPP
;
6370 if (pool
.info
.requires_aligned_append() && op
.extent
.offset
) {
6371 result
= -EOPNOTSUPP
;
6374 } else if (op
.extent
.offset
!= oi
.size
&&
6375 pool
.info
.requires_aligned_append()) {
6376 result
= -EOPNOTSUPP
;
6380 if (seq
&& (seq
> op
.extent
.truncate_seq
) &&
6381 (op
.extent
.offset
+ op
.extent
.length
> oi
.size
)) {
6382 // old write, arrived after trimtrunc
6383 op
.extent
.length
= (op
.extent
.offset
> oi
.size
? 0 : oi
.size
- op
.extent
.offset
);
6384 dout(10) << " old truncate_seq " << op
.extent
.truncate_seq
<< " < current " << seq
6385 << ", adjusting write length to " << op
.extent
.length
<< dendl
;
6387 t
.substr_of(osd_op
.indata
, 0, op
.extent
.length
);
6388 osd_op
.indata
.swap(t
);
6390 if (op
.extent
.truncate_seq
> seq
) {
6391 // write arrives before trimtrunc
6392 if (obs
.exists
&& !oi
.is_whiteout()) {
6393 dout(10) << " truncate_seq " << op
.extent
.truncate_seq
<< " > current " << seq
6394 << ", truncating to " << op
.extent
.truncate_size
<< dendl
;
6395 t
->truncate(soid
, op
.extent
.truncate_size
);
6396 oi
.truncate_seq
= op
.extent
.truncate_seq
;
6397 oi
.truncate_size
= op
.extent
.truncate_size
;
6398 if (oi
.size
> op
.extent
.truncate_size
) {
6399 interval_set
<uint64_t> trim
;
6400 trim
.insert(op
.extent
.truncate_size
,
6401 oi
.size
- op
.extent
.truncate_size
);
6402 ctx
->modified_ranges
.union_of(trim
);
6403 ctx
->clean_regions
.mark_data_region_dirty(op
.extent
.truncate_size
, oi
.size
- op
.extent
.truncate_size
);
6405 if (op
.extent
.truncate_size
!= oi
.size
) {
6406 truncate_update_size_and_usage(ctx
->delta_stats
,
6408 op
.extent
.truncate_size
);
6411 dout(10) << " truncate_seq " << op
.extent
.truncate_seq
<< " > current " << seq
6412 << ", but object is new" << dendl
;
6413 oi
.truncate_seq
= op
.extent
.truncate_seq
;
6414 oi
.truncate_size
= op
.extent
.truncate_size
;
6417 result
= check_offset_and_length(
6418 op
.extent
.offset
, op
.extent
.length
,
6419 static_cast<Option::size_t>(osd
->osd_max_object_size
), get_dpp());
6423 maybe_create_new_object(ctx
);
6425 if (op
.extent
.length
== 0) {
6426 if (op
.extent
.offset
> oi
.size
) {
6428 soid
, op
.extent
.offset
);
6429 truncate_update_size_and_usage(ctx
->delta_stats
, oi
,
6436 soid
, op
.extent
.offset
, op
.extent
.length
, osd_op
.indata
, op
.flags
);
6439 if (op
.extent
.offset
== 0 && op
.extent
.length
>= oi
.size
6440 && !skip_data_digest
) {
6441 obs
.oi
.set_data_digest(osd_op
.indata
.crc32c(-1));
6442 } else if (op
.extent
.offset
== oi
.size
&& obs
.oi
.is_data_digest()) {
6443 if (skip_data_digest
) {
6444 obs
.oi
.clear_data_digest();
6446 obs
.oi
.set_data_digest(osd_op
.indata
.crc32c(obs
.oi
.data_digest
));
6449 obs
.oi
.clear_data_digest();
6451 write_update_size_and_usage(ctx
->delta_stats
, oi
, ctx
->modified_ranges
,
6452 op
.extent
.offset
, op
.extent
.length
);
6453 ctx
->clean_regions
.mark_data_region_dirty(op
.extent
.offset
, op
.extent
.length
);
6454 dout(10) << "clean_regions modified" << ctx
->clean_regions
<< dendl
;
6458 case CEPH_OSD_OP_WRITEFULL
:
6461 { // write full object
6462 tracepoint(osd
, do_osd_op_pre_writefull
, soid
.oid
.name
.c_str(), soid
.snap
.val
, oi
.size
, 0, op
.extent
.length
);
6464 if (op
.extent
.length
!= osd_op
.indata
.length()) {
6468 result
= check_offset_and_length(
6469 0, op
.extent
.length
,
6470 static_cast<Option::size_t>(osd
->osd_max_object_size
), get_dpp());
6474 if (pool
.info
.has_flag(pg_pool_t::FLAG_WRITE_FADVISE_DONTNEED
))
6475 op
.flags
= op
.flags
| CEPH_OSD_OP_FLAG_FADVISE_DONTNEED
;
6477 maybe_create_new_object(ctx
);
6478 if (pool
.info
.is_erasure()) {
6479 t
->truncate(soid
, 0);
6480 } else if (obs
.exists
&& op
.extent
.length
< oi
.size
) {
6481 t
->truncate(soid
, op
.extent
.length
);
6483 if (op
.extent
.length
) {
6484 t
->write(soid
, 0, op
.extent
.length
, osd_op
.indata
, op
.flags
);
6486 if (!skip_data_digest
) {
6487 obs
.oi
.set_data_digest(osd_op
.indata
.crc32c(-1));
6489 obs
.oi
.clear_data_digest();
6491 ctx
->clean_regions
.mark_data_region_dirty(0,
6492 std::max((uint64_t)op
.extent
.length
, oi
.size
));
6493 write_update_size_and_usage(ctx
->delta_stats
, oi
, ctx
->modified_ranges
,
6494 0, op
.extent
.length
, true);
6498 case CEPH_OSD_OP_WRITESAME
:
6500 tracepoint(osd
, do_osd_op_pre_writesame
, soid
.oid
.name
.c_str(), soid
.snap
.val
, oi
.size
, op
.writesame
.offset
, op
.writesame
.length
, op
.writesame
.data_length
);
6501 result
= do_writesame(ctx
, osd_op
);
6504 case CEPH_OSD_OP_ROLLBACK
:
6506 tracepoint(osd
, do_osd_op_pre_rollback
, soid
.oid
.name
.c_str(), soid
.snap
.val
);
6507 result
= _rollback_to(ctx
, op
);
6510 case CEPH_OSD_OP_ZERO
:
6511 tracepoint(osd
, do_osd_op_pre_zero
, soid
.oid
.name
.c_str(), soid
.snap
.val
, op
.extent
.offset
, op
.extent
.length
);
6512 if (pool
.info
.requires_aligned_append()) {
6513 result
= -EOPNOTSUPP
;
6518 result
= check_offset_and_length(
6519 op
.extent
.offset
, op
.extent
.length
,
6520 static_cast<Option::size_t>(osd
->osd_max_object_size
), get_dpp());
6524 ceph_assert(op
.extent
.length
);
6525 if (obs
.exists
&& !oi
.is_whiteout()) {
6526 t
->zero(soid
, op
.extent
.offset
, op
.extent
.length
);
6527 interval_set
<uint64_t> ch
;
6528 ch
.insert(op
.extent
.offset
, op
.extent
.length
);
6529 ctx
->modified_ranges
.union_of(ch
);
6530 ctx
->clean_regions
.mark_data_region_dirty(op
.extent
.offset
, op
.extent
.length
);
6531 ctx
->delta_stats
.num_wr
++;
6532 oi
.clear_data_digest();
6538 case CEPH_OSD_OP_CREATE
:
6542 tracepoint(osd
, do_osd_op_pre_create
, soid
.oid
.name
.c_str(), soid
.snap
.val
);
6543 if (obs
.exists
&& !oi
.is_whiteout() &&
6544 (op
.flags
& CEPH_OSD_OP_FLAG_EXCL
)) {
6545 result
= -EEXIST
; /* this is an exclusive create */
6547 if (osd_op
.indata
.length()) {
6548 auto p
= osd_op
.indata
.cbegin();
6551 decode(category
, p
);
6553 catch (buffer::error
& e
) {
6557 // category is no longer implemented.
6559 maybe_create_new_object(ctx
);
6565 case CEPH_OSD_OP_TRIMTRUNC
:
6566 op
.extent
.offset
= op
.extent
.truncate_size
;
6569 case CEPH_OSD_OP_TRUNCATE
:
6570 tracepoint(osd
, do_osd_op_pre_truncate
, soid
.oid
.name
.c_str(), soid
.snap
.val
, oi
.size
, oi
.truncate_seq
, op
.extent
.offset
, op
.extent
.length
, op
.extent
.truncate_size
, op
.extent
.truncate_seq
);
6571 if (pool
.info
.requires_aligned_append()) {
6572 result
= -EOPNOTSUPP
;
6579 if (!obs
.exists
|| oi
.is_whiteout()) {
6580 dout(10) << " object dne, truncate is a no-op" << dendl
;
6584 result
= check_offset_and_length(
6585 op
.extent
.offset
, op
.extent
.length
,
6586 static_cast<Option::size_t>(osd
->osd_max_object_size
), get_dpp());
6590 if (op
.extent
.truncate_seq
) {
6591 ceph_assert(op
.extent
.offset
== op
.extent
.truncate_size
);
6592 if (op
.extent
.truncate_seq
<= oi
.truncate_seq
) {
6593 dout(10) << " truncate seq " << op
.extent
.truncate_seq
<< " <= current " << oi
.truncate_seq
6594 << ", no-op" << dendl
;
6597 dout(10) << " truncate seq " << op
.extent
.truncate_seq
<< " > current " << oi
.truncate_seq
6598 << ", truncating" << dendl
;
6599 oi
.truncate_seq
= op
.extent
.truncate_seq
;
6600 oi
.truncate_size
= op
.extent
.truncate_size
;
6603 maybe_create_new_object(ctx
);
6604 t
->truncate(soid
, op
.extent
.offset
);
6605 if (oi
.size
> op
.extent
.offset
) {
6606 interval_set
<uint64_t> trim
;
6607 trim
.insert(op
.extent
.offset
, oi
.size
-op
.extent
.offset
);
6608 ctx
->modified_ranges
.union_of(trim
);
6609 ctx
->clean_regions
.mark_data_region_dirty(op
.extent
.offset
, oi
.size
- op
.extent
.offset
);
6610 } else if (oi
.size
< op
.extent
.offset
) {
6611 ctx
->clean_regions
.mark_data_region_dirty(oi
.size
, op
.extent
.offset
- oi
.size
);
6613 if (op
.extent
.offset
!= oi
.size
) {
6614 truncate_update_size_and_usage(ctx
->delta_stats
,
6618 ctx
->delta_stats
.num_wr
++;
6619 // do no set exists, or we will break above DELETE -> TRUNCATE munging.
6621 oi
.clear_data_digest();
6625 case CEPH_OSD_OP_DELETE
:
6628 tracepoint(osd
, do_osd_op_pre_delete
, soid
.oid
.name
.c_str(), soid
.snap
.val
);
6630 if (oi
.has_manifest()) {
6631 if ((oi
.flags
& object_info_t::FLAG_REDIRECT_HAS_REFERENCE
) && oi
.manifest
.is_redirect()) {
6632 ctx
->register_on_commit(
6634 object_locator_t
target_oloc(oi
.manifest
.redirect_target
);
6635 refcount_manifest(ctx
->obc
, target_oloc
, oi
.manifest
.redirect_target
,
6636 SnapContext(), false, NULL
, 0);
6638 } else if (oi
.manifest
.is_chunked()) {
6639 ctx
->register_on_commit(
6641 for (auto p
: oi
.manifest
.chunk_map
) {
6642 if (p
.second
.has_reference()) {
6643 object_locator_t
target_oloc(p
.second
.oid
);
6644 refcount_manifest(ctx
->obc
, target_oloc
, p
.second
.oid
,
6645 SnapContext(), false, NULL
, p
.first
);
6651 result
= _delete_oid(ctx
, false, ctx
->ignore_cache
);
6655 case CEPH_OSD_OP_WATCH
:
6659 tracepoint(osd
, do_osd_op_pre_watch
, soid
.oid
.name
.c_str(), soid
.snap
.val
,
6660 op
.watch
.cookie
, op
.watch
.op
);
6666 uint64_t cookie
= op
.watch
.cookie
;
6667 entity_name_t entity
= ctx
->reqid
.name
;
6668 ObjectContextRef obc
= ctx
->obc
;
6670 dout(10) << "watch " << ceph_osd_watch_op_name(op
.watch
.op
)
6671 << ": ctx->obc=" << (void *)obc
.get() << " cookie=" << cookie
6672 << " oi.version=" << oi
.version
.version
<< " ctx->at_version=" << ctx
->at_version
<< dendl
;
6673 dout(10) << "watch: oi.user_version=" << oi
.user_version
<< dendl
;
6674 dout(10) << "watch: peer_addr="
6675 << ctx
->op
->get_req()->get_connection()->get_peer_addr() << dendl
;
6677 uint32_t timeout
= cct
->_conf
->osd_client_watch_timeout
;
6678 if (op
.watch
.timeout
!= 0) {
6679 timeout
= op
.watch
.timeout
;
6682 watch_info_t
w(cookie
, timeout
,
6683 ctx
->op
->get_req()->get_connection()->get_peer_addr());
6684 if (op
.watch
.op
== CEPH_OSD_WATCH_OP_WATCH
||
6685 op
.watch
.op
== CEPH_OSD_WATCH_OP_LEGACY_WATCH
) {
6686 if (oi
.watchers
.count(make_pair(cookie
, entity
))) {
6687 dout(10) << " found existing watch " << w
<< " by " << entity
<< dendl
;
6689 dout(10) << " registered new watch " << w
<< " by " << entity
<< dendl
;
6690 oi
.watchers
[make_pair(cookie
, entity
)] = w
;
6691 t
->nop(soid
); // make sure update the object_info on disk!
6693 bool will_ping
= (op
.watch
.op
== CEPH_OSD_WATCH_OP_WATCH
);
6694 ctx
->watch_connects
.push_back(make_pair(w
, will_ping
));
6695 } else if (op
.watch
.op
== CEPH_OSD_WATCH_OP_RECONNECT
) {
6696 if (!oi
.watchers
.count(make_pair(cookie
, entity
))) {
6700 dout(10) << " found existing watch " << w
<< " by " << entity
<< dendl
;
6701 ctx
->watch_connects
.push_back(make_pair(w
, true));
6702 } else if (op
.watch
.op
== CEPH_OSD_WATCH_OP_PING
) {
6703 /* Note: WATCH with PING doesn't cause may_write() to return true,
6704 * so if there is nothing else in the transaction, this is going
6705 * to run do_osd_op_effects, but not write out a log entry */
6706 if (!oi
.watchers
.count(make_pair(cookie
, entity
))) {
6710 map
<pair
<uint64_t,entity_name_t
>,WatchRef
>::iterator p
=
6711 obc
->watchers
.find(make_pair(cookie
, entity
));
6712 if (p
== obc
->watchers
.end() ||
6713 !p
->second
->is_connected()) {
6714 // client needs to reconnect
6715 result
= -ETIMEDOUT
;
6718 dout(10) << " found existing watch " << w
<< " by " << entity
<< dendl
;
6719 p
->second
->got_ping(ceph_clock_now());
6721 } else if (op
.watch
.op
== CEPH_OSD_WATCH_OP_UNWATCH
) {
6722 map
<pair
<uint64_t, entity_name_t
>, watch_info_t
>::iterator oi_iter
=
6723 oi
.watchers
.find(make_pair(cookie
, entity
));
6724 if (oi_iter
!= oi
.watchers
.end()) {
6725 dout(10) << " removed watch " << oi_iter
->second
<< " by "
6727 oi
.watchers
.erase(oi_iter
);
6728 t
->nop(soid
); // update oi on disk
6729 ctx
->watch_disconnects
.push_back(
6730 watch_disconnect_t(cookie
, entity
, false));
6732 dout(10) << " can't remove: no watch by " << entity
<< dendl
;
6738 case CEPH_OSD_OP_CACHE_PIN
:
6739 tracepoint(osd
, do_osd_op_pre_cache_pin
, soid
.oid
.name
.c_str(), soid
.snap
.val
);
6740 if ((!pool
.info
.is_tier() ||
6741 pool
.info
.cache_mode
== pg_pool_t::CACHEMODE_NONE
)) {
6743 dout(10) << " pin object is only allowed on the cache tier " << dendl
;
6749 if (!obs
.exists
|| oi
.is_whiteout()) {
6754 if (!oi
.is_cache_pinned()) {
6755 oi
.set_flag(object_info_t::FLAG_CACHE_PIN
);
6757 ctx
->delta_stats
.num_objects_pinned
++;
6758 ctx
->delta_stats
.num_wr
++;
6763 case CEPH_OSD_OP_CACHE_UNPIN
:
6764 tracepoint(osd
, do_osd_op_pre_cache_unpin
, soid
.oid
.name
.c_str(), soid
.snap
.val
);
6765 if ((!pool
.info
.is_tier() ||
6766 pool
.info
.cache_mode
== pg_pool_t::CACHEMODE_NONE
)) {
6768 dout(10) << " pin object is only allowed on the cache tier " << dendl
;
6774 if (!obs
.exists
|| oi
.is_whiteout()) {
6779 if (oi
.is_cache_pinned()) {
6780 oi
.clear_flag(object_info_t::FLAG_CACHE_PIN
);
6782 ctx
->delta_stats
.num_objects_pinned
--;
6783 ctx
->delta_stats
.num_wr
++;
6788 case CEPH_OSD_OP_SET_REDIRECT
:
6792 if (pool
.info
.is_tier()) {
6800 if (get_osdmap()->require_osd_release
< ceph_release_t::luminous
) {
6801 result
= -EOPNOTSUPP
;
6805 object_t target_name
;
6806 object_locator_t target_oloc
;
6807 snapid_t target_snapid
= (uint64_t)op
.copy_from
.snapid
;
6808 version_t target_version
= op
.copy_from
.src_version
;
6810 decode(target_name
, bp
);
6811 decode(target_oloc
, bp
);
6813 catch (buffer::error
& e
) {
6818 get_osdmap()->object_locator_to_pg(target_name
, target_oloc
, raw_pg
);
6819 hobject_t
target(target_name
, target_oloc
.key
, target_snapid
,
6820 raw_pg
.ps(), raw_pg
.pool(),
6821 target_oloc
.nspace
);
6822 if (target
== soid
) {
6823 dout(20) << " set-redirect self is invalid" << dendl
;
6828 bool need_reference
= (osd_op
.op
.flags
& CEPH_OSD_OP_FLAG_WITH_REFERENCE
);
6829 bool has_reference
= (oi
.flags
& object_info_t::FLAG_REDIRECT_HAS_REFERENCE
);
6830 if (has_reference
) {
6832 dout(5) << " the object is already a manifest " << dendl
;
6835 if (op_finisher
== nullptr && need_reference
) {
6837 ctx
->op_finishers
[ctx
->current_osd_subop_num
].reset(
6838 new SetManifestFinisher(osd_op
));
6839 RefCountCallback
*fin
= new RefCountCallback(ctx
, osd_op
);
6840 refcount_manifest(ctx
->obc
, target_oloc
, target
, SnapContext(),
6842 result
= -EINPROGRESS
;
6846 result
= op_finisher
->execute();
6847 ceph_assert(result
== 0);
6850 if (!oi
.has_manifest() && !oi
.manifest
.is_redirect())
6851 ctx
->delta_stats
.num_objects_manifest
++;
6853 oi
.set_flag(object_info_t::FLAG_MANIFEST
);
6854 oi
.manifest
.redirect_target
= target
;
6855 oi
.manifest
.type
= object_manifest_t::TYPE_REDIRECT
;
6856 t
->truncate(soid
, 0);
6857 ctx
->clean_regions
.mark_data_region_dirty(0, oi
.size
);
6858 if (oi
.is_omap() && pool
.info
.supports_omap()) {
6859 t
->omap_clear(soid
);
6860 obs
.oi
.clear_omap_digest();
6861 obs
.oi
.clear_flag(object_info_t::FLAG_OMAP
);
6862 ctx
->clean_regions
.mark_omap_dirty();
6864 write_update_size_and_usage(ctx
->delta_stats
, oi
, ctx
->modified_ranges
,
6866 ctx
->delta_stats
.num_bytes
-= oi
.size
;
6869 oi
.user_version
= target_version
;
6870 ctx
->user_at_version
= target_version
;
6872 map
<string
,bufferlist
> rmattrs
;
6873 result
= getattrs_maybe_cache(ctx
->obc
, &rmattrs
);
6875 dout(10) << __func__
<< " error: " << cpp_strerror(result
) << dendl
;
6878 map
<string
, bufferlist
>::iterator iter
;
6879 for (iter
= rmattrs
.begin(); iter
!= rmattrs
.end(); ++iter
) {
6880 const string
& name
= iter
->first
;
6881 t
->rmattr(soid
, name
);
6883 if (!has_reference
&& need_reference
) {
6884 oi
.set_flag(object_info_t::FLAG_REDIRECT_HAS_REFERENCE
);
6886 dout(10) << "set-redirect oid:" << oi
.soid
<< " user_version: " << oi
.user_version
<< dendl
;
6888 ctx
->op_finishers
.erase(ctx
->current_osd_subop_num
);
6895 case CEPH_OSD_OP_SET_CHUNK
:
6899 if (pool
.info
.is_tier()) {
6907 if (get_osdmap()->require_osd_release
< ceph_release_t::luminous
) {
6908 result
= -EOPNOTSUPP
;
6912 object_locator_t tgt_oloc
;
6913 uint64_t src_offset
, src_length
, tgt_offset
;
6916 decode(src_offset
, bp
);
6917 decode(src_length
, bp
);
6918 decode(tgt_oloc
, bp
);
6919 decode(tgt_name
, bp
);
6920 decode(tgt_offset
, bp
);
6922 catch (buffer::error
& e
) {
6932 for (auto &p
: oi
.manifest
.chunk_map
) {
6933 if ((p
.first
<= src_offset
&& p
.first
+ p
.second
.length
> src_offset
) ||
6934 (p
.first
> src_offset
&& p
.first
<= src_offset
+ src_length
)) {
6935 dout(20) << __func__
<< " overlapped !! offset: " << src_offset
<< " length: " << src_length
6936 << " chunk_info: " << p
<< dendl
;
6937 result
= -EOPNOTSUPP
;
6942 if (!oi
.manifest
.is_chunked()) {
6943 oi
.manifest
.clear();
6947 chunk_info_t chunk_info
;
6948 get_osdmap()->object_locator_to_pg(tgt_name
, tgt_oloc
, raw_pg
);
6949 hobject_t
target(tgt_name
, tgt_oloc
.key
, snapid_t(),
6950 raw_pg
.ps(), raw_pg
.pool(),
6952 bool need_reference
= (osd_op
.op
.flags
& CEPH_OSD_OP_FLAG_WITH_REFERENCE
);
6953 bool has_reference
= (oi
.manifest
.chunk_map
.find(src_offset
) != oi
.manifest
.chunk_map
.end()) &&
6954 (oi
.manifest
.chunk_map
[src_offset
].flags
& chunk_info_t::FLAG_HAS_REFERENCE
);
6955 if (has_reference
) {
6957 dout(5) << " the object is already a manifest " << dendl
;
6960 if (op_finisher
== nullptr && need_reference
) {
6962 ctx
->op_finishers
[ctx
->current_osd_subop_num
].reset(
6963 new SetManifestFinisher(osd_op
));
6964 RefCountCallback
*fin
= new RefCountCallback(ctx
, osd_op
);
6965 refcount_manifest(ctx
->obc
, tgt_oloc
, target
, SnapContext(),
6966 true, fin
, src_offset
);
6967 result
= -EINPROGRESS
;
6970 result
= op_finisher
->execute();
6971 ceph_assert(result
== 0);
6974 chunk_info_t chunk_info
;
6975 chunk_info
.set_flag(chunk_info_t::FLAG_MISSING
);
6976 chunk_info
.oid
= target
;
6977 chunk_info
.offset
= tgt_offset
;
6978 chunk_info
.length
= src_length
;
6979 oi
.manifest
.chunk_map
[src_offset
] = chunk_info
;
6980 if (!oi
.has_manifest() && !oi
.manifest
.is_chunked())
6981 ctx
->delta_stats
.num_objects_manifest
++;
6982 oi
.set_flag(object_info_t::FLAG_MANIFEST
);
6983 oi
.manifest
.type
= object_manifest_t::TYPE_CHUNKED
;
6984 if (!has_reference
&& need_reference
) {
6985 oi
.manifest
.chunk_map
[src_offset
].set_flag(chunk_info_t::FLAG_HAS_REFERENCE
);
6987 if (need_reference
&& pool
.info
.get_fingerprint_type() != pg_pool_t::TYPE_FINGERPRINT_NONE
) {
6988 oi
.manifest
.chunk_map
[src_offset
].set_flag(chunk_info_t::FLAG_HAS_FINGERPRINT
);
6992 dout(10) << "set-chunked oid:" << oi
.soid
<< " user_version: " << oi
.user_version
6993 << " chunk_info: " << chunk_info
<< dendl
;
6995 ctx
->op_finishers
.erase(ctx
->current_osd_subop_num
);
7002 case CEPH_OSD_OP_TIER_PROMOTE
:
7006 if (pool
.info
.is_tier()) {
7014 if (get_osdmap()->require_osd_release
< ceph_release_t::luminous
) {
7015 result
= -EOPNOTSUPP
;
7018 if (!obs
.oi
.has_manifest()) {
7023 if (op_finisher
== nullptr) {
7024 PromoteManifestCallback
*cb
;
7025 object_locator_t my_oloc
;
7028 if (obs
.oi
.manifest
.is_chunked()) {
7029 src_hoid
= obs
.oi
.soid
;
7030 cb
= new PromoteManifestCallback(ctx
->obc
, this, ctx
);
7031 } else if (obs
.oi
.manifest
.is_redirect()) {
7032 object_locator_t
src_oloc(obs
.oi
.manifest
.redirect_target
);
7034 src_hoid
= obs
.oi
.manifest
.redirect_target
;
7035 cb
= new PromoteManifestCallback(ctx
->obc
, this, ctx
);
7037 ceph_abort_msg("unrecognized manifest type");
7039 ctx
->op_finishers
[ctx
->current_osd_subop_num
].reset(
7040 new PromoteFinisher(cb
));
7041 unsigned flags
= CEPH_OSD_COPY_FROM_FLAG_IGNORE_OVERLAY
|
7042 CEPH_OSD_COPY_FROM_FLAG_IGNORE_CACHE
|
7043 CEPH_OSD_COPY_FROM_FLAG_MAP_SNAP_CLONE
|
7044 CEPH_OSD_COPY_FROM_FLAG_RWORDERED
;
7045 unsigned src_fadvise_flags
= LIBRADOS_OP_FLAG_FADVISE_SEQUENTIAL
;
7046 start_copy(cb
, ctx
->obc
, src_hoid
, my_oloc
, 0, flags
,
7047 obs
.oi
.soid
.snap
== CEPH_NOSNAP
,
7048 src_fadvise_flags
, 0);
7050 dout(10) << "tier-promote oid:" << oi
.soid
<< " manifest: " << obs
.oi
.manifest
<< dendl
;
7051 result
= -EINPROGRESS
;
7053 result
= op_finisher
->execute();
7054 ceph_assert(result
== 0);
7055 ctx
->op_finishers
.erase(ctx
->current_osd_subop_num
);
7061 case CEPH_OSD_OP_TIER_FLUSH
:
7065 if (pool
.info
.is_tier()) {
7073 if (get_osdmap()->require_osd_release
< ceph_release_t::octopus
) {
7074 result
= -EOPNOTSUPP
;
7077 if (!obs
.oi
.has_manifest()) {
7083 bool is_dirty
= false;
7084 for (auto& p
: ctx
->obc
->obs
.oi
.manifest
.chunk_map
) {
7085 if (p
.second
.is_dirty()) {
7092 result
= start_flush(ctx
->op
, ctx
->obc
, true, NULL
, std::nullopt
);
7093 if (result
== -EINPROGRESS
)
7102 case CEPH_OSD_OP_UNSET_MANIFEST
:
7106 if (pool
.info
.is_tier()) {
7114 if (!oi
.has_manifest()) {
7115 result
= -EOPNOTSUPP
;
7118 if (get_osdmap()->require_osd_release
< ceph_release_t::luminous
) {
7119 result
= -EOPNOTSUPP
;
7123 if (oi
.manifest
.is_redirect()) {
7124 if ((oi
.flags
& object_info_t::FLAG_REDIRECT_HAS_REFERENCE
)) {
7125 ctx
->register_on_commit(
7127 object_locator_t
target_oloc(oi
.manifest
.redirect_target
);
7128 refcount_manifest(ctx
->obc
, target_oloc
, oi
.manifest
.redirect_target
,
7129 SnapContext(), false, NULL
, 0);
7132 } else if (oi
.manifest
.is_chunked()) {
7133 ctx
->register_on_commit(
7135 for (auto p
: oi
.manifest
.chunk_map
) {
7136 if (p
.second
.flags
& chunk_info_t::FLAG_HAS_REFERENCE
) {
7137 object_locator_t
target_oloc(p
.second
.oid
);
7138 refcount_manifest(ctx
->obc
, target_oloc
, p
.second
.oid
,
7139 SnapContext(), false, NULL
, p
.first
);
7144 ceph_abort_msg("unrecognized manifest type");
7147 oi
.clear_flag(object_info_t::FLAG_MANIFEST
);
7148 oi
.manifest
= object_manifest_t();
7149 ctx
->delta_stats
.num_objects_manifest
--;
7150 ctx
->delta_stats
.num_wr
++;
7156 // -- object attrs --
7158 case CEPH_OSD_OP_SETXATTR
:
7162 if (cct
->_conf
->osd_max_attr_size
> 0 &&
7163 op
.xattr
.value_len
> cct
->_conf
->osd_max_attr_size
) {
7164 tracepoint(osd
, do_osd_op_pre_setxattr
, soid
.oid
.name
.c_str(), soid
.snap
.val
, "???");
7168 unsigned max_name_len
=
7169 std::min
<uint64_t>(osd
->store
->get_max_attr_name_length(),
7170 cct
->_conf
->osd_max_attr_name_len
);
7171 if (op
.xattr
.name_len
> max_name_len
) {
7172 result
= -ENAMETOOLONG
;
7175 maybe_create_new_object(ctx
);
7177 bp
.copy(op
.xattr
.name_len
, aname
);
7178 tracepoint(osd
, do_osd_op_pre_setxattr
, soid
.oid
.name
.c_str(), soid
.snap
.val
, aname
.c_str());
7179 string name
= "_" + aname
;
7181 bp
.copy(op
.xattr
.value_len
, bl
);
7182 t
->setattr(soid
, name
, bl
);
7183 ctx
->delta_stats
.num_wr
++;
7187 case CEPH_OSD_OP_RMXATTR
:
7192 bp
.copy(op
.xattr
.name_len
, aname
);
7193 tracepoint(osd
, do_osd_op_pre_rmxattr
, soid
.oid
.name
.c_str(), soid
.snap
.val
, aname
.c_str());
7194 if (!obs
.exists
|| oi
.is_whiteout()) {
7198 string name
= "_" + aname
;
7199 t
->rmattr(soid
, name
);
7200 ctx
->delta_stats
.num_wr
++;
7205 // -- fancy writers --
7206 case CEPH_OSD_OP_APPEND
:
7208 tracepoint(osd
, do_osd_op_pre_append
, soid
.oid
.name
.c_str(), soid
.snap
.val
, oi
.size
, oi
.truncate_seq
, op
.extent
.offset
, op
.extent
.length
, op
.extent
.truncate_size
, op
.extent
.truncate_seq
);
7209 // just do it inline; this works because we are happy to execute
7210 // fancy op on replicas as well.
7211 vector
<OSDOp
> nops(1);
7212 OSDOp
& newop
= nops
[0];
7213 newop
.op
.op
= CEPH_OSD_OP_WRITE
;
7214 newop
.op
.extent
.offset
= oi
.size
;
7215 newop
.op
.extent
.length
= op
.extent
.length
;
7216 newop
.op
.extent
.truncate_seq
= oi
.truncate_seq
;
7217 newop
.indata
= osd_op
.indata
;
7218 result
= do_osd_ops(ctx
, nops
);
7219 osd_op
.outdata
.claim(newop
.outdata
);
7223 case CEPH_OSD_OP_STARTSYNC
:
7228 // -- trivial map --
7229 case CEPH_OSD_OP_TMAPGET
:
7230 tracepoint(osd
, do_osd_op_pre_tmapget
, soid
.oid
.name
.c_str(), soid
.snap
.val
);
7231 if (pool
.info
.is_erasure()) {
7232 result
= -EOPNOTSUPP
;
7236 vector
<OSDOp
> nops(1);
7237 OSDOp
& newop
= nops
[0];
7238 newop
.op
.op
= CEPH_OSD_OP_SYNC_READ
;
7239 newop
.op
.extent
.offset
= 0;
7240 newop
.op
.extent
.length
= 0;
7241 result
= do_osd_ops(ctx
, nops
);
7242 osd_op
.outdata
.claim(newop
.outdata
);
7246 case CEPH_OSD_OP_TMAPPUT
:
7247 tracepoint(osd
, do_osd_op_pre_tmapput
, soid
.oid
.name
.c_str(), soid
.snap
.val
);
7248 if (pool
.info
.is_erasure()) {
7249 result
= -EOPNOTSUPP
;
7253 //_dout_lock.Lock();
7254 //osd_op.data.hexdump(*_dout);
7255 //_dout_lock.Unlock();
7257 // verify sort order
7258 bool unsorted
= false;
7268 dout(10) << "tmapput key " << key
<< dendl
;
7271 if (key
< last_key
) {
7272 dout(10) << "TMAPPUT is unordered; resorting" << dendl
;
7281 vector
<OSDOp
> nops(1);
7282 OSDOp
& newop
= nops
[0];
7283 newop
.op
.op
= CEPH_OSD_OP_WRITEFULL
;
7284 newop
.op
.extent
.offset
= 0;
7285 newop
.op
.extent
.length
= osd_op
.indata
.length();
7286 newop
.indata
= osd_op
.indata
;
7289 bp
= osd_op
.indata
.begin();
7291 map
<string
, bufferlist
> m
;
7294 ceph_assert(bp
.end());
7296 encode(header
, newbl
);
7298 newop
.indata
= newbl
;
7300 result
= do_osd_ops(ctx
, nops
);
7301 ceph_assert(result
== 0);
7305 case CEPH_OSD_OP_TMAPUP
:
7306 tracepoint(osd
, do_osd_op_pre_tmapup
, soid
.oid
.name
.c_str(), soid
.snap
.val
);
7307 if (pool
.info
.is_erasure()) {
7308 result
= -EOPNOTSUPP
;
7312 result
= do_tmapup(ctx
, bp
, osd_op
);
7315 case CEPH_OSD_OP_TMAP2OMAP
:
7317 tracepoint(osd
, do_osd_op_pre_tmap2omap
, soid
.oid
.name
.c_str(), soid
.snap
.val
);
7318 result
= do_tmap2omap(ctx
, op
.tmap2omap
.flags
);
7322 case CEPH_OSD_OP_OMAPGETKEYS
:
7326 uint64_t max_return
;
7328 decode(start_after
, bp
);
7329 decode(max_return
, bp
);
7331 catch (buffer::error
& e
) {
7333 tracepoint(osd
, do_osd_op_pre_omapgetkeys
, soid
.oid
.name
.c_str(), soid
.snap
.val
, "???", 0);
7336 if (max_return
> cct
->_conf
->osd_max_omap_entries_per_request
) {
7337 max_return
= cct
->_conf
->osd_max_omap_entries_per_request
;
7339 tracepoint(osd
, do_osd_op_pre_omapgetkeys
, soid
.oid
.name
.c_str(), soid
.snap
.val
, start_after
.c_str(), max_return
);
7343 bool truncated
= false;
7345 ObjectMap::ObjectMapIterator iter
= osd
->store
->get_omap_iterator(
7346 ch
, ghobject_t(soid
)
7349 iter
->upper_bound(start_after
);
7350 for (num
= 0; iter
->valid(); ++num
, iter
->next()) {
7351 if (num
>= max_return
||
7352 bl
.length() >= cct
->_conf
->osd_max_omap_bytes_per_request
) {
7356 encode(iter
->key(), bl
);
7358 } // else return empty out_set
7359 encode(num
, osd_op
.outdata
);
7360 osd_op
.outdata
.claim_append(bl
);
7361 encode(truncated
, osd_op
.outdata
);
7362 ctx
->delta_stats
.num_rd_kb
+= shift_round_up(osd_op
.outdata
.length(), 10);
7363 ctx
->delta_stats
.num_rd
++;
7367 case CEPH_OSD_OP_OMAPGETVALS
:
7371 uint64_t max_return
;
7372 string filter_prefix
;
7374 decode(start_after
, bp
);
7375 decode(max_return
, bp
);
7376 decode(filter_prefix
, bp
);
7378 catch (buffer::error
& e
) {
7380 tracepoint(osd
, do_osd_op_pre_omapgetvals
, soid
.oid
.name
.c_str(), soid
.snap
.val
, "???", 0, "???");
7383 if (max_return
> cct
->_conf
->osd_max_omap_entries_per_request
) {
7384 max_return
= cct
->_conf
->osd_max_omap_entries_per_request
;
7386 tracepoint(osd
, do_osd_op_pre_omapgetvals
, soid
.oid
.name
.c_str(), soid
.snap
.val
, start_after
.c_str(), max_return
, filter_prefix
.c_str());
7389 bool truncated
= false;
7392 ObjectMap::ObjectMapIterator iter
= osd
->store
->get_omap_iterator(
7393 ch
, ghobject_t(soid
)
7399 iter
->upper_bound(start_after
);
7400 if (filter_prefix
> start_after
) iter
->lower_bound(filter_prefix
);
7403 iter
->key().substr(0, filter_prefix
.size()) == filter_prefix
;
7404 ++num
, iter
->next()) {
7405 dout(20) << "Found key " << iter
->key() << dendl
;
7406 if (num
>= max_return
||
7407 bl
.length() >= cct
->_conf
->osd_max_omap_bytes_per_request
) {
7411 encode(iter
->key(), bl
);
7412 encode(iter
->value(), bl
);
7414 } // else return empty out_set
7415 encode(num
, osd_op
.outdata
);
7416 osd_op
.outdata
.claim_append(bl
);
7417 encode(truncated
, osd_op
.outdata
);
7418 ctx
->delta_stats
.num_rd_kb
+= shift_round_up(osd_op
.outdata
.length(), 10);
7419 ctx
->delta_stats
.num_rd
++;
7423 case CEPH_OSD_OP_OMAPGETHEADER
:
7424 tracepoint(osd
, do_osd_op_pre_omapgetheader
, soid
.oid
.name
.c_str(), soid
.snap
.val
);
7425 if (!oi
.is_omap()) {
7426 // return empty header
7431 osd
->store
->omap_get_header(ch
, ghobject_t(soid
), &osd_op
.outdata
);
7432 ctx
->delta_stats
.num_rd_kb
+= shift_round_up(osd_op
.outdata
.length(), 10);
7433 ctx
->delta_stats
.num_rd
++;
7437 case CEPH_OSD_OP_OMAPGETVALSBYKEYS
:
7440 set
<string
> keys_to_get
;
7442 decode(keys_to_get
, bp
);
7444 catch (buffer::error
& e
) {
7446 tracepoint(osd
, do_osd_op_pre_omapgetvalsbykeys
, soid
.oid
.name
.c_str(), soid
.snap
.val
, "???");
7449 tracepoint(osd
, do_osd_op_pre_omapgetvalsbykeys
, soid
.oid
.name
.c_str(), soid
.snap
.val
, list_entries(keys_to_get
).c_str());
7450 map
<string
, bufferlist
> out
;
7452 osd
->store
->omap_get_values(ch
, ghobject_t(soid
), keys_to_get
, &out
);
7453 } // else return empty omap entries
7454 encode(out
, osd_op
.outdata
);
7455 ctx
->delta_stats
.num_rd_kb
+= shift_round_up(osd_op
.outdata
.length(), 10);
7456 ctx
->delta_stats
.num_rd
++;
7460 case CEPH_OSD_OP_OMAP_CMP
:
7463 if (!obs
.exists
|| oi
.is_whiteout()) {
7465 tracepoint(osd
, do_osd_op_pre_omap_cmp
, soid
.oid
.name
.c_str(), soid
.snap
.val
, "???");
7468 map
<string
, pair
<bufferlist
, int> > assertions
;
7470 decode(assertions
, bp
);
7472 catch (buffer::error
& e
) {
7474 tracepoint(osd
, do_osd_op_pre_omap_cmp
, soid
.oid
.name
.c_str(), soid
.snap
.val
, "???");
7477 tracepoint(osd
, do_osd_op_pre_omap_cmp
, soid
.oid
.name
.c_str(), soid
.snap
.val
, list_keys(assertions
).c_str());
7479 map
<string
, bufferlist
> out
;
7483 for (map
<string
, pair
<bufferlist
, int> >::iterator i
= assertions
.begin();
7484 i
!= assertions
.end();
7486 to_get
.insert(i
->first
);
7487 int r
= osd
->store
->omap_get_values(ch
, ghobject_t(soid
),
7493 } // else leave out empty
7495 //Should set num_rd_kb based on encode length of map
7496 ctx
->delta_stats
.num_rd
++;
7500 for (map
<string
, pair
<bufferlist
, int> >::iterator i
= assertions
.begin();
7501 i
!= assertions
.end();
7503 auto out_entry
= out
.find(i
->first
);
7504 bufferlist
&bl
= (out_entry
!= out
.end()) ?
7505 out_entry
->second
: empty
;
7506 switch (i
->second
.second
) {
7507 case CEPH_OSD_CMPXATTR_OP_EQ
:
7508 if (!(bl
== i
->second
.first
)) {
7512 case CEPH_OSD_CMPXATTR_OP_LT
:
7513 if (!(bl
< i
->second
.first
)) {
7517 case CEPH_OSD_CMPXATTR_OP_GT
:
7518 if (!(bl
> i
->second
.first
)) {
7536 case CEPH_OSD_OP_OMAPSETVALS
:
7537 if (!pool
.info
.supports_omap()) {
7538 result
= -EOPNOTSUPP
;
7539 tracepoint(osd
, do_osd_op_pre_omapsetvals
, soid
.oid
.name
.c_str(), soid
.snap
.val
);
7545 maybe_create_new_object(ctx
);
7546 bufferlist to_set_bl
;
7548 decode_str_str_map_to_bl(bp
, &to_set_bl
);
7550 catch (buffer::error
& e
) {
7552 tracepoint(osd
, do_osd_op_pre_omapsetvals
, soid
.oid
.name
.c_str(), soid
.snap
.val
);
7555 tracepoint(osd
, do_osd_op_pre_omapsetvals
, soid
.oid
.name
.c_str(), soid
.snap
.val
);
7556 if (cct
->_conf
->subsys
.should_gather
<dout_subsys
, 20>()) {
7557 dout(20) << "setting vals: " << dendl
;
7558 map
<string
,bufferlist
> to_set
;
7559 bufferlist::const_iterator pt
= to_set_bl
.begin();
7561 for (map
<string
, bufferlist
>::iterator i
= to_set
.begin();
7564 dout(20) << "\t" << i
->first
<< dendl
;
7567 t
->omap_setkeys(soid
, to_set_bl
);
7568 ctx
->clean_regions
.mark_omap_dirty();
7569 ctx
->delta_stats
.num_wr
++;
7570 ctx
->delta_stats
.num_wr_kb
+= shift_round_up(to_set_bl
.length(), 10);
7572 obs
.oi
.set_flag(object_info_t::FLAG_OMAP
);
7573 obs
.oi
.clear_omap_digest();
7576 case CEPH_OSD_OP_OMAPSETHEADER
:
7577 tracepoint(osd
, do_osd_op_pre_omapsetheader
, soid
.oid
.name
.c_str(), soid
.snap
.val
);
7578 if (!pool
.info
.supports_omap()) {
7579 result
= -EOPNOTSUPP
;
7585 maybe_create_new_object(ctx
);
7586 t
->omap_setheader(soid
, osd_op
.indata
);
7587 ctx
->clean_regions
.mark_omap_dirty();
7588 ctx
->delta_stats
.num_wr
++;
7590 obs
.oi
.set_flag(object_info_t::FLAG_OMAP
);
7591 obs
.oi
.clear_omap_digest();
7594 case CEPH_OSD_OP_OMAPCLEAR
:
7595 tracepoint(osd
, do_osd_op_pre_omapclear
, soid
.oid
.name
.c_str(), soid
.snap
.val
);
7596 if (!pool
.info
.supports_omap()) {
7597 result
= -EOPNOTSUPP
;
7603 if (!obs
.exists
|| oi
.is_whiteout()) {
7608 t
->omap_clear(soid
);
7609 ctx
->clean_regions
.mark_omap_dirty();
7610 ctx
->delta_stats
.num_wr
++;
7611 obs
.oi
.clear_omap_digest();
7612 obs
.oi
.clear_flag(object_info_t::FLAG_OMAP
);
7617 case CEPH_OSD_OP_OMAPRMKEYS
:
7618 if (!pool
.info
.supports_omap()) {
7619 result
= -EOPNOTSUPP
;
7620 tracepoint(osd
, do_osd_op_pre_omaprmkeys
, soid
.oid
.name
.c_str(), soid
.snap
.val
);
7626 if (!obs
.exists
|| oi
.is_whiteout()) {
7628 tracepoint(osd
, do_osd_op_pre_omaprmkeys
, soid
.oid
.name
.c_str(), soid
.snap
.val
);
7631 bufferlist to_rm_bl
;
7633 decode_str_set_to_bl(bp
, &to_rm_bl
);
7635 catch (buffer::error
& e
) {
7637 tracepoint(osd
, do_osd_op_pre_omaprmkeys
, soid
.oid
.name
.c_str(), soid
.snap
.val
);
7640 tracepoint(osd
, do_osd_op_pre_omaprmkeys
, soid
.oid
.name
.c_str(), soid
.snap
.val
);
7641 t
->omap_rmkeys(soid
, to_rm_bl
);
7642 ctx
->clean_regions
.mark_omap_dirty();
7643 ctx
->delta_stats
.num_wr
++;
7645 obs
.oi
.clear_omap_digest();
7648 case CEPH_OSD_OP_OMAPRMKEYRANGE
:
7649 tracepoint(osd
, do_osd_op_pre_omaprmkeyrange
, soid
.oid
.name
.c_str(), soid
.snap
.val
);
7650 if (!pool
.info
.supports_omap()) {
7651 result
= -EOPNOTSUPP
;
7657 if (!obs
.exists
|| oi
.is_whiteout()) {
7661 std::string key_begin
, key_end
;
7663 decode(key_begin
, bp
);
7664 decode(key_end
, bp
);
7665 } catch (buffer::error
& e
) {
7669 t
->omap_rmkeyrange(soid
, key_begin
, key_end
);
7670 ctx
->delta_stats
.num_wr
++;
7672 obs
.oi
.clear_omap_digest();
7675 case CEPH_OSD_OP_COPY_GET
:
7677 tracepoint(osd
, do_osd_op_pre_copy_get
, soid
.oid
.name
.c_str(),
7679 if (op_finisher
== nullptr) {
7680 result
= do_copy_get(ctx
, bp
, osd_op
, ctx
->obc
);
7682 result
= op_finisher
->execute();
7686 case CEPH_OSD_OP_COPY_FROM
:
7687 case CEPH_OSD_OP_COPY_FROM2
:
7692 object_locator_t src_oloc
;
7693 uint32_t truncate_seq
= 0;
7694 uint64_t truncate_size
= 0;
7695 bool have_truncate
= false;
7696 snapid_t src_snapid
= (uint64_t)op
.copy_from
.snapid
;
7697 version_t src_version
= op
.copy_from
.src_version
;
7699 if ((op
.op
== CEPH_OSD_OP_COPY_FROM2
) &&
7700 (op
.copy_from
.flags
& ~CEPH_OSD_COPY_FROM_FLAGS
)) {
7701 dout(20) << "invalid copy-from2 flags 0x"
7702 << std::hex
<< (int)op
.copy_from
.flags
<< std::dec
<< dendl
;
7707 decode(src_name
, bp
);
7708 decode(src_oloc
, bp
);
7709 // check if client sent us truncate_seq and truncate_size
7710 if ((op
.op
== CEPH_OSD_OP_COPY_FROM2
) &&
7711 (op
.copy_from
.flags
& CEPH_OSD_COPY_FROM_FLAG_TRUNCATE_SEQ
)) {
7712 decode(truncate_seq
, bp
);
7713 decode(truncate_size
, bp
);
7714 have_truncate
= true;
7717 catch (buffer::error
& e
) {
7720 do_osd_op_pre_copy_from
,
7721 soid
.oid
.name
.c_str(),
7733 do_osd_op_pre_copy_from
,
7734 soid
.oid
.name
.c_str(),
7736 src_name
.name
.c_str(),
7738 src_oloc
.key
.c_str(),
7739 src_oloc
.nspace
.c_str(),
7743 if (op_finisher
== nullptr) {
7746 get_osdmap()->object_locator_to_pg(src_name
, src_oloc
, raw_pg
);
7747 hobject_t
src(src_name
, src_oloc
.key
, src_snapid
,
7748 raw_pg
.ps(), raw_pg
.pool(),
7751 dout(20) << " copy from self is invalid" << dendl
;
7755 CopyFromCallback
*cb
= new CopyFromCallback(ctx
, osd_op
);
7757 cb
->set_truncate(truncate_seq
, truncate_size
);
7758 ctx
->op_finishers
[ctx
->current_osd_subop_num
].reset(
7759 new CopyFromFinisher(cb
));
7760 start_copy(cb
, ctx
->obc
, src
, src_oloc
, src_version
,
7763 op
.copy_from
.src_fadvise_flags
,
7765 result
= -EINPROGRESS
;
7768 result
= op_finisher
->execute();
7769 ceph_assert(result
== 0);
7771 // COPY_FROM cannot be executed multiple times -- it must restart
7772 ctx
->op_finishers
.erase(ctx
->current_osd_subop_num
);
7778 tracepoint(osd
, do_osd_op_pre_unknown
, soid
.oid
.name
.c_str(), soid
.snap
.val
, op
.op
, ceph_osd_op_name(op
.op
));
7779 dout(1) << "unrecognized osd op " << op
.op
7780 << " " << ceph_osd_op_name(op
.op
)
7782 result
= -EOPNOTSUPP
;
7786 osd_op
.rval
= result
;
7787 tracepoint(osd
, do_osd_op_post
, soid
.oid
.name
.c_str(), soid
.snap
.val
, op
.op
, ceph_osd_op_name(op
.op
), op
.flags
, result
);
7788 if (result
< 0 && (op
.flags
& CEPH_OSD_OP_FLAG_FAILOK
) &&
7789 result
!= -EAGAIN
&& result
!= -EINPROGRESS
)
7796 dout(10) << __func__
<< " error: " << cpp_strerror(result
) << dendl
;
7801 int PrimaryLogPG::_get_tmap(OpContext
*ctx
, bufferlist
*header
, bufferlist
*vals
)
7803 if (ctx
->new_obs
.oi
.size
== 0) {
7804 dout(20) << "unable to get tmap for zero sized " << ctx
->new_obs
.oi
.soid
<< dendl
;
7807 vector
<OSDOp
> nops(1);
7808 OSDOp
&newop
= nops
[0];
7809 newop
.op
.op
= CEPH_OSD_OP_TMAPGET
;
7810 do_osd_ops(ctx
, nops
);
7812 bufferlist::const_iterator i
= newop
.outdata
.begin();
7814 (*vals
).substr_of(newop
.outdata
, i
.get_off(), i
.get_remaining());
7816 dout(20) << "unsuccessful at decoding tmap for " << ctx
->new_obs
.oi
.soid
7820 dout(20) << "successful at decoding tmap for " << ctx
->new_obs
.oi
.soid
7825 int PrimaryLogPG::_verify_no_head_clones(const hobject_t
& soid
,
7828 // verify that all clones have been evicted
7829 dout(20) << __func__
<< " verifying clones are absent "
7831 for (vector
<snapid_t
>::const_iterator p
= ss
.clones
.begin();
7832 p
!= ss
.clones
.end();
7834 hobject_t clone_oid
= soid
;
7835 clone_oid
.snap
= *p
;
7836 if (is_missing_object(clone_oid
))
7838 ObjectContextRef clone_obc
= get_object_context(clone_oid
, false);
7839 if (clone_obc
&& clone_obc
->obs
.exists
) {
7840 dout(10) << __func__
<< " cannot evict head before clone "
7841 << clone_oid
<< dendl
;
7844 if (copy_ops
.count(clone_oid
)) {
7845 dout(10) << __func__
<< " cannot evict head, pending promote on clone "
7846 << clone_oid
<< dendl
;
7853 inline int PrimaryLogPG::_delete_oid(
7855 bool no_whiteout
, // no whiteouts, no matter what.
7856 bool try_no_whiteout
) // try not to whiteout
7858 SnapSet
& snapset
= ctx
->new_snapset
;
7859 ObjectState
& obs
= ctx
->new_obs
;
7860 object_info_t
& oi
= obs
.oi
;
7861 const hobject_t
& soid
= oi
.soid
;
7862 PGTransaction
* t
= ctx
->op_t
.get();
7864 // cache: cache: set whiteout on delete?
7865 bool whiteout
= false;
7866 if (pool
.info
.cache_mode
!= pg_pool_t::CACHEMODE_NONE
7868 && !try_no_whiteout
) {
7872 // in luminous or later, we can't delete the head if there are
7873 // clones. we trust the caller passing no_whiteout has already
7874 // verified they don't exist.
7875 if (!snapset
.clones
.empty() ||
7876 (!ctx
->snapc
.snaps
.empty() && ctx
->snapc
.snaps
[0] > snapset
.seq
)) {
7878 dout(20) << __func__
<< " has or will have clones but no_whiteout=1"
7881 dout(20) << __func__
<< " has or will have clones; will whiteout"
7886 dout(20) << __func__
<< " " << soid
<< " whiteout=" << (int)whiteout
7887 << " no_whiteout=" << (int)no_whiteout
7888 << " try_no_whiteout=" << (int)try_no_whiteout
7890 if (!obs
.exists
|| (obs
.oi
.is_whiteout() && whiteout
))
7896 interval_set
<uint64_t> ch
;
7897 ch
.insert(0, oi
.size
);
7898 ctx
->modified_ranges
.union_of(ch
);
7899 ctx
->clean_regions
.mark_data_region_dirty(0, oi
.size
);
7902 ctx
->clean_regions
.mark_omap_dirty();
7903 ctx
->delta_stats
.num_wr
++;
7904 if (soid
.is_snap()) {
7905 ceph_assert(ctx
->obc
->ssc
->snapset
.clone_overlap
.count(soid
.snap
));
7906 ctx
->delta_stats
.num_bytes
-= ctx
->obc
->ssc
->snapset
.get_clone_bytes(soid
.snap
);
7908 ctx
->delta_stats
.num_bytes
-= oi
.size
;
7913 // disconnect all watchers
7914 for (map
<pair
<uint64_t, entity_name_t
>, watch_info_t
>::iterator p
=
7915 oi
.watchers
.begin();
7916 p
!= oi
.watchers
.end();
7918 dout(20) << __func__
<< " will disconnect watcher " << p
->first
<< dendl
;
7919 ctx
->watch_disconnects
.push_back(
7920 watch_disconnect_t(p
->first
.first
, p
->first
.second
, true));
7922 oi
.watchers
.clear();
7925 dout(20) << __func__
<< " setting whiteout on " << soid
<< dendl
;
7926 oi
.set_flag(object_info_t::FLAG_WHITEOUT
);
7927 ctx
->delta_stats
.num_whiteouts
++;
7929 osd
->logger
->inc(l_osd_tier_whiteout
);
7934 ctx
->delta_stats
.num_objects
--;
7936 ctx
->delta_stats
.num_object_clones
--;
7937 if (oi
.is_whiteout()) {
7938 dout(20) << __func__
<< " deleting whiteout on " << soid
<< dendl
;
7939 ctx
->delta_stats
.num_whiteouts
--;
7940 oi
.clear_flag(object_info_t::FLAG_WHITEOUT
);
7942 if (oi
.is_cache_pinned()) {
7943 ctx
->delta_stats
.num_objects_pinned
--;
7945 if (oi
.has_manifest()) {
7946 ctx
->delta_stats
.num_objects_manifest
--;
7952 int PrimaryLogPG::_rollback_to(OpContext
*ctx
, ceph_osd_op
& op
)
7954 SnapSet
& snapset
= ctx
->new_snapset
;
7955 ObjectState
& obs
= ctx
->new_obs
;
7956 object_info_t
& oi
= obs
.oi
;
7957 const hobject_t
& soid
= oi
.soid
;
7958 PGTransaction
* t
= ctx
->op_t
.get();
7959 snapid_t snapid
= (uint64_t)op
.snap
.snapid
;
7960 hobject_t missing_oid
;
7962 dout(10) << "_rollback_to " << soid
<< " snapid " << snapid
<< dendl
;
7964 ObjectContextRef rollback_to
;
7966 int ret
= find_object_context(
7967 hobject_t(soid
.oid
, soid
.get_key(), snapid
, soid
.get_hash(), info
.pgid
.pool(),
7968 soid
.get_namespace()),
7969 &rollback_to
, false, false, &missing_oid
);
7970 if (ret
== -EAGAIN
) {
7971 /* clone must be missing */
7972 ceph_assert(is_degraded_or_backfilling_object(missing_oid
) || is_degraded_on_async_recovery_target(missing_oid
));
7973 dout(20) << "_rollback_to attempted to roll back to a missing or backfilling clone "
7974 << missing_oid
<< " (requested snapid: ) " << snapid
<< dendl
;
7975 block_write_on_degraded_snap(missing_oid
, ctx
->op
);
7979 ObjectContextRef promote_obc
;
7980 cache_result_t tier_mode_result
;
7981 if (obs
.exists
&& obs
.oi
.has_manifest()) {
7983 maybe_handle_manifest_detail(
7989 maybe_handle_cache_detail(
7999 switch (tier_mode_result
) {
8000 case cache_result_t::NOOP
:
8002 case cache_result_t::BLOCKED_PROMOTE
:
8003 ceph_assert(promote_obc
);
8004 block_write_on_snap_rollback(soid
, promote_obc
, ctx
->op
);
8006 case cache_result_t::BLOCKED_FULL
:
8007 block_write_on_full_cache(soid
, ctx
->op
);
8009 case cache_result_t::REPLIED_WITH_EAGAIN
:
8010 ceph_abort_msg("this can't happen, no rollback on replica");
8012 ceph_abort_msg("must promote was set, other values are not valid");
8017 if (ret
== -ENOENT
|| (rollback_to
&& rollback_to
->obs
.oi
.is_whiteout())) {
8018 // there's no snapshot here, or there's no object.
8019 // if there's no snapshot, we delete the object; otherwise, do nothing.
8020 dout(20) << "_rollback_to deleting head on " << soid
.oid
8021 << " because got ENOENT|whiteout on find_object_context" << dendl
;
8022 if (ctx
->obc
->obs
.oi
.watchers
.size()) {
8023 // Cannot delete an object with watchers
8026 _delete_oid(ctx
, false, false);
8030 // ummm....huh? It *can't* return anything else at time of writing.
8031 ceph_abort_msg("unexpected error code in _rollback_to");
8032 } else { //we got our context, let's use it to do the rollback!
8033 hobject_t
& rollback_to_sobject
= rollback_to
->obs
.oi
.soid
;
8034 if (is_degraded_or_backfilling_object(rollback_to_sobject
) ||
8035 is_degraded_on_async_recovery_target(rollback_to_sobject
)) {
8036 dout(20) << "_rollback_to attempted to roll back to a degraded object "
8037 << rollback_to_sobject
<< " (requested snapid: ) " << snapid
<< dendl
;
8038 block_write_on_degraded_snap(rollback_to_sobject
, ctx
->op
);
8040 } else if (rollback_to
->obs
.oi
.soid
.snap
== CEPH_NOSNAP
) {
8041 // rolling back to the head; we just need to clone it.
8044 /* 1) Delete current head
8045 * 2) Clone correct snapshot into head
8046 * 3) Calculate clone_overlaps by following overlaps
8047 * forward from rollback snapshot */
8048 dout(10) << "_rollback_to deleting " << soid
.oid
8049 << " and rolling back to old snap" << dendl
;
8054 t
->clone(soid
, rollback_to_sobject
);
8055 t
->add_obc(rollback_to
);
8057 map
<snapid_t
, interval_set
<uint64_t> >::iterator iter
=
8058 snapset
.clone_overlap
.lower_bound(snapid
);
8059 ceph_assert(iter
!= snapset
.clone_overlap
.end());
8060 interval_set
<uint64_t> overlaps
= iter
->second
;
8062 iter
!= snapset
.clone_overlap
.end();
8064 overlaps
.intersection_of(iter
->second
);
8066 if (obs
.oi
.size
> 0) {
8067 interval_set
<uint64_t> modified
;
8068 modified
.insert(0, obs
.oi
.size
);
8069 overlaps
.intersection_of(modified
);
8070 modified
.subtract(overlaps
);
8071 ctx
->modified_ranges
.union_of(modified
);
8074 // Adjust the cached objectcontext
8075 maybe_create_new_object(ctx
, true);
8076 ctx
->delta_stats
.num_bytes
-= obs
.oi
.size
;
8077 ctx
->delta_stats
.num_bytes
+= rollback_to
->obs
.oi
.size
;
8078 ctx
->clean_regions
.mark_data_region_dirty(0, std::max(obs
.oi
.size
, rollback_to
->obs
.oi
.size
));
8079 ctx
->clean_regions
.mark_omap_dirty();
8080 obs
.oi
.size
= rollback_to
->obs
.oi
.size
;
8081 if (rollback_to
->obs
.oi
.is_data_digest())
8082 obs
.oi
.set_data_digest(rollback_to
->obs
.oi
.data_digest
);
8084 obs
.oi
.clear_data_digest();
8085 if (rollback_to
->obs
.oi
.is_omap_digest())
8086 obs
.oi
.set_omap_digest(rollback_to
->obs
.oi
.omap_digest
);
8088 obs
.oi
.clear_omap_digest();
8090 if (rollback_to
->obs
.oi
.is_omap()) {
8091 dout(10) << __func__
<< " setting omap flag on " << obs
.oi
.soid
<< dendl
;
8092 obs
.oi
.set_flag(object_info_t::FLAG_OMAP
);
8094 dout(10) << __func__
<< " clearing omap flag on " << obs
.oi
.soid
<< dendl
;
8095 obs
.oi
.clear_flag(object_info_t::FLAG_OMAP
);
8102 void PrimaryLogPG::_make_clone(
8105 ObjectContextRef obc
,
8106 const hobject_t
& head
, const hobject_t
& coid
,
8110 encode(*poi
, bv
, get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD
, nullptr));
8112 t
->clone(coid
, head
);
8113 setattr_maybe_cache(obc
, t
, OI_ATTR
, bv
);
8114 rmattr_maybe_cache(obc
, t
, SS_ATTR
);
8117 void PrimaryLogPG::make_writeable(OpContext
*ctx
)
8119 const hobject_t
& soid
= ctx
->obs
->oi
.soid
;
8120 SnapContext
& snapc
= ctx
->snapc
;
8123 ceph_assert(soid
.snap
== CEPH_NOSNAP
);
8124 dout(20) << "make_writeable " << soid
<< " snapset=" << ctx
->new_snapset
8125 << " snapc=" << snapc
<< dendl
;
8127 bool was_dirty
= ctx
->obc
->obs
.oi
.is_dirty();
8128 if (ctx
->new_obs
.exists
) {
8129 // we will mark the object dirty
8130 if (ctx
->undirty
&& was_dirty
) {
8131 dout(20) << " clearing DIRTY flag" << dendl
;
8132 ceph_assert(ctx
->new_obs
.oi
.is_dirty());
8133 ctx
->new_obs
.oi
.clear_flag(object_info_t::FLAG_DIRTY
);
8134 --ctx
->delta_stats
.num_objects_dirty
;
8135 osd
->logger
->inc(l_osd_tier_clean
);
8136 } else if (!was_dirty
&& !ctx
->undirty
) {
8137 dout(20) << " setting DIRTY flag" << dendl
;
8138 ctx
->new_obs
.oi
.set_flag(object_info_t::FLAG_DIRTY
);
8139 ++ctx
->delta_stats
.num_objects_dirty
;
8140 osd
->logger
->inc(l_osd_tier_dirty
);
8144 dout(20) << " deletion, decrementing num_dirty and clearing flag" << dendl
;
8145 ctx
->new_obs
.oi
.clear_flag(object_info_t::FLAG_DIRTY
);
8146 --ctx
->delta_stats
.num_objects_dirty
;
8150 if ((ctx
->new_obs
.exists
&&
8151 ctx
->new_obs
.oi
.is_omap()) &&
8152 (!ctx
->obc
->obs
.exists
||
8153 !ctx
->obc
->obs
.oi
.is_omap())) {
8154 ++ctx
->delta_stats
.num_objects_omap
;
8156 if ((!ctx
->new_obs
.exists
||
8157 !ctx
->new_obs
.oi
.is_omap()) &&
8158 (ctx
->obc
->obs
.exists
&&
8159 ctx
->obc
->obs
.oi
.is_omap())) {
8160 --ctx
->delta_stats
.num_objects_omap
;
8163 if (ctx
->new_snapset
.seq
> snapc
.seq
) {
8164 dout(10) << " op snapset is old" << dendl
;
8167 if ((ctx
->obs
->exists
&& !ctx
->obs
->oi
.is_whiteout()) && // head exist(ed)
8168 snapc
.snaps
.size() && // there are snaps
8169 !ctx
->cache_evict
&&
8170 snapc
.snaps
[0] > ctx
->new_snapset
.seq
) { // existing object is old
8172 hobject_t coid
= soid
;
8173 coid
.snap
= snapc
.seq
;
8177 l
< snapc
.snaps
.size() && snapc
.snaps
[l
] > ctx
->new_snapset
.seq
;
8180 vector
<snapid_t
> snaps(l
);
8181 for (unsigned i
=0; i
<l
; i
++)
8182 snaps
[i
] = snapc
.snaps
[i
];
8185 object_info_t
static_snap_oi(coid
);
8186 object_info_t
*snap_oi
;
8188 ctx
->clone_obc
= object_contexts
.lookup_or_create(static_snap_oi
.soid
);
8189 ctx
->clone_obc
->destructor_callback
=
8190 new C_PG_ObjectContext(this, ctx
->clone_obc
.get());
8191 ctx
->clone_obc
->obs
.oi
= static_snap_oi
;
8192 ctx
->clone_obc
->obs
.exists
= true;
8193 ctx
->clone_obc
->ssc
= ctx
->obc
->ssc
;
8194 ctx
->clone_obc
->ssc
->ref
++;
8195 if (pool
.info
.is_erasure())
8196 ctx
->clone_obc
->attr_cache
= ctx
->obc
->attr_cache
;
8197 snap_oi
= &ctx
->clone_obc
->obs
.oi
;
8198 bool got
= ctx
->lock_manager
.get_write_greedy(
8203 dout(20) << " got greedy write on clone_obc " << *ctx
->clone_obc
<< dendl
;
8205 snap_oi
= &static_snap_oi
;
8207 snap_oi
->version
= ctx
->at_version
;
8208 snap_oi
->prior_version
= ctx
->obs
->oi
.version
;
8209 snap_oi
->copy_user_bits(ctx
->obs
->oi
);
8211 _make_clone(ctx
, ctx
->op_t
.get(), ctx
->clone_obc
, soid
, coid
, snap_oi
);
8213 ctx
->delta_stats
.num_objects
++;
8214 if (snap_oi
->is_dirty()) {
8215 ctx
->delta_stats
.num_objects_dirty
++;
8216 osd
->logger
->inc(l_osd_tier_dirty
);
8218 if (snap_oi
->is_omap())
8219 ctx
->delta_stats
.num_objects_omap
++;
8220 if (snap_oi
->is_cache_pinned())
8221 ctx
->delta_stats
.num_objects_pinned
++;
8222 if (snap_oi
->has_manifest())
8223 ctx
->delta_stats
.num_objects_manifest
++;
8224 ctx
->delta_stats
.num_object_clones
++;
8225 ctx
->new_snapset
.clones
.push_back(coid
.snap
);
8226 ctx
->new_snapset
.clone_size
[coid
.snap
] = ctx
->obs
->oi
.size
;
8227 ctx
->new_snapset
.clone_snaps
[coid
.snap
] = snaps
;
8229 // clone_overlap should contain an entry for each clone
8230 // (an empty interval_set if there is no overlap)
8231 ctx
->new_snapset
.clone_overlap
[coid
.snap
];
8232 if (ctx
->obs
->oi
.size
)
8233 ctx
->new_snapset
.clone_overlap
[coid
.snap
].insert(0, ctx
->obs
->oi
.size
);
8236 dout(10) << " cloning v " << ctx
->obs
->oi
.version
8237 << " to " << coid
<< " v " << ctx
->at_version
8238 << " snaps=" << snaps
8239 << " snapset=" << ctx
->new_snapset
<< dendl
;
8240 ctx
->log
.push_back(pg_log_entry_t(
8241 pg_log_entry_t::CLONE
, coid
, ctx
->at_version
,
8242 ctx
->obs
->oi
.version
,
8243 ctx
->obs
->oi
.user_version
,
8244 osd_reqid_t(), ctx
->new_obs
.oi
.mtime
, 0));
8245 encode(snaps
, ctx
->log
.back().snaps
);
8247 ctx
->at_version
.version
++;
8250 // update most recent clone_overlap and usage stats
8251 if (ctx
->new_snapset
.clones
.size() > 0) {
8252 // the clone_overlap is difference of range between head and clones.
8253 // we need to check whether the most recent clone exists, if it's
8254 // been evicted, it's not included in the stats, but the clone_overlap
8255 // is still exist in the snapset, so we should update the
8256 // clone_overlap to make it sense.
8257 hobject_t last_clone_oid
= soid
;
8258 last_clone_oid
.snap
= ctx
->new_snapset
.clone_overlap
.rbegin()->first
;
8259 interval_set
<uint64_t> &newest_overlap
=
8260 ctx
->new_snapset
.clone_overlap
.rbegin()->second
;
8261 ctx
->modified_ranges
.intersection_of(newest_overlap
);
8262 if (is_present_clone(last_clone_oid
)) {
8263 // modified_ranges is still in use by the clone
8264 ctx
->delta_stats
.num_bytes
+= ctx
->modified_ranges
.size();
8266 newest_overlap
.subtract(ctx
->modified_ranges
);
8269 if (snapc
.seq
> ctx
->new_snapset
.seq
) {
8270 // update snapset with latest snap context
8271 ctx
->new_snapset
.seq
= snapc
.seq
;
8272 if (get_osdmap()->require_osd_release
< ceph_release_t::octopus
) {
8273 ctx
->new_snapset
.snaps
= snapc
.snaps
;
8275 ctx
->new_snapset
.snaps
.clear();
8278 dout(20) << "make_writeable " << soid
8279 << " done, snapset=" << ctx
->new_snapset
<< dendl
;
8283 void PrimaryLogPG::write_update_size_and_usage(object_stat_sum_t
& delta_stats
, object_info_t
& oi
,
8284 interval_set
<uint64_t>& modified
, uint64_t offset
,
8285 uint64_t length
, bool write_full
)
8287 interval_set
<uint64_t> ch
;
8290 ch
.insert(0, oi
.size
);
8292 ch
.insert(offset
, length
);
8293 modified
.union_of(ch
);
8295 (offset
+ length
> oi
.size
&& length
)) {
8296 uint64_t new_size
= offset
+ length
;
8297 delta_stats
.num_bytes
-= oi
.size
;
8298 delta_stats
.num_bytes
+= new_size
;
8302 if (oi
.has_manifest() && oi
.manifest
.is_chunked()) {
8303 for (auto &p
: oi
.manifest
.chunk_map
) {
8304 if ((p
.first
<= offset
&& p
.first
+ p
.second
.length
> offset
) ||
8305 (p
.first
> offset
&& p
.first
< offset
+ length
)) {
8306 p
.second
.clear_flag(chunk_info_t::FLAG_MISSING
);
8307 p
.second
.set_flag(chunk_info_t::FLAG_DIRTY
);
8311 delta_stats
.num_wr
++;
8312 delta_stats
.num_wr_kb
+= shift_round_up(length
, 10);
8315 void PrimaryLogPG::truncate_update_size_and_usage(
8316 object_stat_sum_t
& delta_stats
,
8318 uint64_t truncate_size
)
8320 if (oi
.size
!= truncate_size
) {
8321 delta_stats
.num_bytes
-= oi
.size
;
8322 delta_stats
.num_bytes
+= truncate_size
;
8323 oi
.size
= truncate_size
;
8327 void PrimaryLogPG::complete_disconnect_watches(
8328 ObjectContextRef obc
,
8329 const list
<watch_disconnect_t
> &to_disconnect
)
8331 for (list
<watch_disconnect_t
>::const_iterator i
=
8332 to_disconnect
.begin();
8333 i
!= to_disconnect
.end();
8335 pair
<uint64_t, entity_name_t
> watcher(i
->cookie
, i
->name
);
8336 auto watchers_entry
= obc
->watchers
.find(watcher
);
8337 if (watchers_entry
!= obc
->watchers
.end()) {
8338 WatchRef watch
= watchers_entry
->second
;
8339 dout(10) << "do_osd_op_effects disconnect watcher " << watcher
<< dendl
;
8340 obc
->watchers
.erase(watcher
);
8341 watch
->remove(i
->send_disconnect
);
8343 dout(10) << "do_osd_op_effects disconnect failed to find watcher "
8344 << watcher
<< dendl
;
8349 void PrimaryLogPG::do_osd_op_effects(OpContext
*ctx
, const ConnectionRef
& conn
)
8351 entity_name_t entity
= ctx
->reqid
.name
;
8352 dout(15) << "do_osd_op_effects " << entity
<< " con " << conn
.get() << dendl
;
8354 // disconnects first
8355 complete_disconnect_watches(ctx
->obc
, ctx
->watch_disconnects
);
8359 auto session
= conn
->get_priv();
8363 for (list
<pair
<watch_info_t
,bool> >::iterator i
= ctx
->watch_connects
.begin();
8364 i
!= ctx
->watch_connects
.end();
8366 pair
<uint64_t, entity_name_t
> watcher(i
->first
.cookie
, entity
);
8367 dout(15) << "do_osd_op_effects applying watch connect on session "
8368 << session
.get() << " watcher " << watcher
<< dendl
;
8370 if (ctx
->obc
->watchers
.count(watcher
)) {
8371 dout(15) << "do_osd_op_effects found existing watch watcher " << watcher
8373 watch
= ctx
->obc
->watchers
[watcher
];
8375 dout(15) << "do_osd_op_effects new watcher " << watcher
8377 watch
= Watch::makeWatchRef(
8378 this, osd
, ctx
->obc
, i
->first
.timeout_seconds
,
8379 i
->first
.cookie
, entity
, conn
->get_peer_addr());
8380 ctx
->obc
->watchers
.insert(
8385 watch
->connect(conn
, i
->second
);
8388 for (list
<notify_info_t
>::iterator p
= ctx
->notifies
.begin();
8389 p
!= ctx
->notifies
.end();
8391 dout(10) << "do_osd_op_effects, notify " << *p
<< dendl
;
8392 ConnectionRef
conn(ctx
->op
->get_req()->get_connection());
8394 Notify::makeNotifyRef(
8396 ctx
->reqid
.name
.num(),
8401 ctx
->obc
->obs
.oi
.user_version
,
8403 for (map
<pair
<uint64_t, entity_name_t
>, WatchRef
>::iterator i
=
8404 ctx
->obc
->watchers
.begin();
8405 i
!= ctx
->obc
->watchers
.end();
8407 dout(10) << "starting notify on watch " << i
->first
<< dendl
;
8408 i
->second
->start_notify(notif
);
8413 for (list
<OpContext::NotifyAck
>::iterator p
= ctx
->notify_acks
.begin();
8414 p
!= ctx
->notify_acks
.end();
8416 if (p
->watch_cookie
)
8417 dout(10) << "notify_ack " << make_pair(*(p
->watch_cookie
), p
->notify_id
) << dendl
;
8419 dout(10) << "notify_ack " << make_pair("NULL", p
->notify_id
) << dendl
;
8420 for (map
<pair
<uint64_t, entity_name_t
>, WatchRef
>::iterator i
=
8421 ctx
->obc
->watchers
.begin();
8422 i
!= ctx
->obc
->watchers
.end();
8424 if (i
->first
.second
!= entity
) continue;
8425 if (p
->watch_cookie
&&
8426 *(p
->watch_cookie
) != i
->first
.first
) continue;
8427 dout(10) << "acking notify on watch " << i
->first
<< dendl
;
8428 i
->second
->notify_ack(p
->notify_id
, p
->reply_bl
);
8433 hobject_t
PrimaryLogPG::generate_temp_object(const hobject_t
& target
)
8436 ss
<< "temp_" << info
.pgid
<< "_" << get_role()
8437 << "_" << osd
->monc
->get_global_id() << "_" << (++temp_seq
);
8438 hobject_t hoid
= target
.make_temp_hobject(ss
.str());
8439 dout(20) << __func__
<< " " << hoid
<< dendl
;
8443 hobject_t
PrimaryLogPG::get_temp_recovery_object(
8444 const hobject_t
& target
,
8448 ss
<< "temp_recovering_" << info
.pgid
// (note this includes the shardid)
8450 << "_" << info
.history
.same_interval_since
8451 << "_" << target
.snap
;
8452 // pgid + version + interval + snapid is unique, and short
8453 hobject_t hoid
= target
.make_temp_hobject(ss
.str());
8454 dout(20) << __func__
<< " " << hoid
<< dendl
;
8458 int PrimaryLogPG::prepare_transaction(OpContext
*ctx
)
8460 ceph_assert(!ctx
->ops
->empty());
8462 // valid snap context?
8463 if (!ctx
->snapc
.is_valid()) {
8464 dout(10) << " invalid snapc " << ctx
->snapc
<< dendl
;
8468 // prepare the actual mutation
8469 int result
= do_osd_ops(ctx
, *ctx
->ops
);
8471 if (ctx
->op
->may_write() &&
8472 get_osdmap()->require_osd_release
>= ceph_release_t::kraken
) {
8473 // need to save the error code in the pg log, to detect dup ops,
8474 // but do nothing else
8475 ctx
->update_log_only
= true;
8480 // read-op? write-op noop? done?
8481 if (ctx
->op_t
->empty() && !ctx
->modify
) {
8482 if (ctx
->pending_async_reads
.empty())
8483 unstable_stats
.add(ctx
->delta_stats
);
8484 if (ctx
->op
->may_write() &&
8485 get_osdmap()->require_osd_release
>= ceph_release_t::kraken
) {
8486 ctx
->update_log_only
= true;
8492 if ((ctx
->delta_stats
.num_bytes
> 0 ||
8493 ctx
->delta_stats
.num_objects
> 0) && // FIXME: keys?
8494 pool
.info
.has_flag(pg_pool_t::FLAG_FULL
)) {
8495 auto m
= ctx
->op
->get_req
<MOSDOp
>();
8496 if (ctx
->reqid
.name
.is_mds() || // FIXME: ignore MDS for now
8497 m
->has_flag(CEPH_OSD_FLAG_FULL_FORCE
)) {
8498 dout(20) << __func__
<< " full, but proceeding due to FULL_FORCE or MDS"
8500 } else if (m
->has_flag(CEPH_OSD_FLAG_FULL_TRY
)) {
8501 // they tried, they failed.
8502 dout(20) << __func__
<< " full, replying to FULL_TRY op" << dendl
;
8503 return pool
.info
.has_flag(pg_pool_t::FLAG_FULL_QUOTA
) ? -EDQUOT
: -ENOSPC
;
8506 dout(20) << __func__
<< " full, dropping request (bad client)" << dendl
;
8511 const hobject_t
& soid
= ctx
->obs
->oi
.soid
;
8512 // clone, if necessary
8513 if (soid
.snap
== CEPH_NOSNAP
)
8514 make_writeable(ctx
);
8517 ctx
->new_obs
.exists
? pg_log_entry_t::MODIFY
:
8518 pg_log_entry_t::DELETE
,
8524 void PrimaryLogPG::finish_ctx(OpContext
*ctx
, int log_op_type
, int result
)
8526 const hobject_t
& soid
= ctx
->obs
->oi
.soid
;
8527 dout(20) << __func__
<< " " << soid
<< " " << ctx
8528 << " op " << pg_log_entry_t::get_op_name(log_op_type
)
8530 utime_t now
= ceph_clock_now();
8532 // finish and log the op.
8533 if (ctx
->user_modify
) {
8534 // update the user_version for any modify ops, except for the watch op
8535 ctx
->user_at_version
= std::max(info
.last_user_version
, ctx
->new_obs
.oi
.user_version
) + 1;
8536 /* In order for new clients and old clients to interoperate properly
8537 * when exchanging versions, we need to lower bound the user_version
8538 * (which our new clients pay proper attention to)
8539 * by the at_version (which is all the old clients can ever see). */
8540 if (ctx
->at_version
.version
> ctx
->user_at_version
)
8541 ctx
->user_at_version
= ctx
->at_version
.version
;
8542 ctx
->new_obs
.oi
.user_version
= ctx
->user_at_version
;
8544 ctx
->bytes_written
= ctx
->op_t
->get_bytes_written();
8546 if (ctx
->new_obs
.exists
) {
8547 ctx
->new_obs
.oi
.version
= ctx
->at_version
;
8548 ctx
->new_obs
.oi
.prior_version
= ctx
->obs
->oi
.version
;
8549 ctx
->new_obs
.oi
.last_reqid
= ctx
->reqid
;
8550 if (ctx
->mtime
!= utime_t()) {
8551 ctx
->new_obs
.oi
.mtime
= ctx
->mtime
;
8552 dout(10) << " set mtime to " << ctx
->new_obs
.oi
.mtime
<< dendl
;
8553 ctx
->new_obs
.oi
.local_mtime
= now
;
8555 dout(10) << " mtime unchanged at " << ctx
->new_obs
.oi
.mtime
<< dendl
;
8559 map
<string
, bufferlist
> attrs
;
8560 bufferlist
bv(sizeof(ctx
->new_obs
.oi
));
8561 encode(ctx
->new_obs
.oi
, bv
,
8562 get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD
, nullptr));
8563 attrs
[OI_ATTR
].claim(bv
);
8566 if (soid
.snap
== CEPH_NOSNAP
) {
8567 dout(10) << " final snapset " << ctx
->new_snapset
8568 << " in " << soid
<< dendl
;
8570 encode(ctx
->new_snapset
, bss
);
8571 attrs
[SS_ATTR
].claim(bss
);
8573 dout(10) << " no snapset (this is a clone)" << dendl
;
8575 ctx
->op_t
->setattrs(soid
, attrs
);
8578 ctx
->new_obs
.oi
= object_info_t(ctx
->obc
->obs
.oi
.soid
);
8583 pg_log_entry_t(log_op_type
, soid
, ctx
->at_version
,
8584 ctx
->obs
->oi
.version
,
8585 ctx
->user_at_version
, ctx
->reqid
,
8587 (ctx
->op
&& ctx
->op
->allows_returnvec()) ? result
: 0));
8588 if (ctx
->op
&& ctx
->op
->allows_returnvec()) {
8589 // also the per-op values
8590 ctx
->log
.back().set_op_returns(*ctx
->ops
);
8591 dout(20) << __func__
<< " op_returns " << ctx
->log
.back().op_returns
8595 ctx
->log
.back().clean_regions
= ctx
->clean_regions
;
8596 dout(20) << __func__
<< " object " << soid
<< " marks clean_regions " << ctx
->log
.back().clean_regions
<< dendl
;
8598 if (soid
.snap
< CEPH_NOSNAP
) {
8599 switch (log_op_type
) {
8600 case pg_log_entry_t::MODIFY
:
8601 case pg_log_entry_t::PROMOTE
:
8602 case pg_log_entry_t::CLEAN
:
8603 dout(20) << __func__
<< " encoding snaps from " << ctx
->new_snapset
8605 encode(ctx
->new_snapset
.clone_snaps
[soid
.snap
], ctx
->log
.back().snaps
);
8612 if (!ctx
->extra_reqids
.empty()) {
8613 dout(20) << __func__
<< " extra_reqids " << ctx
->extra_reqids
<< " "
8614 << ctx
->extra_reqid_return_codes
<< dendl
;
8615 ctx
->log
.back().extra_reqids
.swap(ctx
->extra_reqids
);
8616 ctx
->log
.back().extra_reqid_return_codes
.swap(ctx
->extra_reqid_return_codes
);
8619 // apply new object state.
8620 ctx
->obc
->obs
= ctx
->new_obs
;
8622 if (soid
.is_head() && !ctx
->obc
->obs
.exists
) {
8623 ctx
->obc
->ssc
->exists
= false;
8624 ctx
->obc
->ssc
->snapset
= SnapSet();
8626 ctx
->obc
->ssc
->exists
= true;
8627 ctx
->obc
->ssc
->snapset
= ctx
->new_snapset
;
8631 void PrimaryLogPG::apply_stats(
8632 const hobject_t
&soid
,
8633 const object_stat_sum_t
&delta_stats
) {
8635 recovery_state
.apply_op_stats(soid
, delta_stats
);
8636 for (set
<pg_shard_t
>::const_iterator i
= get_backfill_targets().begin();
8637 i
!= get_backfill_targets().end();
8640 const pg_info_t
& pinfo
= recovery_state
.get_peer_info(bt
);
8641 if (soid
> pinfo
.last_backfill
&& soid
<= last_backfill_started
) {
8642 pending_backfill_updates
[soid
].stats
.add(delta_stats
);
8646 if (is_primary() && scrubber
.active
) {
8647 if (soid
< scrubber
.start
) {
8648 dout(20) << __func__
<< " " << soid
<< " < [" << scrubber
.start
8649 << "," << scrubber
.end
<< ")" << dendl
;
8650 scrub_cstat
.add(delta_stats
);
8652 dout(20) << __func__
<< " " << soid
<< " >= [" << scrubber
.start
8653 << "," << scrubber
.end
<< ")" << dendl
;
8658 void PrimaryLogPG::complete_read_ctx(int result
, OpContext
*ctx
)
8660 auto m
= ctx
->op
->get_req
<MOSDOp
>();
8661 ceph_assert(ctx
->async_reads_complete());
8663 for (vector
<OSDOp
>::iterator p
= ctx
->ops
->begin();
8664 p
!= ctx
->ops
->end() && result
>= 0; ++p
) {
8665 if (p
->rval
< 0 && !(p
->op
.flags
& CEPH_OSD_OP_FLAG_FAILOK
)) {
8669 ctx
->bytes_read
+= p
->outdata
.length();
8671 ctx
->reply
->get_header().data_off
= (ctx
->data_off
? *ctx
->data_off
: 0);
8673 MOSDOpReply
*reply
= ctx
->reply
;
8674 ctx
->reply
= nullptr;
8677 if (!ctx
->ignore_log_op_stats
) {
8678 log_op_stats(*ctx
->op
, ctx
->bytes_written
, ctx
->bytes_read
);
8680 publish_stats_to_osd();
8683 // on read, return the current object version
8685 reply
->set_reply_versions(eversion_t(), ctx
->obs
->oi
.user_version
);
8687 reply
->set_reply_versions(eversion_t(), ctx
->user_at_version
);
8689 } else if (result
== -ENOENT
) {
8690 // on ENOENT, set a floor for what the next user version will be.
8691 reply
->set_enoent_reply_versions(info
.last_update
, info
.last_user_version
);
8694 reply
->set_result(result
);
8695 reply
->add_flags(CEPH_OSD_FLAG_ACK
| CEPH_OSD_FLAG_ONDISK
);
8696 osd
->send_message_osd_client(reply
, m
->get_connection());
8700 // ========================================================================
8703 struct C_Copyfrom
: public Context
{
8706 epoch_t last_peering_reset
;
8708 PrimaryLogPG::CopyOpRef cop
; // used for keeping the cop alive
8709 C_Copyfrom(PrimaryLogPG
*p
, hobject_t o
, epoch_t lpr
,
8710 const PrimaryLogPG::CopyOpRef
& c
)
8711 : pg(p
), oid(o
), last_peering_reset(lpr
),
8714 void finish(int r
) override
{
8715 if (r
== -ECANCELED
)
8717 std::scoped_lock l
{*pg
};
8718 if (last_peering_reset
== pg
->get_last_peering_reset()) {
8719 pg
->process_copy_chunk(oid
, tid
, r
);
8725 struct C_CopyFrom_AsyncReadCb
: public Context
{
8727 object_copy_data_t reply_obj
;
8730 C_CopyFrom_AsyncReadCb(OSDOp
*osd_op
, uint64_t features
) :
8731 osd_op(osd_op
), features(features
), len(0) {}
8732 void finish(int r
) override
{
8738 ceph_assert(len
> 0);
8739 ceph_assert(len
<= reply_obj
.data
.length());
8741 bl
.substr_of(reply_obj
.data
, 0, len
);
8742 reply_obj
.data
.swap(bl
);
8743 encode(reply_obj
, osd_op
->outdata
, features
);
8747 struct C_CopyChunk
: public Context
{
8750 epoch_t last_peering_reset
;
8752 PrimaryLogPG::CopyOpRef cop
; // used for keeping the cop alive
8753 uint64_t offset
= 0;
8754 C_CopyChunk(PrimaryLogPG
*p
, hobject_t o
, epoch_t lpr
,
8755 const PrimaryLogPG::CopyOpRef
& c
)
8756 : pg(p
), oid(o
), last_peering_reset(lpr
),
8759 void finish(int r
) override
{
8760 if (r
== -ECANCELED
)
8762 std::scoped_lock l
{*pg
};
8763 if (last_peering_reset
== pg
->get_last_peering_reset()) {
8764 pg
->process_copy_chunk_manifest(oid
, tid
, r
, offset
);
8770 int PrimaryLogPG::do_copy_get(OpContext
*ctx
, bufferlist::const_iterator
& bp
,
8771 OSDOp
& osd_op
, ObjectContextRef
&obc
)
8773 object_info_t
& oi
= obc
->obs
.oi
;
8774 hobject_t
& soid
= oi
.soid
;
8776 object_copy_cursor_t cursor
;
8780 decode(out_max
, bp
);
8782 catch (buffer::error
& e
) {
8787 const MOSDOp
*op
= reinterpret_cast<const MOSDOp
*>(ctx
->op
->get_req());
8788 uint64_t features
= op
->get_features();
8790 bool async_read_started
= false;
8791 object_copy_data_t _reply_obj
;
8792 C_CopyFrom_AsyncReadCb
*cb
= nullptr;
8793 if (pool
.info
.is_erasure()) {
8794 cb
= new C_CopyFrom_AsyncReadCb(&osd_op
, features
);
8796 object_copy_data_t
&reply_obj
= cb
? cb
->reply_obj
: _reply_obj
;
8798 reply_obj
.size
= oi
.size
;
8799 reply_obj
.mtime
= oi
.mtime
;
8800 ceph_assert(obc
->ssc
);
8801 if (soid
.snap
< CEPH_NOSNAP
) {
8802 auto p
= obc
->ssc
->snapset
.clone_snaps
.find(soid
.snap
);
8803 ceph_assert(p
!= obc
->ssc
->snapset
.clone_snaps
.end()); // warn?
8804 reply_obj
.snaps
= p
->second
;
8806 reply_obj
.snap_seq
= obc
->ssc
->snapset
.seq
;
8808 if (oi
.is_data_digest()) {
8809 reply_obj
.flags
|= object_copy_data_t::FLAG_DATA_DIGEST
;
8810 reply_obj
.data_digest
= oi
.data_digest
;
8812 if (oi
.is_omap_digest()) {
8813 reply_obj
.flags
|= object_copy_data_t::FLAG_OMAP_DIGEST
;
8814 reply_obj
.omap_digest
= oi
.omap_digest
;
8816 reply_obj
.truncate_seq
= oi
.truncate_seq
;
8817 reply_obj
.truncate_size
= oi
.truncate_size
;
8820 map
<string
,bufferlist
>& out_attrs
= reply_obj
.attrs
;
8821 if (!cursor
.attr_complete
) {
8822 result
= getattrs_maybe_cache(
8831 cursor
.attr_complete
= true;
8832 dout(20) << " got attrs" << dendl
;
8835 int64_t left
= out_max
- osd_op
.outdata
.length();
8838 bufferlist
& bl
= reply_obj
.data
;
8839 if (left
> 0 && !cursor
.data_complete
) {
8840 if (cursor
.data_offset
< oi
.size
) {
8841 uint64_t max_read
= std::min(oi
.size
- cursor
.data_offset
, (uint64_t)left
);
8843 async_read_started
= true;
8844 ctx
->pending_async_reads
.push_back(
8846 boost::make_tuple(cursor
.data_offset
, max_read
, osd_op
.op
.flags
),
8847 make_pair(&bl
, cb
)));
8850 ctx
->op_finishers
[ctx
->current_osd_subop_num
].reset(
8851 new ReadFinisher(osd_op
));
8852 result
= -EINPROGRESS
;
8854 dout(10) << __func__
<< ": async_read noted for " << soid
<< dendl
;
8856 result
= pgbackend
->objects_read_sync(
8857 oi
.soid
, cursor
.data_offset
, max_read
, osd_op
.op
.flags
, &bl
);
8862 cursor
.data_offset
+= max_read
;
8864 if (cursor
.data_offset
== oi
.size
) {
8865 cursor
.data_complete
= true;
8866 dout(20) << " got data" << dendl
;
8868 ceph_assert(cursor
.data_offset
<= oi
.size
);
8872 uint32_t omap_keys
= 0;
8873 if (!pool
.info
.supports_omap() || !oi
.is_omap()) {
8874 cursor
.omap_complete
= true;
8876 if (left
> 0 && !cursor
.omap_complete
) {
8877 ceph_assert(cursor
.data_complete
);
8878 if (cursor
.omap_offset
.empty()) {
8879 osd
->store
->omap_get_header(ch
, ghobject_t(oi
.soid
),
8880 &reply_obj
.omap_header
);
8882 bufferlist omap_data
;
8883 ObjectMap::ObjectMapIterator iter
=
8884 osd
->store
->get_omap_iterator(ch
, ghobject_t(oi
.soid
));
8886 iter
->upper_bound(cursor
.omap_offset
);
8887 for (; iter
->valid(); iter
->next()) {
8889 encode(iter
->key(), omap_data
);
8890 encode(iter
->value(), omap_data
);
8891 left
-= iter
->key().length() + 4 + iter
->value().length() + 4;
8896 encode(omap_keys
, reply_obj
.omap_data
);
8897 reply_obj
.omap_data
.claim_append(omap_data
);
8899 if (iter
->valid()) {
8900 cursor
.omap_offset
= iter
->key();
8902 cursor
.omap_complete
= true;
8903 dout(20) << " got omap" << dendl
;
8908 if (cursor
.is_complete()) {
8909 // include reqids only in the final step. this is a bit fragile
8911 recovery_state
.get_pg_log().get_log().get_object_reqids(ctx
->obc
->obs
.oi
.soid
, 10,
8913 &reply_obj
.reqid_return_codes
);
8914 dout(20) << " got reqids" << dendl
;
8917 dout(20) << " cursor.is_complete=" << cursor
.is_complete()
8918 << " " << out_attrs
.size() << " attrs"
8919 << " " << bl
.length() << " bytes"
8920 << " " << reply_obj
.omap_header
.length() << " omap header bytes"
8921 << " " << reply_obj
.omap_data
.length() << " omap data bytes in "
8922 << omap_keys
<< " keys"
8923 << " " << reply_obj
.reqids
.size() << " reqids"
8925 reply_obj
.cursor
= cursor
;
8926 if (!async_read_started
) {
8927 encode(reply_obj
, osd_op
.outdata
, features
);
8929 if (cb
&& !async_read_started
) {
8939 void PrimaryLogPG::fill_in_copy_get_noent(OpRequestRef
& op
, hobject_t oid
,
8942 const MOSDOp
*m
= static_cast<const MOSDOp
*>(op
->get_req());
8943 uint64_t features
= m
->get_features();
8944 object_copy_data_t reply_obj
;
8946 recovery_state
.get_pg_log().get_log().get_object_reqids(oid
, 10, &reply_obj
.reqids
,
8947 &reply_obj
.reqid_return_codes
);
8948 dout(20) << __func__
<< " got reqids " << reply_obj
.reqids
<< dendl
;
8949 encode(reply_obj
, osd_op
.outdata
, features
);
8950 osd_op
.rval
= -ENOENT
;
8951 MOSDOpReply
*reply
= new MOSDOpReply(m
, 0, get_osdmap_epoch(), 0, false);
8952 reply
->set_result(-ENOENT
);
8953 reply
->add_flags(CEPH_OSD_FLAG_ACK
| CEPH_OSD_FLAG_ONDISK
);
8954 osd
->send_message_osd_client(reply
, m
->get_connection());
8957 void PrimaryLogPG::start_copy(CopyCallback
*cb
, ObjectContextRef obc
,
8958 hobject_t src
, object_locator_t oloc
,
8959 version_t version
, unsigned flags
,
8960 bool mirror_snapset
,
8961 unsigned src_obj_fadvise_flags
,
8962 unsigned dest_obj_fadvise_flags
)
8964 const hobject_t
& dest
= obc
->obs
.oi
.soid
;
8965 dout(10) << __func__
<< " " << dest
8966 << " from " << src
<< " " << oloc
<< " v" << version
8967 << " flags " << flags
8968 << (mirror_snapset
? " mirror_snapset" : "")
8971 ceph_assert(!mirror_snapset
|| src
.snap
== CEPH_NOSNAP
);
8973 // cancel a previous in-progress copy?
8974 if (copy_ops
.count(dest
)) {
8975 // FIXME: if the src etc match, we could avoid restarting from the
8977 CopyOpRef cop
= copy_ops
[dest
];
8978 vector
<ceph_tid_t
> tids
;
8979 cancel_copy(cop
, false, &tids
);
8980 osd
->objecter
->op_cancel(tids
, -ECANCELED
);
8983 CopyOpRef
cop(std::make_shared
<CopyOp
>(cb
, obc
, src
, oloc
, version
, flags
,
8984 mirror_snapset
, src_obj_fadvise_flags
,
8985 dest_obj_fadvise_flags
));
8986 copy_ops
[dest
] = cop
;
8989 if (!obc
->obs
.oi
.has_manifest()) {
8990 _copy_some(obc
, cop
);
8992 if (obc
->obs
.oi
.manifest
.is_redirect()) {
8993 _copy_some(obc
, cop
);
8994 } else if (obc
->obs
.oi
.manifest
.is_chunked()) {
8995 auto p
= obc
->obs
.oi
.manifest
.chunk_map
.begin();
8996 _copy_some_manifest(obc
, cop
, p
->first
);
8998 ceph_abort_msg("unrecognized manifest type");
9003 void PrimaryLogPG::_copy_some(ObjectContextRef obc
, CopyOpRef cop
)
9005 dout(10) << __func__
<< " " << *obc
<< " " << cop
<< dendl
;
9008 if (cop
->flags
& CEPH_OSD_COPY_FROM_FLAG_FLUSH
)
9009 flags
|= CEPH_OSD_FLAG_FLUSH
;
9010 if (cop
->flags
& CEPH_OSD_COPY_FROM_FLAG_IGNORE_CACHE
)
9011 flags
|= CEPH_OSD_FLAG_IGNORE_CACHE
;
9012 if (cop
->flags
& CEPH_OSD_COPY_FROM_FLAG_IGNORE_OVERLAY
)
9013 flags
|= CEPH_OSD_FLAG_IGNORE_OVERLAY
;
9014 if (cop
->flags
& CEPH_OSD_COPY_FROM_FLAG_MAP_SNAP_CLONE
)
9015 flags
|= CEPH_OSD_FLAG_MAP_SNAP_CLONE
;
9016 if (cop
->flags
& CEPH_OSD_COPY_FROM_FLAG_RWORDERED
)
9017 flags
|= CEPH_OSD_FLAG_RWORDERED
;
9019 C_GatherBuilder
gather(cct
);
9021 if (cop
->cursor
.is_initial() && cop
->mirror_snapset
) {
9023 ceph_assert(cop
->src
.snap
== CEPH_NOSNAP
);
9025 op
.list_snaps(&cop
->results
.snapset
, NULL
);
9026 ceph_tid_t tid
= osd
->objecter
->read(cop
->src
.oid
, cop
->oloc
, op
,
9028 flags
, gather
.new_sub(), NULL
);
9029 cop
->objecter_tid2
= tid
;
9033 if (cop
->results
.user_version
) {
9034 op
.assert_version(cop
->results
.user_version
);
9036 // we should learn the version after the first chunk, if we didn't know
9038 ceph_assert(cop
->cursor
.is_initial());
9040 op
.copy_get(&cop
->cursor
, get_copy_chunk_size(),
9041 &cop
->results
.object_size
, &cop
->results
.mtime
,
9042 &cop
->attrs
, &cop
->data
, &cop
->omap_header
, &cop
->omap_data
,
9043 &cop
->results
.snaps
, &cop
->results
.snap_seq
,
9044 &cop
->results
.flags
,
9045 &cop
->results
.source_data_digest
,
9046 &cop
->results
.source_omap_digest
,
9047 &cop
->results
.reqids
,
9048 &cop
->results
.reqid_return_codes
,
9049 &cop
->results
.truncate_seq
,
9050 &cop
->results
.truncate_size
,
9052 op
.set_last_op_flags(cop
->src_obj_fadvise_flags
);
9054 C_Copyfrom
*fin
= new C_Copyfrom(this, obc
->obs
.oi
.soid
,
9055 get_last_peering_reset(), cop
);
9056 gather
.set_finisher(new C_OnFinisher(fin
,
9057 osd
->get_objecter_finisher(get_pg_shard())));
9059 ceph_tid_t tid
= osd
->objecter
->read(cop
->src
.oid
, cop
->oloc
, op
,
9060 cop
->src
.snap
, NULL
,
9063 // discover the object version if we don't know it yet
9064 cop
->results
.user_version
? NULL
: &cop
->results
.user_version
);
9066 cop
->objecter_tid
= tid
;
9070 void PrimaryLogPG::_copy_some_manifest(ObjectContextRef obc
, CopyOpRef cop
, uint64_t start_offset
)
9072 dout(10) << __func__
<< " " << *obc
<< " " << cop
<< dendl
;
9075 if (cop
->flags
& CEPH_OSD_COPY_FROM_FLAG_FLUSH
)
9076 flags
|= CEPH_OSD_FLAG_FLUSH
;
9077 if (cop
->flags
& CEPH_OSD_COPY_FROM_FLAG_IGNORE_CACHE
)
9078 flags
|= CEPH_OSD_FLAG_IGNORE_CACHE
;
9079 if (cop
->flags
& CEPH_OSD_COPY_FROM_FLAG_IGNORE_OVERLAY
)
9080 flags
|= CEPH_OSD_FLAG_IGNORE_OVERLAY
;
9081 if (cop
->flags
& CEPH_OSD_COPY_FROM_FLAG_MAP_SNAP_CLONE
)
9082 flags
|= CEPH_OSD_FLAG_MAP_SNAP_CLONE
;
9083 if (cop
->flags
& CEPH_OSD_COPY_FROM_FLAG_RWORDERED
)
9084 flags
|= CEPH_OSD_FLAG_RWORDERED
;
9087 uint64_t last_offset
= 0, chunks_size
= 0;
9088 object_manifest_t
*manifest
= &obc
->obs
.oi
.manifest
;
9089 map
<uint64_t, chunk_info_t
>::iterator iter
= manifest
->chunk_map
.find(start_offset
);
9090 for (;iter
!= manifest
->chunk_map
.end(); ++iter
) {
9092 chunks_size
+= iter
->second
.length
;
9093 last_offset
= iter
->first
;
9094 if (get_copy_chunk_size() < chunks_size
) {
9099 cop
->num_chunk
= num_chunks
;
9100 cop
->start_offset
= start_offset
;
9101 cop
->last_offset
= last_offset
;
9102 dout(20) << __func__
<< " oid " << obc
->obs
.oi
.soid
<< " num_chunks: " << num_chunks
9103 << " start_offset: " << start_offset
<< " chunks_size: " << chunks_size
9104 << " last_offset: " << last_offset
<< dendl
;
9106 iter
= manifest
->chunk_map
.find(start_offset
);
9107 for (;iter
!= manifest
->chunk_map
.end(); ++iter
) {
9108 uint64_t obj_offset
= iter
->first
;
9109 uint64_t length
= manifest
->chunk_map
[iter
->first
].length
;
9110 hobject_t soid
= manifest
->chunk_map
[iter
->first
].oid
;
9111 object_locator_t
oloc(soid
);
9112 CopyCallback
* cb
= NULL
;
9113 CopyOpRef
sub_cop(std::make_shared
<CopyOp
>(cb
, ObjectContextRef(), cop
->src
, oloc
,
9114 cop
->results
.user_version
, cop
->flags
, cop
->mirror_snapset
,
9115 cop
->src_obj_fadvise_flags
, cop
->dest_obj_fadvise_flags
));
9116 sub_cop
->cursor
.data_offset
= obj_offset
;
9117 cop
->chunk_cops
[obj_offset
] = sub_cop
;
9119 int s
= sub_cop
->chunk_ops
.size();
9120 sub_cop
->chunk_ops
.resize(s
+1);
9121 sub_cop
->chunk_ops
[s
].op
.op
= CEPH_OSD_OP_READ
;
9122 sub_cop
->chunk_ops
[s
].op
.extent
.offset
= manifest
->chunk_map
[iter
->first
].offset
;
9123 sub_cop
->chunk_ops
[s
].op
.extent
.length
= length
;
9126 op
.dup(sub_cop
->chunk_ops
);
9128 dout(20) << __func__
<< " tgt_oid: " << soid
.oid
<< " tgt_offset: "
9129 << manifest
->chunk_map
[iter
->first
].offset
9130 << " length: " << length
<< " pool id: " << oloc
.pool
<< dendl
;
9132 if (cop
->results
.user_version
) {
9133 op
.assert_version(cop
->results
.user_version
);
9135 // we should learn the version after the first chunk, if we didn't know
9137 ceph_assert(cop
->cursor
.is_initial());
9139 op
.set_last_op_flags(cop
->src_obj_fadvise_flags
);
9141 C_CopyChunk
*fin
= new C_CopyChunk(this, obc
->obs
.oi
.soid
,
9142 get_last_peering_reset(), cop
);
9143 fin
->offset
= obj_offset
;
9145 ceph_tid_t tid
= osd
->objecter
->read(
9147 sub_cop
->src
.snap
, NULL
,
9149 new C_OnFinisher(fin
, osd
->get_objecter_finisher(get_pg_shard())),
9150 // discover the object version if we don't know it yet
9151 sub_cop
->results
.user_version
? NULL
: &sub_cop
->results
.user_version
);
9153 sub_cop
->objecter_tid
= tid
;
9154 if (last_offset
< iter
->first
) {
9160 void PrimaryLogPG::process_copy_chunk(hobject_t oid
, ceph_tid_t tid
, int r
)
9162 dout(10) << __func__
<< " " << oid
<< " tid " << tid
9163 << " " << cpp_strerror(r
) << dendl
;
9164 map
<hobject_t
,CopyOpRef
>::iterator p
= copy_ops
.find(oid
);
9165 if (p
== copy_ops
.end()) {
9166 dout(10) << __func__
<< " no copy_op found" << dendl
;
9169 CopyOpRef cop
= p
->second
;
9170 if (tid
!= cop
->objecter_tid
) {
9171 dout(10) << __func__
<< " tid " << tid
<< " != cop " << cop
9172 << " tid " << cop
->objecter_tid
<< dendl
;
9176 if (cop
->omap_data
.length() || cop
->omap_header
.length())
9177 cop
->results
.has_omap
= true;
9179 if (r
>= 0 && !pool
.info
.supports_omap() &&
9180 (cop
->omap_data
.length() || cop
->omap_header
.length())) {
9183 cop
->objecter_tid
= 0;
9184 cop
->objecter_tid2
= 0; // assume this ordered before us (if it happened)
9185 ObjectContextRef
& cobc
= cop
->obc
;
9190 ceph_assert(cop
->rval
>= 0);
9192 if (oid
.snap
< CEPH_NOSNAP
&& !cop
->results
.snaps
.empty()) {
9193 // verify snap hasn't been deleted
9194 vector
<snapid_t
>::iterator p
= cop
->results
.snaps
.begin();
9195 while (p
!= cop
->results
.snaps
.end()) {
9196 // make best effort to sanitize snaps/clones.
9197 if (get_osdmap()->in_removed_snaps_queue(info
.pgid
.pgid
.pool(), *p
)) {
9198 dout(10) << __func__
<< " clone snap " << *p
<< " has been deleted"
9200 for (vector
<snapid_t
>::iterator q
= p
+ 1;
9201 q
!= cop
->results
.snaps
.end();
9204 cop
->results
.snaps
.resize(cop
->results
.snaps
.size() - 1);
9209 if (cop
->results
.snaps
.empty()) {
9210 dout(10) << __func__
<< " no more snaps for " << oid
<< dendl
;
9216 ceph_assert(cop
->rval
>= 0);
9218 if (!cop
->temp_cursor
.data_complete
) {
9219 cop
->results
.data_digest
= cop
->data
.crc32c(cop
->results
.data_digest
);
9221 if (pool
.info
.supports_omap() && !cop
->temp_cursor
.omap_complete
) {
9222 if (cop
->omap_header
.length()) {
9223 cop
->results
.omap_digest
=
9224 cop
->omap_header
.crc32c(cop
->results
.omap_digest
);
9226 if (cop
->omap_data
.length()) {
9228 keys
.substr_of(cop
->omap_data
, 4, cop
->omap_data
.length() - 4);
9229 cop
->results
.omap_digest
= keys
.crc32c(cop
->results
.omap_digest
);
9233 if (!cop
->temp_cursor
.attr_complete
) {
9234 for (map
<string
,bufferlist
>::iterator p
= cop
->attrs
.begin();
9235 p
!= cop
->attrs
.end();
9237 cop
->results
.attrs
[string("_") + p
->first
] = p
->second
;
9242 if (!cop
->cursor
.is_complete()) {
9243 // write out what we have so far
9244 if (cop
->temp_cursor
.is_initial()) {
9245 ceph_assert(!cop
->results
.started_temp_obj
);
9246 cop
->results
.started_temp_obj
= true;
9247 cop
->results
.temp_oid
= generate_temp_object(oid
);
9248 dout(20) << __func__
<< " using temp " << cop
->results
.temp_oid
<< dendl
;
9250 ObjectContextRef tempobc
= get_object_context(cop
->results
.temp_oid
, true);
9251 OpContextUPtr ctx
= simple_opc_create(tempobc
);
9252 if (cop
->temp_cursor
.is_initial()) {
9253 ctx
->new_temp_oid
= cop
->results
.temp_oid
;
9255 _write_copy_chunk(cop
, ctx
->op_t
.get());
9256 simple_opc_submit(std::move(ctx
));
9257 dout(10) << __func__
<< " fetching more" << dendl
;
9258 _copy_some(cobc
, cop
);
9263 if (cop
->results
.is_data_digest() || cop
->results
.is_omap_digest()) {
9264 dout(20) << __func__
<< std::hex
9265 << " got digest: rx data 0x" << cop
->results
.data_digest
9266 << " omap 0x" << cop
->results
.omap_digest
9267 << ", source: data 0x" << cop
->results
.source_data_digest
9268 << " omap 0x" << cop
->results
.source_omap_digest
9270 << " flags " << cop
->results
.flags
9273 if (cop
->results
.is_data_digest() &&
9274 cop
->results
.data_digest
!= cop
->results
.source_data_digest
) {
9275 derr
<< __func__
<< std::hex
<< " data digest 0x" << cop
->results
.data_digest
9276 << " != source 0x" << cop
->results
.source_data_digest
<< std::dec
9278 osd
->clog
->error() << info
.pgid
<< " copy from " << cop
->src
9279 << " to " << cop
->obc
->obs
.oi
.soid
<< std::hex
9280 << " data digest 0x" << cop
->results
.data_digest
9281 << " != source 0x" << cop
->results
.source_data_digest
9286 if (cop
->results
.is_omap_digest() &&
9287 cop
->results
.omap_digest
!= cop
->results
.source_omap_digest
) {
9288 derr
<< __func__
<< std::hex
9289 << " omap digest 0x" << cop
->results
.omap_digest
9290 << " != source 0x" << cop
->results
.source_omap_digest
9291 << std::dec
<< dendl
;
9292 osd
->clog
->error() << info
.pgid
<< " copy from " << cop
->src
9293 << " to " << cop
->obc
->obs
.oi
.soid
<< std::hex
9294 << " omap digest 0x" << cop
->results
.omap_digest
9295 << " != source 0x" << cop
->results
.source_omap_digest
9300 if (cct
->_conf
->osd_debug_inject_copyfrom_error
) {
9301 derr
<< __func__
<< " injecting copyfrom failure" << dendl
;
9306 cop
->results
.fill_in_final_tx
= std::function
<void(PGTransaction
*)>(
9307 [this, &cop
/* avoid ref cycle */](PGTransaction
*t
) {
9308 ObjectState
& obs
= cop
->obc
->obs
;
9309 if (cop
->temp_cursor
.is_initial()) {
9310 dout(20) << "fill_in_final_tx: writing "
9311 << "directly to final object" << dendl
;
9312 // write directly to final object
9313 cop
->results
.temp_oid
= obs
.oi
.soid
;
9314 _write_copy_chunk(cop
, t
);
9316 // finish writing to temp object, then move into place
9317 dout(20) << "fill_in_final_tx: writing to temp object" << dendl
;
9318 _write_copy_chunk(cop
, t
);
9319 t
->rename(obs
.oi
.soid
, cop
->results
.temp_oid
);
9321 t
->setattrs(obs
.oi
.soid
, cop
->results
.attrs
);
9324 dout(20) << __func__
<< " success; committing" << dendl
;
9327 dout(20) << __func__
<< " complete r = " << cpp_strerror(r
) << dendl
;
9328 CopyCallbackResults
results(r
, &cop
->results
);
9329 cop
->cb
->complete(results
);
9331 copy_ops
.erase(cobc
->obs
.oi
.soid
);
9334 if (r
< 0 && cop
->results
.started_temp_obj
) {
9335 dout(10) << __func__
<< " deleting partial temp object "
9336 << cop
->results
.temp_oid
<< dendl
;
9337 ObjectContextRef tempobc
= get_object_context(cop
->results
.temp_oid
, true);
9338 OpContextUPtr ctx
= simple_opc_create(tempobc
);
9339 ctx
->op_t
->remove(cop
->results
.temp_oid
);
9340 ctx
->discard_temp_oid
= cop
->results
.temp_oid
;
9341 simple_opc_submit(std::move(ctx
));
9344 // cancel and requeue proxy ops on this object
9346 cancel_and_requeue_proxy_ops(cobc
->obs
.oi
.soid
);
9349 kick_object_context_blocked(cobc
);
9352 void PrimaryLogPG::process_copy_chunk_manifest(hobject_t oid
, ceph_tid_t tid
, int r
, uint64_t offset
)
9354 dout(10) << __func__
<< " " << oid
<< " tid " << tid
9355 << " " << cpp_strerror(r
) << dendl
;
9356 map
<hobject_t
,CopyOpRef
>::iterator p
= copy_ops
.find(oid
);
9357 if (p
== copy_ops
.end()) {
9358 dout(10) << __func__
<< " no copy_op found" << dendl
;
9361 CopyOpRef obj_cop
= p
->second
;
9362 CopyOpRef chunk_cop
= obj_cop
->chunk_cops
[offset
];
9364 if (tid
!= chunk_cop
->objecter_tid
) {
9365 dout(10) << __func__
<< " tid " << tid
<< " != cop " << chunk_cop
9366 << " tid " << chunk_cop
->objecter_tid
<< dendl
;
9370 if (chunk_cop
->omap_data
.length() || chunk_cop
->omap_header
.length()) {
9374 chunk_cop
->objecter_tid
= 0;
9375 chunk_cop
->objecter_tid2
= 0; // assume this ordered before us (if it happened)
9376 ObjectContextRef
& cobc
= obj_cop
->obc
;
9377 OSDOp
&chunk_data
= chunk_cop
->chunk_ops
[0];
9380 obj_cop
->failed
= true;
9384 if (obj_cop
->failed
) {
9387 if (!chunk_data
.outdata
.length()) {
9389 obj_cop
->failed
= true;
9393 obj_cop
->num_chunk
--;
9395 /* check all of the copyop are completed */
9396 if (obj_cop
->num_chunk
) {
9397 dout(20) << __func__
<< " num_chunk: " << obj_cop
->num_chunk
<< dendl
;
9402 OpContextUPtr ctx
= simple_opc_create(obj_cop
->obc
);
9403 if (!ctx
->lock_manager
.take_write_lock(
9404 obj_cop
->obc
->obs
.oi
.soid
,
9406 // recovery op can take read lock.
9407 // so need to wait for recovery completion
9409 obj_cop
->failed
= true;
9410 close_op_ctx(ctx
.release());
9413 dout(20) << __func__
<< " took lock on obc, " << obj_cop
->obc
->rwstate
<< dendl
;
9415 PGTransaction
*t
= ctx
->op_t
.get();
9416 ObjectState
& obs
= ctx
->new_obs
;
9417 for (auto p
: obj_cop
->chunk_cops
) {
9418 OSDOp
&sub_chunk
= p
.second
->chunk_ops
[0];
9419 t
->write(cobc
->obs
.oi
.soid
,
9420 p
.second
->cursor
.data_offset
,
9421 sub_chunk
.outdata
.length(),
9423 p
.second
->dest_obj_fadvise_flags
);
9424 dout(20) << __func__
<< " offset: " << p
.second
->cursor
.data_offset
9425 << " length: " << sub_chunk
.outdata
.length() << dendl
;
9426 write_update_size_and_usage(ctx
->delta_stats
, obs
.oi
, ctx
->modified_ranges
,
9427 p
.second
->cursor
.data_offset
, sub_chunk
.outdata
.length());
9428 obs
.oi
.manifest
.chunk_map
[p
.second
->cursor
.data_offset
].clear_flag(chunk_info_t::FLAG_DIRTY
);
9429 obs
.oi
.manifest
.chunk_map
[p
.second
->cursor
.data_offset
].clear_flag(chunk_info_t::FLAG_MISSING
);
9430 ctx
->clean_regions
.mark_data_region_dirty(p
.second
->cursor
.data_offset
, sub_chunk
.outdata
.length());
9431 sub_chunk
.outdata
.clear();
9433 obs
.oi
.clear_data_digest();
9434 ctx
->at_version
= get_next_version();
9435 finish_ctx(ctx
.get(), pg_log_entry_t::PROMOTE
);
9436 simple_opc_submit(std::move(ctx
));
9438 auto p
= cobc
->obs
.oi
.manifest
.chunk_map
.rbegin();
9439 /* check remaining work */
9440 if (p
!= cobc
->obs
.oi
.manifest
.chunk_map
.rend()) {
9441 if (obj_cop
->last_offset
>= p
->first
+ p
->second
.length
) {
9442 for (auto &en
: cobc
->obs
.oi
.manifest
.chunk_map
) {
9443 if (obj_cop
->last_offset
< en
.first
) {
9444 _copy_some_manifest(cobc
, obj_cop
, en
.first
);
9453 dout(20) << __func__
<< " complete r = " << cpp_strerror(r
) << dendl
;
9454 CopyCallbackResults
results(r
, &obj_cop
->results
);
9455 obj_cop
->cb
->complete(results
);
9457 copy_ops
.erase(cobc
->obs
.oi
.soid
);
9460 // cancel and requeue proxy ops on this object
9462 cancel_and_requeue_proxy_ops(cobc
->obs
.oi
.soid
);
9465 kick_object_context_blocked(cobc
);
9468 void PrimaryLogPG::cancel_and_requeue_proxy_ops(hobject_t oid
) {
9469 vector
<ceph_tid_t
> tids
;
9470 for (map
<ceph_tid_t
, ProxyReadOpRef
>::iterator it
= proxyread_ops
.begin();
9471 it
!= proxyread_ops
.end();) {
9472 if (it
->second
->soid
== oid
) {
9473 cancel_proxy_read((it
++)->second
, &tids
);
9478 for (map
<ceph_tid_t
, ProxyWriteOpRef
>::iterator it
= proxywrite_ops
.begin();
9479 it
!= proxywrite_ops
.end();) {
9480 if (it
->second
->soid
== oid
) {
9481 cancel_proxy_write((it
++)->second
, &tids
);
9486 osd
->objecter
->op_cancel(tids
, -ECANCELED
);
9487 kick_proxy_ops_blocked(oid
);
9490 void PrimaryLogPG::_write_copy_chunk(CopyOpRef cop
, PGTransaction
*t
)
9492 dout(20) << __func__
<< " " << cop
9493 << " " << cop
->attrs
.size() << " attrs"
9494 << " " << cop
->data
.length() << " bytes"
9495 << " " << cop
->omap_header
.length() << " omap header bytes"
9496 << " " << cop
->omap_data
.length() << " omap data bytes"
9498 if (!cop
->temp_cursor
.attr_complete
) {
9499 t
->create(cop
->results
.temp_oid
);
9501 if (!cop
->temp_cursor
.data_complete
) {
9502 ceph_assert(cop
->data
.length() + cop
->temp_cursor
.data_offset
==
9503 cop
->cursor
.data_offset
);
9504 if (pool
.info
.required_alignment() &&
9505 !cop
->cursor
.data_complete
) {
9507 * Trim off the unaligned bit at the end, we'll adjust cursor.data_offset
9508 * to pick it up on the next pass.
9510 ceph_assert(cop
->temp_cursor
.data_offset
%
9511 pool
.info
.required_alignment() == 0);
9512 if (cop
->data
.length() % pool
.info
.required_alignment() != 0) {
9514 cop
->data
.length() % pool
.info
.required_alignment();
9516 bl
.substr_of(cop
->data
, 0, cop
->data
.length() - to_trim
);
9518 cop
->cursor
.data_offset
-= to_trim
;
9519 ceph_assert(cop
->data
.length() + cop
->temp_cursor
.data_offset
==
9520 cop
->cursor
.data_offset
);
9523 if (cop
->data
.length()) {
9525 cop
->results
.temp_oid
,
9526 cop
->temp_cursor
.data_offset
,
9529 cop
->dest_obj_fadvise_flags
);
9533 if (pool
.info
.supports_omap()) {
9534 if (!cop
->temp_cursor
.omap_complete
) {
9535 if (cop
->omap_header
.length()) {
9537 cop
->results
.temp_oid
,
9539 cop
->omap_header
.clear();
9541 if (cop
->omap_data
.length()) {
9542 map
<string
,bufferlist
> omap
;
9543 bufferlist::const_iterator p
= cop
->omap_data
.begin();
9545 t
->omap_setkeys(cop
->results
.temp_oid
, omap
);
9546 cop
->omap_data
.clear();
9550 ceph_assert(cop
->omap_header
.length() == 0);
9551 ceph_assert(cop
->omap_data
.length() == 0);
9553 cop
->temp_cursor
= cop
->cursor
;
9556 void PrimaryLogPG::finish_copyfrom(CopyFromCallback
*cb
)
9558 OpContext
*ctx
= cb
->ctx
;
9559 dout(20) << "finish_copyfrom on " << ctx
->obs
->oi
.soid
<< dendl
;
9561 ObjectState
& obs
= ctx
->new_obs
;
9563 dout(20) << __func__
<< ": exists, removing" << dendl
;
9564 ctx
->op_t
->remove(obs
.oi
.soid
);
9566 ctx
->delta_stats
.num_objects
++;
9569 if (cb
->is_temp_obj_used()) {
9570 ctx
->discard_temp_oid
= cb
->results
->temp_oid
;
9572 cb
->results
->fill_in_final_tx(ctx
->op_t
.get());
9574 // CopyFromCallback fills this in for us
9575 obs
.oi
.user_version
= ctx
->user_at_version
;
9577 if (cb
->results
->is_data_digest()) {
9578 obs
.oi
.set_data_digest(cb
->results
->data_digest
);
9580 obs
.oi
.clear_data_digest();
9582 if (cb
->results
->is_omap_digest()) {
9583 obs
.oi
.set_omap_digest(cb
->results
->omap_digest
);
9585 obs
.oi
.clear_omap_digest();
9588 obs
.oi
.truncate_seq
= cb
->truncate_seq
;
9589 obs
.oi
.truncate_size
= cb
->truncate_size
;
9591 obs
.oi
.mtime
= ceph::real_clock::to_timespec(cb
->results
->mtime
);
9592 ctx
->mtime
= utime_t();
9594 ctx
->extra_reqids
= cb
->results
->reqids
;
9595 ctx
->extra_reqid_return_codes
= cb
->results
->reqid_return_codes
;
9597 // cache: clear whiteout?
9598 if (obs
.oi
.is_whiteout()) {
9599 dout(10) << __func__
<< " clearing whiteout on " << obs
.oi
.soid
<< dendl
;
9600 obs
.oi
.clear_flag(object_info_t::FLAG_WHITEOUT
);
9601 --ctx
->delta_stats
.num_whiteouts
;
9604 if (cb
->results
->has_omap
) {
9605 dout(10) << __func__
<< " setting omap flag on " << obs
.oi
.soid
<< dendl
;
9606 obs
.oi
.set_flag(object_info_t::FLAG_OMAP
);
9607 ctx
->clean_regions
.mark_omap_dirty();
9609 dout(10) << __func__
<< " clearing omap flag on " << obs
.oi
.soid
<< dendl
;
9610 obs
.oi
.clear_flag(object_info_t::FLAG_OMAP
);
9613 interval_set
<uint64_t> ch
;
9614 if (obs
.oi
.size
> 0)
9615 ch
.insert(0, obs
.oi
.size
);
9616 ctx
->modified_ranges
.union_of(ch
);
9617 ctx
->clean_regions
.mark_data_region_dirty(0, std::max(obs
.oi
.size
, cb
->get_data_size()));
9619 if (cb
->get_data_size() != obs
.oi
.size
) {
9620 ctx
->delta_stats
.num_bytes
-= obs
.oi
.size
;
9621 obs
.oi
.size
= cb
->get_data_size();
9622 ctx
->delta_stats
.num_bytes
+= obs
.oi
.size
;
9624 ctx
->delta_stats
.num_wr
++;
9625 ctx
->delta_stats
.num_wr_kb
+= shift_round_up(obs
.oi
.size
, 10);
9627 osd
->logger
->inc(l_osd_copyfrom
);
9630 void PrimaryLogPG::finish_promote(int r
, CopyResults
*results
,
9631 ObjectContextRef obc
)
9633 const hobject_t
& soid
= obc
->obs
.oi
.soid
;
9634 dout(10) << __func__
<< " " << soid
<< " r=" << r
9635 << " uv" << results
->user_version
<< dendl
;
9637 if (r
== -ECANCELED
) {
9641 if (r
!= -ENOENT
&& soid
.is_snap()) {
9642 if (results
->snaps
.empty()) {
9643 // we must have read "snap" content from the head object in the
9644 // base pool. use snap_seq to construct what snaps should be
9645 // for this clone (what is was before we evicted the clean clone
9646 // from this pool, and what it will be when we flush and the
9647 // clone eventually happens in the base pool). we want to use
9648 // snaps in (results->snap_seq,soid.snap]
9649 SnapSet
& snapset
= obc
->ssc
->snapset
;
9650 for (auto p
= snapset
.clone_snaps
.rbegin();
9651 p
!= snapset
.clone_snaps
.rend();
9653 for (auto snap
: p
->second
) {
9654 if (snap
> soid
.snap
) {
9657 if (snap
<= results
->snap_seq
) {
9660 results
->snaps
.push_back(snap
);
9665 dout(20) << __func__
<< " snaps " << results
->snaps
<< dendl
;
9666 filter_snapc(results
->snaps
);
9668 dout(20) << __func__
<< " filtered snaps " << results
->snaps
<< dendl
;
9669 if (results
->snaps
.empty()) {
9670 dout(20) << __func__
9671 << " snaps are empty, clone is invalid,"
9672 << " setting r to ENOENT" << dendl
;
9677 if (r
< 0 && results
->started_temp_obj
) {
9678 dout(10) << __func__
<< " abort; will clean up partial work" << dendl
;
9679 ObjectContextRef tempobc
= get_object_context(results
->temp_oid
, false);
9680 ceph_assert(tempobc
);
9681 OpContextUPtr ctx
= simple_opc_create(tempobc
);
9682 ctx
->op_t
->remove(results
->temp_oid
);
9683 simple_opc_submit(std::move(ctx
));
9684 results
->started_temp_obj
= false;
9687 if (r
== -ENOENT
&& soid
.is_snap()) {
9688 dout(10) << __func__
9689 << ": enoent while trying to promote clone, " << soid
9690 << " must have been trimmed, removing from snapset"
9692 hobject_t
head(soid
.get_head());
9693 ObjectContextRef obc
= get_object_context(head
, false);
9696 OpContextUPtr tctx
= simple_opc_create(obc
);
9697 tctx
->at_version
= get_next_version();
9698 if (get_osdmap()->require_osd_release
< ceph_release_t::octopus
) {
9699 filter_snapc(tctx
->new_snapset
.snaps
);
9701 tctx
->new_snapset
.snaps
.clear();
9703 vector
<snapid_t
> new_clones
;
9704 map
<snapid_t
, vector
<snapid_t
>> new_clone_snaps
;
9705 for (vector
<snapid_t
>::iterator i
= tctx
->new_snapset
.clones
.begin();
9706 i
!= tctx
->new_snapset
.clones
.end();
9708 if (*i
!= soid
.snap
) {
9709 new_clones
.push_back(*i
);
9710 auto p
= tctx
->new_snapset
.clone_snaps
.find(*i
);
9711 if (p
!= tctx
->new_snapset
.clone_snaps
.end()) {
9712 new_clone_snaps
[*i
] = p
->second
;
9716 tctx
->new_snapset
.clones
.swap(new_clones
);
9717 tctx
->new_snapset
.clone_overlap
.erase(soid
.snap
);
9718 tctx
->new_snapset
.clone_size
.erase(soid
.snap
);
9719 tctx
->new_snapset
.clone_snaps
.swap(new_clone_snaps
);
9721 // take RWWRITE lock for duration of our local write. ignore starvation.
9722 if (!tctx
->lock_manager
.take_write_lock(
9725 ceph_abort_msg("problem!");
9727 dout(20) << __func__
<< " took lock on obc, " << obc
->rwstate
<< dendl
;
9729 finish_ctx(tctx
.get(), pg_log_entry_t::PROMOTE
);
9731 simple_opc_submit(std::move(tctx
));
9735 bool whiteout
= false;
9737 ceph_assert(soid
.snap
== CEPH_NOSNAP
); // snap case is above
9738 dout(10) << __func__
<< " whiteout " << soid
<< dendl
;
9742 if (r
< 0 && !whiteout
) {
9743 derr
<< __func__
<< " unexpected promote error " << cpp_strerror(r
) << dendl
;
9744 // pass error to everyone blocked on this object
9745 // FIXME: this is pretty sloppy, but at this point we got
9746 // something unexpected and don't have many other options.
9747 map
<hobject_t
,list
<OpRequestRef
>>::iterator blocked_iter
=
9748 waiting_for_blocked_object
.find(soid
);
9749 if (blocked_iter
!= waiting_for_blocked_object
.end()) {
9750 while (!blocked_iter
->second
.empty()) {
9751 osd
->reply_op_error(blocked_iter
->second
.front(), r
);
9752 blocked_iter
->second
.pop_front();
9754 waiting_for_blocked_object
.erase(blocked_iter
);
9759 osd
->promote_finish(results
->object_size
);
9761 OpContextUPtr tctx
= simple_opc_create(obc
);
9762 tctx
->at_version
= get_next_version();
9764 if (!obc
->obs
.oi
.has_manifest()) {
9765 ++tctx
->delta_stats
.num_objects
;
9767 if (soid
.snap
< CEPH_NOSNAP
)
9768 ++tctx
->delta_stats
.num_object_clones
;
9769 tctx
->new_obs
.exists
= true;
9771 tctx
->extra_reqids
= results
->reqids
;
9772 tctx
->extra_reqid_return_codes
= results
->reqid_return_codes
;
9775 // create a whiteout
9776 tctx
->op_t
->create(soid
);
9777 tctx
->new_obs
.oi
.set_flag(object_info_t::FLAG_WHITEOUT
);
9778 ++tctx
->delta_stats
.num_whiteouts
;
9779 dout(20) << __func__
<< " creating whiteout on " << soid
<< dendl
;
9780 osd
->logger
->inc(l_osd_tier_whiteout
);
9782 if (results
->has_omap
) {
9783 dout(10) << __func__
<< " setting omap flag on " << soid
<< dendl
;
9784 tctx
->new_obs
.oi
.set_flag(object_info_t::FLAG_OMAP
);
9785 ++tctx
->delta_stats
.num_objects_omap
;
9788 results
->fill_in_final_tx(tctx
->op_t
.get());
9789 if (results
->started_temp_obj
) {
9790 tctx
->discard_temp_oid
= results
->temp_oid
;
9792 tctx
->new_obs
.oi
.size
= results
->object_size
;
9793 tctx
->new_obs
.oi
.user_version
= results
->user_version
;
9794 tctx
->new_obs
.oi
.mtime
= ceph::real_clock::to_timespec(results
->mtime
);
9795 tctx
->mtime
= utime_t();
9796 if (results
->is_data_digest()) {
9797 tctx
->new_obs
.oi
.set_data_digest(results
->data_digest
);
9799 tctx
->new_obs
.oi
.clear_data_digest();
9801 if (results
->object_size
)
9802 tctx
->clean_regions
.mark_data_region_dirty(0, results
->object_size
);
9803 if (results
->is_omap_digest()) {
9804 tctx
->new_obs
.oi
.set_omap_digest(results
->omap_digest
);
9806 tctx
->new_obs
.oi
.clear_omap_digest();
9808 if (results
->has_omap
)
9809 tctx
->clean_regions
.mark_omap_dirty();
9810 tctx
->new_obs
.oi
.truncate_seq
= results
->truncate_seq
;
9811 tctx
->new_obs
.oi
.truncate_size
= results
->truncate_size
;
9813 if (soid
.snap
!= CEPH_NOSNAP
) {
9814 ceph_assert(obc
->ssc
->snapset
.clone_snaps
.count(soid
.snap
));
9815 ceph_assert(obc
->ssc
->snapset
.clone_size
.count(soid
.snap
));
9816 ceph_assert(obc
->ssc
->snapset
.clone_size
[soid
.snap
] ==
9817 results
->object_size
);
9818 ceph_assert(obc
->ssc
->snapset
.clone_overlap
.count(soid
.snap
));
9820 tctx
->delta_stats
.num_bytes
+= obc
->ssc
->snapset
.get_clone_bytes(soid
.snap
);
9822 tctx
->delta_stats
.num_bytes
+= results
->object_size
;
9826 if (results
->mirror_snapset
) {
9827 ceph_assert(tctx
->new_obs
.oi
.soid
.snap
== CEPH_NOSNAP
);
9828 tctx
->new_snapset
.from_snap_set(
9830 get_osdmap()->require_osd_release
< ceph_release_t::luminous
);
9832 dout(20) << __func__
<< " new_snapset " << tctx
->new_snapset
<< dendl
;
9834 // take RWWRITE lock for duration of our local write. ignore starvation.
9835 if (!tctx
->lock_manager
.take_write_lock(
9838 ceph_abort_msg("problem!");
9840 dout(20) << __func__
<< " took lock on obc, " << obc
->rwstate
<< dendl
;
9842 finish_ctx(tctx
.get(), pg_log_entry_t::PROMOTE
);
9844 simple_opc_submit(std::move(tctx
));
9846 osd
->logger
->inc(l_osd_tier_promote
);
9849 agent_state
->is_idle())
9850 agent_choose_mode();
9853 void PrimaryLogPG::finish_promote_manifest(int r
, CopyResults
*results
,
9854 ObjectContextRef obc
)
9856 const hobject_t
& soid
= obc
->obs
.oi
.soid
;
9857 dout(10) << __func__
<< " " << soid
<< " r=" << r
9858 << " uv" << results
->user_version
<< dendl
;
9860 if (r
== -ECANCELED
|| r
== -EAGAIN
) {
9865 derr
<< __func__
<< " unexpected promote error " << cpp_strerror(r
) << dendl
;
9866 // pass error to everyone blocked on this object
9867 // FIXME: this is pretty sloppy, but at this point we got
9868 // something unexpected and don't have many other options.
9869 map
<hobject_t
,list
<OpRequestRef
>>::iterator blocked_iter
=
9870 waiting_for_blocked_object
.find(soid
);
9871 if (blocked_iter
!= waiting_for_blocked_object
.end()) {
9872 while (!blocked_iter
->second
.empty()) {
9873 osd
->reply_op_error(blocked_iter
->second
.front(), r
);
9874 blocked_iter
->second
.pop_front();
9876 waiting_for_blocked_object
.erase(blocked_iter
);
9881 osd
->promote_finish(results
->object_size
);
9882 osd
->logger
->inc(l_osd_tier_promote
);
9885 agent_state
->is_idle())
9886 agent_choose_mode();
9889 void PrimaryLogPG::cancel_copy(CopyOpRef cop
, bool requeue
,
9890 vector
<ceph_tid_t
> *tids
)
9892 dout(10) << __func__
<< " " << cop
->obc
->obs
.oi
.soid
9893 << " from " << cop
->src
<< " " << cop
->oloc
9894 << " v" << cop
->results
.user_version
<< dendl
;
9896 // cancel objecter op, if we can
9897 if (cop
->objecter_tid
) {
9898 tids
->push_back(cop
->objecter_tid
);
9899 cop
->objecter_tid
= 0;
9900 if (cop
->objecter_tid2
) {
9901 tids
->push_back(cop
->objecter_tid2
);
9902 cop
->objecter_tid2
= 0;
9906 copy_ops
.erase(cop
->obc
->obs
.oi
.soid
);
9907 cop
->obc
->stop_block();
9909 kick_object_context_blocked(cop
->obc
);
9910 cop
->results
.should_requeue
= requeue
;
9911 CopyCallbackResults
result(-ECANCELED
, &cop
->results
);
9912 cop
->cb
->complete(result
);
9914 // There may still be an objecter callback referencing this copy op.
9915 // That callback will not need the obc since it's been canceled, and
9916 // we need the obc reference to go away prior to flush.
9917 cop
->obc
= ObjectContextRef();
9920 void PrimaryLogPG::cancel_copy_ops(bool requeue
, vector
<ceph_tid_t
> *tids
)
9922 dout(10) << __func__
<< dendl
;
9923 map
<hobject_t
,CopyOpRef
>::iterator p
= copy_ops
.begin();
9924 while (p
!= copy_ops
.end()) {
9925 // requeue this op? can I queue up all of them?
9926 cancel_copy((p
++)->second
, requeue
, tids
);
9931 // ========================================================================
9934 // Flush a dirty object in the cache tier by writing it back to the
9935 // base tier. The sequence looks like:
9937 // * send a copy-from operation to the base tier to copy the current
9938 // version of the object
9939 // * base tier will pull the object via (perhaps multiple) copy-get(s)
9940 // * on completion, we check if the object has been modified. if so,
9941 // just reply with -EAGAIN.
9942 // * try to take a write lock so we can clear the dirty flag. if this
9943 // fails, wait and retry
9944 // * start a repop that clears the bit.
9946 // If we have to wait, we will retry by coming back through the
9947 // start_flush method. We check if a flush is already in progress
9948 // and, if so, try to finish it by rechecking the version and trying
9949 // to clear the dirty bit.
9951 // In order for the cache-flush (a write op) to not block the copy-get
9952 // from reading the object, the client *must* set the SKIPRWLOCKS
9955 // NOTE: normally writes are strictly ordered for the client, but
9956 // flushes are special in that they can be reordered with respect to
9957 // other writes. In particular, we can't have a flush request block
9958 // an update to the cache pool object!
9960 struct C_Flush
: public Context
{
9963 epoch_t last_peering_reset
;
9966 C_Flush(PrimaryLogPG
*p
, hobject_t o
, epoch_t lpr
)
9967 : pg(p
), oid(o
), last_peering_reset(lpr
),
9968 tid(0), start(ceph_clock_now())
9970 void finish(int r
) override
{
9971 if (r
== -ECANCELED
)
9973 std::scoped_lock locker
{*pg
};
9974 if (last_peering_reset
== pg
->get_last_peering_reset()) {
9975 pg
->finish_flush(oid
, tid
, r
);
9976 pg
->osd
->logger
->tinc(l_osd_tier_flush_lat
, ceph_clock_now() - start
);
9981 int PrimaryLogPG::start_flush(
9982 OpRequestRef op
, ObjectContextRef obc
,
9983 bool blocking
, hobject_t
*pmissing
,
9984 std::optional
<std::function
<void()>> &&on_flush
)
9986 const object_info_t
& oi
= obc
->obs
.oi
;
9987 const hobject_t
& soid
= oi
.soid
;
9988 dout(10) << __func__
<< " " << soid
9989 << " v" << oi
.version
9990 << " uv" << oi
.user_version
9991 << " " << (blocking
? "blocking" : "non-blocking/best-effort")
9994 bool preoctopus_compat
=
9995 get_osdmap()->require_osd_release
< ceph_release_t::octopus
;
9997 if (preoctopus_compat
) {
9998 // for pre-octopus compatibility, filter SnapSet::snaps. not
9999 // certain we need this, but let's be conservative.
10000 snapset
= obc
->ssc
->snapset
.get_filtered(pool
.info
);
10002 // NOTE: change this to a const ref when we remove this compat code
10003 snapset
= obc
->ssc
->snapset
;
10006 // verify there are no (older) check for dirty clones
10008 dout(20) << " snapset " << snapset
<< dendl
;
10009 vector
<snapid_t
>::reverse_iterator p
= snapset
.clones
.rbegin();
10010 while (p
!= snapset
.clones
.rend() && *p
>= soid
.snap
)
10012 if (p
!= snapset
.clones
.rend()) {
10013 hobject_t next
= soid
;
10015 ceph_assert(next
.snap
< soid
.snap
);
10016 if (recovery_state
.get_pg_log().get_missing().is_missing(next
)) {
10017 dout(10) << __func__
<< " missing clone is " << next
<< dendl
;
10022 ObjectContextRef older_obc
= get_object_context(next
, false);
10024 dout(20) << __func__
<< " next oldest clone is " << older_obc
->obs
.oi
10026 if (older_obc
->obs
.oi
.is_dirty()) {
10027 dout(10) << __func__
<< " next oldest clone is dirty: "
10028 << older_obc
->obs
.oi
<< dendl
;
10032 dout(20) << __func__
<< " next oldest clone " << next
10033 << " is not present; implicitly clean" << dendl
;
10036 dout(20) << __func__
<< " no older clones" << dendl
;
10041 obc
->start_block();
10043 map
<hobject_t
,FlushOpRef
>::iterator p
= flush_ops
.find(soid
);
10044 if (p
!= flush_ops
.end()) {
10045 FlushOpRef fop
= p
->second
;
10046 if (fop
->op
== op
) {
10047 // we couldn't take the write lock on a cache-try-flush before;
10048 // now we are trying again for the lock.
10049 return try_flush_mark_clean(fop
);
10051 if (fop
->flushed_version
== obc
->obs
.oi
.user_version
&&
10052 (fop
->blocking
|| !blocking
)) {
10053 // nonblocking can join anything
10054 // blocking can only join a blocking flush
10055 dout(20) << __func__
<< " piggybacking on existing flush " << dendl
;
10057 fop
->dup_ops
.push_back(op
);
10058 return -EAGAIN
; // clean up this ctx; op will retry later
10061 // cancel current flush since it will fail anyway, or because we
10062 // are blocking and the existing flush is nonblocking.
10063 dout(20) << __func__
<< " canceling previous flush; it will fail" << dendl
;
10065 osd
->reply_op_error(fop
->op
, -EBUSY
);
10066 while (!fop
->dup_ops
.empty()) {
10067 osd
->reply_op_error(fop
->dup_ops
.front(), -EBUSY
);
10068 fop
->dup_ops
.pop_front();
10070 vector
<ceph_tid_t
> tids
;
10071 cancel_flush(fop
, false, &tids
);
10072 osd
->objecter
->op_cancel(tids
, -ECANCELED
);
10075 if (obc
->obs
.oi
.has_manifest() && obc
->obs
.oi
.manifest
.is_chunked()) {
10076 int r
= start_manifest_flush(op
, obc
, blocking
, std::move(on_flush
));
10077 if (r
!= -EINPROGRESS
) {
10085 * In general, we need to send a delete and a copyfrom.
10086 * Consider snapc 10:[10, 9, 8, 4, 3, 2]:[10(10, 9), 4(4,3,2)]
10087 * where 4 is marked as clean. To flush 10, we have to:
10088 * 1) delete 4:[4,3,2] -- Logically, the object does not exist after 4
10089 * 2) copyfrom 8:[8,4,3,2] -- flush object after snap 8
10091 * There is a complicating case. Supposed there had been a clone 7
10092 * for snaps [7, 6] which has been trimmed since they no longer exist.
10093 * In the base pool, we'd have 5:[4,3,2]:[4(4,3,2)]+head. When we submit
10094 * the delete, the snap will be promoted to 5, and the head will become
10095 * a whiteout. When the copy-from goes through, we'll end up with
10096 * 8:[8,4,3,2]:[4(4,3,2)]+head.
10098 * Another complication is the case where there is an interval change
10099 * after doing the delete and the flush but before marking the object
10100 * clean. We'll happily delete head and then recreate it at the same
10101 * sequence number, which works out ok.
10104 SnapContext snapc
, dsnapc
;
10105 if (snapset
.seq
!= 0) {
10106 if (soid
.snap
== CEPH_NOSNAP
) {
10107 snapc
= snapset
.get_ssc_as_of(snapset
.seq
);
10109 snapid_t min_included_snap
;
10110 auto p
= snapset
.clone_snaps
.find(soid
.snap
);
10111 ceph_assert(p
!= snapset
.clone_snaps
.end());
10112 min_included_snap
= p
->second
.back();
10113 snapc
= snapset
.get_ssc_as_of(min_included_snap
- 1);
10116 snapid_t prev_snapc
= 0;
10117 for (vector
<snapid_t
>::reverse_iterator citer
= snapset
.clones
.rbegin();
10118 citer
!= snapset
.clones
.rend();
10120 if (*citer
< soid
.snap
) {
10121 prev_snapc
= *citer
;
10126 dsnapc
= snapset
.get_ssc_as_of(prev_snapc
);
10129 object_locator_t
base_oloc(soid
);
10130 base_oloc
.pool
= pool
.info
.tier_of
;
10132 if (dsnapc
.seq
< snapc
.seq
) {
10135 osd
->objecter
->mutate(
10140 ceph::real_clock::from_ceph_timespec(oi
.mtime
),
10141 (CEPH_OSD_FLAG_IGNORE_OVERLAY
|
10142 CEPH_OSD_FLAG_ENFORCE_SNAPC
),
10143 NULL
/* no callback, we'll rely on the ordering w.r.t the next op */);
10146 FlushOpRef
fop(std::make_shared
<FlushOp
>());
10148 fop
->flushed_version
= oi
.user_version
;
10149 fop
->blocking
= blocking
;
10150 fop
->on_flush
= std::move(on_flush
);
10154 if (oi
.is_whiteout()) {
10155 fop
->removal
= true;
10158 object_locator_t
oloc(soid
);
10159 o
.copy_from(soid
.oid
.name
, soid
.snap
, oloc
, oi
.user_version
,
10160 CEPH_OSD_COPY_FROM_FLAG_FLUSH
|
10161 CEPH_OSD_COPY_FROM_FLAG_IGNORE_OVERLAY
|
10162 CEPH_OSD_COPY_FROM_FLAG_IGNORE_CACHE
|
10163 CEPH_OSD_COPY_FROM_FLAG_MAP_SNAP_CLONE
,
10164 LIBRADOS_OP_FLAG_FADVISE_SEQUENTIAL
|LIBRADOS_OP_FLAG_FADVISE_NOCACHE
);
10166 //mean the base tier don't cache data after this
10167 if (agent_state
&& agent_state
->evict_mode
!= TierAgentState::EVICT_MODE_FULL
)
10168 o
.set_last_op_flags(LIBRADOS_OP_FLAG_FADVISE_DONTNEED
);
10170 C_Flush
*fin
= new C_Flush(this, soid
, get_last_peering_reset());
10172 ceph_tid_t tid
= osd
->objecter
->mutate(
10173 soid
.oid
, base_oloc
, o
, snapc
,
10174 ceph::real_clock::from_ceph_timespec(oi
.mtime
),
10175 CEPH_OSD_FLAG_IGNORE_OVERLAY
| CEPH_OSD_FLAG_ENFORCE_SNAPC
,
10176 new C_OnFinisher(fin
,
10177 osd
->get_objecter_finisher(get_pg_shard())));
10178 /* we're under the pg lock and fin->finish() is grabbing that */
10180 fop
->objecter_tid
= tid
;
10182 flush_ops
[soid
] = fop
;
10184 recovery_state
.update_stats(
10185 [&oi
](auto &history
, auto &stats
) {
10186 stats
.stats
.sum
.num_flush
++;
10187 stats
.stats
.sum
.num_flush_kb
+= shift_round_up(oi
.size
, 10);
10190 return -EINPROGRESS
;
10193 void PrimaryLogPG::finish_flush(hobject_t oid
, ceph_tid_t tid
, int r
)
10195 dout(10) << __func__
<< " " << oid
<< " tid " << tid
10196 << " " << cpp_strerror(r
) << dendl
;
10197 map
<hobject_t
,FlushOpRef
>::iterator p
= flush_ops
.find(oid
);
10198 if (p
== flush_ops
.end()) {
10199 dout(10) << __func__
<< " no flush_op found" << dendl
;
10202 FlushOpRef fop
= p
->second
;
10203 if (tid
!= fop
->objecter_tid
&& !fop
->obc
->obs
.oi
.has_manifest()) {
10204 dout(10) << __func__
<< " tid " << tid
<< " != fop " << fop
10205 << " tid " << fop
->objecter_tid
<< dendl
;
10208 ObjectContextRef obc
= fop
->obc
;
10209 fop
->objecter_tid
= 0;
10211 if (r
< 0 && !(r
== -ENOENT
&& fop
->removal
)) {
10213 osd
->reply_op_error(fop
->op
, -EBUSY
);
10214 if (fop
->blocking
) {
10216 kick_object_context_blocked(obc
);
10219 if (!fop
->dup_ops
.empty()) {
10220 dout(20) << __func__
<< " requeueing dups" << dendl
;
10221 requeue_ops(fop
->dup_ops
);
10223 if (fop
->on_flush
) {
10224 (*(fop
->on_flush
))();
10225 fop
->on_flush
= std::nullopt
;
10227 flush_ops
.erase(oid
);
10231 r
= try_flush_mark_clean(fop
);
10232 if (r
== -EBUSY
&& fop
->op
) {
10233 osd
->reply_op_error(fop
->op
, r
);
10237 int PrimaryLogPG::try_flush_mark_clean(FlushOpRef fop
)
10239 ObjectContextRef obc
= fop
->obc
;
10240 const hobject_t
& oid
= obc
->obs
.oi
.soid
;
10242 if (fop
->blocking
) {
10244 kick_object_context_blocked(obc
);
10247 if (fop
->flushed_version
!= obc
->obs
.oi
.user_version
||
10248 !obc
->obs
.exists
) {
10249 if (obc
->obs
.exists
)
10250 dout(10) << __func__
<< " flushed_version " << fop
->flushed_version
10251 << " != current " << obc
->obs
.oi
.user_version
10254 dout(10) << __func__
<< " object no longer exists" << dendl
;
10256 if (!fop
->dup_ops
.empty()) {
10257 dout(20) << __func__
<< " requeueing dups" << dendl
;
10258 requeue_ops(fop
->dup_ops
);
10260 if (fop
->on_flush
) {
10261 (*(fop
->on_flush
))();
10262 fop
->on_flush
= std::nullopt
;
10264 flush_ops
.erase(oid
);
10266 osd
->logger
->inc(l_osd_tier_flush_fail
);
10268 osd
->logger
->inc(l_osd_tier_try_flush_fail
);
10272 if (!fop
->blocking
&&
10273 write_blocked_by_scrub(oid
)) {
10275 dout(10) << __func__
<< " blocked by scrub" << dendl
;
10276 requeue_op(fop
->op
);
10277 requeue_ops(fop
->dup_ops
);
10278 return -EAGAIN
; // will retry
10280 osd
->logger
->inc(l_osd_tier_try_flush_fail
);
10281 vector
<ceph_tid_t
> tids
;
10282 cancel_flush(fop
, false, &tids
);
10283 osd
->objecter
->op_cancel(tids
, -ECANCELED
);
10288 // successfully flushed, can we evict this object?
10289 if (!obc
->obs
.oi
.has_manifest() && !fop
->op
&&
10290 agent_state
&& agent_state
->evict_mode
!= TierAgentState::EVICT_MODE_IDLE
&&
10291 agent_maybe_evict(obc
, true)) {
10292 osd
->logger
->inc(l_osd_tier_clean
);
10293 if (fop
->on_flush
) {
10294 (*(fop
->on_flush
))();
10295 fop
->on_flush
= std::nullopt
;
10297 flush_ops
.erase(oid
);
10301 dout(10) << __func__
<< " clearing DIRTY flag for " << oid
<< dendl
;
10302 OpContextUPtr ctx
= simple_opc_create(fop
->obc
);
10304 // successfully flushed; can we clear the dirty bit?
10305 // try to take the lock manually, since we don't
10307 if (ctx
->lock_manager
.get_lock_type(
10312 dout(20) << __func__
<< " took write lock" << dendl
;
10313 } else if (fop
->op
) {
10314 dout(10) << __func__
<< " waiting on write lock " << fop
->op
<< " "
10315 << fop
->dup_ops
<< dendl
;
10316 // fop->op is now waiting on the lock; get fop->dup_ops to wait too.
10317 for (auto op
: fop
->dup_ops
) {
10318 bool locked
= ctx
->lock_manager
.get_lock_type(
10323 ceph_assert(!locked
);
10325 close_op_ctx(ctx
.release());
10326 return -EAGAIN
; // will retry
10328 dout(10) << __func__
<< " failed write lock, no op; failing" << dendl
;
10329 close_op_ctx(ctx
.release());
10330 osd
->logger
->inc(l_osd_tier_try_flush_fail
);
10331 vector
<ceph_tid_t
> tids
;
10332 cancel_flush(fop
, false, &tids
);
10333 osd
->objecter
->op_cancel(tids
, -ECANCELED
);
10337 if (fop
->on_flush
) {
10338 ctx
->register_on_finish(*(fop
->on_flush
));
10339 fop
->on_flush
= std::nullopt
;
10342 ctx
->at_version
= get_next_version();
10344 ctx
->new_obs
= obc
->obs
;
10345 ctx
->new_obs
.oi
.clear_flag(object_info_t::FLAG_DIRTY
);
10346 --ctx
->delta_stats
.num_objects_dirty
;
10347 if (fop
->obc
->obs
.oi
.has_manifest()) {
10348 ceph_assert(obc
->obs
.oi
.manifest
.is_chunked());
10349 PGTransaction
* t
= ctx
->op_t
.get();
10350 uint64_t chunks_size
= 0;
10351 for (auto &p
: ctx
->new_obs
.oi
.manifest
.chunk_map
) {
10352 chunks_size
+= p
.second
.length
;
10354 if (ctx
->new_obs
.oi
.is_omap() && pool
.info
.supports_omap()) {
10355 t
->omap_clear(oid
);
10356 ctx
->new_obs
.oi
.clear_omap_digest();
10357 ctx
->new_obs
.oi
.clear_flag(object_info_t::FLAG_OMAP
);
10358 ctx
->clean_regions
.mark_omap_dirty();
10360 if (obc
->obs
.oi
.size
== chunks_size
) {
10361 t
->truncate(oid
, 0);
10362 interval_set
<uint64_t> trim
;
10363 trim
.insert(0, ctx
->new_obs
.oi
.size
);
10364 ctx
->modified_ranges
.union_of(trim
);
10365 truncate_update_size_and_usage(ctx
->delta_stats
,
10368 ctx
->clean_regions
.mark_data_region_dirty(0, ctx
->new_obs
.oi
.size
);
10369 ctx
->new_obs
.oi
.new_object();
10370 for (auto &p
: ctx
->new_obs
.oi
.manifest
.chunk_map
) {
10371 p
.second
.clear_flag(chunk_info_t::FLAG_DIRTY
);
10372 p
.second
.set_flag(chunk_info_t::FLAG_MISSING
);
10375 for (auto &p
: ctx
->new_obs
.oi
.manifest
.chunk_map
) {
10376 if (p
.second
.is_dirty()) {
10377 dout(20) << __func__
<< " offset: " << p
.second
.offset
10378 << " length: " << p
.second
.length
<< dendl
;
10379 p
.second
.clear_flag(chunk_info_t::FLAG_DIRTY
);
10380 p
.second
.clear_flag(chunk_info_t::FLAG_MISSING
); // CLEAN
10386 finish_ctx(ctx
.get(), pg_log_entry_t::CLEAN
);
10388 osd
->logger
->inc(l_osd_tier_clean
);
10390 if (!fop
->dup_ops
.empty() || fop
->op
) {
10391 dout(20) << __func__
<< " requeueing for " << ctx
->at_version
<< dendl
;
10392 list
<OpRequestRef
> ls
;
10394 ls
.push_back(fop
->op
);
10395 ls
.splice(ls
.end(), fop
->dup_ops
);
10399 simple_opc_submit(std::move(ctx
));
10401 flush_ops
.erase(oid
);
10404 osd
->logger
->inc(l_osd_tier_flush
);
10406 osd
->logger
->inc(l_osd_tier_try_flush
);
10408 return -EINPROGRESS
;
10411 void PrimaryLogPG::cancel_flush(FlushOpRef fop
, bool requeue
,
10412 vector
<ceph_tid_t
> *tids
)
10414 dout(10) << __func__
<< " " << fop
->obc
->obs
.oi
.soid
<< " tid "
10415 << fop
->objecter_tid
<< dendl
;
10416 if (fop
->objecter_tid
) {
10417 tids
->push_back(fop
->objecter_tid
);
10418 fop
->objecter_tid
= 0;
10420 if (fop
->io_tids
.size()) {
10421 for (auto &p
: fop
->io_tids
) {
10422 tids
->push_back(p
.second
);
10426 if (fop
->blocking
&& fop
->obc
->is_blocked()) {
10427 fop
->obc
->stop_block();
10428 kick_object_context_blocked(fop
->obc
);
10432 requeue_op(fop
->op
);
10433 requeue_ops(fop
->dup_ops
);
10435 if (fop
->on_flush
) {
10436 (*(fop
->on_flush
))();
10437 fop
->on_flush
= std::nullopt
;
10439 flush_ops
.erase(fop
->obc
->obs
.oi
.soid
);
10442 void PrimaryLogPG::cancel_flush_ops(bool requeue
, vector
<ceph_tid_t
> *tids
)
10444 dout(10) << __func__
<< dendl
;
10445 map
<hobject_t
,FlushOpRef
>::iterator p
= flush_ops
.begin();
10446 while (p
!= flush_ops
.end()) {
10447 cancel_flush((p
++)->second
, requeue
, tids
);
10451 bool PrimaryLogPG::is_present_clone(hobject_t coid
)
10453 if (!pool
.info
.allow_incomplete_clones())
10455 if (is_missing_object(coid
))
10457 ObjectContextRef obc
= get_object_context(coid
, false);
10458 return obc
&& obc
->obs
.exists
;
10461 // ========================================================================
10464 class C_OSD_RepopCommit
: public Context
{
10465 PrimaryLogPGRef pg
;
10466 boost::intrusive_ptr
<PrimaryLogPG::RepGather
> repop
;
10468 C_OSD_RepopCommit(PrimaryLogPG
*pg
, PrimaryLogPG::RepGather
*repop
)
10469 : pg(pg
), repop(repop
) {}
10470 void finish(int) override
{
10471 pg
->repop_all_committed(repop
.get());
10475 void PrimaryLogPG::repop_all_committed(RepGather
*repop
)
10477 dout(10) << __func__
<< ": repop tid " << repop
->rep_tid
<< " all committed "
10479 repop
->all_committed
= true;
10480 if (!repop
->rep_aborted
) {
10481 if (repop
->v
!= eversion_t()) {
10482 recovery_state
.complete_write(repop
->v
, repop
->pg_local_last_complete
);
10488 void PrimaryLogPG::op_applied(const eversion_t
&applied_version
)
10490 dout(10) << "op_applied version " << applied_version
<< dendl
;
10491 ceph_assert(applied_version
!= eversion_t());
10492 ceph_assert(applied_version
<= info
.last_update
);
10493 recovery_state
.local_write_applied(applied_version
);
10494 if (is_primary()) {
10495 if (scrubber
.active
) {
10496 if (recovery_state
.get_last_update_applied() >=
10497 scrubber
.subset_last_update
) {
10498 requeue_scrub(ops_blocked_by_scrub());
10501 ceph_assert(scrubber
.start
== scrubber
.end
);
10506 void PrimaryLogPG::eval_repop(RepGather
*repop
)
10508 dout(10) << "eval_repop " << *repop
10509 << (repop
->op
&& repop
->op
->get_req
<MOSDOp
>() ? "" : " (no op)") << dendl
;
10512 if (repop
->all_committed
) {
10513 dout(10) << " commit: " << *repop
<< dendl
;
10514 for (auto p
= repop
->on_committed
.begin();
10515 p
!= repop
->on_committed
.end();
10516 repop
->on_committed
.erase(p
++)) {
10519 // send dup commits, in order
10520 auto it
= waiting_for_ondisk
.find(repop
->v
);
10521 if (it
!= waiting_for_ondisk
.end()) {
10522 ceph_assert(waiting_for_ondisk
.begin()->first
== repop
->v
);
10523 for (auto& i
: it
->second
) {
10524 int return_code
= repop
->r
;
10525 if (return_code
>= 0) {
10526 return_code
= std::get
<2>(i
);
10528 osd
->reply_op_error(std::get
<0>(i
), return_code
, repop
->v
,
10529 std::get
<1>(i
), std::get
<3>(i
));
10531 waiting_for_ondisk
.erase(it
);
10534 publish_stats_to_osd();
10536 dout(10) << " removing " << *repop
<< dendl
;
10537 ceph_assert(!repop_queue
.empty());
10538 dout(20) << " q front is " << *repop_queue
.front() << dendl
;
10539 if (repop_queue
.front() == repop
) {
10540 RepGather
*to_remove
= nullptr;
10541 while (!repop_queue
.empty() &&
10542 (to_remove
= repop_queue
.front())->all_committed
) {
10543 repop_queue
.pop_front();
10544 for (auto p
= to_remove
->on_success
.begin();
10545 p
!= to_remove
->on_success
.end();
10546 to_remove
->on_success
.erase(p
++)) {
10549 remove_repop(to_remove
);
10555 void PrimaryLogPG::issue_repop(RepGather
*repop
, OpContext
*ctx
)
10558 const hobject_t
& soid
= ctx
->obs
->oi
.soid
;
10559 dout(7) << "issue_repop rep_tid " << repop
->rep_tid
10563 repop
->v
= ctx
->at_version
;
10565 ctx
->op_t
->add_obc(ctx
->obc
);
10566 if (ctx
->clone_obc
) {
10567 ctx
->op_t
->add_obc(ctx
->clone_obc
);
10569 if (ctx
->head_obc
) {
10570 ctx
->op_t
->add_obc(ctx
->head_obc
);
10573 Context
*on_all_commit
= new C_OSD_RepopCommit(this, repop
);
10574 if (!(ctx
->log
.empty())) {
10575 ceph_assert(ctx
->at_version
>= projected_last_update
);
10576 projected_last_update
= ctx
->at_version
;
10578 for (auto &&entry
: ctx
->log
) {
10579 projected_log
.add(entry
);
10582 recovery_state
.pre_submit_op(
10586 pgbackend
->submit_transaction(
10590 std::move(ctx
->op_t
),
10591 recovery_state
.get_pg_trim_to(),
10592 recovery_state
.get_min_last_complete_ondisk(),
10594 ctx
->updated_hset_history
,
10601 PrimaryLogPG::RepGather
*PrimaryLogPG::new_repop(
10602 OpContext
*ctx
, ObjectContextRef obc
,
10603 ceph_tid_t rep_tid
)
10606 dout(10) << "new_repop rep_tid " << rep_tid
<< " on " << *ctx
->op
->get_req() << dendl
;
10608 dout(10) << "new_repop rep_tid " << rep_tid
<< " (no op)" << dendl
;
10610 RepGather
*repop
= new RepGather(
10611 ctx
, rep_tid
, info
.last_complete
);
10613 repop
->start
= ceph_clock_now();
10615 repop_queue
.push_back(&repop
->queue_item
);
10618 osd
->logger
->inc(l_osd_op_wip
);
10620 dout(10) << __func__
<< ": " << *repop
<< dendl
;
10624 boost::intrusive_ptr
<PrimaryLogPG::RepGather
> PrimaryLogPG::new_repop(
10625 eversion_t version
,
10627 ObcLockManager
&&manager
,
10629 std::optional
<std::function
<void(void)> > &&on_complete
)
10631 RepGather
*repop
= new RepGather(
10632 std::move(manager
),
10634 std::move(on_complete
),
10636 info
.last_complete
,
10638 repop
->v
= version
;
10640 repop
->start
= ceph_clock_now();
10642 repop_queue
.push_back(&repop
->queue_item
);
10644 osd
->logger
->inc(l_osd_op_wip
);
10646 dout(10) << __func__
<< ": " << *repop
<< dendl
;
10647 return boost::intrusive_ptr
<RepGather
>(repop
);
10650 void PrimaryLogPG::remove_repop(RepGather
*repop
)
10652 dout(20) << __func__
<< " " << *repop
<< dendl
;
10654 for (auto p
= repop
->on_finish
.begin();
10655 p
!= repop
->on_finish
.end();
10656 repop
->on_finish
.erase(p
++)) {
10660 release_object_locks(
10661 repop
->lock_manager
);
10664 osd
->logger
->dec(l_osd_op_wip
);
10667 PrimaryLogPG::OpContextUPtr
PrimaryLogPG::simple_opc_create(ObjectContextRef obc
)
10669 dout(20) << __func__
<< " " << obc
->obs
.oi
.soid
<< dendl
;
10670 ceph_tid_t rep_tid
= osd
->get_tid();
10671 osd_reqid_t
reqid(osd
->get_cluster_msgr_name(), 0, rep_tid
);
10672 OpContextUPtr
ctx(new OpContext(OpRequestRef(), reqid
, nullptr, obc
, this));
10673 ctx
->op_t
.reset(new PGTransaction());
10674 ctx
->mtime
= ceph_clock_now();
10678 void PrimaryLogPG::simple_opc_submit(OpContextUPtr ctx
)
10680 RepGather
*repop
= new_repop(ctx
.get(), ctx
->obc
, ctx
->reqid
.tid
);
10681 dout(20) << __func__
<< " " << repop
<< dendl
;
10682 issue_repop(repop
, ctx
.get());
10684 recovery_state
.update_trim_to();
10689 void PrimaryLogPG::submit_log_entries(
10690 const mempool::osd_pglog::list
<pg_log_entry_t
> &entries
,
10691 ObcLockManager
&&manager
,
10692 std::optional
<std::function
<void(void)> > &&_on_complete
,
10696 dout(10) << __func__
<< " " << entries
<< dendl
;
10697 ceph_assert(is_primary());
10699 eversion_t version
;
10700 if (!entries
.empty()) {
10701 ceph_assert(entries
.rbegin()->version
>= projected_last_update
);
10702 version
= projected_last_update
= entries
.rbegin()->version
;
10705 boost::intrusive_ptr
<RepGather
> repop
;
10706 std::optional
<std::function
<void(void)> > on_complete
;
10707 if (get_osdmap()->require_osd_release
>= ceph_release_t::jewel
) {
10711 std::move(manager
),
10713 std::move(_on_complete
));
10715 on_complete
= std::move(_on_complete
);
10718 pgbackend
->call_write_ordered(
10719 [this, entries
, repop
, on_complete
]() {
10720 ObjectStore::Transaction t
;
10721 eversion_t old_last_update
= info
.last_update
;
10722 recovery_state
.merge_new_log_entries(
10723 entries
, t
, recovery_state
.get_pg_trim_to(),
10724 recovery_state
.get_min_last_complete_ondisk());
10726 set
<pg_shard_t
> waiting_on
;
10727 for (set
<pg_shard_t
>::const_iterator i
= get_acting_recovery_backfill().begin();
10728 i
!= get_acting_recovery_backfill().end();
10730 pg_shard_t
peer(*i
);
10731 if (peer
== pg_whoami
) continue;
10732 ceph_assert(recovery_state
.get_peer_missing().count(peer
));
10733 ceph_assert(recovery_state
.has_peer_info(peer
));
10734 if (get_osdmap()->require_osd_release
>= ceph_release_t::jewel
) {
10735 ceph_assert(repop
);
10736 MOSDPGUpdateLogMissing
*m
= new MOSDPGUpdateLogMissing(
10738 spg_t(info
.pgid
.pgid
, i
->shard
),
10740 get_osdmap_epoch(),
10741 get_last_peering_reset(),
10743 recovery_state
.get_pg_trim_to(),
10744 recovery_state
.get_min_last_complete_ondisk());
10745 osd
->send_message_osd_cluster(
10746 peer
.osd
, m
, get_osdmap_epoch());
10747 waiting_on
.insert(peer
);
10749 MOSDPGLog
*m
= new MOSDPGLog(
10750 peer
.shard
, pg_whoami
.shard
,
10751 info
.last_update
.epoch
,
10752 info
, get_last_peering_reset());
10753 m
->log
.log
= entries
;
10754 m
->log
.tail
= old_last_update
;
10755 m
->log
.head
= info
.last_update
;
10756 osd
->send_message_osd_cluster(
10757 peer
.osd
, m
, get_osdmap_epoch());
10760 ceph_tid_t rep_tid
= repop
->rep_tid
;
10761 waiting_on
.insert(pg_whoami
);
10762 log_entry_update_waiting_on
.insert(
10765 LogUpdateCtx
{std::move(repop
), std::move(waiting_on
)}
10767 struct OnComplete
: public Context
{
10768 PrimaryLogPGRef pg
;
10769 ceph_tid_t rep_tid
;
10772 PrimaryLogPGRef pg
,
10773 ceph_tid_t rep_tid
,
10775 : pg(pg
), rep_tid(rep_tid
), epoch(epoch
) {}
10776 void finish(int) override
{
10777 std::scoped_lock l
{*pg
};
10778 if (!pg
->pg_has_reset_since(epoch
)) {
10779 auto it
= pg
->log_entry_update_waiting_on
.find(rep_tid
);
10780 ceph_assert(it
!= pg
->log_entry_update_waiting_on
.end());
10781 auto it2
= it
->second
.waiting_on
.find(pg
->pg_whoami
);
10782 ceph_assert(it2
!= it
->second
.waiting_on
.end());
10783 it
->second
.waiting_on
.erase(it2
);
10784 if (it
->second
.waiting_on
.empty()) {
10785 pg
->repop_all_committed(it
->second
.repop
.get());
10786 pg
->log_entry_update_waiting_on
.erase(it
);
10791 t
.register_on_commit(
10792 new OnComplete
{this, rep_tid
, get_osdmap_epoch()});
10793 int r
= osd
->store
->queue_transaction(ch
, std::move(t
), NULL
);
10794 ceph_assert(r
== 0);
10795 op_applied(info
.last_update
);
10798 recovery_state
.update_trim_to();
10801 void PrimaryLogPG::cancel_log_updates()
10803 // get rid of all the LogUpdateCtx so their references to repops are
10805 log_entry_update_waiting_on
.clear();
10808 // -------------------------------------------------------
10810 void PrimaryLogPG::get_watchers(list
<obj_watch_item_t
> *ls
)
10812 std::scoped_lock l
{*this};
10813 pair
<hobject_t
, ObjectContextRef
> i
;
10814 while (object_contexts
.get_next(i
.first
, &i
)) {
10815 ObjectContextRef
obc(i
.second
);
10816 get_obc_watchers(obc
, *ls
);
10820 void PrimaryLogPG::get_obc_watchers(ObjectContextRef obc
, list
<obj_watch_item_t
> &pg_watchers
)
10822 for (map
<pair
<uint64_t, entity_name_t
>, WatchRef
>::iterator j
=
10823 obc
->watchers
.begin();
10824 j
!= obc
->watchers
.end();
10826 obj_watch_item_t owi
;
10828 owi
.obj
= obc
->obs
.oi
.soid
;
10829 owi
.wi
.addr
= j
->second
->get_peer_addr();
10830 owi
.wi
.name
= j
->second
->get_entity();
10831 owi
.wi
.cookie
= j
->second
->get_cookie();
10832 owi
.wi
.timeout_seconds
= j
->second
->get_timeout();
10834 dout(30) << "watch: Found oid=" << owi
.obj
<< " addr=" << owi
.wi
.addr
10835 << " name=" << owi
.wi
.name
<< " cookie=" << owi
.wi
.cookie
<< dendl
;
10837 pg_watchers
.push_back(owi
);
10841 void PrimaryLogPG::check_blacklisted_watchers()
10843 dout(20) << "PrimaryLogPG::check_blacklisted_watchers for pg " << get_pgid() << dendl
;
10844 pair
<hobject_t
, ObjectContextRef
> i
;
10845 while (object_contexts
.get_next(i
.first
, &i
))
10846 check_blacklisted_obc_watchers(i
.second
);
10849 void PrimaryLogPG::check_blacklisted_obc_watchers(ObjectContextRef obc
)
10851 dout(20) << "PrimaryLogPG::check_blacklisted_obc_watchers for obc " << obc
->obs
.oi
.soid
<< dendl
;
10852 for (map
<pair
<uint64_t, entity_name_t
>, WatchRef
>::iterator k
=
10853 obc
->watchers
.begin();
10854 k
!= obc
->watchers
.end();
10856 //Advance iterator now so handle_watch_timeout() can erase element
10857 map
<pair
<uint64_t, entity_name_t
>, WatchRef
>::iterator j
= k
++;
10858 dout(30) << "watch: Found " << j
->second
->get_entity() << " cookie " << j
->second
->get_cookie() << dendl
;
10859 entity_addr_t ea
= j
->second
->get_peer_addr();
10860 dout(30) << "watch: Check entity_addr_t " << ea
<< dendl
;
10861 if (get_osdmap()->is_blacklisted(ea
)) {
10862 dout(10) << "watch: Found blacklisted watcher for " << ea
<< dendl
;
10863 ceph_assert(j
->second
->get_pg() == this);
10864 j
->second
->unregister_cb();
10865 handle_watch_timeout(j
->second
);
10870 void PrimaryLogPG::populate_obc_watchers(ObjectContextRef obc
)
10872 ceph_assert(is_active());
10873 auto it_objects
= recovery_state
.get_pg_log().get_log().objects
.find(obc
->obs
.oi
.soid
);
10874 ceph_assert((recovering
.count(obc
->obs
.oi
.soid
) ||
10875 !is_missing_object(obc
->obs
.oi
.soid
)) ||
10876 (it_objects
!= recovery_state
.get_pg_log().get_log().objects
.end() && // or this is a revert... see recover_primary()
10877 it_objects
->second
->op
==
10878 pg_log_entry_t::LOST_REVERT
&&
10879 it_objects
->second
->reverting_to
==
10880 obc
->obs
.oi
.version
));
10882 dout(10) << "populate_obc_watchers " << obc
->obs
.oi
.soid
<< dendl
;
10883 ceph_assert(obc
->watchers
.empty());
10884 // populate unconnected_watchers
10885 for (map
<pair
<uint64_t, entity_name_t
>, watch_info_t
>::iterator p
=
10886 obc
->obs
.oi
.watchers
.begin();
10887 p
!= obc
->obs
.oi
.watchers
.end();
10889 utime_t expire
= info
.stats
.last_became_active
;
10890 expire
+= p
->second
.timeout_seconds
;
10891 dout(10) << " unconnected watcher " << p
->first
<< " will expire " << expire
<< dendl
;
10893 Watch::makeWatchRef(
10894 this, osd
, obc
, p
->second
.timeout_seconds
, p
->first
.first
,
10895 p
->first
.second
, p
->second
.addr
));
10896 watch
->disconnect();
10897 obc
->watchers
.insert(
10899 make_pair(p
->first
.first
, p
->first
.second
),
10902 // Look for watchers from blacklisted clients and drop
10903 check_blacklisted_obc_watchers(obc
);
10906 void PrimaryLogPG::handle_watch_timeout(WatchRef watch
)
10908 ObjectContextRef obc
= watch
->get_obc(); // handle_watch_timeout owns this ref
10909 dout(10) << "handle_watch_timeout obc " << obc
<< dendl
;
10911 if (!is_active()) {
10912 dout(10) << "handle_watch_timeout not active, no-op" << dendl
;
10915 if (!obc
->obs
.exists
) {
10916 dout(10) << __func__
<< " object " << obc
->obs
.oi
.soid
<< " dne" << dendl
;
10919 if (is_degraded_or_backfilling_object(obc
->obs
.oi
.soid
)) {
10920 callbacks_for_degraded_object
[obc
->obs
.oi
.soid
].push_back(
10921 watch
->get_delayed_cb()
10923 dout(10) << "handle_watch_timeout waiting for degraded on obj "
10924 << obc
->obs
.oi
.soid
10929 if (write_blocked_by_scrub(obc
->obs
.oi
.soid
)) {
10930 dout(10) << "handle_watch_timeout waiting for scrub on obj "
10931 << obc
->obs
.oi
.soid
10933 scrubber
.add_callback(
10934 watch
->get_delayed_cb() // This callback!
10939 OpContextUPtr ctx
= simple_opc_create(obc
);
10940 ctx
->at_version
= get_next_version();
10942 object_info_t
& oi
= ctx
->new_obs
.oi
;
10943 oi
.watchers
.erase(make_pair(watch
->get_cookie(),
10944 watch
->get_entity()));
10946 list
<watch_disconnect_t
> watch_disconnects
= {
10947 watch_disconnect_t(watch
->get_cookie(), watch
->get_entity(), true)
10949 ctx
->register_on_success(
10950 [this, obc
, watch_disconnects
]() {
10951 complete_disconnect_watches(obc
, watch_disconnects
);
10955 PGTransaction
*t
= ctx
->op_t
.get();
10956 ctx
->log
.push_back(pg_log_entry_t(pg_log_entry_t::MODIFY
, obc
->obs
.oi
.soid
,
10960 osd_reqid_t(), ctx
->mtime
, 0));
10962 oi
.prior_version
= obc
->obs
.oi
.version
;
10963 oi
.version
= ctx
->at_version
;
10965 encode(oi
, bl
, get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD
, nullptr));
10966 t
->setattr(obc
->obs
.oi
.soid
, OI_ATTR
, bl
);
10968 // apply new object state.
10969 ctx
->obc
->obs
= ctx
->new_obs
;
10971 // no ctx->delta_stats
10972 simple_opc_submit(std::move(ctx
));
10975 ObjectContextRef
PrimaryLogPG::create_object_context(const object_info_t
& oi
,
10976 SnapSetContext
*ssc
)
10978 ObjectContextRef
obc(object_contexts
.lookup_or_create(oi
.soid
));
10979 ceph_assert(obc
->destructor_callback
== NULL
);
10980 obc
->destructor_callback
= new C_PG_ObjectContext(this, obc
.get());
10982 obc
->obs
.exists
= false;
10985 register_snapset_context(ssc
);
10986 dout(10) << "create_object_context " << (void*)obc
.get() << " " << oi
.soid
<< " " << dendl
;
10988 populate_obc_watchers(obc
);
10992 ObjectContextRef
PrimaryLogPG::get_object_context(
10993 const hobject_t
& soid
,
10995 const map
<string
, bufferlist
> *attrs
)
10997 auto it_objects
= recovery_state
.get_pg_log().get_log().objects
.find(soid
);
10999 attrs
|| !recovery_state
.get_pg_log().get_missing().is_missing(soid
) ||
11000 // or this is a revert... see recover_primary()
11001 (it_objects
!= recovery_state
.get_pg_log().get_log().objects
.end() &&
11002 it_objects
->second
->op
==
11003 pg_log_entry_t::LOST_REVERT
));
11004 ObjectContextRef obc
= object_contexts
.lookup(soid
);
11005 osd
->logger
->inc(l_osd_object_ctx_cache_total
);
11007 osd
->logger
->inc(l_osd_object_ctx_cache_hit
);
11008 dout(10) << __func__
<< ": found obc in cache: " << obc
11011 dout(10) << __func__
<< ": obc NOT found in cache: " << soid
<< dendl
;
11015 auto it_oi
= attrs
->find(OI_ATTR
);
11016 ceph_assert(it_oi
!= attrs
->end());
11017 bv
= it_oi
->second
;
11019 int r
= pgbackend
->objects_get_attr(soid
, OI_ATTR
, &bv
);
11022 dout(10) << __func__
<< ": no obc for soid "
11023 << soid
<< " and !can_create"
11025 return ObjectContextRef(); // -ENOENT!
11028 dout(10) << __func__
<< ": no obc for soid "
11029 << soid
<< " but can_create"
11032 object_info_t
oi(soid
);
11033 SnapSetContext
*ssc
= get_snapset_context(
11034 soid
, true, 0, false);
11036 obc
= create_object_context(oi
, ssc
);
11037 dout(10) << __func__
<< ": " << obc
<< " " << soid
11038 << " " << obc
->rwstate
11039 << " oi: " << obc
->obs
.oi
11040 << " ssc: " << obc
->ssc
11041 << " snapset: " << obc
->ssc
->snapset
<< dendl
;
11048 bufferlist::const_iterator bliter
= bv
.begin();
11049 decode(oi
, bliter
);
11051 dout(0) << __func__
<< ": obc corrupt: " << soid
<< dendl
;
11052 return ObjectContextRef(); // -ENOENT!
11055 ceph_assert(oi
.soid
.pool
== (int64_t)info
.pgid
.pool());
11057 obc
= object_contexts
.lookup_or_create(oi
.soid
);
11058 obc
->destructor_callback
= new C_PG_ObjectContext(this, obc
.get());
11060 obc
->obs
.exists
= true;
11062 obc
->ssc
= get_snapset_context(
11064 soid
.has_snapset() ? attrs
: 0);
11067 populate_obc_watchers(obc
);
11069 if (pool
.info
.is_erasure()) {
11071 obc
->attr_cache
= *attrs
;
11073 int r
= pgbackend
->objects_get_attrs(
11076 ceph_assert(r
== 0);
11080 dout(10) << __func__
<< ": creating obc from disk: " << obc
11084 // XXX: Caller doesn't expect this
11085 if (obc
->ssc
== NULL
) {
11086 derr
<< __func__
<< ": obc->ssc not available, not returning context" << dendl
;
11087 return ObjectContextRef(); // -ENOENT!
11090 dout(10) << __func__
<< ": " << obc
<< " " << soid
11091 << " " << obc
->rwstate
11092 << " oi: " << obc
->obs
.oi
11093 << " exists: " << (int)obc
->obs
.exists
11094 << " ssc: " << obc
->ssc
11095 << " snapset: " << obc
->ssc
->snapset
<< dendl
;
11099 void PrimaryLogPG::context_registry_on_change()
11101 pair
<hobject_t
, ObjectContextRef
> i
;
11102 while (object_contexts
.get_next(i
.first
, &i
)) {
11103 ObjectContextRef
obc(i
.second
);
11105 for (map
<pair
<uint64_t, entity_name_t
>, WatchRef
>::iterator j
=
11106 obc
->watchers
.begin();
11107 j
!= obc
->watchers
.end();
11108 obc
->watchers
.erase(j
++)) {
11109 j
->second
->discard();
11117 * If we return an error, and set *pmissing, then promoting that
11120 * If we return -EAGAIN, we will always set *pmissing to the missing
11121 * object to wait for.
11123 * If we return an error but do not set *pmissing, then we know the
11124 * object does not exist.
11126 int PrimaryLogPG::find_object_context(const hobject_t
& oid
,
11127 ObjectContextRef
*pobc
,
11129 bool map_snapid_to_clone
,
11130 hobject_t
*pmissing
)
11133 ceph_assert(oid
.pool
== static_cast<int64_t>(info
.pgid
.pool()));
11135 if (oid
.snap
== CEPH_NOSNAP
) {
11136 ObjectContextRef obc
= get_object_context(oid
, can_create
);
11142 dout(10) << __func__
<< " " << oid
11143 << " @" << oid
.snap
11144 << " oi=" << obc
->obs
.oi
11153 hobject_t head
= oid
.get_head();
11154 SnapSetContext
*ssc
= get_snapset_context(oid
, can_create
);
11155 if (!ssc
|| !(ssc
->exists
|| can_create
)) {
11156 dout(20) << __func__
<< " " << oid
<< " no snapset" << dendl
;
11158 *pmissing
= head
; // start by getting the head
11160 put_snapset_context(ssc
);
11164 if (map_snapid_to_clone
) {
11165 dout(10) << __func__
<< " " << oid
<< " @" << oid
.snap
11166 << " snapset " << ssc
->snapset
11167 << " map_snapid_to_clone=true" << dendl
;
11168 if (oid
.snap
> ssc
->snapset
.seq
) {
11169 // already must be readable
11170 ObjectContextRef obc
= get_object_context(head
, false);
11171 dout(10) << __func__
<< " " << oid
<< " @" << oid
.snap
11172 << " snapset " << ssc
->snapset
11173 << " maps to head" << dendl
;
11175 put_snapset_context(ssc
);
11176 return (obc
&& obc
->obs
.exists
) ? 0 : -ENOENT
;
11178 vector
<snapid_t
>::const_iterator citer
= std::find(
11179 ssc
->snapset
.clones
.begin(),
11180 ssc
->snapset
.clones
.end(),
11182 if (citer
== ssc
->snapset
.clones
.end()) {
11183 dout(10) << __func__
<< " " << oid
<< " @" << oid
.snap
11184 << " snapset " << ssc
->snapset
11185 << " maps to nothing" << dendl
;
11186 put_snapset_context(ssc
);
11190 dout(10) << __func__
<< " " << oid
<< " @" << oid
.snap
11191 << " snapset " << ssc
->snapset
11192 << " maps to " << oid
<< dendl
;
11194 if (recovery_state
.get_pg_log().get_missing().is_missing(oid
)) {
11195 dout(10) << __func__
<< " " << oid
<< " @" << oid
.snap
11196 << " snapset " << ssc
->snapset
11197 << " " << oid
<< " is missing" << dendl
;
11200 put_snapset_context(ssc
);
11204 ObjectContextRef obc
= get_object_context(oid
, false);
11205 if (!obc
|| !obc
->obs
.exists
) {
11206 dout(10) << __func__
<< " " << oid
<< " @" << oid
.snap
11207 << " snapset " << ssc
->snapset
11208 << " " << oid
<< " is not present" << dendl
;
11211 put_snapset_context(ssc
);
11214 dout(10) << __func__
<< " " << oid
<< " @" << oid
.snap
11215 << " snapset " << ssc
->snapset
11216 << " " << oid
<< " HIT" << dendl
;
11218 put_snapset_context(ssc
);
11221 ceph_abort(); //unreachable
11224 dout(10) << __func__
<< " " << oid
<< " @" << oid
.snap
11225 << " snapset " << ssc
->snapset
<< dendl
;
11228 if (oid
.snap
> ssc
->snapset
.seq
) {
11229 ObjectContextRef obc
= get_object_context(head
, false);
11230 dout(10) << __func__
<< " " << head
11231 << " want " << oid
.snap
<< " > snapset seq " << ssc
->snapset
.seq
11232 << " -- HIT " << obc
->obs
11237 ceph_assert(ssc
== obc
->ssc
);
11238 put_snapset_context(ssc
);
11244 // which clone would it be?
11246 while (k
< ssc
->snapset
.clones
.size() &&
11247 ssc
->snapset
.clones
[k
] < oid
.snap
)
11249 if (k
== ssc
->snapset
.clones
.size()) {
11250 dout(10) << __func__
<< " no clones with last >= oid.snap "
11251 << oid
.snap
<< " -- DNE" << dendl
;
11252 put_snapset_context(ssc
);
11255 hobject_t
soid(oid
.oid
, oid
.get_key(), ssc
->snapset
.clones
[k
], oid
.get_hash(),
11256 info
.pgid
.pool(), oid
.get_namespace());
11258 if (recovery_state
.get_pg_log().get_missing().is_missing(soid
)) {
11259 dout(20) << __func__
<< " " << soid
<< " missing, try again later"
11263 put_snapset_context(ssc
);
11267 ObjectContextRef obc
= get_object_context(soid
, false);
11268 if (!obc
|| !obc
->obs
.exists
) {
11271 put_snapset_context(ssc
);
11272 if (is_primary()) {
11273 if (is_degraded_or_backfilling_object(soid
)) {
11274 dout(20) << __func__
<< " clone is degraded or backfilling " << soid
<< dendl
;
11276 } else if (is_degraded_on_async_recovery_target(soid
)) {
11277 dout(20) << __func__
<< " clone is recovering " << soid
<< dendl
;
11280 dout(20) << __func__
<< " missing clone " << soid
<< dendl
;
11284 dout(20) << __func__
<< " replica missing clone" << soid
<< dendl
;
11292 ceph_assert(obc
->ssc
== ssc
);
11293 put_snapset_context(ssc
);
11298 dout(20) << __func__
<< " " << soid
11299 << " snapset " << obc
->ssc
->snapset
11301 snapid_t first
, last
;
11302 auto p
= obc
->ssc
->snapset
.clone_snaps
.find(soid
.snap
);
11303 ceph_assert(p
!= obc
->ssc
->snapset
.clone_snaps
.end());
11304 if (p
->second
.empty()) {
11305 dout(1) << __func__
<< " " << soid
<< " empty snapset -- DNE" << dendl
;
11306 ceph_assert(!cct
->_conf
->osd_debug_verify_snaps
);
11309 if (std::find(p
->second
.begin(), p
->second
.end(), oid
.snap
) ==
11311 dout(20) << __func__
<< " " << soid
<< " clone_snaps " << p
->second
11312 << " does not contain " << oid
.snap
<< " -- DNE" << dendl
;
11315 if (get_osdmap()->in_removed_snaps_queue(info
.pgid
.pgid
.pool(), oid
.snap
)) {
11316 dout(20) << __func__
<< " " << soid
<< " snap " << oid
.snap
11317 << " in removed_snaps_queue" << " -- DNE" << dendl
;
11320 dout(20) << __func__
<< " " << soid
<< " clone_snaps " << p
->second
11321 << " contains " << oid
.snap
<< " -- HIT " << obc
->obs
<< dendl
;
11326 void PrimaryLogPG::object_context_destructor_callback(ObjectContext
*obc
)
11329 put_snapset_context(obc
->ssc
);
11332 void PrimaryLogPG::add_object_context_to_pg_stat(ObjectContextRef obc
, pg_stat_t
*pgstat
)
11334 object_info_t
& oi
= obc
->obs
.oi
;
11336 dout(10) << __func__
<< " " << oi
.soid
<< dendl
;
11337 ceph_assert(!oi
.soid
.is_snapdir());
11339 object_stat_sum_t stat
;
11340 stat
.num_objects
++;
11342 stat
.num_objects_dirty
++;
11343 if (oi
.is_whiteout())
11344 stat
.num_whiteouts
++;
11346 stat
.num_objects_omap
++;
11347 if (oi
.is_cache_pinned())
11348 stat
.num_objects_pinned
++;
11349 if (oi
.has_manifest())
11350 stat
.num_objects_manifest
++;
11352 if (oi
.soid
.is_snap()) {
11353 stat
.num_object_clones
++;
11356 obc
->ssc
= get_snapset_context(oi
.soid
, false);
11357 ceph_assert(obc
->ssc
);
11358 stat
.num_bytes
+= obc
->ssc
->snapset
.get_clone_bytes(oi
.soid
.snap
);
11360 stat
.num_bytes
+= oi
.size
;
11364 pgstat
->stats
.sum
.add(stat
);
11367 void PrimaryLogPG::kick_object_context_blocked(ObjectContextRef obc
)
11369 const hobject_t
& soid
= obc
->obs
.oi
.soid
;
11370 if (obc
->is_blocked()) {
11371 dout(10) << __func__
<< " " << soid
<< " still blocked" << dendl
;
11375 map
<hobject_t
, list
<OpRequestRef
>>::iterator p
= waiting_for_blocked_object
.find(soid
);
11376 if (p
!= waiting_for_blocked_object
.end()) {
11377 list
<OpRequestRef
>& ls
= p
->second
;
11378 dout(10) << __func__
<< " " << soid
<< " requeuing " << ls
.size() << " requests" << dendl
;
11380 waiting_for_blocked_object
.erase(p
);
11383 map
<hobject_t
, ObjectContextRef
>::iterator i
=
11384 objects_blocked_on_snap_promotion
.find(obc
->obs
.oi
.soid
.get_head());
11385 if (i
!= objects_blocked_on_snap_promotion
.end()) {
11386 ceph_assert(i
->second
== obc
);
11387 objects_blocked_on_snap_promotion
.erase(i
);
11390 if (obc
->requeue_scrub_on_unblock
) {
11391 obc
->requeue_scrub_on_unblock
= false;
11392 // only requeue if we are still active: we may be unblocking
11393 // because we are resetting for a new peering interval
11400 SnapSetContext
*PrimaryLogPG::get_snapset_context(
11401 const hobject_t
& oid
,
11403 const map
<string
, bufferlist
> *attrs
,
11406 std::lock_guard
l(snapset_contexts_lock
);
11407 SnapSetContext
*ssc
;
11408 map
<hobject_t
, SnapSetContext
*>::iterator p
= snapset_contexts
.find(
11409 oid
.get_snapdir());
11410 if (p
!= snapset_contexts
.end()) {
11411 if (can_create
|| p
->second
->exists
) {
11420 if (!(oid
.is_head() && !oid_existed
)) {
11421 r
= pgbackend
->objects_get_attr(oid
.get_head(), SS_ATTR
, &bv
);
11423 if (r
< 0 && !can_create
)
11426 auto it_ss
= attrs
->find(SS_ATTR
);
11427 ceph_assert(it_ss
!= attrs
->end());
11428 bv
= it_ss
->second
;
11430 ssc
= new SnapSetContext(oid
.get_snapdir());
11431 _register_snapset_context(ssc
);
11433 bufferlist::const_iterator bvp
= bv
.begin();
11435 ssc
->snapset
.decode(bvp
);
11436 } catch (buffer::error
& e
) {
11437 dout(0) << __func__
<< " Can't decode snapset: " << e
<< dendl
;
11440 ssc
->exists
= true;
11442 ssc
->exists
= false;
11450 void PrimaryLogPG::put_snapset_context(SnapSetContext
*ssc
)
11452 std::lock_guard
l(snapset_contexts_lock
);
11454 if (ssc
->ref
== 0) {
11455 if (ssc
->registered
)
11456 snapset_contexts
.erase(ssc
->oid
);
11463 * NONE - didn't pull anything
11464 * YES - pulled what the caller wanted
11465 * HEAD - needed to pull head first
11467 enum { PULL_NONE
, PULL_HEAD
, PULL_YES
};
11469 int PrimaryLogPG::recover_missing(
11470 const hobject_t
&soid
, eversion_t v
,
11472 PGBackend::RecoveryHandle
*h
)
11474 if (recovery_state
.get_missing_loc().is_unfound(soid
)) {
11475 dout(7) << __func__
<< " " << soid
11477 << " but it is unfound" << dendl
;
11481 if (recovery_state
.get_missing_loc().is_deleted(soid
)) {
11482 start_recovery_op(soid
);
11483 ceph_assert(!recovering
.count(soid
));
11484 recovering
.insert(make_pair(soid
, ObjectContextRef()));
11485 epoch_t cur_epoch
= get_osdmap_epoch();
11486 remove_missing_object(soid
, v
, new LambdaContext(
11488 std::scoped_lock locker
{*this};
11489 if (!pg_has_reset_since(cur_epoch
)) {
11490 bool object_missing
= false;
11491 for (const auto& shard
: get_acting_recovery_backfill()) {
11492 if (shard
== pg_whoami
)
11494 if (recovery_state
.get_peer_missing(shard
).is_missing(soid
)) {
11495 dout(20) << __func__
<< ": soid " << soid
<< " needs to be deleted from replica " << shard
<< dendl
;
11496 object_missing
= true;
11500 if (!object_missing
) {
11501 object_stat_sum_t stat_diff
;
11502 stat_diff
.num_objects_recovered
= 1;
11503 if (scrub_after_recovery
)
11504 stat_diff
.num_objects_repaired
= 1;
11505 on_global_recover(soid
, stat_diff
, true);
11507 auto recovery_handle
= pgbackend
->open_recovery_op();
11508 pgbackend
->recover_delete_object(soid
, v
, recovery_handle
);
11509 pgbackend
->run_recovery_op(recovery_handle
, priority
);
11516 // is this a snapped object? if so, consult the snapset.. we may not need the entire object!
11517 ObjectContextRef obc
;
11518 ObjectContextRef head_obc
;
11519 if (soid
.snap
&& soid
.snap
< CEPH_NOSNAP
) {
11520 // do we have the head?
11521 hobject_t head
= soid
.get_head();
11522 if (recovery_state
.get_pg_log().get_missing().is_missing(head
)) {
11523 if (recovering
.count(head
)) {
11524 dout(10) << " missing but already recovering head " << head
<< dendl
;
11527 int r
= recover_missing(
11528 head
, recovery_state
.get_pg_log().get_missing().get_items().find(head
)->second
.need
, priority
,
11530 if (r
!= PULL_NONE
)
11535 head_obc
= get_object_context(
11539 ceph_assert(head_obc
);
11541 start_recovery_op(soid
);
11542 ceph_assert(!recovering
.count(soid
));
11543 recovering
.insert(make_pair(soid
, obc
));
11544 int r
= pgbackend
->recover_object(
11550 // This is only a pull which shouldn't return an error
11551 ceph_assert(r
>= 0);
11555 void PrimaryLogPG::remove_missing_object(const hobject_t
&soid
,
11556 eversion_t v
, Context
*on_complete
)
11558 dout(20) << __func__
<< " " << soid
<< " " << v
<< dendl
;
11559 ceph_assert(on_complete
!= nullptr);
11561 ObjectStore::Transaction t
;
11562 remove_snap_mapped_object(t
, soid
);
11564 ObjectRecoveryInfo recovery_info
;
11565 recovery_info
.soid
= soid
;
11566 recovery_info
.version
= v
;
11568 epoch_t cur_epoch
= get_osdmap_epoch();
11569 t
.register_on_complete(new LambdaContext(
11571 std::unique_lock locker
{*this};
11572 if (!pg_has_reset_since(cur_epoch
)) {
11573 ObjectStore::Transaction t2
;
11574 on_local_recover(soid
, recovery_info
, ObjectContextRef(), true, &t2
);
11575 t2
.register_on_complete(on_complete
);
11576 int r
= osd
->store
->queue_transaction(ch
, std::move(t2
), nullptr);
11577 ceph_assert(r
== 0);
11581 on_complete
->complete(-EAGAIN
);
11584 int r
= osd
->store
->queue_transaction(ch
, std::move(t
), nullptr);
11585 ceph_assert(r
== 0);
11588 void PrimaryLogPG::finish_degraded_object(const hobject_t oid
)
11590 dout(10) << __func__
<< " " << oid
<< dendl
;
11591 if (callbacks_for_degraded_object
.count(oid
)) {
11592 list
<Context
*> contexts
;
11593 contexts
.swap(callbacks_for_degraded_object
[oid
]);
11594 callbacks_for_degraded_object
.erase(oid
);
11595 for (list
<Context
*>::iterator i
= contexts
.begin();
11596 i
!= contexts
.end();
11601 map
<hobject_t
, snapid_t
>::iterator i
= objects_blocked_on_degraded_snap
.find(
11603 if (i
!= objects_blocked_on_degraded_snap
.end() &&
11604 i
->second
== oid
.snap
)
11605 objects_blocked_on_degraded_snap
.erase(i
);
11608 void PrimaryLogPG::_committed_pushed_object(
11609 epoch_t epoch
, eversion_t last_complete
)
11611 std::scoped_lock locker
{*this};
11612 if (!pg_has_reset_since(epoch
)) {
11613 recovery_state
.recovery_committed_to(last_complete
);
11615 dout(10) << __func__
11616 << " pg has changed, not touching last_complete_ondisk" << dendl
;
11620 void PrimaryLogPG::_applied_recovered_object(ObjectContextRef obc
)
11622 dout(20) << __func__
<< dendl
;
11624 dout(20) << "obc = " << *obc
<< dendl
;
11626 ceph_assert(active_pushes
>= 1);
11629 // requeue an active chunky scrub waiting on recovery ops
11630 if (!recovery_state
.is_deleting() && active_pushes
== 0
11631 && scrubber
.is_chunky_scrub_active()) {
11632 requeue_scrub(ops_blocked_by_scrub());
11636 void PrimaryLogPG::_applied_recovered_object_replica()
11638 dout(20) << __func__
<< dendl
;
11639 ceph_assert(active_pushes
>= 1);
11642 // requeue an active chunky scrub waiting on recovery ops
11643 if (!recovery_state
.is_deleting() && active_pushes
== 0 &&
11644 scrubber
.active_rep_scrub
&& static_cast<const MOSDRepScrub
*>(
11645 scrubber
.active_rep_scrub
->get_req())->chunky
) {
11646 auto& op
= scrubber
.active_rep_scrub
;
11649 unique_ptr
<OpSchedulerItem::OpQueueable
>(new PGOpItem(info
.pgid
, op
)),
11650 op
->get_req()->get_cost(),
11651 op
->get_req()->get_priority(),
11652 op
->get_req()->get_recv_stamp(),
11653 op
->get_req()->get_source().num(),
11654 get_osdmap_epoch()));
11655 scrubber
.active_rep_scrub
.reset();
11659 void PrimaryLogPG::on_failed_pull(
11660 const set
<pg_shard_t
> &from
,
11661 const hobject_t
&soid
,
11662 const eversion_t
&v
)
11664 dout(20) << __func__
<< ": " << soid
<< dendl
;
11665 ceph_assert(recovering
.count(soid
));
11666 auto obc
= recovering
[soid
];
11668 list
<OpRequestRef
> blocked_ops
;
11669 obc
->drop_recovery_read(&blocked_ops
);
11670 requeue_ops(blocked_ops
);
11672 recovering
.erase(soid
);
11673 for (auto&& i
: from
) {
11674 if (i
!= pg_whoami
) { // we'll get it below in primary_error
11675 recovery_state
.force_object_missing(i
, soid
, v
);
11679 dout(0) << __func__
<< " " << soid
<< " from shard " << from
11680 << ", reps on " << recovery_state
.get_missing_loc().get_locations(soid
)
11681 << " unfound? " << recovery_state
.get_missing_loc().is_unfound(soid
)
11683 finish_recovery_op(soid
); // close out this attempt,
11684 finish_degraded_object(soid
);
11686 if (from
.count(pg_whoami
)) {
11687 dout(0) << " primary missing oid " << soid
<< " version " << v
<< dendl
;
11688 primary_error(soid
, v
);
11689 backfills_in_flight
.erase(soid
);
11693 eversion_t
PrimaryLogPG::pick_newest_available(const hobject_t
& oid
)
11696 pg_missing_item pmi
;
11697 bool is_missing
= recovery_state
.get_pg_log().get_missing().is_missing(oid
, &pmi
);
11698 ceph_assert(is_missing
);
11700 dout(10) << "pick_newest_available " << oid
<< " " << v
<< " on osd." << osd
->whoami
<< " (local)" << dendl
;
11702 ceph_assert(!get_acting_recovery_backfill().empty());
11703 for (set
<pg_shard_t
>::iterator i
= get_acting_recovery_backfill().begin();
11704 i
!= get_acting_recovery_backfill().end();
11706 if (*i
== get_primary()) continue;
11707 pg_shard_t peer
= *i
;
11708 if (!recovery_state
.get_peer_missing(peer
).is_missing(oid
)) {
11711 eversion_t h
= recovery_state
.get_peer_missing(peer
).get_items().at(oid
).have
;
11712 dout(10) << "pick_newest_available " << oid
<< " " << h
<< " on osd." << peer
<< dendl
;
11717 dout(10) << "pick_newest_available " << oid
<< " " << v
<< " (newest)" << dendl
;
11721 void PrimaryLogPG::do_update_log_missing(OpRequestRef
&op
)
11723 const MOSDPGUpdateLogMissing
*m
= static_cast<const MOSDPGUpdateLogMissing
*>(
11725 ceph_assert(m
->get_type() == MSG_OSD_PG_UPDATE_LOG_MISSING
);
11726 ObjectStore::Transaction t
;
11727 std::optional
<eversion_t
> op_trim_to
, op_roll_forward_to
;
11728 if (m
->pg_trim_to
!= eversion_t())
11729 op_trim_to
= m
->pg_trim_to
;
11730 if (m
->pg_roll_forward_to
!= eversion_t())
11731 op_roll_forward_to
= m
->pg_roll_forward_to
;
11733 dout(20) << __func__
11734 << " op_trim_to = " << op_trim_to
<< " op_roll_forward_to = " << op_roll_forward_to
<< dendl
;
11736 recovery_state
.append_log_entries_update_missing(
11737 m
->entries
, t
, op_trim_to
, op_roll_forward_to
);
11738 eversion_t new_lcod
= info
.last_complete
;
11740 Context
*complete
= new LambdaContext(
11742 const MOSDPGUpdateLogMissing
*msg
= static_cast<const MOSDPGUpdateLogMissing
*>(
11744 std::scoped_lock locker
{*this};
11745 if (!pg_has_reset_since(msg
->get_epoch())) {
11746 update_last_complete_ondisk(new_lcod
);
11747 MOSDPGUpdateLogMissingReply
*reply
=
11748 new MOSDPGUpdateLogMissingReply(
11749 spg_t(info
.pgid
.pgid
, primary_shard().shard
),
11755 reply
->set_priority(CEPH_MSG_PRIO_HIGH
);
11756 msg
->get_connection()->send_message(reply
);
11760 if (get_osdmap()->require_osd_release
>= ceph_release_t::kraken
) {
11761 t
.register_on_commit(complete
);
11763 /* Hack to work around the fact that ReplicatedBackend sends
11764 * ack+commit if commit happens first
11766 * This behavior is no longer necessary, but we preserve it so old
11767 * primaries can keep their repops in order */
11768 if (pool
.info
.is_erasure()) {
11769 t
.register_on_complete(complete
);
11771 t
.register_on_commit(complete
);
11774 int tr
= osd
->store
->queue_transaction(
11778 ceph_assert(tr
== 0);
11779 op_applied(info
.last_update
);
11782 void PrimaryLogPG::do_update_log_missing_reply(OpRequestRef
&op
)
11784 const MOSDPGUpdateLogMissingReply
*m
=
11785 static_cast<const MOSDPGUpdateLogMissingReply
*>(
11787 dout(20) << __func__
<< " got reply from "
11788 << m
->get_from() << dendl
;
11790 auto it
= log_entry_update_waiting_on
.find(m
->get_tid());
11791 if (it
!= log_entry_update_waiting_on
.end()) {
11792 if (it
->second
.waiting_on
.count(m
->get_from())) {
11793 it
->second
.waiting_on
.erase(m
->get_from());
11794 if (m
->last_complete_ondisk
!= eversion_t()) {
11795 update_peer_last_complete_ondisk(m
->get_from(), m
->last_complete_ondisk
);
11799 << info
.pgid
<< " got reply "
11800 << *m
<< " from shard we are not waiting for "
11804 if (it
->second
.waiting_on
.empty()) {
11805 repop_all_committed(it
->second
.repop
.get());
11806 log_entry_update_waiting_on
.erase(it
);
11810 << info
.pgid
<< " got reply "
11811 << *m
<< " on unknown tid " << m
->get_tid();
11815 /* Mark all unfound objects as lost.
11817 void PrimaryLogPG::mark_all_unfound_lost(
11819 std::function
<void(int,const std::string
&,bufferlist
&)> on_finish
)
11821 dout(3) << __func__
<< " " << pg_log_entry_t::get_op_name(what
) << dendl
;
11822 list
<hobject_t
> oids
;
11824 dout(30) << __func__
<< ": log before:\n";
11825 recovery_state
.get_pg_log().get_log().print(*_dout
);
11828 mempool::osd_pglog::list
<pg_log_entry_t
> log_entries
;
11830 utime_t mtime
= ceph_clock_now();
11831 map
<hobject_t
, pg_missing_item
>::const_iterator m
=
11832 recovery_state
.get_missing_loc().get_needs_recovery().begin();
11833 map
<hobject_t
, pg_missing_item
>::const_iterator mend
=
11834 recovery_state
.get_missing_loc().get_needs_recovery().end();
11836 ObcLockManager manager
;
11837 eversion_t v
= get_next_version();
11838 v
.epoch
= get_osdmap_epoch();
11839 uint64_t num_unfound
= recovery_state
.get_missing_loc().num_unfound();
11840 while (m
!= mend
) {
11841 const hobject_t
&oid(m
->first
);
11842 if (!recovery_state
.get_missing_loc().is_unfound(oid
)) {
11843 // We only care about unfound objects
11848 ObjectContextRef obc
;
11852 case pg_log_entry_t::LOST_MARK
:
11853 ceph_abort_msg("actually, not implemented yet!");
11856 case pg_log_entry_t::LOST_REVERT
:
11857 prev
= pick_newest_available(oid
);
11858 if (prev
> eversion_t()) {
11861 pg_log_entry_t::LOST_REVERT
, oid
, v
,
11862 m
->second
.need
, 0, osd_reqid_t(), mtime
, 0);
11863 e
.reverting_to
= prev
;
11864 e
.mark_unrollbackable();
11865 log_entries
.push_back(e
);
11866 dout(10) << e
<< dendl
;
11868 // we are now missing the new version; recovery code will sort it out.
11874 case pg_log_entry_t::LOST_DELETE
:
11876 pg_log_entry_t
e(pg_log_entry_t::LOST_DELETE
, oid
, v
, m
->second
.need
,
11877 0, osd_reqid_t(), mtime
, 0);
11878 if (get_osdmap()->require_osd_release
>= ceph_release_t::jewel
) {
11879 if (pool
.info
.require_rollback()) {
11880 e
.mod_desc
.try_rmobject(v
.version
);
11882 e
.mark_unrollbackable();
11884 } // otherwise, just do what we used to do
11885 dout(10) << e
<< dendl
;
11886 log_entries
.push_back(e
);
11887 oids
.push_back(oid
);
11889 // If context found mark object as deleted in case
11890 // of racing with new creation. This can happen if
11891 // object lost and EIO at primary.
11892 obc
= object_contexts
.lookup(oid
);
11894 obc
->obs
.exists
= false;
11906 recovery_state
.update_stats(
11907 [](auto &history
, auto &stats
) {
11908 stats
.stats_invalid
= true;
11912 submit_log_entries(
11914 std::move(manager
),
11915 std::optional
<std::function
<void(void)> >(
11916 [this, oids
, num_unfound
, on_finish
]() {
11917 if (recovery_state
.perform_deletes_during_peering()) {
11918 for (auto oid
: oids
) {
11919 // clear old locations - merge_new_log_entries will have
11920 // handled rebuilding missing_loc for each of these
11921 // objects if we have the RECOVERY_DELETES flag
11922 recovery_state
.object_recovered(oid
, object_stat_sum_t());
11926 if (is_recovery_unfound()) {
11927 queue_peering_event(
11929 std::make_shared
<PGPeeringEvent
>(
11930 get_osdmap_epoch(),
11931 get_osdmap_epoch(),
11932 PeeringState::DoRecovery())));
11933 } else if (is_backfill_unfound()) {
11934 queue_peering_event(
11936 std::make_shared
<PGPeeringEvent
>(
11937 get_osdmap_epoch(),
11938 get_osdmap_epoch(),
11939 PeeringState::RequestBackfill())));
11945 ss
<< "pg has " << num_unfound
11946 << " objects unfound and apparently lost marking";
11947 string rs
= ss
.str();
11948 dout(0) << "do_command r=" << 0 << " " << rs
<< dendl
;
11949 osd
->clog
->info() << rs
;
11951 on_finish(0, rs
, empty
);
11956 void PrimaryLogPG::_split_into(pg_t child_pgid
, PG
*child
, unsigned split_bits
)
11958 ceph_assert(repop_queue
.empty());
11962 * pg status change notification
11965 void PrimaryLogPG::apply_and_flush_repops(bool requeue
)
11967 list
<OpRequestRef
> rq
;
11969 // apply all repops
11970 while (!repop_queue
.empty()) {
11971 RepGather
*repop
= repop_queue
.front();
11972 repop_queue
.pop_front();
11973 dout(10) << " canceling repop tid " << repop
->rep_tid
<< dendl
;
11974 repop
->rep_aborted
= true;
11975 repop
->on_committed
.clear();
11976 repop
->on_success
.clear();
11980 dout(10) << " requeuing " << *repop
->op
->get_req() << dendl
;
11981 rq
.push_back(repop
->op
);
11982 repop
->op
= OpRequestRef();
11985 // also requeue any dups, interleaved into position
11986 auto p
= waiting_for_ondisk
.find(repop
->v
);
11987 if (p
!= waiting_for_ondisk
.end()) {
11988 dout(10) << " also requeuing ondisk waiters " << p
->second
<< dendl
;
11989 for (auto& i
: p
->second
) {
11990 rq
.push_back(std::get
<0>(i
));
11992 waiting_for_ondisk
.erase(p
);
11996 remove_repop(repop
);
11999 ceph_assert(repop_queue
.empty());
12003 if (!waiting_for_ondisk
.empty()) {
12004 for (auto& i
: waiting_for_ondisk
) {
12005 for (auto& j
: i
.second
) {
12006 derr
<< __func__
<< ": op " << *(std::get
<0>(j
)->get_req())
12007 << " waiting on " << i
.first
<< dendl
;
12010 ceph_assert(waiting_for_ondisk
.empty());
12014 waiting_for_ondisk
.clear();
12017 void PrimaryLogPG::on_flushed()
12019 requeue_ops(waiting_for_flush
);
12020 if (!is_peered() || !is_primary()) {
12021 pair
<hobject_t
, ObjectContextRef
> i
;
12022 while (object_contexts
.get_next(i
.first
, &i
)) {
12023 derr
<< __func__
<< ": object " << i
.first
<< " obc still alive" << dendl
;
12025 ceph_assert(object_contexts
.empty());
12029 void PrimaryLogPG::on_removal(ObjectStore::Transaction
&t
)
12031 dout(10) << __func__
<< dendl
;
12035 t
.register_on_commit(new C_DeleteMore(this, get_osdmap_epoch()));
12038 void PrimaryLogPG::clear_async_reads()
12040 dout(10) << __func__
<< dendl
;
12041 for(auto& i
: in_progress_async_reads
) {
12042 dout(10) << "clear ctx: "
12043 << "OpRequestRef " << i
.first
12044 << " OpContext " << i
.second
12046 close_op_ctx(i
.second
);
12050 void PrimaryLogPG::clear_cache()
12052 object_contexts
.clear();
12055 void PrimaryLogPG::on_shutdown()
12057 dout(10) << __func__
<< dendl
;
12059 if (recovery_queued
) {
12060 recovery_queued
= false;
12061 osd
->clear_queued_recovery(this);
12064 clear_scrub_reserved();
12065 scrub_clear_state();
12067 unreg_next_scrub();
12069 vector
<ceph_tid_t
> tids
;
12070 cancel_copy_ops(false, &tids
);
12071 cancel_flush_ops(false, &tids
);
12072 cancel_proxy_ops(false, &tids
);
12073 cancel_manifest_ops(false, &tids
);
12074 osd
->objecter
->op_cancel(tids
, -ECANCELED
);
12076 apply_and_flush_repops(false);
12077 cancel_log_updates();
12078 // we must remove PGRefs, so do this this prior to release_backoffs() callers
12080 // clean up snap trim references
12081 snap_trimmer_machine
.process_event(Reset());
12083 pgbackend
->on_change();
12085 context_registry_on_change();
12086 object_contexts
.clear();
12088 clear_async_reads();
12090 osd
->remote_reserver
.cancel_reservation(info
.pgid
);
12091 osd
->local_reserver
.cancel_reservation(info
.pgid
);
12093 clear_primary_state();
12096 if (is_primary()) {
12097 osd
->clear_ready_to_merge(this);
12101 void PrimaryLogPG::on_activate_complete()
12105 if (!recovery_state
.needs_flush()) {
12106 requeue_ops(waiting_for_peered
);
12107 } else if (!waiting_for_peered
.empty()) {
12108 dout(10) << __func__
<< " flushes in progress, moving "
12109 << waiting_for_peered
.size()
12110 << " items to waiting_for_flush"
12112 ceph_assert(waiting_for_flush
.empty());
12113 waiting_for_flush
.swap(waiting_for_peered
);
12118 if (needs_recovery()) {
12119 dout(10) << "activate not all replicas are up-to-date, queueing recovery" << dendl
;
12120 queue_peering_event(
12122 std::make_shared
<PGPeeringEvent
>(
12123 get_osdmap_epoch(),
12124 get_osdmap_epoch(),
12125 PeeringState::DoRecovery())));
12126 } else if (needs_backfill()) {
12127 dout(10) << "activate queueing backfill" << dendl
;
12128 queue_peering_event(
12130 std::make_shared
<PGPeeringEvent
>(
12131 get_osdmap_epoch(),
12132 get_osdmap_epoch(),
12133 PeeringState::RequestBackfill())));
12135 dout(10) << "activate all replicas clean, no recovery" << dendl
;
12136 eio_errors_to_process
= false;
12137 queue_peering_event(
12139 std::make_shared
<PGPeeringEvent
>(
12140 get_osdmap_epoch(),
12141 get_osdmap_epoch(),
12142 PeeringState::AllReplicasRecovered())));
12145 publish_stats_to_osd();
12147 if (get_backfill_targets().size()) {
12148 last_backfill_started
= earliest_backfill();
12149 new_backfill
= true;
12150 ceph_assert(!last_backfill_started
.is_max());
12151 dout(5) << __func__
<< ": bft=" << get_backfill_targets()
12152 << " from " << last_backfill_started
<< dendl
;
12153 for (set
<pg_shard_t
>::const_iterator i
= get_backfill_targets().begin();
12154 i
!= get_backfill_targets().end();
12156 dout(5) << "target shard " << *i
12157 << " from " << recovery_state
.get_peer_info(*i
).last_backfill
12166 void PrimaryLogPG::on_change(ObjectStore::Transaction
&t
)
12168 dout(10) << __func__
<< dendl
;
12170 if (hit_set
&& hit_set
->insert_count() == 0) {
12171 dout(20) << " discarding empty hit_set" << dendl
;
12175 if (recovery_queued
) {
12176 recovery_queued
= false;
12177 osd
->clear_queued_recovery(this);
12180 // requeue everything in the reverse order they should be
12182 requeue_ops(waiting_for_peered
);
12183 requeue_ops(waiting_for_flush
);
12184 requeue_ops(waiting_for_active
);
12185 requeue_ops(waiting_for_readable
);
12187 clear_scrub_reserved();
12189 vector
<ceph_tid_t
> tids
;
12190 cancel_copy_ops(is_primary(), &tids
);
12191 cancel_flush_ops(is_primary(), &tids
);
12192 cancel_proxy_ops(is_primary(), &tids
);
12193 cancel_manifest_ops(is_primary(), &tids
);
12194 osd
->objecter
->op_cancel(tids
, -ECANCELED
);
12196 // requeue object waiters
12197 for (auto& p
: waiting_for_unreadable_object
) {
12198 release_backoffs(p
.first
);
12200 if (is_primary()) {
12201 requeue_object_waiters(waiting_for_unreadable_object
);
12203 waiting_for_unreadable_object
.clear();
12205 for (map
<hobject_t
,list
<OpRequestRef
>>::iterator p
= waiting_for_degraded_object
.begin();
12206 p
!= waiting_for_degraded_object
.end();
12207 waiting_for_degraded_object
.erase(p
++)) {
12208 release_backoffs(p
->first
);
12210 requeue_ops(p
->second
);
12213 finish_degraded_object(p
->first
);
12216 // requeues waiting_for_scrub
12217 scrub_clear_state();
12219 for (auto p
= waiting_for_blocked_object
.begin();
12220 p
!= waiting_for_blocked_object
.end();
12221 waiting_for_blocked_object
.erase(p
++)) {
12223 requeue_ops(p
->second
);
12227 for (auto i
= callbacks_for_degraded_object
.begin();
12228 i
!= callbacks_for_degraded_object
.end();
12230 finish_degraded_object((i
++)->first
);
12232 ceph_assert(callbacks_for_degraded_object
.empty());
12234 if (is_primary()) {
12235 requeue_ops(waiting_for_cache_not_full
);
12237 waiting_for_cache_not_full
.clear();
12239 objects_blocked_on_cache_full
.clear();
12241 for (list
<pair
<OpRequestRef
, OpContext
*> >::iterator i
=
12242 in_progress_async_reads
.begin();
12243 i
!= in_progress_async_reads
.end();
12244 in_progress_async_reads
.erase(i
++)) {
12245 close_op_ctx(i
->second
);
12247 requeue_op(i
->first
);
12250 // this will requeue ops we were working on but didn't finish, and
12252 apply_and_flush_repops(is_primary());
12253 cancel_log_updates();
12255 // do this *after* apply_and_flush_repops so that we catch any newly
12256 // registered watches.
12257 context_registry_on_change();
12259 pgbackend
->on_change_cleanup(&t
);
12260 scrubber
.cleanup_store(&t
);
12261 pgbackend
->on_change();
12263 // clear snap_trimmer state
12264 snap_trimmer_machine
.process_event(Reset());
12266 debug_op_order
.clear();
12267 unstable_stats
.clear();
12269 // we don't want to cache object_contexts through the interval change
12270 // NOTE: we actually assert that all currently live references are dead
12271 // by the time the flush for the next interval completes.
12272 object_contexts
.clear();
12274 // should have been cleared above by finishing all of the degraded objects
12275 ceph_assert(objects_blocked_on_degraded_snap
.empty());
12278 void PrimaryLogPG::plpg_on_role_change()
12280 dout(10) << __func__
<< dendl
;
12281 if (get_role() != 0 && hit_set
) {
12282 dout(10) << " clearing hit set" << dendl
;
12287 void PrimaryLogPG::plpg_on_pool_change()
12289 dout(10) << __func__
<< dendl
;
12290 // requeue cache full waiters just in case the cache_mode is
12291 // changing away from writeback mode. note that if we are not
12292 // active the normal requeuing machinery is sufficient (and properly
12295 pool
.info
.cache_mode
!= pg_pool_t::CACHEMODE_WRITEBACK
&&
12296 !waiting_for_cache_not_full
.empty()) {
12297 dout(10) << __func__
<< " requeuing full waiters (not in writeback) "
12299 requeue_ops(waiting_for_cache_not_full
);
12300 objects_blocked_on_cache_full
.clear();
12306 // clear state. called on recovery completion AND cancellation.
12307 void PrimaryLogPG::_clear_recovery_state()
12309 #ifdef DEBUG_RECOVERY_OIDS
12310 recovering_oids
.clear();
12312 last_backfill_started
= hobject_t();
12313 set
<hobject_t
>::iterator i
= backfills_in_flight
.begin();
12314 while (i
!= backfills_in_flight
.end()) {
12315 ceph_assert(recovering
.count(*i
));
12316 backfills_in_flight
.erase(i
++);
12319 list
<OpRequestRef
> blocked_ops
;
12320 for (map
<hobject_t
, ObjectContextRef
>::iterator i
= recovering
.begin();
12321 i
!= recovering
.end();
12322 recovering
.erase(i
++)) {
12324 i
->second
->drop_recovery_read(&blocked_ops
);
12325 requeue_ops(blocked_ops
);
12328 ceph_assert(backfills_in_flight
.empty());
12329 pending_backfill_updates
.clear();
12330 ceph_assert(recovering
.empty());
12331 pgbackend
->clear_recovery_state();
12334 void PrimaryLogPG::cancel_pull(const hobject_t
&soid
)
12336 dout(20) << __func__
<< ": " << soid
<< dendl
;
12337 ceph_assert(recovering
.count(soid
));
12338 ObjectContextRef obc
= recovering
[soid
];
12340 list
<OpRequestRef
> blocked_ops
;
12341 obc
->drop_recovery_read(&blocked_ops
);
12342 requeue_ops(blocked_ops
);
12344 recovering
.erase(soid
);
12345 finish_recovery_op(soid
);
12346 release_backoffs(soid
);
12347 if (waiting_for_degraded_object
.count(soid
)) {
12348 dout(20) << " kicking degraded waiters on " << soid
<< dendl
;
12349 requeue_ops(waiting_for_degraded_object
[soid
]);
12350 waiting_for_degraded_object
.erase(soid
);
12352 if (waiting_for_unreadable_object
.count(soid
)) {
12353 dout(20) << " kicking unreadable waiters on " << soid
<< dendl
;
12354 requeue_ops(waiting_for_unreadable_object
[soid
]);
12355 waiting_for_unreadable_object
.erase(soid
);
12357 if (is_missing_object(soid
))
12358 recovery_state
.set_last_requested(0);
12359 finish_degraded_object(soid
);
12362 void PrimaryLogPG::check_recovery_sources(const OSDMapRef
& osdmap
)
12364 pgbackend
->check_recovery_sources(osdmap
);
12367 bool PrimaryLogPG::start_recovery_ops(
12369 ThreadPool::TPHandle
&handle
,
12370 uint64_t *ops_started
)
12372 uint64_t& started
= *ops_started
;
12374 bool work_in_progress
= false;
12375 bool recovery_started
= false;
12376 ceph_assert(is_primary());
12377 ceph_assert(is_peered());
12378 ceph_assert(!recovery_state
.is_deleting());
12380 ceph_assert(recovery_queued
);
12381 recovery_queued
= false;
12383 if (!state_test(PG_STATE_RECOVERING
) &&
12384 !state_test(PG_STATE_BACKFILLING
)) {
12385 /* TODO: I think this case is broken and will make do_recovery()
12386 * unhappy since we're returning false */
12387 dout(10) << "recovery raced and were queued twice, ignoring!" << dendl
;
12388 return have_unfound();
12391 const auto &missing
= recovery_state
.get_pg_log().get_missing();
12393 uint64_t num_unfound
= get_num_unfound();
12395 if (!recovery_state
.have_missing()) {
12396 recovery_state
.local_recovery_complete();
12399 if (!missing
.have_missing() || // Primary does not have missing
12400 // or all of the missing objects are unfound.
12401 recovery_state
.all_missing_unfound()) {
12402 // Recover the replicas.
12403 started
= recover_replicas(max
, handle
, &recovery_started
);
12406 // We still have missing objects that we should grab from replicas.
12407 started
+= recover_primary(max
, handle
);
12409 if (!started
&& num_unfound
!= get_num_unfound()) {
12410 // second chance to recovery replicas
12411 started
= recover_replicas(max
, handle
, &recovery_started
);
12414 if (started
|| recovery_started
)
12415 work_in_progress
= true;
12417 bool deferred_backfill
= false;
12418 if (recovering
.empty() &&
12419 state_test(PG_STATE_BACKFILLING
) &&
12420 !get_backfill_targets().empty() && started
< max
&&
12421 missing
.num_missing() == 0 &&
12422 waiting_on_backfill
.empty()) {
12423 if (get_osdmap()->test_flag(CEPH_OSDMAP_NOBACKFILL
)) {
12424 dout(10) << "deferring backfill due to NOBACKFILL" << dendl
;
12425 deferred_backfill
= true;
12426 } else if (get_osdmap()->test_flag(CEPH_OSDMAP_NOREBALANCE
) &&
12428 dout(10) << "deferring backfill due to NOREBALANCE" << dendl
;
12429 deferred_backfill
= true;
12430 } else if (!recovery_state
.is_backfill_reserved()) {
12431 dout(10) << "deferring backfill due to !backfill_reserved" << dendl
;
12432 if (!backfill_reserving
) {
12433 dout(10) << "queueing RequestBackfill" << dendl
;
12434 backfill_reserving
= true;
12435 queue_peering_event(
12437 std::make_shared
<PGPeeringEvent
>(
12438 get_osdmap_epoch(),
12439 get_osdmap_epoch(),
12440 PeeringState::RequestBackfill())));
12442 deferred_backfill
= true;
12444 started
+= recover_backfill(max
- started
, handle
, &work_in_progress
);
12448 dout(10) << " started " << started
<< dendl
;
12449 osd
->logger
->inc(l_osd_rop
, started
);
12451 if (!recovering
.empty() ||
12452 work_in_progress
|| recovery_ops_active
> 0 || deferred_backfill
)
12453 return !work_in_progress
&& have_unfound();
12455 ceph_assert(recovering
.empty());
12456 ceph_assert(recovery_ops_active
== 0);
12458 dout(10) << __func__
<< " needs_recovery: "
12459 << recovery_state
.get_missing_loc().get_needs_recovery()
12461 dout(10) << __func__
<< " missing_loc: "
12462 << recovery_state
.get_missing_loc().get_missing_locs()
12464 int unfound
= get_num_unfound();
12466 dout(10) << " still have " << unfound
<< " unfound" << dendl
;
12470 if (missing
.num_missing() > 0) {
12471 // this shouldn't happen!
12472 osd
->clog
->error() << info
.pgid
<< " Unexpected Error: recovery ending with "
12473 << missing
.num_missing() << ": " << missing
.get_items();
12477 if (needs_recovery()) {
12478 // this shouldn't happen!
12479 // We already checked num_missing() so we must have missing replicas
12480 osd
->clog
->error() << info
.pgid
12481 << " Unexpected Error: recovery ending with missing replicas";
12485 if (state_test(PG_STATE_RECOVERING
)) {
12486 state_clear(PG_STATE_RECOVERING
);
12487 state_clear(PG_STATE_FORCED_RECOVERY
);
12488 if (needs_backfill()) {
12489 dout(10) << "recovery done, queuing backfill" << dendl
;
12490 queue_peering_event(
12492 std::make_shared
<PGPeeringEvent
>(
12493 get_osdmap_epoch(),
12494 get_osdmap_epoch(),
12495 PeeringState::RequestBackfill())));
12497 dout(10) << "recovery done, no backfill" << dendl
;
12498 eio_errors_to_process
= false;
12499 state_clear(PG_STATE_FORCED_BACKFILL
);
12500 queue_peering_event(
12502 std::make_shared
<PGPeeringEvent
>(
12503 get_osdmap_epoch(),
12504 get_osdmap_epoch(),
12505 PeeringState::AllReplicasRecovered())));
12507 } else { // backfilling
12508 state_clear(PG_STATE_BACKFILLING
);
12509 state_clear(PG_STATE_FORCED_BACKFILL
);
12510 state_clear(PG_STATE_FORCED_RECOVERY
);
12511 dout(10) << "recovery done, backfill done" << dendl
;
12512 eio_errors_to_process
= false;
12513 queue_peering_event(
12515 std::make_shared
<PGPeeringEvent
>(
12516 get_osdmap_epoch(),
12517 get_osdmap_epoch(),
12518 PeeringState::Backfilled())));
12525 * do one recovery op.
12526 * return true if done, false if nothing left to do.
12528 uint64_t PrimaryLogPG::recover_primary(uint64_t max
, ThreadPool::TPHandle
&handle
)
12530 ceph_assert(is_primary());
12532 const auto &missing
= recovery_state
.get_pg_log().get_missing();
12534 dout(10) << __func__
<< " recovering " << recovering
.size()
12536 << " missing " << missing
<< dendl
;
12538 dout(25) << __func__
<< " " << missing
.get_items() << dendl
;
12541 pg_log_entry_t
*latest
= 0;
12542 unsigned started
= 0;
12545 PGBackend::RecoveryHandle
*h
= pgbackend
->open_recovery_op();
12546 map
<version_t
, hobject_t
>::const_iterator p
=
12547 missing
.get_rmissing().lower_bound(recovery_state
.get_pg_log().get_log().last_requested
);
12548 while (p
!= missing
.get_rmissing().end()) {
12549 handle
.reset_tp_timeout();
12551 version_t v
= p
->first
;
12553 auto it_objects
= recovery_state
.get_pg_log().get_log().objects
.find(p
->second
);
12554 if (it_objects
!= recovery_state
.get_pg_log().get_log().objects
.end()) {
12555 latest
= it_objects
->second
;
12556 ceph_assert(latest
->is_update() || latest
->is_delete());
12557 soid
= latest
->soid
;
12562 const pg_missing_item
& item
= missing
.get_items().find(p
->second
)->second
;
12565 hobject_t head
= soid
.get_head();
12567 eversion_t need
= item
.need
;
12569 dout(10) << __func__
<< " "
12570 << soid
<< " " << item
.need
12571 << (missing
.is_missing(soid
) ? " (missing)":"")
12572 << (missing
.is_missing(head
) ? " (missing head)":"")
12573 << (recovering
.count(soid
) ? " (recovering)":"")
12574 << (recovering
.count(head
) ? " (recovering head)":"")
12578 switch (latest
->op
) {
12579 case pg_log_entry_t::CLONE
:
12581 * Handling for this special case removed for now, until we
12582 * can correctly construct an accurate SnapSet from the old
12587 case pg_log_entry_t::LOST_REVERT
:
12589 if (item
.have
== latest
->reverting_to
) {
12590 ObjectContextRef obc
= get_object_context(soid
, true);
12592 if (obc
->obs
.oi
.version
== latest
->version
) {
12593 // I'm already reverting
12594 dout(10) << " already reverting " << soid
<< dendl
;
12596 dout(10) << " reverting " << soid
<< " to " << latest
->prior_version
<< dendl
;
12597 obc
->obs
.oi
.version
= latest
->version
;
12599 ObjectStore::Transaction t
;
12601 obc
->obs
.oi
.encode(
12603 get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD
, nullptr));
12604 ceph_assert(!pool
.info
.require_rollback());
12605 t
.setattr(coll
, ghobject_t(soid
), OI_ATTR
, b2
);
12607 recovery_state
.recover_got(
12615 t
.register_on_applied(new C_OSD_AppliedRecoveredObject(this, obc
));
12616 t
.register_on_commit(new C_OSD_CommittedPushedObject(
12618 get_osdmap_epoch(),
12619 info
.last_complete
));
12620 osd
->store
->queue_transaction(ch
, std::move(t
));
12625 * Pull the old version of the object. Update missing_loc here to have the location
12626 * of the version we want.
12628 * This doesn't use the usual missing_loc paths, but that's okay:
12629 * - if we have it locally, we hit the case above, and go from there.
12630 * - if we don't, we always pass through this case during recovery and set up the location
12632 * - this way we don't need to mangle the missing code to be general about needing an old
12635 eversion_t alternate_need
= latest
->reverting_to
;
12636 dout(10) << " need to pull prior_version " << alternate_need
<< " for revert " << item
<< dendl
;
12638 set
<pg_shard_t
> good_peers
;
12639 for (auto p
= recovery_state
.get_peer_missing().begin();
12640 p
!= recovery_state
.get_peer_missing().end();
12642 if (p
->second
.is_missing(soid
, need
) &&
12643 p
->second
.get_items().at(soid
).have
== alternate_need
) {
12644 good_peers
.insert(p
->first
);
12647 recovery_state
.set_revert_with_targets(
12650 dout(10) << " will pull " << alternate_need
<< " or " << need
12652 << recovery_state
.get_missing_loc().get_locations(soid
)
12660 if (!recovering
.count(soid
)) {
12661 if (recovering
.count(head
)) {
12664 int r
= recover_missing(
12665 soid
, need
, get_recovery_op_priority(), h
);
12678 if (started
>= max
)
12683 // only advance last_requested if we haven't skipped anything
12685 recovery_state
.set_last_requested(v
);
12688 pgbackend
->run_recovery_op(h
, get_recovery_op_priority());
12692 bool PrimaryLogPG::primary_error(
12693 const hobject_t
& soid
, eversion_t v
)
12695 recovery_state
.force_object_missing(pg_whoami
, soid
, v
);
12696 bool uhoh
= recovery_state
.get_missing_loc().is_unfound(soid
);
12698 osd
->clog
->error() << info
.pgid
<< " missing primary copy of "
12699 << soid
<< ", unfound";
12701 osd
->clog
->error() << info
.pgid
<< " missing primary copy of "
12703 << ", will try copies on "
12704 << recovery_state
.get_missing_loc().get_locations(soid
);
12708 int PrimaryLogPG::prep_object_replica_deletes(
12709 const hobject_t
& soid
, eversion_t v
,
12710 PGBackend::RecoveryHandle
*h
,
12711 bool *work_started
)
12713 ceph_assert(is_primary());
12714 dout(10) << __func__
<< ": on " << soid
<< dendl
;
12716 ObjectContextRef obc
= get_object_context(soid
, false);
12718 if (!obc
->get_recovery_read()) {
12719 dout(20) << "replica delete delayed on " << soid
12720 << "; could not get rw_manager lock" << dendl
;
12721 *work_started
= true;
12724 dout(20) << "replica delete got recovery read lock on " << soid
12729 start_recovery_op(soid
);
12730 ceph_assert(!recovering
.count(soid
));
12732 recovering
.insert(make_pair(soid
, ObjectContextRef()));
12734 recovering
.insert(make_pair(soid
, obc
));
12736 pgbackend
->recover_delete_object(soid
, v
, h
);
12740 int PrimaryLogPG::prep_object_replica_pushes(
12741 const hobject_t
& soid
, eversion_t v
,
12742 PGBackend::RecoveryHandle
*h
,
12743 bool *work_started
)
12745 ceph_assert(is_primary());
12746 dout(10) << __func__
<< ": on " << soid
<< dendl
;
12748 if (soid
.snap
&& soid
.snap
< CEPH_NOSNAP
) {
12749 // do we have the head and/or snapdir?
12750 hobject_t head
= soid
.get_head();
12751 if (recovery_state
.get_pg_log().get_missing().is_missing(head
)) {
12752 if (recovering
.count(head
)) {
12753 dout(10) << " missing but already recovering head " << head
<< dendl
;
12756 int r
= recover_missing(
12757 head
, recovery_state
.get_pg_log().get_missing().get_items().find(head
)->second
.need
,
12758 get_recovery_op_priority(), h
);
12759 if (r
!= PULL_NONE
)
12766 // NOTE: we know we will get a valid oloc off of disk here.
12767 ObjectContextRef obc
= get_object_context(soid
, false);
12769 primary_error(soid
, v
);
12773 if (!obc
->get_recovery_read()) {
12774 dout(20) << "recovery delayed on " << soid
12775 << "; could not get rw_manager lock" << dendl
;
12776 *work_started
= true;
12779 dout(20) << "recovery got recovery read lock on " << soid
12783 start_recovery_op(soid
);
12784 ceph_assert(!recovering
.count(soid
));
12785 recovering
.insert(make_pair(soid
, obc
));
12787 int r
= pgbackend
->recover_object(
12790 ObjectContextRef(),
12791 obc
, // has snapset context
12794 dout(0) << __func__
<< " Error " << r
<< " on oid " << soid
<< dendl
;
12795 on_failed_pull({ pg_whoami
}, soid
, v
);
12801 uint64_t PrimaryLogPG::recover_replicas(uint64_t max
, ThreadPool::TPHandle
&handle
,
12802 bool *work_started
)
12804 dout(10) << __func__
<< "(" << max
<< ")" << dendl
;
12805 uint64_t started
= 0;
12807 PGBackend::RecoveryHandle
*h
= pgbackend
->open_recovery_op();
12809 // this is FAR from an optimal recovery order. pretty lame, really.
12810 ceph_assert(!get_acting_recovery_backfill().empty());
12811 // choose replicas to recover, replica has the shortest missing list first
12812 // so we can bring it back to normal ASAP
12813 std::vector
<std::pair
<unsigned int, pg_shard_t
>> replicas_by_num_missing
,
12814 async_by_num_missing
;
12815 replicas_by_num_missing
.reserve(get_acting_recovery_backfill().size() - 1);
12816 for (auto &p
: get_acting_recovery_backfill()) {
12817 if (p
== get_primary()) {
12820 auto pm
= recovery_state
.get_peer_missing().find(p
);
12821 ceph_assert(pm
!= recovery_state
.get_peer_missing().end());
12822 auto nm
= pm
->second
.num_missing();
12824 if (is_async_recovery_target(p
)) {
12825 async_by_num_missing
.push_back(make_pair(nm
, p
));
12827 replicas_by_num_missing
.push_back(make_pair(nm
, p
));
12831 // sort by number of missing objects, in ascending order.
12832 auto func
= [](const std::pair
<unsigned int, pg_shard_t
> &lhs
,
12833 const std::pair
<unsigned int, pg_shard_t
> &rhs
) {
12834 return lhs
.first
< rhs
.first
;
12836 // acting goes first
12837 std::sort(replicas_by_num_missing
.begin(), replicas_by_num_missing
.end(), func
);
12838 // then async_recovery_targets
12839 std::sort(async_by_num_missing
.begin(), async_by_num_missing
.end(), func
);
12840 replicas_by_num_missing
.insert(replicas_by_num_missing
.end(),
12841 async_by_num_missing
.begin(), async_by_num_missing
.end());
12842 for (auto &replica
: replicas_by_num_missing
) {
12843 pg_shard_t
&peer
= replica
.second
;
12844 ceph_assert(peer
!= get_primary());
12845 auto pm
= recovery_state
.get_peer_missing().find(peer
);
12846 ceph_assert(pm
!= recovery_state
.get_peer_missing().end());
12847 size_t m_sz
= pm
->second
.num_missing();
12849 dout(10) << " peer osd." << peer
<< " missing " << m_sz
<< " objects." << dendl
;
12850 dout(20) << " peer osd." << peer
<< " missing " << pm
->second
.get_items() << dendl
;
12853 const pg_missing_t
&m(pm
->second
);
12854 for (map
<version_t
, hobject_t
>::const_iterator p
= m
.get_rmissing().begin();
12855 p
!= m
.get_rmissing().end() && started
< max
;
12857 handle
.reset_tp_timeout();
12858 const hobject_t
soid(p
->second
);
12860 if (recovery_state
.get_missing_loc().is_unfound(soid
)) {
12861 dout(10) << __func__
<< ": " << soid
<< " still unfound" << dendl
;
12865 const pg_info_t
&pi
= recovery_state
.get_peer_info(peer
);
12866 if (soid
> pi
.last_backfill
) {
12867 if (!recovering
.count(soid
)) {
12868 derr
<< __func__
<< ": object " << soid
<< " last_backfill "
12869 << pi
.last_backfill
<< dendl
;
12870 derr
<< __func__
<< ": object added to missing set for backfill, but "
12871 << "is not in recovering, error!" << dendl
;
12877 if (recovering
.count(soid
)) {
12878 dout(10) << __func__
<< ": already recovering " << soid
<< dendl
;
12882 if (recovery_state
.get_missing_loc().is_deleted(soid
)) {
12883 dout(10) << __func__
<< ": " << soid
<< " is a delete, removing" << dendl
;
12884 map
<hobject_t
,pg_missing_item
>::const_iterator r
= m
.get_items().find(soid
);
12885 started
+= prep_object_replica_deletes(soid
, r
->second
.need
, h
, work_started
);
12889 if (soid
.is_snap() &&
12890 recovery_state
.get_pg_log().get_missing().is_missing(
12891 soid
.get_head())) {
12892 dout(10) << __func__
<< ": " << soid
.get_head()
12893 << " still missing on primary" << dendl
;
12897 if (recovery_state
.get_pg_log().get_missing().is_missing(soid
)) {
12898 dout(10) << __func__
<< ": " << soid
<< " still missing on primary" << dendl
;
12902 dout(10) << __func__
<< ": recover_object_replicas(" << soid
<< ")" << dendl
;
12903 map
<hobject_t
,pg_missing_item
>::const_iterator r
= m
.get_items().find(soid
);
12904 started
+= prep_object_replica_pushes(soid
, r
->second
.need
, h
, work_started
);
12908 pgbackend
->run_recovery_op(h
, get_recovery_op_priority());
12912 hobject_t
PrimaryLogPG::earliest_peer_backfill() const
12914 hobject_t e
= hobject_t::get_max();
12915 for (const pg_shard_t
& peer
: get_backfill_targets()) {
12916 const auto iter
= peer_backfill_info
.find(peer
);
12917 ceph_assert(iter
!= peer_backfill_info
.end());
12918 e
= std::min(e
, iter
->second
.begin
);
12923 bool PrimaryLogPG::all_peer_done() const
12925 // Primary hasn't got any more objects
12926 ceph_assert(backfill_info
.empty());
12928 for (const pg_shard_t
& bt
: get_backfill_targets()) {
12929 const auto piter
= peer_backfill_info
.find(bt
);
12930 ceph_assert(piter
!= peer_backfill_info
.end());
12931 const BackfillInterval
& pbi
= piter
->second
;
12932 // See if peer has more to process
12933 if (!pbi
.extends_to_end() || !pbi
.empty())
12944 * backfilled: fully pushed to replica or present in replica's missing set (both
12945 * our copy and theirs).
12947 * All objects on a backfill_target in
12948 * [MIN,peer_backfill_info[backfill_target].begin) are valid; logically-removed
12949 * objects have been actually deleted and all logically-valid objects are replicated.
12950 * There may be PG objects in this interval yet to be backfilled.
12952 * All objects in PG in [MIN,backfill_info.begin) have been backfilled to all
12953 * backfill_targets. There may be objects on backfill_target(s) yet to be deleted.
12955 * For a backfill target, all objects < std::min(peer_backfill_info[target].begin,
12956 * backfill_info.begin) in PG are backfilled. No deleted objects in this
12957 * interval remain on the backfill target.
12959 * For a backfill target, all objects <= peer_info[target].last_backfill
12960 * have been backfilled to target
12962 * There *MAY* be missing/outdated objects between last_backfill_started and
12963 * std::min(peer_backfill_info[*].begin, backfill_info.begin) in the event that client
12964 * io created objects since the last scan. For this reason, we call
12965 * update_range() again before continuing backfill.
12967 uint64_t PrimaryLogPG::recover_backfill(
12969 ThreadPool::TPHandle
&handle
, bool *work_started
)
12971 dout(10) << __func__
<< " (" << max
<< ")"
12972 << " bft=" << get_backfill_targets()
12973 << " last_backfill_started " << last_backfill_started
12974 << (new_backfill
? " new_backfill":"")
12976 ceph_assert(!get_backfill_targets().empty());
12978 // Initialize from prior backfill state
12979 if (new_backfill
) {
12980 // on_activate() was called prior to getting here
12981 ceph_assert(last_backfill_started
== earliest_backfill());
12982 new_backfill
= false;
12984 // initialize BackfillIntervals
12985 for (set
<pg_shard_t
>::const_iterator i
= get_backfill_targets().begin();
12986 i
!= get_backfill_targets().end();
12988 peer_backfill_info
[*i
].reset(
12989 recovery_state
.get_peer_info(*i
).last_backfill
);
12991 backfill_info
.reset(last_backfill_started
);
12993 backfills_in_flight
.clear();
12994 pending_backfill_updates
.clear();
12997 for (set
<pg_shard_t
>::const_iterator i
= get_backfill_targets().begin();
12998 i
!= get_backfill_targets().end();
13000 dout(10) << "peer osd." << *i
13001 << " info " << recovery_state
.get_peer_info(*i
)
13002 << " interval " << peer_backfill_info
[*i
].begin
13003 << "-" << peer_backfill_info
[*i
].end
13004 << " " << peer_backfill_info
[*i
].objects
.size() << " objects"
13008 // update our local interval to cope with recent changes
13009 backfill_info
.begin
= last_backfill_started
;
13010 update_range(&backfill_info
, handle
);
13013 vector
<boost::tuple
<hobject_t
, eversion_t
, pg_shard_t
> > to_remove
;
13014 set
<hobject_t
> add_to_stat
;
13016 for (set
<pg_shard_t
>::const_iterator i
= get_backfill_targets().begin();
13017 i
!= get_backfill_targets().end();
13019 peer_backfill_info
[*i
].trim_to(
13021 recovery_state
.get_peer_info(*i
).last_backfill
,
13022 last_backfill_started
));
13024 backfill_info
.trim_to(last_backfill_started
);
13026 PGBackend::RecoveryHandle
*h
= pgbackend
->open_recovery_op();
13027 while (ops
< max
) {
13028 if (backfill_info
.begin
<= earliest_peer_backfill() &&
13029 !backfill_info
.extends_to_end() && backfill_info
.empty()) {
13030 hobject_t next
= backfill_info
.end
;
13031 backfill_info
.reset(next
);
13032 backfill_info
.end
= hobject_t::get_max();
13033 update_range(&backfill_info
, handle
);
13034 backfill_info
.trim();
13037 dout(20) << " my backfill interval " << backfill_info
<< dendl
;
13039 bool sent_scan
= false;
13040 for (set
<pg_shard_t
>::const_iterator i
= get_backfill_targets().begin();
13041 i
!= get_backfill_targets().end();
13043 pg_shard_t bt
= *i
;
13044 BackfillInterval
& pbi
= peer_backfill_info
[bt
];
13046 dout(20) << " peer shard " << bt
<< " backfill " << pbi
<< dendl
;
13047 if (pbi
.begin
<= backfill_info
.begin
&&
13048 !pbi
.extends_to_end() && pbi
.empty()) {
13049 dout(10) << " scanning peer osd." << bt
<< " from " << pbi
.end
<< dendl
;
13050 epoch_t e
= get_osdmap_epoch();
13051 MOSDPGScan
*m
= new MOSDPGScan(
13052 MOSDPGScan::OP_SCAN_GET_DIGEST
, pg_whoami
, e
, get_last_peering_reset(),
13053 spg_t(info
.pgid
.pgid
, bt
.shard
),
13054 pbi
.end
, hobject_t());
13055 osd
->send_message_osd_cluster(bt
.osd
, m
, get_osdmap_epoch());
13056 ceph_assert(waiting_on_backfill
.find(bt
) == waiting_on_backfill
.end());
13057 waiting_on_backfill
.insert(bt
);
13062 // Count simultaneous scans as a single op and let those complete
13065 start_recovery_op(hobject_t::get_max()); // XXX: was pbi.end
13069 if (backfill_info
.empty() && all_peer_done()) {
13070 dout(10) << " reached end for both local and all peers" << dendl
;
13074 // Get object within set of peers to operate on and
13075 // the set of targets for which that object applies.
13076 hobject_t check
= earliest_peer_backfill();
13078 if (check
< backfill_info
.begin
) {
13080 set
<pg_shard_t
> check_targets
;
13081 for (set
<pg_shard_t
>::const_iterator i
= get_backfill_targets().begin();
13082 i
!= get_backfill_targets().end();
13084 pg_shard_t bt
= *i
;
13085 BackfillInterval
& pbi
= peer_backfill_info
[bt
];
13086 if (pbi
.begin
== check
)
13087 check_targets
.insert(bt
);
13089 ceph_assert(!check_targets
.empty());
13091 dout(20) << " BACKFILL removing " << check
13092 << " from peers " << check_targets
<< dendl
;
13093 for (set
<pg_shard_t
>::iterator i
= check_targets
.begin();
13094 i
!= check_targets
.end();
13096 pg_shard_t bt
= *i
;
13097 BackfillInterval
& pbi
= peer_backfill_info
[bt
];
13098 ceph_assert(pbi
.begin
== check
);
13100 to_remove
.push_back(boost::make_tuple(check
, pbi
.objects
.begin()->second
, bt
));
13104 last_backfill_started
= check
;
13106 // Don't increment ops here because deletions
13107 // are cheap and not replied to unlike real recovery_ops,
13108 // and we can't increment ops without requeueing ourself
13111 eversion_t
& obj_v
= backfill_info
.objects
.begin()->second
;
13113 vector
<pg_shard_t
> need_ver_targs
, missing_targs
, keep_ver_targs
, skip_targs
;
13114 for (set
<pg_shard_t
>::const_iterator i
= get_backfill_targets().begin();
13115 i
!= get_backfill_targets().end();
13117 pg_shard_t bt
= *i
;
13118 BackfillInterval
& pbi
= peer_backfill_info
[bt
];
13119 // Find all check peers that have the wrong version
13120 if (check
== backfill_info
.begin
&& check
== pbi
.begin
) {
13121 if (pbi
.objects
.begin()->second
!= obj_v
) {
13122 need_ver_targs
.push_back(bt
);
13124 keep_ver_targs
.push_back(bt
);
13127 const pg_info_t
& pinfo
= recovery_state
.get_peer_info(bt
);
13129 // Only include peers that we've caught up to their backfill line
13130 // otherwise, they only appear to be missing this object
13131 // because their pbi.begin > backfill_info.begin.
13132 if (backfill_info
.begin
> pinfo
.last_backfill
)
13133 missing_targs
.push_back(bt
);
13135 skip_targs
.push_back(bt
);
13139 if (!keep_ver_targs
.empty()) {
13140 // These peers have version obj_v
13141 dout(20) << " BACKFILL keeping " << check
13142 << " with ver " << obj_v
13143 << " on peers " << keep_ver_targs
<< dendl
;
13144 //assert(!waiting_for_degraded_object.count(check));
13146 if (!need_ver_targs
.empty() || !missing_targs
.empty()) {
13147 ObjectContextRef obc
= get_object_context(backfill_info
.begin
, false);
13149 if (obc
->get_recovery_read()) {
13150 if (!need_ver_targs
.empty()) {
13151 dout(20) << " BACKFILL replacing " << check
13152 << " with ver " << obj_v
13153 << " to peers " << need_ver_targs
<< dendl
;
13155 if (!missing_targs
.empty()) {
13156 dout(20) << " BACKFILL pushing " << backfill_info
.begin
13157 << " with ver " << obj_v
13158 << " to peers " << missing_targs
<< dendl
;
13160 vector
<pg_shard_t
> all_push
= need_ver_targs
;
13161 all_push
.insert(all_push
.end(), missing_targs
.begin(), missing_targs
.end());
13163 handle
.reset_tp_timeout();
13164 int r
= prep_backfill_object_push(backfill_info
.begin
, obj_v
, obc
, all_push
, h
);
13166 *work_started
= true;
13167 dout(0) << __func__
<< " Error " << r
<< " trying to backfill " << backfill_info
.begin
<< dendl
;
13172 *work_started
= true;
13173 dout(20) << "backfill blocking on " << backfill_info
.begin
13174 << "; could not get rw_manager lock" << dendl
;
13178 dout(20) << "need_ver_targs=" << need_ver_targs
13179 << " keep_ver_targs=" << keep_ver_targs
<< dendl
;
13180 dout(20) << "backfill_targets=" << get_backfill_targets()
13181 << " missing_targs=" << missing_targs
13182 << " skip_targs=" << skip_targs
<< dendl
;
13184 last_backfill_started
= backfill_info
.begin
;
13185 add_to_stat
.insert(backfill_info
.begin
); // XXX: Only one for all pushes?
13186 backfill_info
.pop_front();
13187 vector
<pg_shard_t
> check_targets
= need_ver_targs
;
13188 check_targets
.insert(check_targets
.end(), keep_ver_targs
.begin(), keep_ver_targs
.end());
13189 for (vector
<pg_shard_t
>::iterator i
= check_targets
.begin();
13190 i
!= check_targets
.end();
13192 pg_shard_t bt
= *i
;
13193 BackfillInterval
& pbi
= peer_backfill_info
[bt
];
13199 hobject_t backfill_pos
=
13200 std::min(backfill_info
.begin
, earliest_peer_backfill());
13202 for (set
<hobject_t
>::iterator i
= add_to_stat
.begin();
13203 i
!= add_to_stat
.end();
13205 ObjectContextRef obc
= get_object_context(*i
, false);
13208 add_object_context_to_pg_stat(obc
, &stat
);
13209 pending_backfill_updates
[*i
] = stat
;
13211 map
<pg_shard_t
,MOSDPGBackfillRemove
*> reqs
;
13212 for (unsigned i
= 0; i
< to_remove
.size(); ++i
) {
13213 handle
.reset_tp_timeout();
13214 const hobject_t
& oid
= to_remove
[i
].get
<0>();
13215 eversion_t v
= to_remove
[i
].get
<1>();
13216 pg_shard_t peer
= to_remove
[i
].get
<2>();
13217 MOSDPGBackfillRemove
*m
;
13218 auto it
= reqs
.find(peer
);
13219 if (it
!= reqs
.end()) {
13222 m
= reqs
[peer
] = new MOSDPGBackfillRemove(
13223 spg_t(info
.pgid
.pgid
, peer
.shard
),
13224 get_osdmap_epoch());
13226 m
->ls
.push_back(make_pair(oid
, v
));
13228 if (oid
<= last_backfill_started
)
13229 pending_backfill_updates
[oid
]; // add empty stat!
13231 for (auto p
: reqs
) {
13232 osd
->send_message_osd_cluster(p
.first
.osd
, p
.second
,
13233 get_osdmap_epoch());
13236 pgbackend
->run_recovery_op(h
, get_recovery_op_priority());
13238 dout(5) << "backfill_pos is " << backfill_pos
<< dendl
;
13239 for (set
<hobject_t
>::iterator i
= backfills_in_flight
.begin();
13240 i
!= backfills_in_flight
.end();
13242 dout(20) << *i
<< " is still in flight" << dendl
;
13245 hobject_t next_backfill_to_complete
= backfills_in_flight
.empty() ?
13246 backfill_pos
: *(backfills_in_flight
.begin());
13247 hobject_t new_last_backfill
= earliest_backfill();
13248 dout(10) << "starting new_last_backfill at " << new_last_backfill
<< dendl
;
13249 for (map
<hobject_t
, pg_stat_t
>::iterator i
=
13250 pending_backfill_updates
.begin();
13251 i
!= pending_backfill_updates
.end() &&
13252 i
->first
< next_backfill_to_complete
;
13253 pending_backfill_updates
.erase(i
++)) {
13254 dout(20) << " pending_backfill_update " << i
->first
<< dendl
;
13255 ceph_assert(i
->first
> new_last_backfill
);
13256 recovery_state
.update_complete_backfill_object_stats(
13259 new_last_backfill
= i
->first
;
13261 dout(10) << "possible new_last_backfill at " << new_last_backfill
<< dendl
;
13263 ceph_assert(!pending_backfill_updates
.empty() ||
13264 new_last_backfill
== last_backfill_started
);
13265 if (pending_backfill_updates
.empty() &&
13266 backfill_pos
.is_max()) {
13267 ceph_assert(backfills_in_flight
.empty());
13268 new_last_backfill
= backfill_pos
;
13269 last_backfill_started
= backfill_pos
;
13271 dout(10) << "final new_last_backfill at " << new_last_backfill
<< dendl
;
13273 // If new_last_backfill == MAX, then we will send OP_BACKFILL_FINISH to
13274 // all the backfill targets. Otherwise, we will move last_backfill up on
13275 // those targets need it and send OP_BACKFILL_PROGRESS to them.
13276 for (set
<pg_shard_t
>::const_iterator i
= get_backfill_targets().begin();
13277 i
!= get_backfill_targets().end();
13279 pg_shard_t bt
= *i
;
13280 const pg_info_t
& pinfo
= recovery_state
.get_peer_info(bt
);
13282 if (new_last_backfill
> pinfo
.last_backfill
) {
13283 recovery_state
.update_peer_last_backfill(bt
, new_last_backfill
);
13284 epoch_t e
= get_osdmap_epoch();
13285 MOSDPGBackfill
*m
= NULL
;
13286 if (pinfo
.last_backfill
.is_max()) {
13287 m
= new MOSDPGBackfill(
13288 MOSDPGBackfill::OP_BACKFILL_FINISH
,
13290 get_last_peering_reset(),
13291 spg_t(info
.pgid
.pgid
, bt
.shard
));
13292 // Use default priority here, must match sub_op priority
13293 start_recovery_op(hobject_t::get_max());
13295 m
= new MOSDPGBackfill(
13296 MOSDPGBackfill::OP_BACKFILL_PROGRESS
,
13298 get_last_peering_reset(),
13299 spg_t(info
.pgid
.pgid
, bt
.shard
));
13300 // Use default priority here, must match sub_op priority
13302 m
->last_backfill
= pinfo
.last_backfill
;
13303 m
->stats
= pinfo
.stats
;
13304 osd
->send_message_osd_cluster(bt
.osd
, m
, get_osdmap_epoch());
13305 dout(10) << " peer " << bt
13306 << " num_objects now " << pinfo
.stats
.stats
.sum
.num_objects
13307 << " / " << info
.stats
.stats
.sum
.num_objects
<< dendl
;
13312 *work_started
= true;
13316 int PrimaryLogPG::prep_backfill_object_push(
13317 hobject_t oid
, eversion_t v
,
13318 ObjectContextRef obc
,
13319 vector
<pg_shard_t
> peers
,
13320 PGBackend::RecoveryHandle
*h
)
13322 dout(10) << __func__
<< " " << oid
<< " v " << v
<< " to peers " << peers
<< dendl
;
13323 ceph_assert(!peers
.empty());
13325 backfills_in_flight
.insert(oid
);
13326 recovery_state
.prepare_backfill_for_missing(oid
, v
, peers
);
13328 ceph_assert(!recovering
.count(oid
));
13330 start_recovery_op(oid
);
13331 recovering
.insert(make_pair(oid
, obc
));
13333 int r
= pgbackend
->recover_object(
13336 ObjectContextRef(),
13340 dout(0) << __func__
<< " Error " << r
<< " on oid " << oid
<< dendl
;
13341 on_failed_pull({ pg_whoami
}, oid
, v
);
13346 void PrimaryLogPG::update_range(
13347 BackfillInterval
*bi
,
13348 ThreadPool::TPHandle
&handle
)
13350 int local_min
= cct
->_conf
->osd_backfill_scan_min
;
13351 int local_max
= cct
->_conf
->osd_backfill_scan_max
;
13353 if (bi
->version
< info
.log_tail
) {
13354 dout(10) << __func__
<< ": bi is old, rescanning local backfill_info"
13356 bi
->version
= info
.last_update
;
13357 scan_range(local_min
, local_max
, bi
, handle
);
13360 if (bi
->version
>= projected_last_update
) {
13361 dout(10) << __func__
<< ": bi is current " << dendl
;
13362 ceph_assert(bi
->version
== projected_last_update
);
13363 } else if (bi
->version
>= info
.log_tail
) {
13364 if (recovery_state
.get_pg_log().get_log().empty() && projected_log
.empty()) {
13365 /* Because we don't move log_tail on split, the log might be
13366 * empty even if log_tail != last_update. However, the only
13367 * way to get here with an empty log is if log_tail is actually
13368 * eversion_t(), because otherwise the entry which changed
13369 * last_update since the last scan would have to be present.
13371 ceph_assert(bi
->version
== eversion_t());
13375 dout(10) << __func__
<< ": bi is old, (" << bi
->version
13376 << ") can be updated with log to projected_last_update "
13377 << projected_last_update
<< dendl
;
13379 auto func
= [&](const pg_log_entry_t
&e
) {
13380 dout(10) << __func__
<< ": updating from version " << e
.version
13382 const hobject_t
&soid
= e
.soid
;
13383 if (soid
>= bi
->begin
&&
13385 if (e
.is_update()) {
13386 dout(10) << __func__
<< ": " << e
.soid
<< " updated to version "
13387 << e
.version
<< dendl
;
13388 bi
->objects
.erase(e
.soid
);
13389 bi
->objects
.insert(
13393 } else if (e
.is_delete()) {
13394 dout(10) << __func__
<< ": " << e
.soid
<< " removed" << dendl
;
13395 bi
->objects
.erase(e
.soid
);
13399 dout(10) << "scanning pg log first" << dendl
;
13400 recovery_state
.get_pg_log().get_log().scan_log_after(bi
->version
, func
);
13401 dout(10) << "scanning projected log" << dendl
;
13402 projected_log
.scan_log_after(bi
->version
, func
);
13403 bi
->version
= projected_last_update
;
13405 ceph_abort_msg("scan_range should have raised bi->version past log_tail");
13409 void PrimaryLogPG::scan_range(
13410 int min
, int max
, BackfillInterval
*bi
,
13411 ThreadPool::TPHandle
&handle
)
13413 ceph_assert(is_locked());
13414 dout(10) << "scan_range from " << bi
->begin
<< dendl
;
13415 bi
->clear_objects();
13417 vector
<hobject_t
> ls
;
13419 int r
= pgbackend
->objects_list_partial(bi
->begin
, min
, max
, &ls
, &bi
->end
);
13420 ceph_assert(r
>= 0);
13421 dout(10) << " got " << ls
.size() << " items, next " << bi
->end
<< dendl
;
13422 dout(20) << ls
<< dendl
;
13424 for (vector
<hobject_t
>::iterator p
= ls
.begin(); p
!= ls
.end(); ++p
) {
13425 handle
.reset_tp_timeout();
13426 ObjectContextRef obc
;
13428 obc
= object_contexts
.lookup(*p
);
13430 if (!obc
->obs
.exists
) {
13431 /* If the object does not exist here, it must have been removed
13432 * between the collection_list_partial and here. This can happen
13433 * for the first item in the range, which is usually last_backfill.
13437 bi
->objects
[*p
] = obc
->obs
.oi
.version
;
13438 dout(20) << " " << *p
<< " " << obc
->obs
.oi
.version
<< dendl
;
13441 int r
= pgbackend
->objects_get_attr(*p
, OI_ATTR
, &bl
);
13442 /* If the object does not exist here, it must have been removed
13443 * between the collection_list_partial and here. This can happen
13444 * for the first item in the range, which is usually last_backfill.
13449 ceph_assert(r
>= 0);
13450 object_info_t
oi(bl
);
13451 bi
->objects
[*p
] = oi
.version
;
13452 dout(20) << " " << *p
<< " " << oi
.version
<< dendl
;
13460 * verifies that stray objects have been deleted
13462 void PrimaryLogPG::check_local()
13464 dout(10) << __func__
<< dendl
;
13467 info
.last_update
>=
13468 recovery_state
.get_pg_log().get_tail()); // otherwise we need some help!
13470 if (!cct
->_conf
->osd_debug_verify_stray_on_activate
)
13473 // just scan the log.
13474 set
<hobject_t
> did
;
13475 for (list
<pg_log_entry_t
>::const_reverse_iterator p
= recovery_state
.get_pg_log().get_log().log
.rbegin();
13476 p
!= recovery_state
.get_pg_log().get_log().log
.rend();
13478 if (did
.count(p
->soid
))
13480 did
.insert(p
->soid
);
13482 if (p
->is_delete() && !is_missing_object(p
->soid
)) {
13483 dout(10) << " checking " << p
->soid
13484 << " at " << p
->version
<< dendl
;
13486 int r
= osd
->store
->stat(
13488 ghobject_t(p
->soid
, ghobject_t::NO_GEN
, pg_whoami
.shard
),
13490 if (r
!= -ENOENT
) {
13491 derr
<< __func__
<< " " << p
->soid
<< " exists, but should have been "
13492 << "deleted" << dendl
;
13493 ceph_abort_msg("erroneously present object");
13496 // ignore old(+missing) objects
13503 // ===========================
13506 hobject_t
PrimaryLogPG::get_hit_set_current_object(utime_t stamp
)
13509 ss
<< "hit_set_" << info
.pgid
.pgid
<< "_current_" << stamp
;
13510 hobject_t
hoid(sobject_t(ss
.str(), CEPH_NOSNAP
), "",
13511 info
.pgid
.ps(), info
.pgid
.pool(),
13512 cct
->_conf
->osd_hit_set_namespace
);
13513 dout(20) << __func__
<< " " << hoid
<< dendl
;
13517 hobject_t
PrimaryLogPG::get_hit_set_archive_object(utime_t start
,
13522 ss
<< "hit_set_" << info
.pgid
.pgid
<< "_archive_";
13524 start
.gmtime(ss
, true /* legacy pre-octopus form */) << "_";
13525 end
.gmtime(ss
, true /* legacy pre-octopus form */);
13527 start
.localtime(ss
, true /* legacy pre-octopus form */) << "_";
13528 end
.localtime(ss
, true /* legacy pre-octopus form */);
13530 hobject_t
hoid(sobject_t(ss
.str(), CEPH_NOSNAP
), "",
13531 info
.pgid
.ps(), info
.pgid
.pool(),
13532 cct
->_conf
->osd_hit_set_namespace
);
13533 dout(20) << __func__
<< " " << hoid
<< dendl
;
13537 void PrimaryLogPG::hit_set_clear()
13539 dout(20) << __func__
<< dendl
;
13541 hit_set_start_stamp
= utime_t();
13544 void PrimaryLogPG::hit_set_setup()
13546 if (!is_active() ||
13552 if (is_active() && is_primary() &&
13553 (!pool
.info
.hit_set_count
||
13554 !pool
.info
.hit_set_period
||
13555 pool
.info
.hit_set_params
.get_type() == HitSet::TYPE_NONE
)) {
13558 // only primary is allowed to remove all the hit set objects
13559 hit_set_remove_all();
13563 // FIXME: discard any previous data for now
13566 // include any writes we know about from the pg log. this doesn't
13567 // capture reads, but it is better than nothing!
13568 hit_set_apply_log();
13571 void PrimaryLogPG::hit_set_remove_all()
13573 // If any archives are degraded we skip this
13574 for (auto p
= info
.hit_set
.history
.begin();
13575 p
!= info
.hit_set
.history
.end();
13577 hobject_t aoid
= get_hit_set_archive_object(p
->begin
, p
->end
, p
->using_gmt
);
13579 // Once we hit a degraded object just skip
13580 if (is_degraded_or_backfilling_object(aoid
))
13582 if (write_blocked_by_scrub(aoid
))
13586 if (!info
.hit_set
.history
.empty()) {
13587 auto p
= info
.hit_set
.history
.rbegin();
13588 ceph_assert(p
!= info
.hit_set
.history
.rend());
13589 hobject_t oid
= get_hit_set_archive_object(p
->begin
, p
->end
, p
->using_gmt
);
13590 ceph_assert(!is_degraded_or_backfilling_object(oid
));
13591 ObjectContextRef obc
= get_object_context(oid
, false);
13594 OpContextUPtr ctx
= simple_opc_create(obc
);
13595 ctx
->at_version
= get_next_version();
13596 ctx
->updated_hset_history
= info
.hit_set
;
13597 utime_t now
= ceph_clock_now();
13599 hit_set_trim(ctx
, 0);
13600 simple_opc_submit(std::move(ctx
));
13603 recovery_state
.update_hset(pg_hit_set_history_t());
13605 agent_state
->discard_hit_sets();
13609 void PrimaryLogPG::hit_set_create()
13611 utime_t now
= ceph_clock_now();
13612 // make a copy of the params to modify
13613 HitSet::Params
params(pool
.info
.hit_set_params
);
13615 dout(20) << __func__
<< " " << params
<< dendl
;
13616 if (pool
.info
.hit_set_params
.get_type() == HitSet::TYPE_BLOOM
) {
13617 BloomHitSet::Params
*p
=
13618 static_cast<BloomHitSet::Params
*>(params
.impl
.get());
13620 // convert false positive rate so it holds up across the full period
13621 p
->set_fpp(p
->get_fpp() / pool
.info
.hit_set_count
);
13622 if (p
->get_fpp() <= 0.0)
13623 p
->set_fpp(.01); // fpp cannot be zero!
13625 // if we don't have specified size, estimate target size based on the
13627 if (p
->target_size
== 0 && hit_set
) {
13628 utime_t dur
= now
- hit_set_start_stamp
;
13629 unsigned unique
= hit_set
->approx_unique_insert_count();
13630 dout(20) << __func__
<< " previous set had approx " << unique
13631 << " unique items over " << dur
<< " seconds" << dendl
;
13632 p
->target_size
= (double)unique
* (double)pool
.info
.hit_set_period
13635 if (p
->target_size
<
13636 static_cast<uint64_t>(cct
->_conf
->osd_hit_set_min_size
))
13637 p
->target_size
= cct
->_conf
->osd_hit_set_min_size
;
13640 > static_cast<uint64_t>(cct
->_conf
->osd_hit_set_max_size
))
13641 p
->target_size
= cct
->_conf
->osd_hit_set_max_size
;
13643 p
->seed
= now
.sec();
13645 dout(10) << __func__
<< " target_size " << p
->target_size
13646 << " fpp " << p
->get_fpp() << dendl
;
13648 hit_set
.reset(new HitSet(params
));
13649 hit_set_start_stamp
= now
;
13653 * apply log entries to set
13655 * this would only happen after peering, to at least capture writes
13656 * during an interval that was potentially lost.
13658 bool PrimaryLogPG::hit_set_apply_log()
13663 eversion_t to
= info
.last_update
;
13664 eversion_t from
= info
.hit_set
.current_last_update
;
13666 dout(20) << __func__
<< " no update" << dendl
;
13670 dout(20) << __func__
<< " " << to
<< " .. " << info
.last_update
<< dendl
;
13671 list
<pg_log_entry_t
>::const_reverse_iterator p
=
13672 recovery_state
.get_pg_log().get_log().log
.rbegin();
13673 while (p
!= recovery_state
.get_pg_log().get_log().log
.rend() && p
->version
> to
)
13675 while (p
!= recovery_state
.get_pg_log().get_log().log
.rend() && p
->version
> from
) {
13676 hit_set
->insert(p
->soid
);
13683 void PrimaryLogPG::hit_set_persist()
13685 dout(10) << __func__
<< dendl
;
13687 unsigned max
= pool
.info
.hit_set_count
;
13689 utime_t now
= ceph_clock_now();
13692 // If any archives are degraded we skip this persist request
13693 // account for the additional entry being added below
13694 for (auto p
= info
.hit_set
.history
.begin();
13695 p
!= info
.hit_set
.history
.end();
13697 hobject_t aoid
= get_hit_set_archive_object(p
->begin
, p
->end
, p
->using_gmt
);
13699 // Once we hit a degraded object just skip further trim
13700 if (is_degraded_or_backfilling_object(aoid
))
13702 if (write_blocked_by_scrub(aoid
))
13706 // If backfill is in progress and we could possibly overlap with the
13707 // hit_set_* objects, back off. Since these all have
13708 // hobject_t::hash set to pgid.ps(), and those sort first, we can
13709 // look just at that. This is necessary because our transactions
13710 // may include a modify of the new hit_set *and* a delete of the
13711 // old one, and this may span the backfill boundary.
13712 for (set
<pg_shard_t
>::const_iterator p
= get_backfill_targets().begin();
13713 p
!= get_backfill_targets().end();
13715 const pg_info_t
& pi
= recovery_state
.get_peer_info(*p
);
13716 if (pi
.last_backfill
== hobject_t() ||
13717 pi
.last_backfill
.get_hash() == info
.pgid
.ps()) {
13718 dout(10) << __func__
<< " backfill target osd." << *p
13719 << " last_backfill has not progressed past pgid ps"
13726 pg_hit_set_info_t new_hset
= pg_hit_set_info_t(pool
.info
.use_gmt_hitset
);
13727 new_hset
.begin
= hit_set_start_stamp
;
13728 new_hset
.end
= now
;
13729 oid
= get_hit_set_archive_object(
13732 new_hset
.using_gmt
);
13734 // If the current object is degraded we skip this persist request
13735 if (write_blocked_by_scrub(oid
))
13739 encode(*hit_set
, bl
);
13740 dout(20) << __func__
<< " archive " << oid
<< dendl
;
13743 agent_state
->add_hit_set(new_hset
.begin
, hit_set
);
13744 uint32_t size
= agent_state
->hit_set_map
.size();
13745 if (size
>= pool
.info
.hit_set_count
) {
13746 size
= pool
.info
.hit_set_count
> 0 ? pool
.info
.hit_set_count
- 1: 0;
13748 hit_set_in_memory_trim(size
);
13751 ObjectContextRef obc
= get_object_context(oid
, true);
13752 OpContextUPtr ctx
= simple_opc_create(obc
);
13754 ctx
->at_version
= get_next_version();
13755 ctx
->updated_hset_history
= info
.hit_set
;
13756 pg_hit_set_history_t
&updated_hit_set_hist
= *(ctx
->updated_hset_history
);
13758 updated_hit_set_hist
.current_last_update
= info
.last_update
;
13759 new_hset
.version
= ctx
->at_version
;
13761 updated_hit_set_hist
.history
.push_back(new_hset
);
13764 // fabricate an object_info_t and SnapSet
13765 obc
->obs
.oi
.version
= ctx
->at_version
;
13766 obc
->obs
.oi
.mtime
= now
;
13767 obc
->obs
.oi
.size
= bl
.length();
13768 obc
->obs
.exists
= true;
13769 obc
->obs
.oi
.set_data_digest(bl
.crc32c(-1));
13771 ctx
->new_obs
= obc
->obs
;
13773 ctx
->new_snapset
= obc
->ssc
->snapset
;
13775 ctx
->delta_stats
.num_objects
++;
13776 ctx
->delta_stats
.num_objects_hit_set_archive
++;
13778 ctx
->delta_stats
.num_bytes
+= bl
.length();
13779 ctx
->delta_stats
.num_bytes_hit_set_archive
+= bl
.length();
13782 encode(ctx
->new_snapset
, bss
);
13783 bufferlist
boi(sizeof(ctx
->new_obs
.oi
));
13784 encode(ctx
->new_obs
.oi
, boi
,
13785 get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD
, nullptr));
13787 ctx
->op_t
->create(oid
);
13789 ctx
->op_t
->write(oid
, 0, bl
.length(), bl
, 0);
13790 write_update_size_and_usage(ctx
->delta_stats
, obc
->obs
.oi
, ctx
->modified_ranges
,
13792 ctx
->clean_regions
.mark_data_region_dirty(0, bl
.length());
13794 map
<string
, bufferlist
> attrs
;
13795 attrs
[OI_ATTR
].claim(boi
);
13796 attrs
[SS_ATTR
].claim(bss
);
13797 setattrs_maybe_cache(ctx
->obc
, ctx
->op_t
.get(), attrs
);
13798 ctx
->log
.push_back(
13800 pg_log_entry_t::MODIFY
,
13809 ctx
->log
.back().clean_regions
= ctx
->clean_regions
;
13811 hit_set_trim(ctx
, max
);
13813 simple_opc_submit(std::move(ctx
));
13816 void PrimaryLogPG::hit_set_trim(OpContextUPtr
&ctx
, unsigned max
)
13818 ceph_assert(ctx
->updated_hset_history
);
13819 pg_hit_set_history_t
&updated_hit_set_hist
=
13820 *(ctx
->updated_hset_history
);
13821 for (unsigned num
= updated_hit_set_hist
.history
.size(); num
> max
; --num
) {
13822 list
<pg_hit_set_info_t
>::iterator p
= updated_hit_set_hist
.history
.begin();
13823 ceph_assert(p
!= updated_hit_set_hist
.history
.end());
13824 hobject_t oid
= get_hit_set_archive_object(p
->begin
, p
->end
, p
->using_gmt
);
13826 ceph_assert(!is_degraded_or_backfilling_object(oid
));
13828 dout(20) << __func__
<< " removing " << oid
<< dendl
;
13829 ++ctx
->at_version
.version
;
13830 ctx
->log
.push_back(
13831 pg_log_entry_t(pg_log_entry_t::DELETE
,
13840 ctx
->op_t
->remove(oid
);
13841 updated_hit_set_hist
.history
.pop_front();
13843 ObjectContextRef obc
= get_object_context(oid
, false);
13845 --ctx
->delta_stats
.num_objects
;
13846 --ctx
->delta_stats
.num_objects_hit_set_archive
;
13847 ctx
->delta_stats
.num_bytes
-= obc
->obs
.oi
.size
;
13848 ctx
->delta_stats
.num_bytes_hit_set_archive
-= obc
->obs
.oi
.size
;
13852 void PrimaryLogPG::hit_set_in_memory_trim(uint32_t max_in_memory
)
13854 while (agent_state
->hit_set_map
.size() > max_in_memory
) {
13855 agent_state
->remove_oldest_hit_set();
13860 // =======================================
13863 void PrimaryLogPG::agent_setup()
13865 ceph_assert(is_locked());
13866 if (!is_active() ||
13868 state_test(PG_STATE_PREMERGE
) ||
13869 pool
.info
.cache_mode
== pg_pool_t::CACHEMODE_NONE
||
13870 pool
.info
.tier_of
< 0 ||
13871 !get_osdmap()->have_pg_pool(pool
.info
.tier_of
)) {
13875 if (!agent_state
) {
13876 agent_state
.reset(new TierAgentState
);
13878 // choose random starting position
13879 agent_state
->position
= hobject_t();
13880 agent_state
->position
.pool
= info
.pgid
.pool();
13881 agent_state
->position
.set_hash(pool
.info
.get_random_pg_position(
13884 agent_state
->start
= agent_state
->position
;
13886 dout(10) << __func__
<< " allocated new state, position "
13887 << agent_state
->position
<< dendl
;
13889 dout(10) << __func__
<< " keeping existing state" << dendl
;
13892 if (info
.stats
.stats_invalid
) {
13893 osd
->clog
->warn() << "pg " << info
.pgid
<< " has invalid (post-split) stats; must scrub before tier agent can activate";
13896 agent_choose_mode();
13899 void PrimaryLogPG::agent_clear()
13902 agent_state
.reset(NULL
);
13905 // Return false if no objects operated on since start of object hash space
13906 bool PrimaryLogPG::agent_work(int start_max
, int agent_flush_quota
)
13908 std::scoped_lock locker
{*this};
13909 if (!agent_state
) {
13910 dout(10) << __func__
<< " no agent state, stopping" << dendl
;
13914 ceph_assert(!recovery_state
.is_deleting());
13916 if (agent_state
->is_idle()) {
13917 dout(10) << __func__
<< " idle, stopping" << dendl
;
13921 osd
->logger
->inc(l_osd_agent_wake
);
13923 dout(10) << __func__
13924 << " max " << start_max
13925 << ", flush " << agent_state
->get_flush_mode_name()
13926 << ", evict " << agent_state
->get_evict_mode_name()
13927 << ", pos " << agent_state
->position
13929 ceph_assert(is_primary());
13930 ceph_assert(is_active());
13932 agent_load_hit_sets();
13934 const pg_pool_t
*base_pool
= get_osdmap()->get_pg_pool(pool
.info
.tier_of
);
13935 ceph_assert(base_pool
);
13938 int ls_max
= cct
->_conf
->osd_pool_default_cache_max_evict_check_size
;
13940 // list some objects. this conveniently lists clones (oldest to
13941 // newest) before heads... the same order we want to flush in.
13943 // NOTE: do not flush the Sequencer. we will assume that the
13944 // listing we get back is imprecise.
13945 vector
<hobject_t
> ls
;
13947 int r
= pgbackend
->objects_list_partial(agent_state
->position
, ls_min
, ls_max
,
13949 ceph_assert(r
>= 0);
13950 dout(20) << __func__
<< " got " << ls
.size() << " objects" << dendl
;
13952 for (vector
<hobject_t
>::iterator p
= ls
.begin();
13955 if (p
->nspace
== cct
->_conf
->osd_hit_set_namespace
) {
13956 dout(20) << __func__
<< " skip (hit set) " << *p
<< dendl
;
13957 osd
->logger
->inc(l_osd_agent_skip
);
13960 if (is_degraded_or_backfilling_object(*p
)) {
13961 dout(20) << __func__
<< " skip (degraded) " << *p
<< dendl
;
13962 osd
->logger
->inc(l_osd_agent_skip
);
13965 if (is_missing_object(p
->get_head())) {
13966 dout(20) << __func__
<< " skip (missing head) " << *p
<< dendl
;
13967 osd
->logger
->inc(l_osd_agent_skip
);
13970 ObjectContextRef obc
= get_object_context(*p
, false, NULL
);
13972 // we didn't flush; we may miss something here.
13973 dout(20) << __func__
<< " skip (no obc) " << *p
<< dendl
;
13974 osd
->logger
->inc(l_osd_agent_skip
);
13977 if (!obc
->obs
.exists
) {
13978 dout(20) << __func__
<< " skip (dne) " << obc
->obs
.oi
.soid
<< dendl
;
13979 osd
->logger
->inc(l_osd_agent_skip
);
13982 if (range_intersects_scrub(obc
->obs
.oi
.soid
,
13983 obc
->obs
.oi
.soid
.get_head())) {
13984 dout(20) << __func__
<< " skip (scrubbing) " << obc
->obs
.oi
<< dendl
;
13985 osd
->logger
->inc(l_osd_agent_skip
);
13988 if (obc
->is_blocked()) {
13989 dout(20) << __func__
<< " skip (blocked) " << obc
->obs
.oi
<< dendl
;
13990 osd
->logger
->inc(l_osd_agent_skip
);
13993 if (obc
->is_request_pending()) {
13994 dout(20) << __func__
<< " skip (request pending) " << obc
->obs
.oi
<< dendl
;
13995 osd
->logger
->inc(l_osd_agent_skip
);
13999 // be careful flushing omap to an EC pool.
14000 if (!base_pool
->supports_omap() &&
14001 obc
->obs
.oi
.is_omap()) {
14002 dout(20) << __func__
<< " skip (omap to EC) " << obc
->obs
.oi
<< dendl
;
14003 osd
->logger
->inc(l_osd_agent_skip
);
14007 if (agent_state
->evict_mode
!= TierAgentState::EVICT_MODE_IDLE
&&
14008 agent_maybe_evict(obc
, false))
14010 else if (agent_state
->flush_mode
!= TierAgentState::FLUSH_MODE_IDLE
&&
14011 agent_flush_quota
> 0 && agent_maybe_flush(obc
)) {
14013 --agent_flush_quota
;
14015 if (started
>= start_max
) {
14016 // If finishing early, set "next" to the next object
14017 if (++p
!= ls
.end())
14023 if (++agent_state
->hist_age
> cct
->_conf
->osd_agent_hist_halflife
) {
14024 dout(20) << __func__
<< " resetting atime and temp histograms" << dendl
;
14025 agent_state
->hist_age
= 0;
14026 agent_state
->temp_hist
.decay();
14029 // Total objects operated on so far
14030 int total_started
= agent_state
->started
+ started
;
14031 bool need_delay
= false;
14033 dout(20) << __func__
<< " start pos " << agent_state
->position
14034 << " next start pos " << next
14035 << " started " << total_started
<< dendl
;
14037 // See if we've made a full pass over the object hash space
14038 // This might check at most ls_max objects a second time to notice that
14039 // we've checked every objects at least once.
14040 if (agent_state
->position
< agent_state
->start
&&
14041 next
>= agent_state
->start
) {
14042 dout(20) << __func__
<< " wrap around " << agent_state
->start
<< dendl
;
14043 if (total_started
== 0)
14047 agent_state
->start
= next
;
14049 agent_state
->started
= total_started
;
14051 // See if we are starting from beginning
14053 agent_state
->position
= hobject_t();
14055 agent_state
->position
= next
;
14057 // Discard old in memory HitSets
14058 hit_set_in_memory_trim(pool
.info
.hit_set_count
);
14061 ceph_assert(agent_state
->delaying
== false);
14065 agent_choose_mode();
14069 void PrimaryLogPG::agent_load_hit_sets()
14071 if (agent_state
->evict_mode
== TierAgentState::EVICT_MODE_IDLE
) {
14075 if (agent_state
->hit_set_map
.size() < info
.hit_set
.history
.size()) {
14076 dout(10) << __func__
<< dendl
;
14077 for (auto p
= info
.hit_set
.history
.begin();
14078 p
!= info
.hit_set
.history
.end(); ++p
) {
14079 if (agent_state
->hit_set_map
.count(p
->begin
.sec()) == 0) {
14080 dout(10) << __func__
<< " loading " << p
->begin
<< "-"
14081 << p
->end
<< dendl
;
14082 if (!pool
.info
.is_replicated()) {
14083 // FIXME: EC not supported here yet
14084 derr
<< __func__
<< " on non-replicated pool" << dendl
;
14088 hobject_t oid
= get_hit_set_archive_object(p
->begin
, p
->end
, p
->using_gmt
);
14089 if (is_unreadable_object(oid
)) {
14090 dout(10) << __func__
<< " unreadable " << oid
<< ", waiting" << dendl
;
14094 ObjectContextRef obc
= get_object_context(oid
, false);
14096 derr
<< __func__
<< ": could not load hitset " << oid
<< dendl
;
14102 int r
= osd
->store
->read(ch
, ghobject_t(oid
), 0, 0, bl
);
14103 ceph_assert(r
>= 0);
14105 HitSetRef
hs(new HitSet
);
14106 bufferlist::const_iterator pbl
= bl
.begin();
14108 agent_state
->add_hit_set(p
->begin
.sec(), hs
);
14114 bool PrimaryLogPG::agent_maybe_flush(ObjectContextRef
& obc
)
14116 if (!obc
->obs
.oi
.is_dirty()) {
14117 dout(20) << __func__
<< " skip (clean) " << obc
->obs
.oi
<< dendl
;
14118 osd
->logger
->inc(l_osd_agent_skip
);
14121 if (obc
->obs
.oi
.is_cache_pinned()) {
14122 dout(20) << __func__
<< " skip (cache_pinned) " << obc
->obs
.oi
<< dendl
;
14123 osd
->logger
->inc(l_osd_agent_skip
);
14127 utime_t now
= ceph_clock_now();
14128 utime_t ob_local_mtime
;
14129 if (obc
->obs
.oi
.local_mtime
!= utime_t()) {
14130 ob_local_mtime
= obc
->obs
.oi
.local_mtime
;
14132 ob_local_mtime
= obc
->obs
.oi
.mtime
;
14134 bool evict_mode_full
=
14135 (agent_state
->evict_mode
== TierAgentState::EVICT_MODE_FULL
);
14136 if (!evict_mode_full
&&
14137 obc
->obs
.oi
.soid
.snap
== CEPH_NOSNAP
&& // snaps immutable; don't delay
14138 (ob_local_mtime
+ utime_t(pool
.info
.cache_min_flush_age
, 0) > now
)) {
14139 dout(20) << __func__
<< " skip (too young) " << obc
->obs
.oi
<< dendl
;
14140 osd
->logger
->inc(l_osd_agent_skip
);
14144 if (osd
->agent_is_active_oid(obc
->obs
.oi
.soid
)) {
14145 dout(20) << __func__
<< " skip (flushing) " << obc
->obs
.oi
<< dendl
;
14146 osd
->logger
->inc(l_osd_agent_skip
);
14150 dout(10) << __func__
<< " flushing " << obc
->obs
.oi
<< dendl
;
14152 // FIXME: flush anything dirty, regardless of what distribution of
14155 hobject_t oid
= obc
->obs
.oi
.soid
;
14156 osd
->agent_start_op(oid
);
14157 // no need to capture a pg ref, can't outlive fop or ctx
14158 std::function
<void()> on_flush
= [this, oid
]() {
14159 osd
->agent_finish_op(oid
);
14162 int result
= start_flush(
14163 OpRequestRef(), obc
, false, NULL
,
14165 if (result
!= -EINPROGRESS
) {
14167 dout(10) << __func__
<< " start_flush() failed " << obc
->obs
.oi
14168 << " with " << result
<< dendl
;
14169 osd
->logger
->inc(l_osd_agent_skip
);
14173 osd
->logger
->inc(l_osd_agent_flush
);
14177 bool PrimaryLogPG::agent_maybe_evict(ObjectContextRef
& obc
, bool after_flush
)
14179 const hobject_t
& soid
= obc
->obs
.oi
.soid
;
14180 if (!after_flush
&& obc
->obs
.oi
.is_dirty()) {
14181 dout(20) << __func__
<< " skip (dirty) " << obc
->obs
.oi
<< dendl
;
14184 // This is already checked by agent_work() which passes after_flush = false
14185 if (after_flush
&& range_intersects_scrub(soid
, soid
.get_head())) {
14186 dout(20) << __func__
<< " skip (scrubbing) " << obc
->obs
.oi
<< dendl
;
14189 if (!obc
->obs
.oi
.watchers
.empty()) {
14190 dout(20) << __func__
<< " skip (watchers) " << obc
->obs
.oi
<< dendl
;
14193 if (obc
->is_blocked()) {
14194 dout(20) << __func__
<< " skip (blocked) " << obc
->obs
.oi
<< dendl
;
14197 if (obc
->obs
.oi
.is_cache_pinned()) {
14198 dout(20) << __func__
<< " skip (cache_pinned) " << obc
->obs
.oi
<< dendl
;
14202 if (soid
.snap
== CEPH_NOSNAP
) {
14203 int result
= _verify_no_head_clones(soid
, obc
->ssc
->snapset
);
14205 dout(20) << __func__
<< " skip (clones) " << obc
->obs
.oi
<< dendl
;
14210 if (agent_state
->evict_mode
!= TierAgentState::EVICT_MODE_FULL
) {
14211 // is this object old than cache_min_evict_age?
14212 utime_t now
= ceph_clock_now();
14213 utime_t ob_local_mtime
;
14214 if (obc
->obs
.oi
.local_mtime
!= utime_t()) {
14215 ob_local_mtime
= obc
->obs
.oi
.local_mtime
;
14217 ob_local_mtime
= obc
->obs
.oi
.mtime
;
14219 if (ob_local_mtime
+ utime_t(pool
.info
.cache_min_evict_age
, 0) > now
) {
14220 dout(20) << __func__
<< " skip (too young) " << obc
->obs
.oi
<< dendl
;
14221 osd
->logger
->inc(l_osd_agent_skip
);
14224 // is this object old and/or cold enough?
14226 uint64_t temp_upper
= 0, temp_lower
= 0;
14228 agent_estimate_temp(soid
, &temp
);
14229 agent_state
->temp_hist
.add(temp
);
14230 agent_state
->temp_hist
.get_position_micro(temp
, &temp_lower
, &temp_upper
);
14232 dout(20) << __func__
14233 << " temp " << temp
14234 << " pos " << temp_lower
<< "-" << temp_upper
14235 << ", evict_effort " << agent_state
->evict_effort
14237 dout(30) << "agent_state:\n";
14238 Formatter
*f
= Formatter::create("");
14239 f
->open_object_section("agent_state");
14240 agent_state
->dump(f
);
14241 f
->close_section();
14246 if (1000000 - temp_upper
>= agent_state
->evict_effort
)
14250 dout(10) << __func__
<< " evicting " << obc
->obs
.oi
<< dendl
;
14251 OpContextUPtr ctx
= simple_opc_create(obc
);
14253 auto null_op_req
= OpRequestRef();
14254 if (!ctx
->lock_manager
.get_lock_type(
14259 close_op_ctx(ctx
.release());
14260 dout(20) << __func__
<< " skip (cannot get lock) " << obc
->obs
.oi
<< dendl
;
14264 osd
->agent_start_evict_op();
14265 ctx
->register_on_finish(
14267 osd
->agent_finish_evict_op();
14270 ctx
->at_version
= get_next_version();
14271 ceph_assert(ctx
->new_obs
.exists
);
14272 int r
= _delete_oid(ctx
.get(), true, false);
14273 if (obc
->obs
.oi
.is_omap())
14274 ctx
->delta_stats
.num_objects_omap
--;
14275 ctx
->delta_stats
.num_evict
++;
14276 ctx
->delta_stats
.num_evict_kb
+= shift_round_up(obc
->obs
.oi
.size
, 10);
14277 if (obc
->obs
.oi
.is_dirty())
14278 --ctx
->delta_stats
.num_objects_dirty
;
14279 ceph_assert(r
== 0);
14280 finish_ctx(ctx
.get(), pg_log_entry_t::DELETE
);
14281 simple_opc_submit(std::move(ctx
));
14282 osd
->logger
->inc(l_osd_tier_evict
);
14283 osd
->logger
->inc(l_osd_agent_evict
);
14287 void PrimaryLogPG::agent_stop()
14289 dout(20) << __func__
<< dendl
;
14290 if (agent_state
&& !agent_state
->is_idle()) {
14291 agent_state
->evict_mode
= TierAgentState::EVICT_MODE_IDLE
;
14292 agent_state
->flush_mode
= TierAgentState::FLUSH_MODE_IDLE
;
14293 osd
->agent_disable_pg(this, agent_state
->evict_effort
);
14297 void PrimaryLogPG::agent_delay()
14299 dout(20) << __func__
<< dendl
;
14300 if (agent_state
&& !agent_state
->is_idle()) {
14301 ceph_assert(agent_state
->delaying
== false);
14302 agent_state
->delaying
= true;
14303 osd
->agent_disable_pg(this, agent_state
->evict_effort
);
14307 void PrimaryLogPG::agent_choose_mode_restart()
14309 dout(20) << __func__
<< dendl
;
14310 std::scoped_lock locker
{*this};
14311 if (agent_state
&& agent_state
->delaying
) {
14312 agent_state
->delaying
= false;
14313 agent_choose_mode(true);
14317 bool PrimaryLogPG::agent_choose_mode(bool restart
, OpRequestRef op
)
14319 bool requeued
= false;
14320 // Let delay play out
14321 if (agent_state
->delaying
) {
14322 dout(20) << __func__
<< " " << this << " delaying, ignored" << dendl
;
14326 TierAgentState::flush_mode_t flush_mode
= TierAgentState::FLUSH_MODE_IDLE
;
14327 TierAgentState::evict_mode_t evict_mode
= TierAgentState::EVICT_MODE_IDLE
;
14328 unsigned evict_effort
= 0;
14330 if (info
.stats
.stats_invalid
) {
14331 // idle; stats can't be trusted until we scrub.
14332 dout(20) << __func__
<< " stats invalid (post-split), idle" << dendl
;
14337 uint64_t divisor
= pool
.info
.get_pg_num_divisor(info
.pgid
.pgid
);
14338 ceph_assert(divisor
> 0);
14340 // adjust (effective) user objects down based on the number
14341 // of HitSet objects, which should not count toward our total since
14342 // they cannot be flushed.
14343 uint64_t unflushable
= info
.stats
.stats
.sum
.num_objects_hit_set_archive
;
14345 // also exclude omap objects if ec backing pool
14346 const pg_pool_t
*base_pool
= get_osdmap()->get_pg_pool(pool
.info
.tier_of
);
14347 ceph_assert(base_pool
);
14348 if (!base_pool
->supports_omap())
14349 unflushable
+= info
.stats
.stats
.sum
.num_objects_omap
;
14351 uint64_t num_user_objects
= info
.stats
.stats
.sum
.num_objects
;
14352 if (num_user_objects
> unflushable
)
14353 num_user_objects
-= unflushable
;
14355 num_user_objects
= 0;
14357 uint64_t num_user_bytes
= info
.stats
.stats
.sum
.num_bytes
;
14358 uint64_t unflushable_bytes
= info
.stats
.stats
.sum
.num_bytes_hit_set_archive
;
14359 num_user_bytes
-= unflushable_bytes
;
14360 uint64_t num_overhead_bytes
= osd
->store
->estimate_objects_overhead(num_user_objects
);
14361 num_user_bytes
+= num_overhead_bytes
;
14363 // also reduce the num_dirty by num_objects_omap
14364 int64_t num_dirty
= info
.stats
.stats
.sum
.num_objects_dirty
;
14365 if (!base_pool
->supports_omap()) {
14366 if (num_dirty
> info
.stats
.stats
.sum
.num_objects_omap
)
14367 num_dirty
-= info
.stats
.stats
.sum
.num_objects_omap
;
14372 dout(10) << __func__
14374 << TierAgentState::get_flush_mode_name(agent_state
->flush_mode
)
14376 << TierAgentState::get_evict_mode_name(agent_state
->evict_mode
)
14377 << " num_objects: " << info
.stats
.stats
.sum
.num_objects
14378 << " num_bytes: " << info
.stats
.stats
.sum
.num_bytes
14379 << " num_objects_dirty: " << info
.stats
.stats
.sum
.num_objects_dirty
14380 << " num_objects_omap: " << info
.stats
.stats
.sum
.num_objects_omap
14381 << " num_dirty: " << num_dirty
14382 << " num_user_objects: " << num_user_objects
14383 << " num_user_bytes: " << num_user_bytes
14384 << " num_overhead_bytes: " << num_overhead_bytes
14385 << " pool.info.target_max_bytes: " << pool
.info
.target_max_bytes
14386 << " pool.info.target_max_objects: " << pool
.info
.target_max_objects
14389 // get dirty, full ratios
14390 uint64_t dirty_micro
= 0;
14391 uint64_t full_micro
= 0;
14392 if (pool
.info
.target_max_bytes
&& num_user_objects
> 0) {
14393 uint64_t avg_size
= num_user_bytes
/ num_user_objects
;
14395 num_dirty
* avg_size
* 1000000 /
14396 std::max
<uint64_t>(pool
.info
.target_max_bytes
/ divisor
, 1);
14398 num_user_objects
* avg_size
* 1000000 /
14399 std::max
<uint64_t>(pool
.info
.target_max_bytes
/ divisor
, 1);
14401 if (pool
.info
.target_max_objects
> 0) {
14402 uint64_t dirty_objects_micro
=
14403 num_dirty
* 1000000 /
14404 std::max
<uint64_t>(pool
.info
.target_max_objects
/ divisor
, 1);
14405 if (dirty_objects_micro
> dirty_micro
)
14406 dirty_micro
= dirty_objects_micro
;
14407 uint64_t full_objects_micro
=
14408 num_user_objects
* 1000000 /
14409 std::max
<uint64_t>(pool
.info
.target_max_objects
/ divisor
, 1);
14410 if (full_objects_micro
> full_micro
)
14411 full_micro
= full_objects_micro
;
14413 dout(20) << __func__
<< " dirty " << ((float)dirty_micro
/ 1000000.0)
14414 << " full " << ((float)full_micro
/ 1000000.0)
14418 uint64_t flush_target
= pool
.info
.cache_target_dirty_ratio_micro
;
14419 uint64_t flush_high_target
= pool
.info
.cache_target_dirty_high_ratio_micro
;
14420 uint64_t flush_slop
= (float)flush_target
* cct
->_conf
->osd_agent_slop
;
14421 if (restart
|| agent_state
->flush_mode
== TierAgentState::FLUSH_MODE_IDLE
) {
14422 flush_target
+= flush_slop
;
14423 flush_high_target
+= flush_slop
;
14425 flush_target
-= std::min(flush_target
, flush_slop
);
14426 flush_high_target
-= std::min(flush_high_target
, flush_slop
);
14429 if (dirty_micro
> flush_high_target
) {
14430 flush_mode
= TierAgentState::FLUSH_MODE_HIGH
;
14431 } else if (dirty_micro
> flush_target
|| (!flush_target
&& num_dirty
> 0)) {
14432 flush_mode
= TierAgentState::FLUSH_MODE_LOW
;
14436 uint64_t evict_target
= pool
.info
.cache_target_full_ratio_micro
;
14437 uint64_t evict_slop
= (float)evict_target
* cct
->_conf
->osd_agent_slop
;
14438 if (restart
|| agent_state
->evict_mode
== TierAgentState::EVICT_MODE_IDLE
)
14439 evict_target
+= evict_slop
;
14441 evict_target
-= std::min(evict_target
, evict_slop
);
14443 if (full_micro
> 1000000) {
14444 // evict anything clean
14445 evict_mode
= TierAgentState::EVICT_MODE_FULL
;
14446 evict_effort
= 1000000;
14447 } else if (full_micro
> evict_target
) {
14448 // set effort in [0..1] range based on where we are between
14449 evict_mode
= TierAgentState::EVICT_MODE_SOME
;
14450 uint64_t over
= full_micro
- evict_target
;
14451 uint64_t span
= 1000000 - evict_target
;
14452 evict_effort
= std::max(over
* 1000000 / span
,
14453 uint64_t(1000000.0 *
14454 cct
->_conf
->osd_agent_min_evict_effort
));
14456 // quantize effort to avoid too much reordering in the agent_queue.
14457 uint64_t inc
= cct
->_conf
->osd_agent_quantize_effort
* 1000000;
14458 ceph_assert(inc
> 0);
14459 uint64_t was
= evict_effort
;
14460 evict_effort
-= evict_effort
% inc
;
14461 if (evict_effort
< inc
)
14462 evict_effort
= inc
;
14463 ceph_assert(evict_effort
>= inc
&& evict_effort
<= 1000000);
14464 dout(30) << __func__
<< " evict_effort " << was
<< " quantized by " << inc
<< " to " << evict_effort
<< dendl
;
14469 bool old_idle
= agent_state
->is_idle();
14470 if (flush_mode
!= agent_state
->flush_mode
) {
14471 dout(5) << __func__
<< " flush_mode "
14472 << TierAgentState::get_flush_mode_name(agent_state
->flush_mode
)
14474 << TierAgentState::get_flush_mode_name(flush_mode
)
14476 recovery_state
.update_stats(
14477 [=](auto &history
, auto &stats
) {
14478 if (flush_mode
== TierAgentState::FLUSH_MODE_HIGH
) {
14479 osd
->agent_inc_high_count();
14480 stats
.stats
.sum
.num_flush_mode_high
= 1;
14481 } else if (flush_mode
== TierAgentState::FLUSH_MODE_LOW
) {
14482 stats
.stats
.sum
.num_flush_mode_low
= 1;
14484 if (agent_state
->flush_mode
== TierAgentState::FLUSH_MODE_HIGH
) {
14485 osd
->agent_dec_high_count();
14486 stats
.stats
.sum
.num_flush_mode_high
= 0;
14487 } else if (agent_state
->flush_mode
== TierAgentState::FLUSH_MODE_LOW
) {
14488 stats
.stats
.sum
.num_flush_mode_low
= 0;
14492 agent_state
->flush_mode
= flush_mode
;
14494 if (evict_mode
!= agent_state
->evict_mode
) {
14495 dout(5) << __func__
<< " evict_mode "
14496 << TierAgentState::get_evict_mode_name(agent_state
->evict_mode
)
14498 << TierAgentState::get_evict_mode_name(evict_mode
)
14500 if (agent_state
->evict_mode
== TierAgentState::EVICT_MODE_FULL
&&
14504 requeue_ops(waiting_for_flush
);
14505 requeue_ops(waiting_for_active
);
14506 requeue_ops(waiting_for_readable
);
14507 requeue_ops(waiting_for_scrub
);
14508 requeue_ops(waiting_for_cache_not_full
);
14509 objects_blocked_on_cache_full
.clear();
14512 recovery_state
.update_stats(
14513 [=](auto &history
, auto &stats
) {
14514 if (evict_mode
== TierAgentState::EVICT_MODE_SOME
) {
14515 stats
.stats
.sum
.num_evict_mode_some
= 1;
14516 } else if (evict_mode
== TierAgentState::EVICT_MODE_FULL
) {
14517 stats
.stats
.sum
.num_evict_mode_full
= 1;
14519 if (agent_state
->evict_mode
== TierAgentState::EVICT_MODE_SOME
) {
14520 stats
.stats
.sum
.num_evict_mode_some
= 0;
14521 } else if (agent_state
->evict_mode
== TierAgentState::EVICT_MODE_FULL
) {
14522 stats
.stats
.sum
.num_evict_mode_full
= 0;
14526 agent_state
->evict_mode
= evict_mode
;
14528 uint64_t old_effort
= agent_state
->evict_effort
;
14529 if (evict_effort
!= agent_state
->evict_effort
) {
14530 dout(5) << __func__
<< " evict_effort "
14531 << ((float)agent_state
->evict_effort
/ 1000000.0)
14533 << ((float)evict_effort
/ 1000000.0)
14535 agent_state
->evict_effort
= evict_effort
;
14538 // NOTE: we are using evict_effort as a proxy for *all* agent effort
14539 // (including flush). This is probably fine (they should be
14540 // correlated) but it is not precisely correct.
14541 if (agent_state
->is_idle()) {
14542 if (!restart
&& !old_idle
) {
14543 osd
->agent_disable_pg(this, old_effort
);
14546 if (restart
|| old_idle
) {
14547 osd
->agent_enable_pg(this, agent_state
->evict_effort
);
14548 } else if (old_effort
!= agent_state
->evict_effort
) {
14549 osd
->agent_adjust_pg(this, old_effort
, agent_state
->evict_effort
);
14555 void PrimaryLogPG::agent_estimate_temp(const hobject_t
& oid
, int *temp
)
14557 ceph_assert(hit_set
);
14560 if (hit_set
->contains(oid
))
14563 int last_n
= pool
.info
.hit_set_search_last_n
;
14564 for (map
<time_t,HitSetRef
>::reverse_iterator p
=
14565 agent_state
->hit_set_map
.rbegin(); last_n
> 0 &&
14566 p
!= agent_state
->hit_set_map
.rend(); ++p
, ++i
) {
14567 if (p
->second
->contains(oid
)) {
14568 *temp
+= pool
.info
.get_grade(i
);
14574 // Dup op detection
14576 bool PrimaryLogPG::already_complete(eversion_t v
)
14578 dout(20) << __func__
<< ": " << v
<< dendl
;
14579 for (xlist
<RepGather
*>::iterator i
= repop_queue
.begin();
14582 dout(20) << __func__
<< ": " << **i
<< dendl
;
14583 // skip copy from temp object ops
14584 if ((*i
)->v
== eversion_t()) {
14585 dout(20) << __func__
<< ": " << **i
14586 << " version is empty" << dendl
;
14590 dout(20) << __func__
<< ": " << **i
14591 << " (*i)->v past v" << dendl
;
14594 if (!(*i
)->all_committed
) {
14595 dout(20) << __func__
<< ": " << **i
14596 << " not committed, returning false"
14601 dout(20) << __func__
<< ": returning true" << dendl
;
14606 // ==========================================================================================
14610 bool PrimaryLogPG::_range_available_for_scrub(
14611 const hobject_t
&begin
, const hobject_t
&end
)
14613 pair
<hobject_t
, ObjectContextRef
> next
;
14614 next
.second
= object_contexts
.lookup(begin
);
14615 next
.first
= begin
;
14617 while (more
&& next
.first
< end
) {
14618 if (next
.second
&& next
.second
->is_blocked()) {
14619 next
.second
->requeue_scrub_on_unblock
= true;
14620 dout(10) << __func__
<< ": scrub delayed, "
14621 << next
.first
<< " is blocked"
14625 more
= object_contexts
.get_next(next
.first
, &next
);
14630 static bool doing_clones(const std::optional
<SnapSet
> &snapset
,
14631 const vector
<snapid_t
>::reverse_iterator
&curclone
) {
14632 return snapset
&& curclone
!= snapset
->clones
.rend();
14635 void PrimaryLogPG::log_missing(unsigned missing
,
14636 const std::optional
<hobject_t
> &head
,
14637 LogChannelRef clog
,
14641 bool allow_incomplete_clones
)
14644 if (allow_incomplete_clones
) {
14645 dout(20) << func
<< " " << mode
<< " " << pgid
<< " " << *head
14646 << " skipped " << missing
<< " clone(s) in cache tier" << dendl
;
14648 clog
->info() << mode
<< " " << pgid
<< " " << *head
14649 << " : " << missing
<< " missing clone(s)";
14653 unsigned PrimaryLogPG::process_clones_to(const std::optional
<hobject_t
> &head
,
14654 const std::optional
<SnapSet
> &snapset
,
14655 LogChannelRef clog
,
14658 bool allow_incomplete_clones
,
14659 std::optional
<snapid_t
> target
,
14660 vector
<snapid_t
>::reverse_iterator
*curclone
,
14661 inconsistent_snapset_wrapper
&e
)
14664 ceph_assert(snapset
);
14665 unsigned missing
= 0;
14667 // NOTE: clones are in descending order, thus **curclone > target test here
14668 hobject_t
next_clone(*head
);
14669 while(doing_clones(snapset
, *curclone
) && (!target
|| **curclone
> *target
)) {
14671 // it is okay to be missing one or more clones in a cache tier.
14672 // skip higher-numbered clones in the list.
14673 if (!allow_incomplete_clones
) {
14674 next_clone
.snap
= **curclone
;
14675 clog
->error() << mode
<< " " << pgid
<< " " << *head
14676 << " : expected clone " << next_clone
<< " " << missing
14678 ++scrubber
.shallow_errors
;
14679 e
.set_clone_missing(next_clone
.snap
);
14681 // Clones are descending
14688 * Validate consistency of the object info and snap sets.
14690 * We are sort of comparing 2 lists. The main loop is on objmap.objects. But
14691 * the comparison of the objects is against multiple snapset.clones. There are
14692 * multiple clone lists and in between lists we expect head.
14698 * obj1 snap 1 head, unexpected obj1 snap 1
14699 * obj2 head head, match
14700 * [SnapSet clones 6 4 2 1]
14701 * obj2 snap 7 obj2 snap 6, unexpected obj2 snap 7
14702 * obj2 snap 6 obj2 snap 6, match
14703 * obj2 snap 4 obj2 snap 4, match
14704 * obj3 head obj2 snap 2 (expected), obj2 snap 1 (expected), match
14705 * [Snapset clones 3 1]
14706 * obj3 snap 3 obj3 snap 3 match
14707 * obj3 snap 1 obj3 snap 1 match
14708 * obj4 head head, match
14709 * [Snapset clones 4]
14710 * EOL obj4 snap 4, (expected)
14712 void PrimaryLogPG::scrub_snapshot_metadata(
14713 ScrubMap
&scrubmap
,
14714 const map
<hobject_t
,
14715 pair
<std::optional
<uint32_t>,
14716 std::optional
<uint32_t>>> &missing_digest
)
14718 dout(10) << __func__
<< dendl
;
14720 bool repair
= state_test(PG_STATE_REPAIR
);
14721 bool deep_scrub
= state_test(PG_STATE_DEEP_SCRUB
);
14722 const char *mode
= (repair
? "repair": (deep_scrub
? "deep-scrub" : "scrub"));
14723 std::optional
<snapid_t
> all_clones
; // Unspecified snapid_t or std::nullopt
14725 // traverse in reverse order.
14726 std::optional
<hobject_t
> head
;
14727 std::optional
<SnapSet
> snapset
; // If initialized so will head (above)
14728 vector
<snapid_t
>::reverse_iterator curclone
; // Defined only if snapset initialized
14729 unsigned missing
= 0;
14730 inconsistent_snapset_wrapper soid_error
, head_error
;
14731 unsigned soid_error_count
= 0;
14733 for (map
<hobject_t
,ScrubMap::object
>::reverse_iterator
14734 p
= scrubmap
.objects
.rbegin(); p
!= scrubmap
.objects
.rend(); ++p
) {
14735 const hobject_t
& soid
= p
->first
;
14736 ceph_assert(!soid
.is_snapdir());
14737 soid_error
= inconsistent_snapset_wrapper
{soid
};
14738 object_stat_sum_t stat
;
14739 std::optional
<object_info_t
> oi
;
14741 stat
.num_objects
++;
14743 if (soid
.nspace
== cct
->_conf
->osd_hit_set_namespace
)
14744 stat
.num_objects_hit_set_archive
++;
14746 if (soid
.is_snap()) {
14748 stat
.num_object_clones
++;
14752 if (p
->second
.attrs
.count(OI_ATTR
) == 0) {
14754 osd
->clog
->error() << mode
<< " " << info
.pgid
<< " " << soid
14755 << " : no '" << OI_ATTR
<< "' attr";
14756 ++scrubber
.shallow_errors
;
14757 soid_error
.set_info_missing();
14760 bv
.push_back(p
->second
.attrs
[OI_ATTR
]);
14762 oi
= object_info_t(); // Initialize optional<> before decode into it
14764 } catch (buffer::error
& e
) {
14766 osd
->clog
->error() << mode
<< " " << info
.pgid
<< " " << soid
14767 << " : can't decode '" << OI_ATTR
<< "' attr " << e
.what();
14768 ++scrubber
.shallow_errors
;
14769 soid_error
.set_info_corrupted();
14770 soid_error
.set_info_missing(); // Not available too
14775 if (pgbackend
->be_get_ondisk_size(oi
->size
) != p
->second
.size
) {
14776 osd
->clog
->error() << mode
<< " " << info
.pgid
<< " " << soid
14777 << " : on disk size (" << p
->second
.size
14778 << ") does not match object info size ("
14779 << oi
->size
<< ") adjusted for ondisk to ("
14780 << pgbackend
->be_get_ondisk_size(oi
->size
)
14782 soid_error
.set_size_mismatch();
14783 ++scrubber
.shallow_errors
;
14786 dout(20) << mode
<< " " << soid
<< " " << *oi
<< dendl
;
14788 // A clone num_bytes will be added later when we have snapset
14789 if (!soid
.is_snap()) {
14790 stat
.num_bytes
+= oi
->size
;
14792 if (soid
.nspace
== cct
->_conf
->osd_hit_set_namespace
)
14793 stat
.num_bytes_hit_set_archive
+= oi
->size
;
14795 if (oi
->is_dirty())
14796 ++stat
.num_objects_dirty
;
14797 if (oi
->is_whiteout())
14798 ++stat
.num_whiteouts
;
14800 ++stat
.num_objects_omap
;
14801 if (oi
->is_cache_pinned())
14802 ++stat
.num_objects_pinned
;
14803 if (oi
->has_manifest())
14804 ++stat
.num_objects_manifest
;
14807 // Check for any problems while processing clones
14808 if (doing_clones(snapset
, curclone
)) {
14809 std::optional
<snapid_t
> target
;
14810 // Expecting an object with snap for current head
14811 if (soid
.has_snapset() || soid
.get_head() != head
->get_head()) {
14813 dout(10) << __func__
<< " " << mode
<< " " << info
.pgid
<< " new object "
14814 << soid
<< " while processing " << *head
<< dendl
;
14816 target
= all_clones
;
14818 ceph_assert(soid
.is_snap());
14819 target
= soid
.snap
;
14822 // Log any clones we were expecting to be there up to target
14823 // This will set missing, but will be a no-op if snap.soid == *curclone.
14824 missing
+= process_clones_to(head
, snapset
, osd
->clog
, info
.pgid
, mode
,
14825 pool
.info
.allow_incomplete_clones(), target
, &curclone
,
14829 // Check doing_clones() again in case we ran process_clones_to()
14830 if (doing_clones(snapset
, curclone
)) {
14831 // A head would have processed all clones above
14832 // or all greater than *curclone.
14833 ceph_assert(soid
.is_snap() && *curclone
<= soid
.snap
);
14835 // After processing above clone snap should match the expected curclone
14836 expected
= (*curclone
== soid
.snap
);
14838 // If we aren't doing clones any longer, then expecting head
14839 expected
= soid
.has_snapset();
14842 // If we couldn't read the head's snapset, just ignore clones
14843 if (head
&& !snapset
) {
14844 osd
->clog
->error() << mode
<< " " << info
.pgid
<< " " << soid
14845 << " : clone ignored due to missing snapset";
14847 osd
->clog
->error() << mode
<< " " << info
.pgid
<< " " << soid
14848 << " : is an unexpected clone";
14850 ++scrubber
.shallow_errors
;
14851 soid_error
.set_headless();
14852 scrubber
.store
->add_snap_error(pool
.id
, soid_error
);
14853 ++soid_error_count
;
14854 if (head
&& soid
.get_head() == head
->get_head())
14855 head_error
.set_clone(soid
.snap
);
14860 if (soid
.has_snapset()) {
14863 log_missing(missing
, head
, osd
->clog
, info
.pgid
, __func__
, mode
,
14864 pool
.info
.allow_incomplete_clones());
14867 // Save previous head error information
14868 if (head
&& (head_error
.errors
|| soid_error_count
))
14869 scrubber
.store
->add_snap_error(pool
.id
, head_error
);
14870 // Set this as a new head object
14873 head_error
= soid_error
;
14874 soid_error_count
= 0;
14876 dout(20) << __func__
<< " " << mode
<< " new head " << head
<< dendl
;
14878 if (p
->second
.attrs
.count(SS_ATTR
) == 0) {
14879 osd
->clog
->error() << mode
<< " " << info
.pgid
<< " " << soid
14880 << " : no '" << SS_ATTR
<< "' attr";
14881 ++scrubber
.shallow_errors
;
14882 snapset
= std::nullopt
;
14883 head_error
.set_snapset_missing();
14886 bl
.push_back(p
->second
.attrs
[SS_ATTR
]);
14887 auto blp
= bl
.cbegin();
14889 snapset
= SnapSet(); // Initialize optional<> before decoding into it
14890 decode(*snapset
, blp
);
14891 head_error
.ss_bl
.push_back(p
->second
.attrs
[SS_ATTR
]);
14892 } catch (buffer::error
& e
) {
14893 snapset
= std::nullopt
;
14894 osd
->clog
->error() << mode
<< " " << info
.pgid
<< " " << soid
14895 << " : can't decode '" << SS_ATTR
<< "' attr " << e
.what();
14896 ++scrubber
.shallow_errors
;
14897 head_error
.set_snapset_corrupted();
14902 // what will be next?
14903 curclone
= snapset
->clones
.rbegin();
14905 if (!snapset
->clones
.empty()) {
14906 dout(20) << " snapset " << *snapset
<< dendl
;
14907 if (snapset
->seq
== 0) {
14908 osd
->clog
->error() << mode
<< " " << info
.pgid
<< " " << soid
14909 << " : snaps.seq not set";
14910 ++scrubber
.shallow_errors
;
14911 head_error
.set_snapset_error();
14916 ceph_assert(soid
.is_snap());
14918 ceph_assert(snapset
);
14919 ceph_assert(soid
.snap
== *curclone
);
14921 dout(20) << __func__
<< " " << mode
<< " matched clone " << soid
<< dendl
;
14923 if (snapset
->clone_size
.count(soid
.snap
) == 0) {
14924 osd
->clog
->error() << mode
<< " " << info
.pgid
<< " " << soid
14925 << " : is missing in clone_size";
14926 ++scrubber
.shallow_errors
;
14927 soid_error
.set_size_mismatch();
14929 if (oi
&& oi
->size
!= snapset
->clone_size
[soid
.snap
]) {
14930 osd
->clog
->error() << mode
<< " " << info
.pgid
<< " " << soid
14931 << " : size " << oi
->size
<< " != clone_size "
14932 << snapset
->clone_size
[*curclone
];
14933 ++scrubber
.shallow_errors
;
14934 soid_error
.set_size_mismatch();
14937 if (snapset
->clone_overlap
.count(soid
.snap
) == 0) {
14938 osd
->clog
->error() << mode
<< " " << info
.pgid
<< " " << soid
14939 << " : is missing in clone_overlap";
14940 ++scrubber
.shallow_errors
;
14941 soid_error
.set_size_mismatch();
14943 // This checking is based on get_clone_bytes(). The first 2 asserts
14944 // can't happen because we know we have a clone_size and
14945 // a clone_overlap. Now we check that the interval_set won't
14946 // cause the last assert.
14947 uint64_t size
= snapset
->clone_size
.find(soid
.snap
)->second
;
14948 const interval_set
<uint64_t> &overlap
=
14949 snapset
->clone_overlap
.find(soid
.snap
)->second
;
14950 bool bad_interval_set
= false;
14951 for (interval_set
<uint64_t>::const_iterator i
= overlap
.begin();
14952 i
!= overlap
.end(); ++i
) {
14953 if (size
< i
.get_len()) {
14954 bad_interval_set
= true;
14957 size
-= i
.get_len();
14960 if (bad_interval_set
) {
14961 osd
->clog
->error() << mode
<< " " << info
.pgid
<< " " << soid
14962 << " : bad interval_set in clone_overlap";
14963 ++scrubber
.shallow_errors
;
14964 soid_error
.set_size_mismatch();
14966 stat
.num_bytes
+= snapset
->get_clone_bytes(soid
.snap
);
14973 if (soid_error
.errors
) {
14974 scrubber
.store
->add_snap_error(pool
.id
, soid_error
);
14975 ++soid_error_count
;
14979 scrub_cstat
.add(stat
);
14982 if (doing_clones(snapset
, curclone
)) {
14983 dout(10) << __func__
<< " " << mode
<< " " << info
.pgid
14984 << " No more objects while processing " << *head
<< dendl
;
14986 missing
+= process_clones_to(head
, snapset
, osd
->clog
, info
.pgid
, mode
,
14987 pool
.info
.allow_incomplete_clones(), all_clones
, &curclone
,
14990 // There could be missing found by the test above or even
14991 // before dropping out of the loop for the last head.
14993 log_missing(missing
, head
, osd
->clog
, info
.pgid
, __func__
,
14994 mode
, pool
.info
.allow_incomplete_clones());
14996 if (head
&& (head_error
.errors
|| soid_error_count
))
14997 scrubber
.store
->add_snap_error(pool
.id
, head_error
);
14999 for (auto p
= missing_digest
.begin(); p
!= missing_digest
.end(); ++p
) {
15000 ceph_assert(!p
->first
.is_snapdir());
15001 dout(10) << __func__
<< " recording digests for " << p
->first
<< dendl
;
15002 ObjectContextRef obc
= get_object_context(p
->first
, false);
15004 osd
->clog
->error() << info
.pgid
<< " " << mode
15005 << " cannot get object context for object "
15008 } else if (obc
->obs
.oi
.soid
!= p
->first
) {
15009 osd
->clog
->error() << info
.pgid
<< " " << mode
15011 << " : object has a valid oi attr with a mismatched name, "
15012 << " obc->obs.oi.soid: " << obc
->obs
.oi
.soid
;
15015 OpContextUPtr ctx
= simple_opc_create(obc
);
15016 ctx
->at_version
= get_next_version();
15017 ctx
->mtime
= utime_t(); // do not update mtime
15018 if (p
->second
.first
) {
15019 ctx
->new_obs
.oi
.set_data_digest(*p
->second
.first
);
15021 ctx
->new_obs
.oi
.clear_data_digest();
15023 if (p
->second
.second
) {
15024 ctx
->new_obs
.oi
.set_omap_digest(*p
->second
.second
);
15026 ctx
->new_obs
.oi
.clear_omap_digest();
15028 finish_ctx(ctx
.get(), pg_log_entry_t::MODIFY
);
15030 ctx
->register_on_success(
15032 dout(20) << "updating scrub digest" << dendl
;
15033 if (--scrubber
.num_digest_updates_pending
== 0) {
15038 simple_opc_submit(std::move(ctx
));
15039 ++scrubber
.num_digest_updates_pending
;
15042 dout(10) << __func__
<< " (" << mode
<< ") finish" << dendl
;
15045 void PrimaryLogPG::_scrub_clear_state()
15047 scrub_cstat
= object_stat_collection_t();
15050 void PrimaryLogPG::_scrub_finish()
15052 bool repair
= state_test(PG_STATE_REPAIR
);
15053 bool deep_scrub
= state_test(PG_STATE_DEEP_SCRUB
);
15054 const char *mode
= (repair
? "repair": (deep_scrub
? "deep-scrub" : "scrub"));
15056 if (info
.stats
.stats_invalid
) {
15057 recovery_state
.update_stats(
15058 [=](auto &history
, auto &stats
) {
15059 stats
.stats
= scrub_cstat
;
15060 stats
.stats_invalid
= false;
15065 agent_choose_mode();
15068 dout(10) << mode
<< " got "
15069 << scrub_cstat
.sum
.num_objects
<< "/" << info
.stats
.stats
.sum
.num_objects
<< " objects, "
15070 << scrub_cstat
.sum
.num_object_clones
<< "/" << info
.stats
.stats
.sum
.num_object_clones
<< " clones, "
15071 << scrub_cstat
.sum
.num_objects_dirty
<< "/" << info
.stats
.stats
.sum
.num_objects_dirty
<< " dirty, "
15072 << scrub_cstat
.sum
.num_objects_omap
<< "/" << info
.stats
.stats
.sum
.num_objects_omap
<< " omap, "
15073 << scrub_cstat
.sum
.num_objects_pinned
<< "/" << info
.stats
.stats
.sum
.num_objects_pinned
<< " pinned, "
15074 << scrub_cstat
.sum
.num_objects_hit_set_archive
<< "/" << info
.stats
.stats
.sum
.num_objects_hit_set_archive
<< " hit_set_archive, "
15075 << scrub_cstat
.sum
.num_bytes
<< "/" << info
.stats
.stats
.sum
.num_bytes
<< " bytes, "
15076 << scrub_cstat
.sum
.num_objects_manifest
<< "/" << info
.stats
.stats
.sum
.num_objects_manifest
<< " manifest objects, "
15077 << scrub_cstat
.sum
.num_bytes_hit_set_archive
<< "/" << info
.stats
.stats
.sum
.num_bytes_hit_set_archive
<< " hit_set_archive bytes."
15080 if (scrub_cstat
.sum
.num_objects
!= info
.stats
.stats
.sum
.num_objects
||
15081 scrub_cstat
.sum
.num_object_clones
!= info
.stats
.stats
.sum
.num_object_clones
||
15082 (scrub_cstat
.sum
.num_objects_dirty
!= info
.stats
.stats
.sum
.num_objects_dirty
&&
15083 !info
.stats
.dirty_stats_invalid
) ||
15084 (scrub_cstat
.sum
.num_objects_omap
!= info
.stats
.stats
.sum
.num_objects_omap
&&
15085 !info
.stats
.omap_stats_invalid
) ||
15086 (scrub_cstat
.sum
.num_objects_pinned
!= info
.stats
.stats
.sum
.num_objects_pinned
&&
15087 !info
.stats
.pin_stats_invalid
) ||
15088 (scrub_cstat
.sum
.num_objects_hit_set_archive
!= info
.stats
.stats
.sum
.num_objects_hit_set_archive
&&
15089 !info
.stats
.hitset_stats_invalid
) ||
15090 (scrub_cstat
.sum
.num_bytes_hit_set_archive
!= info
.stats
.stats
.sum
.num_bytes_hit_set_archive
&&
15091 !info
.stats
.hitset_bytes_stats_invalid
) ||
15092 (scrub_cstat
.sum
.num_objects_manifest
!= info
.stats
.stats
.sum
.num_objects_manifest
&&
15093 !info
.stats
.manifest_stats_invalid
) ||
15094 scrub_cstat
.sum
.num_whiteouts
!= info
.stats
.stats
.sum
.num_whiteouts
||
15095 scrub_cstat
.sum
.num_bytes
!= info
.stats
.stats
.sum
.num_bytes
) {
15096 osd
->clog
->error() << info
.pgid
<< " " << mode
15097 << " : stat mismatch, got "
15098 << scrub_cstat
.sum
.num_objects
<< "/" << info
.stats
.stats
.sum
.num_objects
<< " objects, "
15099 << scrub_cstat
.sum
.num_object_clones
<< "/" << info
.stats
.stats
.sum
.num_object_clones
<< " clones, "
15100 << scrub_cstat
.sum
.num_objects_dirty
<< "/" << info
.stats
.stats
.sum
.num_objects_dirty
<< " dirty, "
15101 << scrub_cstat
.sum
.num_objects_omap
<< "/" << info
.stats
.stats
.sum
.num_objects_omap
<< " omap, "
15102 << scrub_cstat
.sum
.num_objects_pinned
<< "/" << info
.stats
.stats
.sum
.num_objects_pinned
<< " pinned, "
15103 << scrub_cstat
.sum
.num_objects_hit_set_archive
<< "/" << info
.stats
.stats
.sum
.num_objects_hit_set_archive
<< " hit_set_archive, "
15104 << scrub_cstat
.sum
.num_whiteouts
<< "/" << info
.stats
.stats
.sum
.num_whiteouts
<< " whiteouts, "
15105 << scrub_cstat
.sum
.num_bytes
<< "/" << info
.stats
.stats
.sum
.num_bytes
<< " bytes, "
15106 << scrub_cstat
.sum
.num_objects_manifest
<< "/" << info
.stats
.stats
.sum
.num_objects_manifest
<< " manifest objects, "
15107 << scrub_cstat
.sum
.num_bytes_hit_set_archive
<< "/" << info
.stats
.stats
.sum
.num_bytes_hit_set_archive
<< " hit_set_archive bytes.";
15108 ++scrubber
.shallow_errors
;
15112 recovery_state
.update_stats(
15113 [this](auto &history
, auto &stats
) {
15114 stats
.stats
= scrub_cstat
;
15115 stats
.dirty_stats_invalid
= false;
15116 stats
.omap_stats_invalid
= false;
15117 stats
.hitset_stats_invalid
= false;
15118 stats
.hitset_bytes_stats_invalid
= false;
15119 stats
.pin_stats_invalid
= false;
15120 stats
.manifest_stats_invalid
= false;
15123 publish_stats_to_osd();
15124 recovery_state
.share_pg_info();
15127 // Clear object context cache to get repair information
15129 object_contexts
.clear();
15132 int PrimaryLogPG::rep_repair_primary_object(const hobject_t
& soid
, OpContext
*ctx
)
15134 OpRequestRef op
= ctx
->op
;
15135 // Only supports replicated pools
15136 ceph_assert(!pool
.info
.is_erasure());
15137 ceph_assert(is_primary());
15139 dout(10) << __func__
<< " " << soid
15140 << " peers osd.{" << get_acting_recovery_backfill() << "}" << dendl
;
15143 block_for_clean(soid
, op
);
15147 ceph_assert(!recovery_state
.get_pg_log().get_missing().is_missing(soid
));
15148 auto& oi
= ctx
->new_obs
.oi
;
15149 eversion_t v
= oi
.version
;
15151 if (primary_error(soid
, v
)) {
15152 dout(0) << __func__
<< " No other replicas available for " << soid
<< dendl
;
15153 // XXX: If we knew that there is no down osd which could include this
15154 // object, it would be nice if we could return EIO here.
15155 // If a "never fail" flag was available, that could be used
15156 // for rbd to NOT return EIO until object marked lost.
15158 // Drop through to save this op in case an osd comes up with the object.
15161 // Restart the op after object becomes readable again
15162 waiting_for_unreadable_object
[soid
].push_back(op
);
15163 op
->mark_delayed("waiting for missing object");
15165 if (!eio_errors_to_process
) {
15166 eio_errors_to_process
= true;
15167 ceph_assert(is_clean());
15168 state_set(PG_STATE_REPAIR
);
15169 state_clear(PG_STATE_CLEAN
);
15170 queue_peering_event(
15172 std::make_shared
<PGPeeringEvent
>(
15173 get_osdmap_epoch(),
15174 get_osdmap_epoch(),
15175 PeeringState::DoRecovery())));
15177 // A prior error must have already cleared clean state and queued recovery
15178 // or a map change has triggered re-peering.
15179 // Not inlining the recovery by calling maybe_kick_recovery(soid);
15180 dout(5) << __func__
<< ": Read error on " << soid
<< ", but already seen errors" << dendl
;
15186 /*---SnapTrimmer Logging---*/
15188 #define dout_prefix pg->gen_prefix(*_dout)
15190 void PrimaryLogPG::SnapTrimmer::log_enter(const char *state_name
)
15192 ldout(pg
->cct
, 20) << "enter " << state_name
<< dendl
;
15195 void PrimaryLogPG::SnapTrimmer::log_exit(const char *state_name
, utime_t enter_time
)
15197 ldout(pg
->cct
, 20) << "exit " << state_name
<< dendl
;
15200 /*---SnapTrimmer states---*/
15202 #define dout_prefix (context< SnapTrimmer >().pg->gen_prefix(*_dout) \
15203 << "SnapTrimmer state<" << get_state_name() << ">: ")
15206 PrimaryLogPG::NotTrimming::NotTrimming(my_context ctx
)
15208 NamedState(nullptr, "NotTrimming")
15210 context
< SnapTrimmer
>().log_enter(state_name
);
15213 void PrimaryLogPG::NotTrimming::exit()
15215 context
< SnapTrimmer
>().log_exit(state_name
, enter_time
);
15218 boost::statechart::result
PrimaryLogPG::NotTrimming::react(const KickTrim
&)
15220 PrimaryLogPG
*pg
= context
< SnapTrimmer
>().pg
;
15221 ldout(pg
->cct
, 10) << "NotTrimming react KickTrim" << dendl
;
15223 if (!(pg
->is_primary() && pg
->is_active())) {
15224 ldout(pg
->cct
, 10) << "NotTrimming not primary or active" << dendl
;
15225 return discard_event();
15227 if (!pg
->is_clean() ||
15228 pg
->snap_trimq
.empty()) {
15229 ldout(pg
->cct
, 10) << "NotTrimming not clean or nothing to trim" << dendl
;
15230 return discard_event();
15232 if (pg
->scrubber
.active
) {
15233 ldout(pg
->cct
, 10) << " scrubbing, will requeue snap_trimmer after" << dendl
;
15234 return transit
< WaitScrub
>();
15236 return transit
< Trimming
>();
15240 boost::statechart::result
PrimaryLogPG::WaitReservation::react(const SnapTrimReserved
&)
15242 PrimaryLogPG
*pg
= context
< SnapTrimmer
>().pg
;
15243 ldout(pg
->cct
, 10) << "WaitReservation react SnapTrimReserved" << dendl
;
15246 if (!context
< SnapTrimmer
>().can_trim()) {
15247 post_event(KickTrim());
15248 return transit
< NotTrimming
>();
15251 context
<Trimming
>().snap_to_trim
= pg
->snap_trimq
.range_start();
15252 ldout(pg
->cct
, 10) << "NotTrimming: trimming "
15253 << pg
->snap_trimq
.range_start()
15255 return transit
< AwaitAsyncWork
>();
15258 /* AwaitAsyncWork */
15259 PrimaryLogPG::AwaitAsyncWork::AwaitAsyncWork(my_context ctx
)
15261 NamedState(nullptr, "Trimming/AwaitAsyncWork")
15263 auto *pg
= context
< SnapTrimmer
>().pg
;
15264 context
< SnapTrimmer
>().log_enter(state_name
);
15265 context
< SnapTrimmer
>().pg
->osd
->queue_for_snap_trim(pg
);
15266 pg
->state_set(PG_STATE_SNAPTRIM
);
15267 pg
->state_clear(PG_STATE_SNAPTRIM_ERROR
);
15268 pg
->publish_stats_to_osd();
15271 boost::statechart::result
PrimaryLogPG::AwaitAsyncWork::react(const DoSnapWork
&)
15273 PrimaryLogPGRef pg
= context
< SnapTrimmer
>().pg
;
15274 snapid_t snap_to_trim
= context
<Trimming
>().snap_to_trim
;
15275 auto &in_flight
= context
<Trimming
>().in_flight
;
15276 ceph_assert(in_flight
.empty());
15278 ceph_assert(pg
->is_primary() && pg
->is_active());
15279 if (!context
< SnapTrimmer
>().can_trim()) {
15280 ldout(pg
->cct
, 10) << "something changed, reverting to NotTrimming" << dendl
;
15281 post_event(KickTrim());
15282 return transit
< NotTrimming
>();
15285 ldout(pg
->cct
, 10) << "AwaitAsyncWork: trimming snap " << snap_to_trim
<< dendl
;
15287 vector
<hobject_t
> to_trim
;
15288 unsigned max
= pg
->cct
->_conf
->osd_pg_max_concurrent_snap_trims
;
15289 to_trim
.reserve(max
);
15290 int r
= pg
->snap_mapper
.get_next_objects_to_trim(
15294 if (r
!= 0 && r
!= -ENOENT
) {
15295 lderr(pg
->cct
) << "get_next_objects_to_trim returned "
15296 << cpp_strerror(r
) << dendl
;
15297 ceph_abort_msg("get_next_objects_to_trim returned an invalid code");
15298 } else if (r
== -ENOENT
) {
15300 ldout(pg
->cct
, 10) << "got ENOENT" << dendl
;
15302 pg
->snap_trimq
.erase(snap_to_trim
);
15304 if (pg
->snap_trimq_repeat
.count(snap_to_trim
)) {
15305 ldout(pg
->cct
, 10) << " removing from snap_trimq_repeat" << dendl
;
15306 pg
->snap_trimq_repeat
.erase(snap_to_trim
);
15308 ldout(pg
->cct
, 10) << "adding snap " << snap_to_trim
15309 << " to purged_snaps"
15311 ObjectStore::Transaction t
;
15312 pg
->recovery_state
.adjust_purged_snaps(
15313 [snap_to_trim
](auto &purged_snaps
) {
15314 purged_snaps
.insert(snap_to_trim
);
15316 pg
->write_if_dirty(t
);
15318 ldout(pg
->cct
, 10) << "purged_snaps now "
15319 << pg
->info
.purged_snaps
<< ", snap_trimq now "
15320 << pg
->snap_trimq
<< dendl
;
15322 int tr
= pg
->osd
->store
->queue_transaction(pg
->ch
, std::move(t
), NULL
);
15323 ceph_assert(tr
== 0);
15325 pg
->recovery_state
.share_pg_info();
15327 post_event(KickTrim());
15328 return transit
< NotTrimming
>();
15330 ceph_assert(!to_trim
.empty());
15332 for (auto &&object
: to_trim
) {
15334 ldout(pg
->cct
, 10) << "AwaitAsyncWork react trimming " << object
<< dendl
;
15336 int error
= pg
->trim_object(in_flight
.empty(), object
, snap_to_trim
, &ctx
);
15338 if (error
== -ENOLCK
) {
15339 ldout(pg
->cct
, 10) << "could not get write lock on obj "
15340 << object
<< dendl
;
15342 pg
->state_set(PG_STATE_SNAPTRIM_ERROR
);
15343 ldout(pg
->cct
, 10) << "Snaptrim error=" << error
<< dendl
;
15345 if (!in_flight
.empty()) {
15346 ldout(pg
->cct
, 10) << "letting the ones we already started finish" << dendl
;
15347 return transit
< WaitRepops
>();
15349 if (error
== -ENOLCK
) {
15350 ldout(pg
->cct
, 10) << "waiting for it to clear"
15352 return transit
< WaitRWLock
>();
15354 return transit
< NotTrimming
>();
15358 in_flight
.insert(object
);
15359 ctx
->register_on_success(
15360 [pg
, object
, &in_flight
]() {
15361 ceph_assert(in_flight
.find(object
) != in_flight
.end());
15362 in_flight
.erase(object
);
15363 if (in_flight
.empty()) {
15364 if (pg
->state_test(PG_STATE_SNAPTRIM_ERROR
)) {
15365 pg
->snap_trimmer_machine
.process_event(Reset());
15367 pg
->snap_trimmer_machine
.process_event(RepopsComplete());
15372 pg
->simple_opc_submit(std::move(ctx
));
15375 return transit
< WaitRepops
>();
15378 void PrimaryLogPG::setattr_maybe_cache(
15379 ObjectContextRef obc
,
15384 t
->setattr(obc
->obs
.oi
.soid
, key
, val
);
15387 void PrimaryLogPG::setattrs_maybe_cache(
15388 ObjectContextRef obc
,
15390 map
<string
, bufferlist
> &attrs
)
15392 t
->setattrs(obc
->obs
.oi
.soid
, attrs
);
15395 void PrimaryLogPG::rmattr_maybe_cache(
15396 ObjectContextRef obc
,
15400 t
->rmattr(obc
->obs
.oi
.soid
, key
);
15403 int PrimaryLogPG::getattr_maybe_cache(
15404 ObjectContextRef obc
,
15408 if (pool
.info
.is_erasure()) {
15409 map
<string
, bufferlist
>::iterator i
= obc
->attr_cache
.find(key
);
15410 if (i
!= obc
->attr_cache
.end()) {
15418 return pgbackend
->objects_get_attr(obc
->obs
.oi
.soid
, key
, val
);
15421 int PrimaryLogPG::getattrs_maybe_cache(
15422 ObjectContextRef obc
,
15423 map
<string
, bufferlist
> *out
)
15427 if (pool
.info
.is_erasure()) {
15428 *out
= obc
->attr_cache
;
15430 r
= pgbackend
->objects_get_attrs(obc
->obs
.oi
.soid
, out
);
15432 map
<string
, bufferlist
> tmp
;
15433 for (map
<string
, bufferlist
>::iterator i
= out
->begin();
15436 if (i
->first
.size() > 1 && i
->first
[0] == '_')
15437 tmp
[i
->first
.substr(1, i
->first
.size())].claim(i
->second
);
15443 bool PrimaryLogPG::check_failsafe_full() {
15444 return osd
->check_failsafe_full(get_dpp());
15447 void intrusive_ptr_add_ref(PrimaryLogPG
*pg
) { pg
->get("intptr"); }
15448 void intrusive_ptr_release(PrimaryLogPG
*pg
) { pg
->put("intptr"); }
15450 #ifdef PG_DEBUG_REFS
15451 uint64_t get_with_id(PrimaryLogPG
*pg
) { return pg
->get_with_id(); }
15452 void put_with_id(PrimaryLogPG
*pg
, uint64_t id
) { return pg
->put_with_id(id
); }
15455 void intrusive_ptr_add_ref(PrimaryLogPG::RepGather
*repop
) { repop
->get(); }
15456 void intrusive_ptr_release(PrimaryLogPG::RepGather
*repop
) { repop
->put(); }