1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 * Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com>
9 * Author: Loic Dachary <loic@dachary.org>
11 * This is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public
13 * License version 2.1, as published by the Free Software
14 * Foundation. See file COPYING.
18 #include "boost/tuple/tuple.hpp"
19 #include "boost/intrusive_ptr.hpp"
21 #include "PrimaryLogPG.h"
23 #include "OpRequest.h"
24 #include "ScrubStore.h"
26 #include "objclass/objclass.h"
28 #include "common/errno.h"
29 #include "common/scrub_types.h"
30 #include "common/perf_counters.h"
32 #include "messages/MOSDOp.h"
33 #include "messages/MOSDBackoff.h"
34 #include "messages/MOSDPGTrim.h"
35 #include "messages/MOSDPGScan.h"
36 #include "messages/MOSDRepScrub.h"
37 #include "messages/MOSDPGBackfill.h"
38 #include "messages/MOSDPGBackfillRemove.h"
39 #include "messages/MOSDPGUpdateLogMissing.h"
40 #include "messages/MOSDPGUpdateLogMissingReply.h"
41 #include "messages/MCommandReply.h"
42 #include "messages/MOSDScrubReserve.h"
43 #include "mds/inode_backtrace.h" // Ugh
44 #include "common/EventTrace.h"
46 #include "common/config.h"
47 #include "include/compat.h"
48 #include "mon/MonClient.h"
49 #include "osdc/Objecter.h"
50 #include "json_spirit/json_spirit_value.h"
51 #include "json_spirit/json_spirit_reader.h"
52 #include "include/ceph_assert.h" // json_spirit clobbers it
53 #include "include/rados/rados_types.hpp"
56 #include "tracing/osd.h"
58 #define tracepoint(...)
61 #define dout_context cct
62 #define dout_subsys ceph_subsys_osd
63 #define DOUT_PREFIX_ARGS this, osd->whoami, get_osdmap()
65 #define dout_prefix _prefix(_dout, this)
67 static ostream
& _prefix(std::ostream
*_dout
, T
*pg
) {
68 return pg
->gen_prefix(*_dout
);
77 MEMPOOL_DEFINE_OBJECT_FACTORY(PrimaryLogPG
, replicatedpg
, osd
);
79 PGLSFilter::PGLSFilter() : cct(nullptr)
83 PGLSFilter::~PGLSFilter()
88 * The CopyCallback class defines an interface for completions to the
89 * copy_start code. Users of the copy infrastructure must implement
90 * one and give an instance of the class to start_copy.
92 * The implementer is responsible for making sure that the CopyCallback
93 * can associate itself with the correct copy operation.
95 class PrimaryLogPG::CopyCallback
: public GenContext
<CopyCallbackResults
> {
99 * results.get<0>() is the return code: 0 for success; -ECANCELED if
100 * the operation was cancelled by the local OSD; -errno for other issues.
101 * results.get<1>() is a pointer to a CopyResults object, which you are
102 * responsible for deleting.
104 void finish(CopyCallbackResults results_
) override
= 0;
107 /// Provide the final size of the copied object to the CopyCallback
108 ~CopyCallback() override
{}
111 template <typename T
>
112 class PrimaryLogPG::BlessedGenContext
: public GenContext
<T
> {
114 unique_ptr
<GenContext
<T
>> c
;
117 BlessedGenContext(PrimaryLogPG
*pg
, GenContext
<T
> *c
, epoch_t e
)
118 : pg(pg
), c(c
), e(e
) {}
119 void finish(T t
) override
{
121 if (pg
->pg_has_reset_since(e
))
124 c
.release()->complete(t
);
127 bool sync_finish(T t
) {
128 // we assume here all blessed/wrapped Contexts can complete synchronously.
129 c
.release()->complete(t
);
134 GenContext
<ThreadPool::TPHandle
&> *PrimaryLogPG::bless_gencontext(
135 GenContext
<ThreadPool::TPHandle
&> *c
) {
136 return new BlessedGenContext
<ThreadPool::TPHandle
&>(
137 this, c
, get_osdmap_epoch());
140 template <typename T
>
141 class PrimaryLogPG::UnlockedBlessedGenContext
: public GenContext
<T
> {
143 unique_ptr
<GenContext
<T
>> c
;
146 UnlockedBlessedGenContext(PrimaryLogPG
*pg
, GenContext
<T
> *c
, epoch_t e
)
147 : pg(pg
), c(c
), e(e
) {}
148 void finish(T t
) override
{
149 if (pg
->pg_has_reset_since(e
))
152 c
.release()->complete(t
);
154 bool sync_finish(T t
) {
155 // we assume here all blessed/wrapped Contexts can complete synchronously.
156 c
.release()->complete(t
);
161 GenContext
<ThreadPool::TPHandle
&> *PrimaryLogPG::bless_unlocked_gencontext(
162 GenContext
<ThreadPool::TPHandle
&> *c
) {
163 return new UnlockedBlessedGenContext
<ThreadPool::TPHandle
&>(
164 this, c
, get_osdmap_epoch());
167 class PrimaryLogPG::BlessedContext
: public Context
{
169 unique_ptr
<Context
> c
;
172 BlessedContext(PrimaryLogPG
*pg
, Context
*c
, epoch_t e
)
173 : pg(pg
), c(c
), e(e
) {}
174 void finish(int r
) override
{
176 if (pg
->pg_has_reset_since(e
))
179 c
.release()->complete(r
);
182 bool sync_finish(int r
) {
183 // we assume here all blessed/wrapped Contexts can complete synchronously.
184 c
.release()->complete(r
);
189 Context
*PrimaryLogPG::bless_context(Context
*c
) {
190 return new BlessedContext(this, c
, get_osdmap_epoch());
193 class PrimaryLogPG::C_PG_ObjectContext
: public Context
{
197 C_PG_ObjectContext(PrimaryLogPG
*p
, ObjectContext
*o
) :
199 void finish(int r
) override
{
200 pg
->object_context_destructor_callback(obc
);
204 struct OnReadComplete
: public Context
{
206 PrimaryLogPG::OpContext
*opcontext
;
209 PrimaryLogPG::OpContext
*ctx
) : pg(pg
), opcontext(ctx
) {}
210 void finish(int r
) override
{
211 opcontext
->finish_read(pg
);
213 ~OnReadComplete() override
{}
216 class PrimaryLogPG::C_OSD_AppliedRecoveredObject
: public Context
{
218 ObjectContextRef obc
;
220 C_OSD_AppliedRecoveredObject(PrimaryLogPG
*p
, ObjectContextRef o
) :
222 bool sync_finish(int r
) override
{
223 pg
->_applied_recovered_object(obc
);
226 void finish(int r
) override
{
228 pg
->_applied_recovered_object(obc
);
233 class PrimaryLogPG::C_OSD_CommittedPushedObject
: public Context
{
236 eversion_t last_complete
;
238 C_OSD_CommittedPushedObject(
239 PrimaryLogPG
*p
, epoch_t epoch
, eversion_t lc
) :
240 pg(p
), epoch(epoch
), last_complete(lc
) {
242 void finish(int r
) override
{
243 pg
->_committed_pushed_object(epoch
, last_complete
);
247 class PrimaryLogPG::C_OSD_AppliedRecoveredObjectReplica
: public Context
{
250 explicit C_OSD_AppliedRecoveredObjectReplica(PrimaryLogPG
*p
) :
252 bool sync_finish(int r
) override
{
253 pg
->_applied_recovered_object_replica();
256 void finish(int r
) override
{
258 pg
->_applied_recovered_object_replica();
264 void PrimaryLogPG::OpContext::start_async_reads(PrimaryLogPG
*pg
)
267 list
<pair
<boost::tuple
<uint64_t, uint64_t, unsigned>,
268 pair
<bufferlist
*, Context
*> > > in
;
269 in
.swap(pending_async_reads
);
270 pg
->pgbackend
->objects_read_async(
273 new OnReadComplete(pg
, this), pg
->get_pool().fast_read
);
275 void PrimaryLogPG::OpContext::finish_read(PrimaryLogPG
*pg
)
277 ceph_assert(inflightreads
> 0);
279 if (async_reads_complete()) {
280 ceph_assert(pg
->in_progress_async_reads
.size());
281 ceph_assert(pg
->in_progress_async_reads
.front().second
== this);
282 pg
->in_progress_async_reads
.pop_front();
284 // Restart the op context now that all reads have been
285 // completed. Read failures will be handled by the op finisher
286 pg
->execute_ctx(this);
290 class CopyFromCallback
: public PrimaryLogPG::CopyCallback
{
292 PrimaryLogPG::CopyResults
*results
= nullptr;
293 PrimaryLogPG::OpContext
*ctx
;
296 CopyFromCallback(PrimaryLogPG::OpContext
*ctx
, OSDOp
&osd_op
)
297 : ctx(ctx
), osd_op(osd_op
) {
299 ~CopyFromCallback() override
{}
301 void finish(PrimaryLogPG::CopyCallbackResults results_
) override
{
302 results
= results_
.get
<1>();
303 int r
= results_
.get
<0>();
305 // for finish_copyfrom
306 ctx
->user_at_version
= results
->user_version
;
309 ctx
->pg
->execute_ctx(ctx
);
311 if (r
!= -ECANCELED
) { // on cancel just toss it out; client resends
313 ctx
->pg
->osd
->reply_op_error(ctx
->op
, r
);
314 } else if (results
->should_requeue
) {
316 ctx
->pg
->requeue_op(ctx
->op
);
318 ctx
->pg
->close_op_ctx(ctx
);
322 bool is_temp_obj_used() {
323 return results
->started_temp_obj
;
325 uint64_t get_data_size() {
326 return results
->object_size
;
330 struct CopyFromFinisher
: public PrimaryLogPG::OpFinisher
{
331 CopyFromCallback
*copy_from_callback
;
333 explicit CopyFromFinisher(CopyFromCallback
*copy_from_callback
)
334 : copy_from_callback(copy_from_callback
) {
337 int execute() override
{
338 // instance will be destructed after this method completes
339 copy_from_callback
->ctx
->pg
->finish_copyfrom(copy_from_callback
);
344 // ======================
345 // PGBackend::Listener
347 void PrimaryLogPG::on_local_recover(
348 const hobject_t
&hoid
,
349 const ObjectRecoveryInfo
&_recovery_info
,
350 ObjectContextRef obc
,
352 ObjectStore::Transaction
*t
355 dout(10) << __func__
<< ": " << hoid
<< dendl
;
357 ObjectRecoveryInfo
recovery_info(_recovery_info
);
358 clear_object_snap_mapping(t
, hoid
);
359 if (!is_delete
&& recovery_info
.soid
.is_snap()) {
360 OSDriver::OSTransaction
_t(osdriver
.get_transaction(t
));
362 dout(20) << " snapset " << recovery_info
.ss
<< dendl
;
363 auto p
= recovery_info
.ss
.clone_snaps
.find(hoid
.snap
);
364 if (p
!= recovery_info
.ss
.clone_snaps
.end()) {
365 snaps
.insert(p
->second
.begin(), p
->second
.end());
366 dout(20) << " snaps " << snaps
<< dendl
;
372 derr
<< __func__
<< " " << hoid
<< " had no clone_snaps" << dendl
;
375 if (!is_delete
&& pg_log
.get_missing().is_missing(recovery_info
.soid
) &&
376 pg_log
.get_missing().get_items().find(recovery_info
.soid
)->second
.need
> recovery_info
.version
) {
377 ceph_assert(is_primary());
378 const pg_log_entry_t
*latest
= pg_log
.get_log().objects
.find(recovery_info
.soid
)->second
;
379 if (latest
->op
== pg_log_entry_t::LOST_REVERT
&&
380 latest
->reverting_to
== recovery_info
.version
) {
381 dout(10) << " got old revert version " << recovery_info
.version
382 << " for " << *latest
<< dendl
;
383 recovery_info
.version
= latest
->version
;
384 // update the attr to the revert event version
385 recovery_info
.oi
.prior_version
= recovery_info
.oi
.version
;
386 recovery_info
.oi
.version
= latest
->version
;
388 encode(recovery_info
.oi
, bl
,
389 get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD
, nullptr));
390 ceph_assert(!pool
.info
.is_erasure());
391 t
->setattr(coll
, ghobject_t(recovery_info
.soid
), OI_ATTR
, bl
);
393 obc
->attr_cache
[OI_ATTR
] = bl
;
397 // keep track of active pushes for scrub
400 if (recovery_info
.version
> pg_log
.get_can_rollback_to()) {
401 /* This can only happen during a repair, and even then, it would
402 * be one heck of a race. If we are repairing the object, the
403 * write in question must be fully committed, so it's not valid
404 * to roll it back anyway (and we'll be rolled forward shortly
406 PGLogEntryHandler h
{this, t
};
407 pg_log
.roll_forward_to(recovery_info
.version
, &h
);
409 recover_got(recovery_info
.soid
, recovery_info
.version
);
413 obc
->obs
.exists
= true;
415 bool got
= obc
->get_recovery_read();
418 ceph_assert(recovering
.count(obc
->obs
.oi
.soid
));
419 recovering
[obc
->obs
.oi
.soid
] = obc
;
420 obc
->obs
.oi
= recovery_info
.oi
; // may have been updated above
423 t
->register_on_applied(new C_OSD_AppliedRecoveredObject(this, obc
));
425 publish_stats_to_osd();
426 ceph_assert(missing_loc
.needs_recovery(hoid
));
428 missing_loc
.add_location(hoid
, pg_whoami
);
429 release_backoffs(hoid
);
430 if (!is_unreadable_object(hoid
)) {
431 auto unreadable_object_entry
= waiting_for_unreadable_object
.find(hoid
);
432 if (unreadable_object_entry
!= waiting_for_unreadable_object
.end()) {
433 dout(20) << " kicking unreadable waiters on " << hoid
<< dendl
;
434 requeue_ops(unreadable_object_entry
->second
);
435 waiting_for_unreadable_object
.erase(unreadable_object_entry
);
439 t
->register_on_applied(
440 new C_OSD_AppliedRecoveredObjectReplica(this));
444 t
->register_on_commit(
445 new C_OSD_CommittedPushedObject(
448 info
.last_complete
));
455 void PrimaryLogPG::on_global_recover(
456 const hobject_t
&soid
,
457 const object_stat_sum_t
&stat_diff
,
460 info
.stats
.stats
.sum
.add(stat_diff
);
461 missing_loc
.recovered(soid
);
462 publish_stats_to_osd();
463 dout(10) << "pushed " << soid
<< " to all replicas" << dendl
;
464 map
<hobject_t
, ObjectContextRef
>::iterator i
= recovering
.find(soid
);
465 ceph_assert(i
!= recovering
.end());
467 if (i
->second
&& i
->second
->rwstate
.recovery_read_marker
) {
468 // recover missing won't have had an obc, but it gets filled in
469 // during on_local_recover
470 ceph_assert(i
->second
);
471 list
<OpRequestRef
> requeue_list
;
472 i
->second
->drop_recovery_read(&requeue_list
);
473 requeue_ops(requeue_list
);
476 backfills_in_flight
.erase(soid
);
479 finish_recovery_op(soid
);
480 release_backoffs(soid
);
481 auto degraded_object_entry
= waiting_for_degraded_object
.find(soid
);
482 if (degraded_object_entry
!= waiting_for_degraded_object
.end()) {
483 dout(20) << " kicking degraded waiters on " << soid
<< dendl
;
484 requeue_ops(degraded_object_entry
->second
);
485 waiting_for_degraded_object
.erase(degraded_object_entry
);
487 auto unreadable_object_entry
= waiting_for_unreadable_object
.find(soid
);
488 if (unreadable_object_entry
!= waiting_for_unreadable_object
.end()) {
489 dout(20) << " kicking unreadable waiters on " << soid
<< dendl
;
490 requeue_ops(unreadable_object_entry
->second
);
491 waiting_for_unreadable_object
.erase(unreadable_object_entry
);
493 finish_degraded_object(soid
);
496 void PrimaryLogPG::on_peer_recover(
498 const hobject_t
&soid
,
499 const ObjectRecoveryInfo
&recovery_info
)
501 publish_stats_to_osd();
503 peer_missing
[peer
].got(soid
, recovery_info
.version
);
504 missing_loc
.add_location(soid
, peer
);
507 void PrimaryLogPG::begin_peer_recover(
509 const hobject_t soid
)
511 peer_missing
[peer
].revise_have(soid
, eversion_t());
514 void PrimaryLogPG::schedule_recovery_work(
515 GenContext
<ThreadPool::TPHandle
&> *c
)
517 osd
->queue_recovery_context(this, c
);
520 void PrimaryLogPG::send_message_osd_cluster(
521 int peer
, Message
*m
, epoch_t from_epoch
)
523 osd
->send_message_osd_cluster(peer
, m
, from_epoch
);
526 void PrimaryLogPG::send_message_osd_cluster(
527 Message
*m
, Connection
*con
)
529 osd
->send_message_osd_cluster(m
, con
);
532 void PrimaryLogPG::send_message_osd_cluster(
533 Message
*m
, const ConnectionRef
& con
)
535 osd
->send_message_osd_cluster(m
, con
);
538 void PrimaryLogPG::on_primary_error(
539 const hobject_t
&oid
,
542 dout(0) << __func__
<< ": oid " << oid
<< " version " << v
<< dendl
;
544 primary_error(oid
, v
);
545 backfill_add_missing(oid
, v
);
548 void PrimaryLogPG::backfill_add_missing(
549 const hobject_t
&oid
,
552 dout(0) << __func__
<< ": oid " << oid
<< " version " << v
<< dendl
;
553 backfills_in_flight
.erase(oid
);
554 missing_loc
.add_missing(oid
, v
, eversion_t());
557 bool PrimaryLogPG::should_send_op(
559 const hobject_t
&hoid
) {
560 if (peer
== get_primary())
562 ceph_assert(peer_info
.count(peer
));
564 hoid
.pool
!= (int64_t)info
.pgid
.pool() ||
565 hoid
<= last_backfill_started
||
566 hoid
<= peer_info
[peer
].last_backfill
;
568 ceph_assert(is_backfill_targets(peer
));
569 dout(10) << __func__
<< " issue_repop shipping empty opt to osd." << peer
570 << ", object " << hoid
571 << " beyond std::max(last_backfill_started "
572 << ", peer_info[peer].last_backfill "
573 << peer_info
[peer
].last_backfill
<< ")" << dendl
;
576 if (async_recovery_targets
.count(peer
) && peer_missing
[peer
].is_missing(hoid
)) {
578 dout(10) << __func__
<< " issue_repop shipping empty opt to osd." << peer
579 << ", object " << hoid
580 << " which is pending recovery in async_recovery_targets" << dendl
;
586 ConnectionRef
PrimaryLogPG::get_con_osd_cluster(
587 int peer
, epoch_t from_epoch
)
589 return osd
->get_con_osd_cluster(peer
, from_epoch
);
592 PerfCounters
*PrimaryLogPG::get_logger()
598 // ====================
601 bool PrimaryLogPG::is_missing_object(const hobject_t
& soid
) const
603 return pg_log
.get_missing().get_items().count(soid
);
606 void PrimaryLogPG::maybe_kick_recovery(
607 const hobject_t
&soid
)
610 bool work_started
= false;
611 if (!missing_loc
.needs_recovery(soid
, &v
))
614 map
<hobject_t
, ObjectContextRef
>::const_iterator p
= recovering
.find(soid
);
615 if (p
!= recovering
.end()) {
616 dout(7) << "object " << soid
<< " v " << v
<< ", already recovering." << dendl
;
617 } else if (missing_loc
.is_unfound(soid
)) {
618 dout(7) << "object " << soid
<< " v " << v
<< ", is unfound." << dendl
;
620 dout(7) << "object " << soid
<< " v " << v
<< ", recovering." << dendl
;
621 PGBackend::RecoveryHandle
*h
= pgbackend
->open_recovery_op();
622 if (is_missing_object(soid
)) {
623 recover_missing(soid
, v
, cct
->_conf
->osd_client_op_priority
, h
);
624 } else if (missing_loc
.is_deleted(soid
)) {
625 prep_object_replica_deletes(soid
, v
, h
, &work_started
);
627 prep_object_replica_pushes(soid
, v
, h
, &work_started
);
629 pgbackend
->run_recovery_op(h
, cct
->_conf
->osd_client_op_priority
);
633 void PrimaryLogPG::wait_for_unreadable_object(
634 const hobject_t
& soid
, OpRequestRef op
)
636 ceph_assert(is_unreadable_object(soid
));
637 maybe_kick_recovery(soid
);
638 waiting_for_unreadable_object
[soid
].push_back(op
);
639 op
->mark_delayed("waiting for missing object");
642 bool PrimaryLogPG::is_degraded_or_backfilling_object(const hobject_t
& soid
)
644 /* The conditions below may clear (on_local_recover, before we queue
645 * the transaction) before we actually requeue the degraded waiters
646 * in on_global_recover after the transaction completes.
648 if (waiting_for_degraded_object
.count(soid
))
650 if (pg_log
.get_missing().get_items().count(soid
))
652 ceph_assert(!acting_recovery_backfill
.empty());
653 for (set
<pg_shard_t
>::iterator i
= acting_recovery_backfill
.begin();
654 i
!= acting_recovery_backfill
.end();
656 if (*i
== get_primary()) continue;
657 pg_shard_t peer
= *i
;
658 auto peer_missing_entry
= peer_missing
.find(peer
);
659 // If an object is missing on an async_recovery_target, return false.
660 // This will not block the op and the object is async recovered later.
661 if (peer_missing_entry
!= peer_missing
.end() &&
662 peer_missing_entry
->second
.get_items().count(soid
)) {
663 if (async_recovery_targets
.count(peer
))
668 // Object is degraded if after last_backfill AND
669 // we are backfilling it
670 if (is_backfill_targets(peer
) &&
671 peer_info
[peer
].last_backfill
<= soid
&&
672 last_backfill_started
>= soid
&&
673 backfills_in_flight
.count(soid
))
679 bool PrimaryLogPG::is_degraded_on_async_recovery_target(const hobject_t
& soid
)
681 for (auto &i
: async_recovery_targets
) {
682 auto peer_missing_entry
= peer_missing
.find(i
);
683 if (peer_missing_entry
!= peer_missing
.end() &&
684 peer_missing_entry
->second
.get_items().count(soid
)) {
685 dout(30) << __func__
<< " " << soid
<< dendl
;
692 void PrimaryLogPG::wait_for_degraded_object(const hobject_t
& soid
, OpRequestRef op
)
694 ceph_assert(is_degraded_or_backfilling_object(soid
) || is_degraded_on_async_recovery_target(soid
));
696 maybe_kick_recovery(soid
);
697 waiting_for_degraded_object
[soid
].push_back(op
);
698 op
->mark_delayed("waiting for degraded object");
701 void PrimaryLogPG::block_write_on_full_cache(
702 const hobject_t
& _oid
, OpRequestRef op
)
704 const hobject_t oid
= _oid
.get_head();
705 dout(20) << __func__
<< ": blocking object " << oid
706 << " on full cache" << dendl
;
707 objects_blocked_on_cache_full
.insert(oid
);
708 waiting_for_cache_not_full
.push_back(op
);
709 op
->mark_delayed("waiting for cache not full");
712 void PrimaryLogPG::block_for_clean(
713 const hobject_t
& oid
, OpRequestRef op
)
715 dout(20) << __func__
<< ": blocking object " << oid
716 << " on primary repair" << dendl
;
717 waiting_for_clean_to_primary_repair
.push_back(op
);
718 op
->mark_delayed("waiting for clean to repair");
721 void PrimaryLogPG::block_write_on_snap_rollback(
722 const hobject_t
& oid
, ObjectContextRef obc
, OpRequestRef op
)
724 dout(20) << __func__
<< ": blocking object " << oid
.get_head()
725 << " on snap promotion " << obc
->obs
.oi
.soid
<< dendl
;
726 // otherwise, we'd have blocked in do_op
727 ceph_assert(oid
.is_head());
728 ceph_assert(objects_blocked_on_snap_promotion
.count(oid
) == 0);
729 objects_blocked_on_snap_promotion
[oid
] = obc
;
730 wait_for_blocked_object(obc
->obs
.oi
.soid
, op
);
733 void PrimaryLogPG::block_write_on_degraded_snap(
734 const hobject_t
& snap
, OpRequestRef op
)
736 dout(20) << __func__
<< ": blocking object " << snap
.get_head()
737 << " on degraded snap " << snap
<< dendl
;
738 // otherwise, we'd have blocked in do_op
739 ceph_assert(objects_blocked_on_degraded_snap
.count(snap
.get_head()) == 0);
740 objects_blocked_on_degraded_snap
[snap
.get_head()] = snap
.snap
;
741 wait_for_degraded_object(snap
, op
);
744 bool PrimaryLogPG::maybe_await_blocked_head(
745 const hobject_t
&hoid
,
748 ObjectContextRef obc
;
749 obc
= object_contexts
.lookup(hoid
.get_head());
751 if (obc
->is_blocked()) {
752 wait_for_blocked_object(obc
->obs
.oi
.soid
, op
);
761 void PrimaryLogPG::wait_for_blocked_object(const hobject_t
& soid
, OpRequestRef op
)
763 dout(10) << __func__
<< " " << soid
<< " " << op
<< dendl
;
764 waiting_for_blocked_object
[soid
].push_back(op
);
765 op
->mark_delayed("waiting for blocked object");
768 void PrimaryLogPG::maybe_force_recovery()
770 // no force if not in degraded/recovery/backfill states
771 if (!is_degraded() &&
772 !state_test(PG_STATE_RECOVERING
|
773 PG_STATE_RECOVERY_WAIT
|
774 PG_STATE_BACKFILLING
|
775 PG_STATE_BACKFILL_WAIT
|
776 PG_STATE_BACKFILL_TOOFULL
))
779 if (pg_log
.get_log().approx_size() <
780 cct
->_conf
->osd_max_pg_log_entries
*
781 cct
->_conf
->osd_force_recovery_pg_log_entries_factor
)
784 // find the oldest missing object
785 version_t min_version
= pg_log
.get_log().head
.version
;
787 if (!pg_log
.get_missing().get_rmissing().empty()) {
788 min_version
= pg_log
.get_missing().get_rmissing().begin()->first
;
789 soid
= pg_log
.get_missing().get_rmissing().begin()->second
;
791 ceph_assert(!acting_recovery_backfill
.empty());
792 for (set
<pg_shard_t
>::iterator it
= acting_recovery_backfill
.begin();
793 it
!= acting_recovery_backfill
.end();
795 if (*it
== get_primary()) continue;
796 pg_shard_t peer
= *it
;
797 auto it_missing
= peer_missing
.find(peer
);
798 if (it_missing
!= peer_missing
.end() &&
799 !it_missing
->second
.get_rmissing().empty()) {
800 const auto& min_obj
= peer_missing
[peer
].get_rmissing().begin();
801 dout(20) << __func__
<< " peer " << peer
<< " min_version " << min_obj
->first
802 << " oid " << min_obj
->second
<< dendl
;
803 if (min_version
> min_obj
->first
) {
804 min_version
= min_obj
->first
;
805 soid
= min_obj
->second
;
811 if (soid
!= hobject_t())
812 maybe_kick_recovery(soid
);
815 class PGLSPlainFilter
: public PGLSFilter
{
818 int init(bufferlist::const_iterator
¶ms
) override
821 decode(xattr
, params
);
823 } catch (buffer::error
&e
) {
829 ~PGLSPlainFilter() override
{}
830 bool filter(const hobject_t
&obj
, bufferlist
& xattr_data
,
831 bufferlist
& outdata
) override
;
834 class PGLSParentFilter
: public PGLSFilter
{
835 inodeno_t parent_ino
;
838 explicit PGLSParentFilter(CephContext
* cct
) : cct(cct
) {
841 int init(bufferlist::const_iterator
¶ms
) override
844 decode(parent_ino
, params
);
845 } catch (buffer::error
&e
) {
848 generic_dout(0) << "parent_ino=" << parent_ino
<< dendl
;
852 ~PGLSParentFilter() override
{}
853 bool filter(const hobject_t
&obj
, bufferlist
& xattr_data
,
854 bufferlist
& outdata
) override
;
857 bool PGLSParentFilter::filter(const hobject_t
&obj
,
858 bufferlist
& xattr_data
, bufferlist
& outdata
)
860 auto iter
= xattr_data
.cbegin();
861 inode_backtrace_t bt
;
863 generic_dout(0) << "PGLSParentFilter::filter" << dendl
;
867 vector
<inode_backpointer_t
>::iterator vi
;
868 for (vi
= bt
.ancestors
.begin(); vi
!= bt
.ancestors
.end(); ++vi
) {
869 generic_dout(0) << "vi->dirino=" << vi
->dirino
<< " parent_ino=" << parent_ino
<< dendl
;
870 if (vi
->dirino
== parent_ino
) {
871 encode(*vi
, outdata
);
879 bool PGLSPlainFilter::filter(const hobject_t
&obj
,
880 bufferlist
& xattr_data
, bufferlist
& outdata
)
882 if (val
.size() != xattr_data
.length())
885 if (memcmp(val
.c_str(), xattr_data
.c_str(), val
.size()))
891 bool PrimaryLogPG::pgls_filter(PGLSFilter
*filter
, hobject_t
& sobj
, bufferlist
& outdata
)
895 // If filter has expressed an interest in an xattr, load it.
896 if (!filter
->get_xattr().empty()) {
897 int ret
= pgbackend
->objects_get_attr(
901 dout(0) << "getattr (sobj=" << sobj
<< ", attr=" << filter
->get_xattr() << ") returned " << ret
<< dendl
;
903 if (ret
!= -ENODATA
|| filter
->reject_empty_xattr()) {
909 return filter
->filter(sobj
, bl
, outdata
);
912 int PrimaryLogPG::get_pgls_filter(bufferlist::const_iterator
& iter
, PGLSFilter
**pfilter
)
920 catch (buffer::error
& e
) {
924 if (type
.compare("parent") == 0) {
925 filter
= new PGLSParentFilter(cct
);
926 } else if (type
.compare("plain") == 0) {
927 filter
= new PGLSPlainFilter();
929 std::size_t dot
= type
.find(".");
930 if (dot
== std::string::npos
|| dot
== 0 || dot
== type
.size() - 1) {
934 const std::string class_name
= type
.substr(0, dot
);
935 const std::string filter_name
= type
.substr(dot
+ 1);
936 ClassHandler::ClassData
*cls
= NULL
;
937 int r
= osd
->class_handler
->open_class(class_name
, &cls
);
939 derr
<< "Error opening class '" << class_name
<< "': "
940 << cpp_strerror(r
) << dendl
;
941 if (r
!= -EPERM
) // propogate permission error
948 ClassHandler::ClassFilter
*class_filter
= cls
->get_filter(filter_name
);
949 if (class_filter
== NULL
) {
950 derr
<< "Error finding filter '" << filter_name
<< "' in class "
951 << class_name
<< dendl
;
954 filter
= class_filter
->fn();
956 // Object classes are obliged to return us something, but let's
957 // give an error rather than asserting out.
958 derr
<< "Buggy class " << class_name
<< " failed to construct "
959 "filter " << filter_name
<< dendl
;
965 int r
= filter
->init(iter
);
967 derr
<< "Error initializing filter " << type
<< ": "
968 << cpp_strerror(r
) << dendl
;
972 // Successfully constructed and initialized, return it.
979 // ==========================================================
981 int PrimaryLogPG::do_command(
992 cmd_getval(cct
, cmdmap
, "format", format
);
993 boost::scoped_ptr
<Formatter
> f(Formatter::create(format
, "json-pretty", "json"));
996 cmd_getval(cct
, cmdmap
, "cmd", command
);
997 if (command
== "query") {
998 f
->open_object_section("pg");
999 f
->dump_string("state", pg_state_string(get_state()));
1000 f
->dump_stream("snap_trimq") << snap_trimq
;
1001 f
->dump_unsigned("snap_trimq_len", snap_trimq
.size());
1002 f
->dump_unsigned("epoch", get_osdmap_epoch());
1003 f
->open_array_section("up");
1004 for (vector
<int>::iterator p
= up
.begin(); p
!= up
.end(); ++p
)
1005 f
->dump_unsigned("osd", *p
);
1007 f
->open_array_section("acting");
1008 for (vector
<int>::iterator p
= acting
.begin(); p
!= acting
.end(); ++p
)
1009 f
->dump_unsigned("osd", *p
);
1011 if (!backfill_targets
.empty()) {
1012 f
->open_array_section("backfill_targets");
1013 for (set
<pg_shard_t
>::iterator p
= backfill_targets
.begin();
1014 p
!= backfill_targets
.end();
1016 f
->dump_stream("shard") << *p
;
1019 if (!async_recovery_targets
.empty()) {
1020 f
->open_array_section("async_recovery_targets");
1021 for (set
<pg_shard_t
>::iterator p
= async_recovery_targets
.begin();
1022 p
!= async_recovery_targets
.end();
1024 f
->dump_stream("shard") << *p
;
1027 if (!acting_recovery_backfill
.empty()) {
1028 f
->open_array_section("acting_recovery_backfill");
1029 for (set
<pg_shard_t
>::iterator p
= acting_recovery_backfill
.begin();
1030 p
!= acting_recovery_backfill
.end();
1032 f
->dump_stream("shard") << *p
;
1035 f
->open_object_section("info");
1036 _update_calc_stats();
1040 f
->open_array_section("peer_info");
1041 for (map
<pg_shard_t
, pg_info_t
>::iterator p
= peer_info
.begin();
1042 p
!= peer_info
.end();
1044 f
->open_object_section("info");
1045 f
->dump_stream("peer") << p
->first
;
1046 p
->second
.dump(f
.get());
1051 f
->open_array_section("recovery_state");
1052 handle_query_state(f
.get());
1055 f
->open_object_section("agent_state");
1057 agent_state
->dump(f
.get());
1064 else if (command
== "mark_unfound_lost") {
1066 cmd_getval(cct
, cmdmap
, "mulcmd", mulcmd
);
1068 if (mulcmd
== "revert") {
1069 if (pool
.info
.is_erasure()) {
1070 ss
<< "mode must be 'delete' for ec pool";
1073 mode
= pg_log_entry_t::LOST_REVERT
;
1074 } else if (mulcmd
== "delete") {
1075 mode
= pg_log_entry_t::LOST_DELETE
;
1077 ss
<< "mode must be 'revert' or 'delete'; mark not yet implemented";
1080 ceph_assert(mode
== pg_log_entry_t::LOST_REVERT
||
1081 mode
== pg_log_entry_t::LOST_DELETE
);
1083 if (!is_primary()) {
1084 ss
<< "not primary";
1088 uint64_t unfound
= missing_loc
.num_unfound();
1090 ss
<< "pg has no unfound objects";
1091 return 0; // make command idempotent
1094 if (!all_unfound_are_queried_or_lost(get_osdmap())) {
1095 ss
<< "pg has " << unfound
1096 << " unfound objects but we haven't probed all sources, not marking lost";
1100 mark_all_unfound_lost(mode
, con
, tid
);
1103 else if (command
== "list_unfound") {
1106 bool show_offset
= false;
1107 if (cmd_getval(cct
, cmdmap
, "offset", offset_json
)) {
1108 json_spirit::Value v
;
1110 if (!json_spirit::read(offset_json
, v
))
1111 throw std::runtime_error("bad json");
1113 } catch (std::runtime_error
& e
) {
1114 ss
<< "error parsing offset: " << e
.what();
1119 f
->open_object_section("missing");
1121 f
->open_object_section("offset");
1122 offset
.dump(f
.get());
1125 auto &needs_recovery_map
= missing_loc
.get_needs_recovery();
1126 f
->dump_int("num_missing", needs_recovery_map
.size());
1127 f
->dump_int("num_unfound", get_num_unfound());
1128 map
<hobject_t
, pg_missing_item
>::const_iterator p
=
1129 needs_recovery_map
.upper_bound(offset
);
1131 f
->open_array_section("objects");
1133 for (; p
!= needs_recovery_map
.end() && num
< cct
->_conf
->osd_command_max_records
; ++p
) {
1134 if (missing_loc
.is_unfound(p
->first
)) {
1135 f
->open_object_section("object");
1137 f
->open_object_section("oid");
1138 p
->first
.dump(f
.get());
1141 p
->second
.dump(f
.get()); // have, need keys
1143 f
->open_array_section("locations");
1144 for (set
<pg_shard_t
>::iterator r
=
1145 missing_loc
.get_locations(p
->first
).begin();
1146 r
!= missing_loc
.get_locations(p
->first
).end();
1148 f
->dump_stream("shard") << *r
;
1157 f
->dump_bool("more", p
!= needs_recovery_map
.end());
1163 ss
<< "unknown pg command " << prefix
;
1167 // ==========================================================
1169 void PrimaryLogPG::do_pg_op(OpRequestRef op
)
1171 // NOTE: this is non-const because we modify the OSDOp.outdata in
1173 MOSDOp
*m
= static_cast<MOSDOp
*>(op
->get_nonconst_req());
1174 ceph_assert(m
->get_type() == CEPH_MSG_OSD_OP
);
1175 dout(10) << "do_pg_op " << *m
<< dendl
;
1180 string cname
, mname
;
1181 PGLSFilter
*filter
= NULL
;
1182 bufferlist filter_out
;
1184 snapid_t snapid
= m
->get_snapid();
1186 vector
<OSDOp
> ops
= m
->ops
;
1188 for (vector
<OSDOp
>::iterator p
= ops
.begin(); p
!= ops
.end(); ++p
) {
1190 auto bp
= p
->indata
.cbegin();
1192 case CEPH_OSD_OP_PGNLS_FILTER
:
1197 catch (const buffer::error
& e
) {
1198 dout(0) << "unable to decode PGLS_FILTER description in " << *m
<< dendl
;
1206 result
= get_pgls_filter(bp
, &filter
);
1210 ceph_assert(filter
);
1214 case CEPH_OSD_OP_PGNLS
:
1215 if (snapid
!= CEPH_NOSNAP
) {
1219 if (get_osdmap()->raw_pg_to_pg(m
->get_pg()) != info
.pgid
.pgid
) {
1220 dout(10) << " pgnls pg=" << m
->get_pg()
1221 << " " << get_osdmap()->raw_pg_to_pg(m
->get_pg())
1222 << " != " << info
.pgid
<< dendl
;
1225 unsigned list_size
= std::min
<uint64_t>(cct
->_conf
->osd_max_pgls
,
1228 dout(10) << " pgnls pg=" << m
->get_pg() << " count " << list_size
1230 // read into a buffer
1231 vector
<hobject_t
> sentries
;
1232 pg_nls_response_t response
;
1234 decode(response
.handle
, bp
);
1236 catch (const buffer::error
& e
) {
1237 dout(0) << "unable to decode PGNLS handle in " << *m
<< dendl
;
1243 hobject_t lower_bound
= response
.handle
;
1244 hobject_t pg_start
= info
.pgid
.pgid
.get_hobj_start();
1245 hobject_t pg_end
= info
.pgid
.pgid
.get_hobj_end(pool
.info
.get_pg_num());
1246 dout(10) << " pgnls lower_bound " << lower_bound
1247 << " pg_end " << pg_end
<< dendl
;
1248 if (((!lower_bound
.is_max() && lower_bound
>= pg_end
) ||
1249 (lower_bound
!= hobject_t() && lower_bound
< pg_start
))) {
1250 // this should only happen with a buggy client.
1251 dout(10) << "outside of PG bounds " << pg_start
<< " .. "
1257 hobject_t current
= lower_bound
;
1258 int r
= pgbackend
->objects_list_partial(
1269 map
<hobject_t
, pg_missing_item
>::const_iterator missing_iter
=
1270 pg_log
.get_missing().get_items().lower_bound(current
);
1271 vector
<hobject_t
>::iterator ls_iter
= sentries
.begin();
1272 hobject_t _max
= hobject_t::get_max();
1274 const hobject_t
&mcand
=
1275 missing_iter
== pg_log
.get_missing().get_items().end() ?
1277 missing_iter
->first
;
1278 const hobject_t
&lcand
=
1279 ls_iter
== sentries
.end() ?
1283 hobject_t candidate
;
1284 if (mcand
== lcand
) {
1286 if (!mcand
.is_max()) {
1290 } else if (mcand
< lcand
) {
1292 ceph_assert(!mcand
.is_max());
1296 ceph_assert(!lcand
.is_max());
1300 dout(10) << " pgnls candidate 0x" << std::hex
<< candidate
.get_hash()
1301 << " vs lower bound 0x" << lower_bound
.get_hash()
1302 << std::dec
<< dendl
;
1304 if (candidate
>= next
) {
1308 if (response
.entries
.size() == list_size
) {
1313 if (candidate
.snap
!= CEPH_NOSNAP
)
1316 // skip internal namespace
1317 if (candidate
.get_namespace() == cct
->_conf
->osd_hit_set_namespace
)
1320 if (missing_loc
.is_deleted(candidate
))
1323 // skip wrong namespace
1324 if (m
->get_hobj().nspace
!= librados::all_nspaces
&&
1325 candidate
.get_namespace() != m
->get_hobj().nspace
)
1328 if (filter
&& !pgls_filter(filter
, candidate
, filter_out
))
1331 dout(20) << "pgnls item 0x" << std::hex
1332 << candidate
.get_hash()
1333 << ", rev 0x" << hobject_t::_reverse_bits(candidate
.get_hash())
1335 << candidate
.oid
.name
<< dendl
;
1337 librados::ListObjectImpl item
;
1338 item
.nspace
= candidate
.get_namespace();
1339 item
.oid
= candidate
.oid
.name
;
1340 item
.locator
= candidate
.get_key();
1341 response
.entries
.push_back(item
);
1344 if (next
.is_max() &&
1345 missing_iter
== pg_log
.get_missing().get_items().end() &&
1346 ls_iter
== sentries
.end()) {
1349 // Set response.handle to the start of the next PG according
1350 // to the object sort order.
1351 response
.handle
= info
.pgid
.pgid
.get_hobj_end(pool
.info
.get_pg_num());
1353 response
.handle
= next
;
1355 dout(10) << "pgnls handle=" << response
.handle
<< dendl
;
1356 encode(response
, osd_op
.outdata
);
1358 encode(filter_out
, osd_op
.outdata
);
1359 dout(10) << " pgnls result=" << result
<< " outdata.length()="
1360 << osd_op
.outdata
.length() << dendl
;
1364 case CEPH_OSD_OP_PGLS_FILTER
:
1369 catch (const buffer::error
& e
) {
1370 dout(0) << "unable to decode PGLS_FILTER description in " << *m
<< dendl
;
1378 result
= get_pgls_filter(bp
, &filter
);
1382 ceph_assert(filter
);
1386 case CEPH_OSD_OP_PGLS
:
1387 if (snapid
!= CEPH_NOSNAP
) {
1391 if (get_osdmap()->raw_pg_to_pg(m
->get_pg()) != info
.pgid
.pgid
) {
1392 dout(10) << " pgls pg=" << m
->get_pg()
1393 << " " << get_osdmap()->raw_pg_to_pg(m
->get_pg())
1394 << " != " << info
.pgid
<< dendl
;
1397 unsigned list_size
= std::min
<uint64_t>(cct
->_conf
->osd_max_pgls
,
1400 dout(10) << " pgls pg=" << m
->get_pg() << " count " << list_size
<< dendl
;
1401 // read into a buffer
1402 vector
<hobject_t
> sentries
;
1403 pg_ls_response_t response
;
1405 decode(response
.handle
, bp
);
1407 catch (const buffer::error
& e
) {
1408 dout(0) << "unable to decode PGLS handle in " << *m
<< dendl
;
1414 hobject_t current
= response
.handle
;
1415 int r
= pgbackend
->objects_list_partial(
1426 ceph_assert(snapid
== CEPH_NOSNAP
|| pg_log
.get_missing().get_items().empty());
1428 map
<hobject_t
, pg_missing_item
>::const_iterator missing_iter
=
1429 pg_log
.get_missing().get_items().lower_bound(current
);
1430 vector
<hobject_t
>::iterator ls_iter
= sentries
.begin();
1431 hobject_t _max
= hobject_t::get_max();
1433 const hobject_t
&mcand
=
1434 missing_iter
== pg_log
.get_missing().get_items().end() ?
1436 missing_iter
->first
;
1437 const hobject_t
&lcand
=
1438 ls_iter
== sentries
.end() ?
1442 hobject_t candidate
;
1443 if (mcand
== lcand
) {
1445 if (!mcand
.is_max()) {
1449 } else if (mcand
< lcand
) {
1451 ceph_assert(!mcand
.is_max());
1455 ceph_assert(!lcand
.is_max());
1459 if (candidate
>= next
) {
1463 if (response
.entries
.size() == list_size
) {
1468 if (candidate
.snap
!= CEPH_NOSNAP
)
1471 // skip wrong namespace
1472 if (candidate
.get_namespace() != m
->get_hobj().nspace
)
1475 if (missing_loc
.is_deleted(candidate
))
1478 if (filter
&& !pgls_filter(filter
, candidate
, filter_out
))
1481 response
.entries
.push_back(make_pair(candidate
.oid
,
1482 candidate
.get_key()));
1484 if (next
.is_max() &&
1485 missing_iter
== pg_log
.get_missing().get_items().end() &&
1486 ls_iter
== sentries
.end()) {
1489 response
.handle
= next
;
1490 encode(response
, osd_op
.outdata
);
1492 encode(filter_out
, osd_op
.outdata
);
1493 dout(10) << " pgls result=" << result
<< " outdata.length()="
1494 << osd_op
.outdata
.length() << dendl
;
1498 case CEPH_OSD_OP_PG_HITSET_LS
:
1500 list
< pair
<utime_t
,utime_t
> > ls
;
1501 for (list
<pg_hit_set_info_t
>::const_iterator p
= info
.hit_set
.history
.begin();
1502 p
!= info
.hit_set
.history
.end();
1504 ls
.push_back(make_pair(p
->begin
, p
->end
));
1506 ls
.push_back(make_pair(hit_set_start_stamp
, utime_t()));
1507 encode(ls
, osd_op
.outdata
);
1511 case CEPH_OSD_OP_PG_HITSET_GET
:
1513 utime_t
stamp(osd_op
.op
.hit_set_get
.stamp
);
1514 if (hit_set_start_stamp
&& stamp
>= hit_set_start_stamp
) {
1515 // read the current in-memory HitSet, not the version we've
1521 encode(*hit_set
, osd_op
.outdata
);
1522 result
= osd_op
.outdata
.length();
1524 // read an archived HitSet.
1526 for (list
<pg_hit_set_info_t
>::const_iterator p
= info
.hit_set
.history
.begin();
1527 p
!= info
.hit_set
.history
.end();
1529 if (stamp
>= p
->begin
&& stamp
<= p
->end
) {
1530 oid
= get_hit_set_archive_object(p
->begin
, p
->end
, p
->using_gmt
);
1534 if (oid
== hobject_t()) {
1538 if (!pool
.info
.is_replicated()) {
1539 // FIXME: EC not supported yet
1540 result
= -EOPNOTSUPP
;
1543 if (is_unreadable_object(oid
)) {
1544 wait_for_unreadable_object(oid
, op
);
1548 result
= osd
->store
->read(ch
, ghobject_t(oid
), 0, 0, osd_op
.outdata
);
1553 case CEPH_OSD_OP_SCRUBLS
:
1554 result
= do_scrub_ls(m
, &osd_op
);
1567 MOSDOpReply
*reply
= new MOSDOpReply(m
, 0, get_osdmap_epoch(),
1568 CEPH_OSD_FLAG_ACK
| CEPH_OSD_FLAG_ONDISK
,
1570 reply
->claim_op_out_data(ops
);
1571 reply
->set_result(result
);
1572 reply
->set_reply_versions(info
.last_update
, info
.last_user_version
);
1573 osd
->send_message_osd_client(reply
, m
->get_connection());
1577 int PrimaryLogPG::do_scrub_ls(MOSDOp
*m
, OSDOp
*osd_op
)
1579 if (m
->get_pg() != info
.pgid
.pgid
) {
1580 dout(10) << " scrubls pg=" << m
->get_pg() << " != " << info
.pgid
<< dendl
;
1581 return -EINVAL
; // hmm?
1583 auto bp
= osd_op
->indata
.cbegin();
1587 } catch (buffer::error
&) {
1588 dout(10) << " corrupted scrub_ls_arg_t" << dendl
;
1592 scrub_ls_result_t result
= {.interval
= info
.history
.same_interval_since
};
1593 if (arg
.interval
!= 0 && arg
.interval
!= info
.history
.same_interval_since
) {
1595 } else if (!scrubber
.store
) {
1597 } else if (arg
.get_snapsets
) {
1598 result
.vals
= scrubber
.store
->get_snap_errors(osd
->store
,
1603 result
.vals
= scrubber
.store
->get_object_errors(osd
->store
,
1608 encode(result
, osd_op
->outdata
);
1612 void PrimaryLogPG::calc_trim_to()
1614 size_t target
= cct
->_conf
->osd_min_pg_log_entries
;
1615 if (is_degraded() ||
1616 state_test(PG_STATE_RECOVERING
|
1617 PG_STATE_RECOVERY_WAIT
|
1618 PG_STATE_BACKFILLING
|
1619 PG_STATE_BACKFILL_WAIT
|
1620 PG_STATE_BACKFILL_TOOFULL
)) {
1621 target
= cct
->_conf
->osd_max_pg_log_entries
;
1624 eversion_t limit
= std::min(
1625 min_last_complete_ondisk
,
1626 pg_log
.get_can_rollback_to());
1627 if (limit
!= eversion_t() &&
1628 limit
!= pg_trim_to
&&
1629 pg_log
.get_log().approx_size() > target
) {
1630 size_t num_to_trim
= std::min(pg_log
.get_log().approx_size() - target
,
1631 cct
->_conf
->osd_pg_log_trim_max
);
1632 if (num_to_trim
< cct
->_conf
->osd_pg_log_trim_min
&&
1633 cct
->_conf
->osd_pg_log_trim_max
>= cct
->_conf
->osd_pg_log_trim_min
) {
1636 list
<pg_log_entry_t
>::const_iterator it
= pg_log
.get_log().log
.begin();
1637 eversion_t new_trim_to
;
1638 for (size_t i
= 0; i
< num_to_trim
; ++i
) {
1639 new_trim_to
= it
->version
;
1641 if (new_trim_to
> limit
) {
1642 new_trim_to
= limit
;
1643 dout(10) << "calc_trim_to trimming to min_last_complete_ondisk" << dendl
;
1647 dout(10) << "calc_trim_to " << pg_trim_to
<< " -> " << new_trim_to
<< dendl
;
1648 pg_trim_to
= new_trim_to
;
1649 assert(pg_trim_to
<= pg_log
.get_head());
1650 assert(pg_trim_to
<= min_last_complete_ondisk
);
1654 void PrimaryLogPG::calc_trim_to_aggressive()
1656 size_t target
= cct
->_conf
->osd_min_pg_log_entries
;
1657 if (is_degraded() ||
1658 state_test(PG_STATE_RECOVERING
|
1659 PG_STATE_RECOVERY_WAIT
|
1660 PG_STATE_BACKFILLING
|
1661 PG_STATE_BACKFILL_WAIT
|
1662 PG_STATE_BACKFILL_TOOFULL
)) {
1663 target
= cct
->_conf
->osd_max_pg_log_entries
;
1665 // limit pg log trimming up to the can_rollback_to value
1666 eversion_t limit
= std::min(
1668 pg_log
.get_can_rollback_to());
1669 dout(10) << __func__
<< " limit = " << limit
<< dendl
;
1671 if (limit
!= eversion_t() &&
1672 limit
!= pg_trim_to
&&
1673 pg_log
.get_log().approx_size() > target
) {
1674 dout(10) << __func__
<< " approx pg log length = "
1675 << pg_log
.get_log().approx_size() << dendl
;
1676 uint64_t num_to_trim
= std::min
<uint64_t>(pg_log
.get_log().approx_size() - target
,
1677 cct
->_conf
->osd_pg_log_trim_max
);
1678 dout(10) << __func__
<< " num_to_trim = " << num_to_trim
<< dendl
;
1679 if (num_to_trim
< cct
->_conf
->osd_pg_log_trim_min
&&
1680 cct
->_conf
->osd_pg_log_trim_max
>= cct
->_conf
->osd_pg_log_trim_min
) {
1683 auto it
= pg_log
.get_log().log
.begin(); // oldest log entry
1684 auto rit
= pg_log
.get_log().log
.rbegin();
1685 eversion_t by_n_to_keep
; // start from tail
1686 eversion_t by_n_to_trim
= eversion_t::max(); // start from head
1687 for (size_t i
= 0; it
!= pg_log
.get_log().log
.end(); ++it
, ++rit
) {
1689 if (i
> target
&& by_n_to_keep
== eversion_t()) {
1690 by_n_to_keep
= rit
->version
;
1692 if (i
>= num_to_trim
&& by_n_to_trim
== eversion_t::max()) {
1693 by_n_to_trim
= it
->version
;
1695 if (by_n_to_keep
!= eversion_t() &&
1696 by_n_to_trim
!= eversion_t::max()) {
1701 if (by_n_to_keep
== eversion_t()) {
1705 pg_trim_to
= std::min({by_n_to_keep
, by_n_to_trim
, limit
});
1706 dout(10) << __func__
<< " pg_trim_to now " << pg_trim_to
<< dendl
;
1707 ceph_assert(pg_trim_to
<= pg_log
.get_head());
1711 PrimaryLogPG::PrimaryLogPG(OSDService
*o
, OSDMapRef curmap
,
1712 const PGPool
&_pool
,
1713 const map
<string
,string
>& ec_profile
, spg_t p
) :
1714 PG(o
, curmap
, _pool
, p
),
1716 PGBackend::build_pg_backend(
1717 _pool
.info
, ec_profile
, this, coll_t(p
), ch
, o
->store
, cct
)),
1718 object_contexts(o
->cct
, o
->cct
->_conf
->osd_pg_object_context_cache_count
),
1719 snapset_contexts_lock("PrimaryLogPG::snapset_contexts_lock"),
1720 new_backfill(false),
1722 snap_trimmer_machine(this)
1724 missing_loc
.set_backend_predicates(
1725 pgbackend
->get_is_readable_predicate(),
1726 pgbackend
->get_is_recoverable_predicate());
1727 snap_trimmer_machine
.initiate();
1730 void PrimaryLogPG::get_src_oloc(const object_t
& oid
, const object_locator_t
& oloc
, object_locator_t
& src_oloc
)
1733 if (oloc
.key
.empty())
1734 src_oloc
.key
= oid
.name
;
1737 void PrimaryLogPG::handle_backoff(OpRequestRef
& op
)
1739 const MOSDBackoff
*m
= static_cast<const MOSDBackoff
*>(op
->get_req());
1740 SessionRef session
{static_cast<Session
*>(m
->get_connection()->get_priv().get())};
1743 hobject_t begin
= info
.pgid
.pgid
.get_hobj_start();
1744 hobject_t end
= info
.pgid
.pgid
.get_hobj_end(pool
.info
.get_pg_num());
1745 if (begin
< m
->begin
) {
1751 dout(10) << __func__
<< " backoff ack id " << m
->id
1752 << " [" << begin
<< "," << end
<< ")" << dendl
;
1753 session
->ack_backoff(cct
, m
->pgid
, m
->id
, begin
, end
);
1756 void PrimaryLogPG::do_request(
1758 ThreadPool::TPHandle
&handle
)
1760 if (op
->osd_trace
) {
1761 op
->pg_trace
.init("pg op", &trace_endpoint
, &op
->osd_trace
);
1762 op
->pg_trace
.event("do request");
1764 // make sure we have a new enough map
1765 auto p
= waiting_for_map
.find(op
->get_source());
1766 if (p
!= waiting_for_map
.end()) {
1767 // preserve ordering
1768 dout(20) << __func__
<< " waiting_for_map "
1769 << p
->first
<< " not empty, queueing" << dendl
;
1770 p
->second
.push_back(op
);
1771 op
->mark_delayed("waiting_for_map not empty");
1774 if (!have_same_or_newer_map(op
->min_epoch
)) {
1775 dout(20) << __func__
<< " min " << op
->min_epoch
1776 << ", queue on waiting_for_map " << op
->get_source() << dendl
;
1777 waiting_for_map
[op
->get_source()].push_back(op
);
1778 op
->mark_delayed("op must wait for map");
1779 osd
->request_osdmap_update(op
->min_epoch
);
1783 if (can_discard_request(op
)) {
1788 const Message
*m
= op
->get_req();
1789 int msg_type
= m
->get_type();
1790 if (m
->get_connection()->has_feature(CEPH_FEATURE_RADOS_BACKOFF
)) {
1791 SessionRef session
{static_cast<Session
*>(m
->get_connection()->get_priv().get())};
1795 if (msg_type
== CEPH_MSG_OSD_OP
) {
1796 if (session
->check_backoff(cct
, info
.pgid
,
1797 info
.pgid
.pgid
.get_hobj_start(), m
)) {
1804 (!is_active() && is_peered());
1805 if (g_conf()->osd_backoff_on_peering
&& !backoff
) {
1811 add_pg_backoff(session
);
1815 // pg backoff acks at pg-level
1816 if (msg_type
== CEPH_MSG_OSD_BACKOFF
) {
1817 const MOSDBackoff
*ba
= static_cast<const MOSDBackoff
*>(m
);
1818 if (ba
->begin
!= ba
->end
) {
1826 // Delay unless PGBackend says it's ok
1827 if (pgbackend
->can_handle_while_inactive(op
)) {
1828 bool handled
= pgbackend
->handle_message(op
);
1829 ceph_assert(handled
);
1832 waiting_for_peered
.push_back(op
);
1833 op
->mark_delayed("waiting for peered");
1838 if (flushes_in_progress
> 0) {
1839 dout(20) << flushes_in_progress
1840 << " flushes_in_progress pending "
1841 << "waiting for flush on " << op
<< dendl
;
1842 waiting_for_flush
.push_back(op
);
1843 op
->mark_delayed("waiting for flush");
1847 ceph_assert(is_peered() && flushes_in_progress
== 0);
1848 if (pgbackend
->handle_message(op
))
1852 case CEPH_MSG_OSD_OP
:
1853 case CEPH_MSG_OSD_BACKOFF
:
1855 dout(20) << " peered, not active, waiting for active on " << op
<< dendl
;
1856 waiting_for_active
.push_back(op
);
1857 op
->mark_delayed("waiting for active");
1861 case CEPH_MSG_OSD_OP
:
1862 // verify client features
1863 if ((pool
.info
.has_tiers() || pool
.info
.is_tier()) &&
1864 !op
->has_feature(CEPH_FEATURE_OSD_CACHEPOOL
)) {
1865 osd
->reply_op_error(op
, -EOPNOTSUPP
);
1870 case CEPH_MSG_OSD_BACKOFF
:
1871 // object-level backoff acks handled in osdop context
1877 case MSG_OSD_PG_SCAN
:
1878 do_scan(op
, handle
);
1881 case MSG_OSD_PG_BACKFILL
:
1885 case MSG_OSD_PG_BACKFILL_REMOVE
:
1886 do_backfill_remove(op
);
1889 case MSG_OSD_SCRUB_RESERVE
:
1891 const MOSDScrubReserve
*m
=
1892 static_cast<const MOSDScrubReserve
*>(op
->get_req());
1894 case MOSDScrubReserve::REQUEST
:
1895 handle_scrub_reserve_request(op
);
1897 case MOSDScrubReserve::GRANT
:
1898 handle_scrub_reserve_grant(op
, m
->from
);
1900 case MOSDScrubReserve::REJECT
:
1901 handle_scrub_reserve_reject(op
, m
->from
);
1903 case MOSDScrubReserve::RELEASE
:
1904 handle_scrub_reserve_release(op
);
1910 case MSG_OSD_REP_SCRUB
:
1911 replica_scrub(op
, handle
);
1914 case MSG_OSD_REP_SCRUBMAP
:
1915 do_replica_scrub_map(op
);
1918 case MSG_OSD_PG_UPDATE_LOG_MISSING
:
1919 do_update_log_missing(op
);
1922 case MSG_OSD_PG_UPDATE_LOG_MISSING_REPLY
:
1923 do_update_log_missing_reply(op
);
1927 ceph_abort_msg("bad message type in do_request");
1931 hobject_t
PrimaryLogPG::earliest_backfill() const
1933 hobject_t e
= hobject_t::get_max();
1934 for (set
<pg_shard_t
>::iterator i
= backfill_targets
.begin();
1935 i
!= backfill_targets
.end();
1938 map
<pg_shard_t
, pg_info_t
>::const_iterator iter
= peer_info
.find(bt
);
1939 ceph_assert(iter
!= peer_info
.end());
1940 if (iter
->second
.last_backfill
< e
)
1941 e
= iter
->second
.last_backfill
;
1946 /** do_op - do an op
1947 * pg lock will be held (if multithreaded)
1948 * osd_lock NOT held.
1950 void PrimaryLogPG::do_op(OpRequestRef
& op
)
1953 // NOTE: take a non-const pointer here; we must be careful not to
1954 // change anything that will break other reads on m (operator<<).
1955 MOSDOp
*m
= static_cast<MOSDOp
*>(op
->get_nonconst_req());
1956 ceph_assert(m
->get_type() == CEPH_MSG_OSD_OP
);
1957 if (m
->finish_decode()) {
1958 op
->reset_desc(); // for TrackedOp
1962 dout(20) << __func__
<< ": op " << *m
<< dendl
;
1964 hobject_t head
= m
->get_hobj();
1965 head
.snap
= CEPH_NOSNAP
;
1967 if (!info
.pgid
.pgid
.contains(
1968 info
.pgid
.pgid
.get_split_bits(pool
.info
.get_pg_num()), head
)) {
1969 derr
<< __func__
<< " " << info
.pgid
.pgid
<< " does not contain "
1970 << head
<< " pg_num " << pool
.info
.get_pg_num() << " hash "
1971 << std::hex
<< head
.get_hash() << std::dec
<< dendl
;
1972 osd
->clog
->warn() << info
.pgid
.pgid
<< " does not contain " << head
1974 ceph_assert(!cct
->_conf
->osd_debug_misdirected_ops
);
1979 m
->get_connection()->has_feature(CEPH_FEATURE_RADOS_BACKOFF
);
1982 session
= static_cast<Session
*>(m
->get_connection()->get_priv().get());
1983 if (!session
.get()) {
1984 dout(10) << __func__
<< " no session" << dendl
;
1988 if (session
->check_backoff(cct
, info
.pgid
, head
, m
)) {
1993 if (m
->has_flag(CEPH_OSD_FLAG_PARALLELEXEC
)) {
1995 dout(20) << __func__
<< ": PARALLELEXEC not implemented " << *m
<< dendl
;
1996 osd
->reply_op_error(op
, -EINVAL
);
2000 if (op
->rmw_flags
== 0) {
2001 int r
= osd
->osd
->init_op_flags(op
);
2003 osd
->reply_op_error(op
, r
);
2008 if ((m
->get_flags() & (CEPH_OSD_FLAG_BALANCE_READS
|
2009 CEPH_OSD_FLAG_LOCALIZE_READS
)) &&
2011 !(op
->may_write() || op
->may_cache())) {
2012 // balanced reads; any replica will do
2013 if (!(is_primary() || is_replica())) {
2014 osd
->handle_misdirected_op(this, op
);
2018 // normal case; must be primary
2019 if (!is_primary()) {
2020 osd
->handle_misdirected_op(this, op
);
2025 if (!op_has_sufficient_caps(op
)) {
2026 osd
->reply_op_error(op
, -EPERM
);
2030 if (op
->includes_pg_op()) {
2031 return do_pg_op(op
);
2034 // object name too long?
2035 if (m
->get_oid().name
.size() > cct
->_conf
->osd_max_object_name_len
) {
2036 dout(4) << "do_op name is longer than "
2037 << cct
->_conf
->osd_max_object_name_len
2038 << " bytes" << dendl
;
2039 osd
->reply_op_error(op
, -ENAMETOOLONG
);
2042 if (m
->get_hobj().get_key().size() > cct
->_conf
->osd_max_object_name_len
) {
2043 dout(4) << "do_op locator is longer than "
2044 << cct
->_conf
->osd_max_object_name_len
2045 << " bytes" << dendl
;
2046 osd
->reply_op_error(op
, -ENAMETOOLONG
);
2049 if (m
->get_hobj().nspace
.size() > cct
->_conf
->osd_max_object_namespace_len
) {
2050 dout(4) << "do_op namespace is longer than "
2051 << cct
->_conf
->osd_max_object_namespace_len
2052 << " bytes" << dendl
;
2053 osd
->reply_op_error(op
, -ENAMETOOLONG
);
2057 if (int r
= osd
->store
->validate_hobject_key(head
)) {
2058 dout(4) << "do_op object " << head
<< " invalid for backing store: "
2060 osd
->reply_op_error(op
, r
);
2065 if (get_osdmap()->is_blacklisted(m
->get_source_addr())) {
2066 dout(10) << "do_op " << m
->get_source_addr() << " is blacklisted" << dendl
;
2067 osd
->reply_op_error(op
, -EBLACKLISTED
);
2071 // order this op as a write?
2072 bool write_ordered
= op
->rwordered();
2074 // discard due to cluster full transition? (we discard any op that
2075 // originates before the cluster or pool is marked full; the client
2076 // will resend after the full flag is removed or if they expect the
2077 // op to succeed despite being full). The except is FULL_FORCE and
2078 // FULL_TRY ops, which there is no reason to discard because they
2079 // bypass all full checks anyway. If this op isn't write or
2080 // read-ordered, we skip.
2081 // FIXME: we exclude mds writes for now.
2082 if (write_ordered
&& !(m
->get_source().is_mds() ||
2083 m
->has_flag(CEPH_OSD_FLAG_FULL_TRY
) ||
2084 m
->has_flag(CEPH_OSD_FLAG_FULL_FORCE
)) &&
2085 info
.history
.last_epoch_marked_full
> m
->get_map_epoch()) {
2086 dout(10) << __func__
<< " discarding op sent before full " << m
<< " "
2090 // mds should have stopped writing before this point.
2091 // We can't allow OSD to become non-startable even if mds
2092 // could be writing as part of file removals.
2093 if (write_ordered
&& osd
->check_failsafe_full(get_dpp()) &&
2094 !m
->has_flag(CEPH_OSD_FLAG_FULL_TRY
)) {
2095 dout(10) << __func__
<< " fail-safe full check failed, dropping request." << dendl
;
2098 int64_t poolid
= get_pgid().pool();
2099 if (op
->may_write()) {
2101 const pg_pool_t
*pi
= get_osdmap()->get_pg_pool(poolid
);
2107 if (m
->get_snapid() != CEPH_NOSNAP
) {
2108 dout(20) << __func__
<< ": write to clone not valid " << *m
<< dendl
;
2109 osd
->reply_op_error(op
, -EINVAL
);
2114 if (cct
->_conf
->osd_max_write_size
&&
2115 m
->get_data_len() > cct
->_conf
->osd_max_write_size
<< 20) {
2116 // journal can't hold commit!
2117 derr
<< "do_op msg data len " << m
->get_data_len()
2118 << " > osd_max_write_size " << (cct
->_conf
->osd_max_write_size
<< 20)
2119 << " on " << *m
<< dendl
;
2120 osd
->reply_op_error(op
, -OSD_WRITETOOBIG
);
2125 dout(10) << "do_op " << *m
2126 << (op
->may_write() ? " may_write" : "")
2127 << (op
->may_read() ? " may_read" : "")
2128 << (op
->may_cache() ? " may_cache" : "")
2129 << " -> " << (write_ordered
? "write-ordered" : "read-ordered")
2130 << " flags " << ceph_osd_flag_string(m
->get_flags())
2134 if (is_unreadable_object(head
)) {
2135 if (!is_primary()) {
2136 osd
->reply_op_error(op
, -EAGAIN
);
2140 (g_conf()->osd_backoff_on_degraded
||
2141 (g_conf()->osd_backoff_on_unfound
&& missing_loc
.is_unfound(head
)))) {
2142 add_backoff(session
, head
, head
);
2143 maybe_kick_recovery(head
);
2145 wait_for_unreadable_object(head
, op
);
2150 if (write_ordered
) {
2152 if (is_degraded_or_backfilling_object(head
)) {
2153 if (can_backoff
&& g_conf()->osd_backoff_on_degraded
) {
2154 add_backoff(session
, head
, head
);
2155 maybe_kick_recovery(head
);
2157 wait_for_degraded_object(head
, op
);
2162 if (scrubber
.is_chunky_scrub_active() && write_blocked_by_scrub(head
)) {
2163 dout(20) << __func__
<< ": waiting for scrub" << dendl
;
2164 waiting_for_scrub
.push_back(op
);
2165 op
->mark_delayed("waiting for scrub");
2170 if (auto blocked_iter
= objects_blocked_on_degraded_snap
.find(head
);
2171 blocked_iter
!= std::end(objects_blocked_on_degraded_snap
)) {
2172 hobject_t
to_wait_on(head
);
2173 to_wait_on
.snap
= blocked_iter
->second
;
2174 wait_for_degraded_object(to_wait_on
, op
);
2177 if (auto blocked_snap_promote_iter
= objects_blocked_on_snap_promotion
.find(head
);
2178 blocked_snap_promote_iter
!= std::end(objects_blocked_on_snap_promotion
)) {
2179 wait_for_blocked_object(blocked_snap_promote_iter
->second
->obs
.oi
.soid
, op
);
2182 if (objects_blocked_on_cache_full
.count(head
)) {
2183 block_write_on_full_cache(head
, op
);
2189 if (op
->may_write() || op
->may_cache()) {
2190 // warning: we will get back *a* request for this reqid, but not
2191 // necessarily the most recent. this happens with flush and
2192 // promote ops, but we can't possible have both in our log where
2193 // the original request is still not stable on disk, so for our
2194 // purposes here it doesn't matter which one we get.
2196 version_t user_version
;
2197 int return_code
= 0;
2198 bool got
= check_in_progress_op(
2199 m
->get_reqid(), &version
, &user_version
, &return_code
);
2201 dout(3) << __func__
<< " dup " << m
->get_reqid()
2202 << " version " << version
<< dendl
;
2203 if (already_complete(version
)) {
2204 osd
->reply_op_error(op
, return_code
, version
, user_version
);
2206 dout(10) << " waiting for " << version
<< " to commit" << dendl
;
2207 // always queue ondisk waiters, so that we can requeue if needed
2208 waiting_for_ondisk
[version
].emplace_back(op
, user_version
, return_code
);
2209 op
->mark_delayed("waiting for ondisk");
2215 ObjectContextRef obc
;
2216 bool can_create
= op
->may_write();
2217 hobject_t missing_oid
;
2219 // kludge around the fact that LIST_SNAPS sets CEPH_SNAPDIR for LIST_SNAPS
2220 hobject_t _oid_head
;
2221 if (m
->get_snapid() == CEPH_SNAPDIR
) {
2222 _oid_head
= m
->get_hobj().get_head();
2224 const hobject_t
& oid
=
2225 m
->get_snapid() == CEPH_SNAPDIR
? _oid_head
: m
->get_hobj();
2227 // make sure LIST_SNAPS is on CEPH_SNAPDIR and nothing else
2228 for (vector
<OSDOp
>::iterator p
= m
->ops
.begin(); p
!= m
->ops
.end(); ++p
) {
2231 if (osd_op
.op
.op
== CEPH_OSD_OP_LIST_SNAPS
) {
2232 if (m
->get_snapid() != CEPH_SNAPDIR
) {
2233 dout(10) << "LIST_SNAPS with incorrect context" << dendl
;
2234 osd
->reply_op_error(op
, -EINVAL
);
2238 if (m
->get_snapid() == CEPH_SNAPDIR
) {
2239 dout(10) << "non-LIST_SNAPS on snapdir" << dendl
;
2240 osd
->reply_op_error(op
, -EINVAL
);
2246 // io blocked on obc?
2247 if (!m
->has_flag(CEPH_OSD_FLAG_FLUSH
) &&
2248 maybe_await_blocked_head(oid
, op
)) {
2252 int r
= find_object_context(
2253 oid
, &obc
, can_create
,
2254 m
->has_flag(CEPH_OSD_FLAG_MAP_SNAP_CLONE
),
2257 // LIST_SNAPS needs the ssc too
2259 m
->get_snapid() == CEPH_SNAPDIR
&&
2261 obc
->ssc
= get_snapset_context(oid
, true);
2265 // If we're not the primary of this OSD, we just return -EAGAIN. Otherwise,
2266 // we have to wait for the object.
2268 // missing the specific snap we need; requeue and wait.
2269 ceph_assert(!op
->may_write()); // only happens on a read/cache
2270 wait_for_unreadable_object(missing_oid
, op
);
2273 } else if (r
== 0) {
2274 if (is_unreadable_object(obc
->obs
.oi
.soid
)) {
2275 dout(10) << __func__
<< ": clone " << obc
->obs
.oi
.soid
2276 << " is unreadable, waiting" << dendl
;
2277 wait_for_unreadable_object(obc
->obs
.oi
.soid
, op
);
2281 // degraded object? (the check above was for head; this could be a clone)
2282 if (write_ordered
&&
2283 obc
->obs
.oi
.soid
.snap
!= CEPH_NOSNAP
&&
2284 is_degraded_or_backfilling_object(obc
->obs
.oi
.soid
)) {
2285 dout(10) << __func__
<< ": clone " << obc
->obs
.oi
.soid
2286 << " is degraded, waiting" << dendl
;
2287 wait_for_degraded_object(obc
->obs
.oi
.soid
, op
);
2292 bool in_hit_set
= false;
2295 if (obc
->obs
.oi
.soid
!= hobject_t() && hit_set
->contains(obc
->obs
.oi
.soid
))
2298 if (missing_oid
!= hobject_t() && hit_set
->contains(missing_oid
))
2301 if (!op
->hitset_inserted
) {
2302 hit_set
->insert(oid
);
2303 op
->hitset_inserted
= true;
2304 if (hit_set
->is_full() ||
2305 hit_set_start_stamp
+ pool
.info
.hit_set_period
<= m
->get_recv_stamp()) {
2312 if (agent_choose_mode(false, op
))
2316 if (obc
.get() && obc
->obs
.exists
&& obc
->obs
.oi
.has_manifest()) {
2317 if (maybe_handle_manifest(op
,
2323 if (maybe_handle_cache(op
,
2332 if (r
&& (r
!= -ENOENT
|| !obc
)) {
2333 // copy the reqids for copy get on ENOENT
2335 (m
->ops
[0].op
.op
== CEPH_OSD_OP_COPY_GET
)) {
2336 fill_in_copy_get_noent(op
, oid
, m
->ops
[0]);
2339 dout(20) << __func__
<< ": find_object_context got error " << r
<< dendl
;
2340 if (op
->may_write() &&
2341 get_osdmap()->require_osd_release
>= CEPH_RELEASE_KRAKEN
) {
2342 record_write_error(op
, oid
, nullptr, r
);
2344 osd
->reply_op_error(op
, r
);
2349 // make sure locator is consistent
2350 object_locator_t
oloc(obc
->obs
.oi
.soid
);
2351 if (m
->get_object_locator() != oloc
) {
2352 dout(10) << " provided locator " << m
->get_object_locator()
2353 << " != object's " << obc
->obs
.oi
.soid
<< dendl
;
2354 osd
->clog
->warn() << "bad locator " << m
->get_object_locator()
2355 << " on object " << oloc
2359 // io blocked on obc?
2360 if (obc
->is_blocked() &&
2361 !m
->has_flag(CEPH_OSD_FLAG_FLUSH
)) {
2362 wait_for_blocked_object(obc
->obs
.oi
.soid
, op
);
2366 dout(25) << __func__
<< " oi " << obc
->obs
.oi
<< dendl
;
2368 OpContext
*ctx
= new OpContext(op
, m
->get_reqid(), &m
->ops
, obc
, this);
2370 if (m
->has_flag(CEPH_OSD_FLAG_SKIPRWLOCKS
)) {
2371 dout(20) << __func__
<< ": skipping rw locks" << dendl
;
2372 } else if (m
->get_flags() & CEPH_OSD_FLAG_FLUSH
) {
2373 dout(20) << __func__
<< ": part of flush, will ignore write lock" << dendl
;
2375 // verify there is in fact a flush in progress
2376 // FIXME: we could make this a stronger test.
2377 map
<hobject_t
,FlushOpRef
>::iterator p
= flush_ops
.find(obc
->obs
.oi
.soid
);
2378 if (p
== flush_ops
.end()) {
2379 dout(10) << __func__
<< " no flush in progress, aborting" << dendl
;
2380 reply_ctx(ctx
, -EINVAL
);
2383 } else if (!get_rw_locks(write_ordered
, ctx
)) {
2384 dout(20) << __func__
<< " waiting for rw locks " << dendl
;
2385 op
->mark_delayed("waiting for rw locks");
2389 dout(20) << __func__
<< " obc " << *obc
<< dendl
;
2392 dout(20) << __func__
<< " returned an error: " << r
<< dendl
;
2394 if (op
->may_write() &&
2395 get_osdmap()->require_osd_release
>= CEPH_RELEASE_KRAKEN
) {
2396 record_write_error(op
, oid
, nullptr, r
);
2398 osd
->reply_op_error(op
, r
);
2403 if (m
->has_flag(CEPH_OSD_FLAG_IGNORE_CACHE
)) {
2404 ctx
->ignore_cache
= true;
2407 if ((op
->may_read()) && (obc
->obs
.oi
.is_lost())) {
2408 // This object is lost. Reading from it returns an error.
2409 dout(20) << __func__
<< ": object " << obc
->obs
.oi
.soid
2410 << " is lost" << dendl
;
2411 reply_ctx(ctx
, -ENFILE
);
2414 if (!op
->may_write() &&
2416 (!obc
->obs
.exists
||
2417 ((m
->get_snapid() != CEPH_SNAPDIR
) &&
2418 obc
->obs
.oi
.is_whiteout()))) {
2419 // copy the reqids for copy get on ENOENT
2420 if (m
->ops
[0].op
.op
== CEPH_OSD_OP_COPY_GET
) {
2421 fill_in_copy_get_noent(op
, oid
, m
->ops
[0]);
2425 reply_ctx(ctx
, -ENOENT
);
2432 utime_t prepare_latency
= ceph_clock_now();
2433 prepare_latency
-= op
->get_dequeued_time();
2434 osd
->logger
->tinc(l_osd_op_prepare_lat
, prepare_latency
);
2435 if (op
->may_read() && op
->may_write()) {
2436 osd
->logger
->tinc(l_osd_op_rw_prepare_lat
, prepare_latency
);
2437 } else if (op
->may_read()) {
2438 osd
->logger
->tinc(l_osd_op_r_prepare_lat
, prepare_latency
);
2439 } else if (op
->may_write() || op
->may_cache()) {
2440 osd
->logger
->tinc(l_osd_op_w_prepare_lat
, prepare_latency
);
2443 // force recovery of the oldest missing object if too many logs
2444 maybe_force_recovery();
2447 PrimaryLogPG::cache_result_t
PrimaryLogPG::maybe_handle_manifest_detail(
2450 ObjectContextRef obc
)
2453 if (static_cast<const MOSDOp
*>(op
->get_req())->get_flags() &
2454 CEPH_OSD_FLAG_IGNORE_REDIRECT
) {
2455 dout(20) << __func__
<< ": ignoring redirect due to flag" << dendl
;
2456 return cache_result_t::NOOP
;
2459 // if it is write-ordered and blocked, stop now
2460 if (obc
->is_blocked() && write_ordered
) {
2461 // we're already doing something with this object
2462 dout(20) << __func__
<< " blocked on " << obc
->obs
.oi
.soid
<< dendl
;
2463 return cache_result_t::NOOP
;
2466 vector
<OSDOp
> ops
= static_cast<const MOSDOp
*>(op
->get_req())->ops
;
2467 for (vector
<OSDOp
>::iterator p
= ops
.begin(); p
!= ops
.end(); ++p
) {
2469 ceph_osd_op
& op
= osd_op
.op
;
2470 if (op
.op
== CEPH_OSD_OP_SET_REDIRECT
||
2471 op
.op
== CEPH_OSD_OP_SET_CHUNK
||
2472 op
.op
== CEPH_OSD_OP_TIER_PROMOTE
||
2473 op
.op
== CEPH_OSD_OP_UNSET_MANIFEST
) {
2474 return cache_result_t::NOOP
;
2478 switch (obc
->obs
.oi
.manifest
.type
) {
2479 case object_manifest_t::TYPE_REDIRECT
:
2480 if (op
->may_write() || write_ordered
) {
2481 do_proxy_write(op
, obc
);
2484 if (obc
->obs
.oi
.size
!= 0) {
2485 return cache_result_t::NOOP
;
2487 do_proxy_read(op
, obc
);
2489 return cache_result_t::HANDLED_PROXY
;
2490 case object_manifest_t::TYPE_CHUNKED
:
2492 if (can_proxy_chunked_read(op
, obc
)) {
2493 map
<hobject_t
,FlushOpRef
>::iterator p
= flush_ops
.find(obc
->obs
.oi
.soid
);
2494 if (p
!= flush_ops
.end()) {
2495 do_proxy_chunked_op(op
, obc
->obs
.oi
.soid
, obc
, true);
2496 return cache_result_t::HANDLED_PROXY
;
2498 do_proxy_chunked_op(op
, obc
->obs
.oi
.soid
, obc
, write_ordered
);
2499 return cache_result_t::HANDLED_PROXY
;
2502 MOSDOp
*m
= static_cast<MOSDOp
*>(op
->get_nonconst_req());
2503 ceph_assert(m
->get_type() == CEPH_MSG_OSD_OP
);
2504 hobject_t head
= m
->get_hobj();
2506 if (is_degraded_or_backfilling_object(head
)) {
2507 dout(20) << __func__
<< ": " << head
<< " is degraded, waiting" << dendl
;
2508 wait_for_degraded_object(head
, op
);
2509 return cache_result_t::BLOCKED_RECOVERY
;
2512 if (write_blocked_by_scrub(head
)) {
2513 dout(20) << __func__
<< ": waiting for scrub" << dendl
;
2514 waiting_for_scrub
.push_back(op
);
2515 op
->mark_delayed("waiting for scrub");
2516 return cache_result_t::BLOCKED_RECOVERY
;
2519 for (auto& p
: obc
->obs
.oi
.manifest
.chunk_map
) {
2520 if (p
.second
.is_missing()) {
2521 const MOSDOp
*m
= static_cast<const MOSDOp
*>(op
->get_req());
2522 const object_locator_t oloc
= m
->get_object_locator();
2523 promote_object(obc
, obc
->obs
.oi
.soid
, oloc
, op
, NULL
);
2524 return cache_result_t::BLOCKED_PROMOTE
;
2528 bool all_dirty
= true;
2529 for (auto& p
: obc
->obs
.oi
.manifest
.chunk_map
) {
2530 if (!p
.second
.is_dirty()) {
2535 start_flush(OpRequestRef(), obc
, true, NULL
, boost::none
);
2537 return cache_result_t::NOOP
;
2540 ceph_abort_msg("unrecognized manifest type");
2543 return cache_result_t::NOOP
;
2546 struct C_ManifestFlush
: public Context
{
2553 uint64_t last_offset
;
2554 C_ManifestFlush(PrimaryLogPG
*p
, hobject_t o
, epoch_t e
)
2555 : pg(p
), oid(o
), lpr(e
),
2556 tid(0), start(ceph_clock_now())
2558 void finish(int r
) override
{
2559 if (r
== -ECANCELED
)
2562 pg
->handle_manifest_flush(oid
, tid
, r
, offset
, last_offset
, lpr
);
2563 pg
->osd
->logger
->tinc(l_osd_tier_flush_lat
, ceph_clock_now() - start
);
2568 void PrimaryLogPG::handle_manifest_flush(hobject_t oid
, ceph_tid_t tid
, int r
,
2569 uint64_t offset
, uint64_t last_offset
,
2572 map
<hobject_t
,FlushOpRef
>::iterator p
= flush_ops
.find(oid
);
2573 if (p
== flush_ops
.end()) {
2574 dout(10) << __func__
<< " no flush_op found" << dendl
;
2577 if (p
->second
->rval
< 0) {
2580 p
->second
->io_results
[offset
] = r
;
2581 for (auto &ior
: p
->second
->io_results
) {
2582 if (ior
.second
< 0) {
2583 finish_manifest_flush(oid
, tid
, r
, p
->second
->obc
, last_offset
);
2584 p
->second
->rval
= r
;
2588 if (p
->second
->chunks
== p
->second
->io_results
.size()) {
2589 if (lpr
== get_last_peering_reset()) {
2590 ceph_assert(p
->second
->obc
);
2591 finish_manifest_flush(oid
, tid
, r
, p
->second
->obc
, last_offset
);
2596 int PrimaryLogPG::start_manifest_flush(OpRequestRef op
, ObjectContextRef obc
, bool blocking
,
2597 boost::optional
<std::function
<void()>> &&on_flush
)
2599 auto p
= obc
->obs
.oi
.manifest
.chunk_map
.begin();
2600 FlushOpRef
manifest_fop(std::make_shared
<FlushOp
>());
2601 manifest_fop
->op
= op
;
2602 manifest_fop
->obc
= obc
;
2603 manifest_fop
->flushed_version
= obc
->obs
.oi
.user_version
;
2604 manifest_fop
->blocking
= blocking
;
2605 manifest_fop
->on_flush
= std::move(on_flush
);
2606 int r
= do_manifest_flush(op
, obc
, manifest_fop
, p
->first
, blocking
);
2611 flush_ops
[obc
->obs
.oi
.soid
] = manifest_fop
;
2612 return -EINPROGRESS
;
2615 int PrimaryLogPG::do_manifest_flush(OpRequestRef op
, ObjectContextRef obc
, FlushOpRef manifest_fop
,
2616 uint64_t start_offset
, bool block
)
2618 struct object_manifest_t
&manifest
= obc
->obs
.oi
.manifest
;
2619 hobject_t soid
= obc
->obs
.oi
.soid
;
2622 uint64_t max_copy_size
= 0, last_offset
= 0;
2624 map
<uint64_t, chunk_info_t
>::iterator iter
= manifest
.chunk_map
.find(start_offset
);
2625 ceph_assert(iter
!= manifest
.chunk_map
.end());
2626 for (;iter
!= manifest
.chunk_map
.end(); ++iter
) {
2627 if (iter
->second
.is_dirty()) {
2628 last_offset
= iter
->first
;
2629 max_copy_size
+= iter
->second
.length
;
2631 if (get_copy_chunk_size() < max_copy_size
) {
2636 iter
= manifest
.chunk_map
.find(start_offset
);
2637 for (;iter
!= manifest
.chunk_map
.end(); ++iter
) {
2638 if (!iter
->second
.is_dirty()) {
2641 uint64_t tgt_length
= iter
->second
.length
;
2642 uint64_t tgt_offset
= iter
->second
.offset
;
2643 hobject_t tgt_soid
= iter
->second
.oid
;
2644 object_locator_t
oloc(tgt_soid
);
2645 ObjectOperation obj_op
;
2646 bufferlist chunk_data
;
2647 int r
= pgbackend
->objects_read_sync(
2648 soid
, iter
->first
, tgt_length
, 0, &chunk_data
);
2650 dout(0) << __func__
<< " read fail " << " offset: " << tgt_offset
2651 << " len: " << tgt_length
<< " r: " << r
<< dendl
;
2654 if (!chunk_data
.length()) {
2658 unsigned flags
= CEPH_OSD_FLAG_IGNORE_CACHE
| CEPH_OSD_FLAG_IGNORE_OVERLAY
|
2659 CEPH_OSD_FLAG_RWORDERED
;
2660 tgt_length
= chunk_data
.length();
2661 pg_pool_t::fingerprint_t fp_algo_t
= pool
.info
.get_fingerprint_type();
2662 if (iter
->second
.has_reference() &&
2663 fp_algo_t
!= pg_pool_t::TYPE_FINGERPRINT_NONE
) {
2664 switch (fp_algo_t
) {
2665 case pg_pool_t::TYPE_FINGERPRINT_SHA1
:
2667 sha1_digest_t sha1r
= chunk_data
.sha1();
2668 object_t fp_oid
= sha1r
.to_str();
2670 if (fp_oid
!= tgt_soid
.oid
) {
2671 // decrement old chunk's reference count
2672 ObjectOperation dec_op
;
2673 cls_chunk_refcount_put_op put_call
;
2674 ::encode(put_call
, in
);
2675 dec_op
.call("refcount", "chunk_put", in
);
2676 // we don't care dec_op's completion. scrub for dedup will fix this.
2677 tid
= osd
->objecter
->mutate(
2678 tgt_soid
.oid
, oloc
, dec_op
, snapc
,
2679 ceph::real_clock::from_ceph_timespec(obc
->obs
.oi
.mtime
),
2683 tgt_soid
.oid
= fp_oid
;
2684 iter
->second
.oid
= tgt_soid
;
2687 osd_op
.extent
.offset
= 0;
2688 osd_op
.extent
.length
= chunk_data
.length();
2691 in
.append(chunk_data
);
2692 obj_op
.call("cas", "cas_write_or_get", in
);
2696 assert(0 == "unrecognized fingerprint type");
2700 obj_op
.add_data(CEPH_OSD_OP_WRITE
, tgt_offset
, tgt_length
, chunk_data
);
2703 C_ManifestFlush
*fin
= new C_ManifestFlush(this, soid
, get_last_peering_reset());
2704 fin
->offset
= iter
->first
;
2705 fin
->last_offset
= last_offset
;
2706 manifest_fop
->chunks
++;
2708 unsigned n
= info
.pgid
.hash_to_shard(osd
->m_objecter_finishers
);
2709 tid
= osd
->objecter
->mutate(
2710 tgt_soid
.oid
, oloc
, obj_op
, snapc
,
2711 ceph::real_clock::from_ceph_timespec(obc
->obs
.oi
.mtime
),
2712 flags
, new C_OnFinisher(fin
, osd
->objecter_finishers
[n
]));
2714 manifest_fop
->io_tids
[iter
->first
] = tid
;
2716 dout(20) << __func__
<< " offset: " << tgt_offset
<< " len: " << tgt_length
2717 << " oid: " << tgt_soid
.oid
<< " ori oid: " << soid
.oid
.name
2718 << " tid: " << tid
<< dendl
;
2719 if (last_offset
< iter
->first
) {
2727 void PrimaryLogPG::finish_manifest_flush(hobject_t oid
, ceph_tid_t tid
, int r
,
2728 ObjectContextRef obc
, uint64_t last_offset
)
2730 dout(10) << __func__
<< " " << oid
<< " tid " << tid
2731 << " " << cpp_strerror(r
) << " last_offset: " << last_offset
<< dendl
;
2732 map
<hobject_t
,FlushOpRef
>::iterator p
= flush_ops
.find(oid
);
2733 if (p
== flush_ops
.end()) {
2734 dout(10) << __func__
<< " no flush_op found" << dendl
;
2737 map
<uint64_t, chunk_info_t
>::iterator iter
=
2738 obc
->obs
.oi
.manifest
.chunk_map
.find(last_offset
);
2739 ceph_assert(iter
!= obc
->obs
.oi
.manifest
.chunk_map
.end());
2740 for (;iter
!= obc
->obs
.oi
.manifest
.chunk_map
.end(); ++iter
) {
2741 if (iter
->second
.is_dirty() && last_offset
< iter
->first
) {
2742 do_manifest_flush(p
->second
->op
, obc
, p
->second
, iter
->first
, p
->second
->blocking
);
2746 finish_flush(oid
, tid
, r
);
2749 void PrimaryLogPG::record_write_error(OpRequestRef op
, const hobject_t
&soid
,
2750 MOSDOpReply
*orig_reply
, int r
)
2752 dout(20) << __func__
<< " r=" << r
<< dendl
;
2753 ceph_assert(op
->may_write());
2754 const osd_reqid_t
&reqid
= static_cast<const MOSDOp
*>(op
->get_req())->get_reqid();
2755 mempool::osd_pglog::list
<pg_log_entry_t
> entries
;
2756 entries
.push_back(pg_log_entry_t(pg_log_entry_t::ERROR
, soid
,
2757 get_next_version(), eversion_t(), 0,
2758 reqid
, utime_t(), r
));
2763 boost::intrusive_ptr
<MOSDOpReply
> orig_reply
;
2768 MOSDOpReply
*orig_reply
,
2771 orig_reply(orig_reply
, false /* take over ref */), r(r
)
2774 ldpp_dout(pg
, 20) << "finished " << __func__
<< " r=" << r
<< dendl
;
2775 const MOSDOp
*m
= static_cast<const MOSDOp
*>(op
->get_req());
2776 int flags
= m
->get_flags() & (CEPH_OSD_FLAG_ACK
| CEPH_OSD_FLAG_ONDISK
);
2777 MOSDOpReply
*reply
= orig_reply
.detach();
2778 if (reply
== nullptr) {
2779 reply
= new MOSDOpReply(m
, r
, pg
->get_osdmap_epoch(),
2782 ldpp_dout(pg
, 10) << " sending commit on " << *m
<< " " << reply
<< dendl
;
2783 pg
->osd
->send_message_osd_client(reply
, m
->get_connection());
2787 ObcLockManager lock_manager
;
2790 std::move(lock_manager
),
2791 boost::optional
<std::function
<void(void)> >(
2792 OnComplete(this, op
, orig_reply
, r
)),
2797 PrimaryLogPG::cache_result_t
PrimaryLogPG::maybe_handle_cache_detail(
2800 ObjectContextRef obc
,
2801 int r
, hobject_t missing_oid
,
2804 ObjectContextRef
*promote_obc
)
2806 // return quickly if caching is not enabled
2807 if (pool
.info
.cache_mode
== pg_pool_t::CACHEMODE_NONE
)
2808 return cache_result_t::NOOP
;
2812 op
->get_req()->get_type() == CEPH_MSG_OSD_OP
&&
2813 (static_cast<const MOSDOp
*>(op
->get_req())->get_flags() &
2814 CEPH_OSD_FLAG_IGNORE_CACHE
)) {
2815 dout(20) << __func__
<< ": ignoring cache due to flag" << dendl
;
2816 return cache_result_t::NOOP
;
2819 must_promote
= must_promote
|| op
->need_promote();
2822 dout(25) << __func__
<< " " << obc
->obs
.oi
<< " "
2823 << (obc
->obs
.exists
? "exists" : "DNE")
2824 << " missing_oid " << missing_oid
2825 << " must_promote " << (int)must_promote
2826 << " in_hit_set " << (int)in_hit_set
2829 dout(25) << __func__
<< " (no obc)"
2830 << " missing_oid " << missing_oid
2831 << " must_promote " << (int)must_promote
2832 << " in_hit_set " << (int)in_hit_set
2835 // if it is write-ordered and blocked, stop now
2836 if (obc
.get() && obc
->is_blocked() && write_ordered
) {
2837 // we're already doing something with this object
2838 dout(20) << __func__
<< " blocked on " << obc
->obs
.oi
.soid
<< dendl
;
2839 return cache_result_t::NOOP
;
2842 if (r
== -ENOENT
&& missing_oid
== hobject_t()) {
2843 // we know this object is logically absent (e.g., an undefined clone)
2844 return cache_result_t::NOOP
;
2847 if (obc
.get() && obc
->obs
.exists
) {
2848 osd
->logger
->inc(l_osd_op_cache_hit
);
2849 return cache_result_t::NOOP
;
2851 if (!is_primary()) {
2852 dout(20) << __func__
<< " cache miss; ask the primary" << dendl
;
2853 osd
->reply_op_error(op
, -EAGAIN
);
2854 return cache_result_t::REPLIED_WITH_EAGAIN
;
2857 if (missing_oid
== hobject_t() && obc
.get()) {
2858 missing_oid
= obc
->obs
.oi
.soid
;
2861 const MOSDOp
*m
= static_cast<const MOSDOp
*>(op
->get_req());
2862 const object_locator_t oloc
= m
->get_object_locator();
2864 if (op
->need_skip_handle_cache()) {
2865 return cache_result_t::NOOP
;
2868 OpRequestRef promote_op
;
2870 switch (pool
.info
.cache_mode
) {
2871 case pg_pool_t::CACHEMODE_WRITEBACK
:
2873 agent_state
->evict_mode
== TierAgentState::EVICT_MODE_FULL
) {
2874 if (!op
->may_write() && !op
->may_cache() &&
2875 !write_ordered
&& !must_promote
) {
2876 dout(20) << __func__
<< " cache pool full, proxying read" << dendl
;
2878 return cache_result_t::HANDLED_PROXY
;
2880 dout(20) << __func__
<< " cache pool full, waiting" << dendl
;
2881 block_write_on_full_cache(missing_oid
, op
);
2882 return cache_result_t::BLOCKED_FULL
;
2885 if (must_promote
|| (!hit_set
&& !op
->need_skip_promote())) {
2886 promote_object(obc
, missing_oid
, oloc
, op
, promote_obc
);
2887 return cache_result_t::BLOCKED_PROMOTE
;
2890 if (op
->may_write() || op
->may_cache()) {
2894 if (!op
->need_skip_promote() &&
2895 maybe_promote(obc
, missing_oid
, oloc
, in_hit_set
,
2896 pool
.info
.min_write_recency_for_promote
,
2899 return cache_result_t::BLOCKED_PROMOTE
;
2901 return cache_result_t::HANDLED_PROXY
;
2905 // Avoid duplicate promotion
2906 if (obc
.get() && obc
->is_blocked()) {
2909 return cache_result_t::BLOCKED_PROMOTE
;
2913 if (!op
->need_skip_promote()) {
2914 (void)maybe_promote(obc
, missing_oid
, oloc
, in_hit_set
,
2915 pool
.info
.min_read_recency_for_promote
,
2916 promote_op
, promote_obc
);
2919 return cache_result_t::HANDLED_PROXY
;
2921 ceph_abort_msg("unreachable");
2922 return cache_result_t::NOOP
;
2924 case pg_pool_t::CACHEMODE_FORWARD
:
2925 // FIXME: this mode allows requests to be reordered.
2926 do_cache_redirect(op
);
2927 return cache_result_t::HANDLED_REDIRECT
;
2929 case pg_pool_t::CACHEMODE_READONLY
:
2930 // TODO: clean this case up
2931 if (!obc
.get() && r
== -ENOENT
) {
2932 // we don't have the object and op's a read
2933 promote_object(obc
, missing_oid
, oloc
, op
, promote_obc
);
2934 return cache_result_t::BLOCKED_PROMOTE
;
2936 if (!r
) { // it must be a write
2937 do_cache_redirect(op
);
2938 return cache_result_t::HANDLED_REDIRECT
;
2940 // crap, there was a failure of some kind
2941 return cache_result_t::NOOP
;
2943 case pg_pool_t::CACHEMODE_READFORWARD
:
2944 // Do writeback to the cache tier for writes
2945 if (op
->may_write() || write_ordered
|| must_promote
) {
2947 agent_state
->evict_mode
== TierAgentState::EVICT_MODE_FULL
) {
2948 dout(20) << __func__
<< " cache pool full, waiting" << dendl
;
2949 block_write_on_full_cache(missing_oid
, op
);
2950 return cache_result_t::BLOCKED_FULL
;
2952 promote_object(obc
, missing_oid
, oloc
, op
, promote_obc
);
2953 return cache_result_t::BLOCKED_PROMOTE
;
2956 // If it is a read, we can read, we need to forward it
2957 do_cache_redirect(op
);
2958 return cache_result_t::HANDLED_REDIRECT
;
2960 case pg_pool_t::CACHEMODE_PROXY
:
2961 if (!must_promote
) {
2962 if (op
->may_write() || op
->may_cache() || write_ordered
) {
2964 return cache_result_t::HANDLED_PROXY
;
2967 return cache_result_t::HANDLED_PROXY
;
2970 // ugh, we're forced to promote.
2972 agent_state
->evict_mode
== TierAgentState::EVICT_MODE_FULL
) {
2973 dout(20) << __func__
<< " cache pool full, waiting" << dendl
;
2974 block_write_on_full_cache(missing_oid
, op
);
2975 return cache_result_t::BLOCKED_FULL
;
2977 promote_object(obc
, missing_oid
, oloc
, op
, promote_obc
);
2978 return cache_result_t::BLOCKED_PROMOTE
;
2980 case pg_pool_t::CACHEMODE_READPROXY
:
2981 // Do writeback to the cache tier for writes
2982 if (op
->may_write() || write_ordered
|| must_promote
) {
2984 agent_state
->evict_mode
== TierAgentState::EVICT_MODE_FULL
) {
2985 dout(20) << __func__
<< " cache pool full, waiting" << dendl
;
2986 block_write_on_full_cache(missing_oid
, op
);
2987 return cache_result_t::BLOCKED_FULL
;
2989 promote_object(obc
, missing_oid
, oloc
, op
, promote_obc
);
2990 return cache_result_t::BLOCKED_PROMOTE
;
2993 // If it is a read, we can read, we need to proxy it
2995 return cache_result_t::HANDLED_PROXY
;
2998 ceph_abort_msg("unrecognized cache_mode");
3000 return cache_result_t::NOOP
;
3003 bool PrimaryLogPG::maybe_promote(ObjectContextRef obc
,
3004 const hobject_t
& missing_oid
,
3005 const object_locator_t
& oloc
,
3008 OpRequestRef promote_op
,
3009 ObjectContextRef
*promote_obc
)
3011 dout(20) << __func__
<< " missing_oid " << missing_oid
3012 << " in_hit_set " << in_hit_set
<< dendl
;
3018 // Check if in the current hit set
3028 unsigned count
= (int)in_hit_set
;
3030 // Check if in other hit sets
3031 const hobject_t
& oid
= obc
.get() ? obc
->obs
.oi
.soid
: missing_oid
;
3032 for (map
<time_t,HitSetRef
>::reverse_iterator itor
=
3033 agent_state
->hit_set_map
.rbegin();
3034 itor
!= agent_state
->hit_set_map
.rend();
3036 if (!itor
->second
->contains(oid
)) {
3040 if (count
>= recency
) {
3045 if (count
>= recency
) {
3048 return false; // not promoting
3053 if (osd
->promote_throttle()) {
3054 dout(10) << __func__
<< " promote throttled" << dendl
;
3057 promote_object(obc
, missing_oid
, oloc
, promote_op
, promote_obc
);
3061 void PrimaryLogPG::do_cache_redirect(OpRequestRef op
)
3063 const MOSDOp
*m
= static_cast<const MOSDOp
*>(op
->get_req());
3064 int flags
= m
->get_flags() & (CEPH_OSD_FLAG_ACK
|CEPH_OSD_FLAG_ONDISK
);
3065 MOSDOpReply
*reply
= new MOSDOpReply(m
, -ENOENT
, get_osdmap_epoch(),
3067 request_redirect_t
redir(m
->get_object_locator(), pool
.info
.tier_of
);
3068 reply
->set_redirect(redir
);
3069 dout(10) << "sending redirect to pool " << pool
.info
.tier_of
<< " for op "
3071 m
->get_connection()->send_message(reply
);
3075 struct C_ProxyRead
: public Context
{
3078 epoch_t last_peering_reset
;
3080 PrimaryLogPG::ProxyReadOpRef prdop
;
3082 C_ProxyRead(PrimaryLogPG
*p
, hobject_t o
, epoch_t lpr
,
3083 const PrimaryLogPG::ProxyReadOpRef
& prd
)
3084 : pg(p
), oid(o
), last_peering_reset(lpr
),
3085 tid(0), prdop(prd
), start(ceph_clock_now())
3087 void finish(int r
) override
{
3088 if (prdop
->canceled
)
3091 if (prdop
->canceled
) {
3095 if (last_peering_reset
== pg
->get_last_peering_reset()) {
3096 pg
->finish_proxy_read(oid
, tid
, r
);
3097 pg
->osd
->logger
->tinc(l_osd_tier_r_lat
, ceph_clock_now() - start
);
3103 struct C_ProxyChunkRead
: public Context
{
3106 epoch_t last_peering_reset
;
3108 PrimaryLogPG::ProxyReadOpRef prdop
;
3110 ObjectOperation
*obj_op
;
3112 uint64_t req_offset
= 0;
3113 ObjectContextRef obc
;
3114 uint64_t req_total_len
= 0;
3115 C_ProxyChunkRead(PrimaryLogPG
*p
, hobject_t o
, epoch_t lpr
,
3116 const PrimaryLogPG::ProxyReadOpRef
& prd
)
3117 : pg(p
), oid(o
), last_peering_reset(lpr
),
3118 tid(0), prdop(prd
), start(ceph_clock_now()), obj_op(NULL
)
3120 void finish(int r
) override
{
3121 if (prdop
->canceled
)
3124 if (prdop
->canceled
) {
3128 if (last_peering_reset
== pg
->get_last_peering_reset()) {
3130 if (!prdop
->ops
[op_index
].outdata
.length()) {
3131 ceph_assert(req_total_len
);
3133 bufferptr
bptr(req_total_len
);
3134 list
.push_back(std::move(bptr
));
3135 prdop
->ops
[op_index
].outdata
.append(list
);
3137 ceph_assert(obj_op
);
3138 uint64_t copy_offset
;
3139 if (req_offset
>= prdop
->ops
[op_index
].op
.extent
.offset
) {
3140 copy_offset
= req_offset
- prdop
->ops
[op_index
].op
.extent
.offset
;
3144 prdop
->ops
[op_index
].outdata
.copy_in(copy_offset
, obj_op
->ops
[0].outdata
.length(),
3145 obj_op
->ops
[0].outdata
.c_str());
3148 pg
->finish_proxy_read(oid
, tid
, r
);
3149 pg
->osd
->logger
->tinc(l_osd_tier_r_lat
, ceph_clock_now() - start
);
3158 void PrimaryLogPG::do_proxy_read(OpRequestRef op
, ObjectContextRef obc
)
3160 // NOTE: non-const here because the ProxyReadOp needs mutable refs to
3161 // stash the result in the request's OSDOp vector
3162 MOSDOp
*m
= static_cast<MOSDOp
*>(op
->get_nonconst_req());
3163 object_locator_t oloc
;
3165 /* extensible tier */
3166 if (obc
&& obc
->obs
.exists
&& obc
->obs
.oi
.has_manifest()) {
3167 switch (obc
->obs
.oi
.manifest
.type
) {
3168 case object_manifest_t::TYPE_REDIRECT
:
3169 oloc
= object_locator_t(obc
->obs
.oi
.manifest
.redirect_target
);
3170 soid
= obc
->obs
.oi
.manifest
.redirect_target
;
3173 ceph_abort_msg("unrecognized manifest type");
3177 soid
= m
->get_hobj();
3178 oloc
= object_locator_t(m
->get_object_locator());
3179 oloc
.pool
= pool
.info
.tier_of
;
3181 unsigned flags
= CEPH_OSD_FLAG_IGNORE_CACHE
| CEPH_OSD_FLAG_IGNORE_OVERLAY
;
3183 // pass through some original flags that make sense.
3184 // - leave out redirection and balancing flags since we are
3185 // already proxying through the primary
3186 // - leave off read/write/exec flags that are derived from the op
3187 flags
|= m
->get_flags() & (CEPH_OSD_FLAG_RWORDERED
|
3188 CEPH_OSD_FLAG_ORDERSNAP
|
3189 CEPH_OSD_FLAG_ENFORCE_SNAPC
|
3190 CEPH_OSD_FLAG_MAP_SNAP_CLONE
);
3192 dout(10) << __func__
<< " Start proxy read for " << *m
<< dendl
;
3194 ProxyReadOpRef
prdop(std::make_shared
<ProxyReadOp
>(op
, soid
, m
->ops
));
3196 ObjectOperation obj_op
;
3197 obj_op
.dup(prdop
->ops
);
3199 if (pool
.info
.cache_mode
== pg_pool_t::CACHEMODE_WRITEBACK
&&
3200 (agent_state
&& agent_state
->evict_mode
!= TierAgentState::EVICT_MODE_FULL
)) {
3201 for (unsigned i
= 0; i
< obj_op
.ops
.size(); i
++) {
3202 ceph_osd_op op
= obj_op
.ops
[i
].op
;
3204 case CEPH_OSD_OP_READ
:
3205 case CEPH_OSD_OP_SYNC_READ
:
3206 case CEPH_OSD_OP_SPARSE_READ
:
3207 case CEPH_OSD_OP_CHECKSUM
:
3208 case CEPH_OSD_OP_CMPEXT
:
3209 op
.flags
= (op
.flags
| CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL
) &
3210 ~(CEPH_OSD_OP_FLAG_FADVISE_DONTNEED
| CEPH_OSD_OP_FLAG_FADVISE_NOCACHE
);
3215 C_ProxyRead
*fin
= new C_ProxyRead(this, soid
, get_last_peering_reset(),
3217 unsigned n
= info
.pgid
.hash_to_shard(osd
->m_objecter_finishers
);
3218 ceph_tid_t tid
= osd
->objecter
->read(
3219 soid
.oid
, oloc
, obj_op
,
3220 m
->get_snapid(), NULL
,
3221 flags
, new C_OnFinisher(fin
, osd
->objecter_finishers
[n
]),
3222 &prdop
->user_version
,
3223 &prdop
->data_offset
,
3226 prdop
->objecter_tid
= tid
;
3227 proxyread_ops
[tid
] = prdop
;
3228 in_progress_proxy_ops
[soid
].push_back(op
);
3231 void PrimaryLogPG::finish_proxy_read(hobject_t oid
, ceph_tid_t tid
, int r
)
3233 dout(10) << __func__
<< " " << oid
<< " tid " << tid
3234 << " " << cpp_strerror(r
) << dendl
;
3236 map
<ceph_tid_t
, ProxyReadOpRef
>::iterator p
= proxyread_ops
.find(tid
);
3237 if (p
== proxyread_ops
.end()) {
3238 dout(10) << __func__
<< " no proxyread_op found" << dendl
;
3241 ProxyReadOpRef prdop
= p
->second
;
3242 if (tid
!= prdop
->objecter_tid
) {
3243 dout(10) << __func__
<< " tid " << tid
<< " != prdop " << prdop
3244 << " tid " << prdop
->objecter_tid
<< dendl
;
3247 if (oid
!= prdop
->soid
) {
3248 dout(10) << __func__
<< " oid " << oid
<< " != prdop " << prdop
3249 << " soid " << prdop
->soid
<< dendl
;
3252 proxyread_ops
.erase(tid
);
3254 map
<hobject_t
, list
<OpRequestRef
>>::iterator q
= in_progress_proxy_ops
.find(oid
);
3255 if (q
== in_progress_proxy_ops
.end()) {
3256 dout(10) << __func__
<< " no in_progress_proxy_ops found" << dendl
;
3259 ceph_assert(q
->second
.size());
3260 list
<OpRequestRef
>::iterator it
= std::find(q
->second
.begin(),
3263 ceph_assert(it
!= q
->second
.end());
3264 OpRequestRef op
= *it
;
3265 q
->second
.erase(it
);
3266 if (q
->second
.size() == 0) {
3267 in_progress_proxy_ops
.erase(oid
);
3268 } else if (std::find(q
->second
.begin(),
3270 prdop
->op
) != q
->second
.end()) {
3271 /* multiple read case */
3272 dout(20) << __func__
<< " " << oid
<< " is not completed " << dendl
;
3276 osd
->logger
->inc(l_osd_tier_proxy_read
);
3278 const MOSDOp
*m
= static_cast<const MOSDOp
*>(op
->get_req());
3279 OpContext
*ctx
= new OpContext(op
, m
->get_reqid(), &prdop
->ops
, this);
3280 ctx
->reply
= new MOSDOpReply(m
, 0, get_osdmap_epoch(), 0, false);
3281 ctx
->user_at_version
= prdop
->user_version
;
3282 ctx
->data_off
= prdop
->data_offset
;
3283 ctx
->ignore_log_op_stats
= true;
3284 complete_read_ctx(r
, ctx
);
3287 void PrimaryLogPG::kick_proxy_ops_blocked(hobject_t
& soid
)
3289 map
<hobject_t
, list
<OpRequestRef
>>::iterator p
= in_progress_proxy_ops
.find(soid
);
3290 if (p
== in_progress_proxy_ops
.end())
3293 list
<OpRequestRef
>& ls
= p
->second
;
3294 dout(10) << __func__
<< " " << soid
<< " requeuing " << ls
.size() << " requests" << dendl
;
3296 in_progress_proxy_ops
.erase(p
);
3299 void PrimaryLogPG::cancel_proxy_read(ProxyReadOpRef prdop
,
3300 vector
<ceph_tid_t
> *tids
)
3302 dout(10) << __func__
<< " " << prdop
->soid
<< dendl
;
3303 prdop
->canceled
= true;
3305 // cancel objecter op, if we can
3306 if (prdop
->objecter_tid
) {
3307 tids
->push_back(prdop
->objecter_tid
);
3308 for (uint32_t i
= 0; i
< prdop
->ops
.size(); i
++) {
3309 prdop
->ops
[i
].outdata
.clear();
3311 proxyread_ops
.erase(prdop
->objecter_tid
);
3312 prdop
->objecter_tid
= 0;
3316 void PrimaryLogPG::cancel_proxy_ops(bool requeue
, vector
<ceph_tid_t
> *tids
)
3318 dout(10) << __func__
<< dendl
;
3320 // cancel proxy reads
3321 map
<ceph_tid_t
, ProxyReadOpRef
>::iterator p
= proxyread_ops
.begin();
3322 while (p
!= proxyread_ops
.end()) {
3323 cancel_proxy_read((p
++)->second
, tids
);
3326 // cancel proxy writes
3327 map
<ceph_tid_t
, ProxyWriteOpRef
>::iterator q
= proxywrite_ops
.begin();
3328 while (q
!= proxywrite_ops
.end()) {
3329 cancel_proxy_write((q
++)->second
, tids
);
3333 map
<hobject_t
, list
<OpRequestRef
>>::iterator p
=
3334 in_progress_proxy_ops
.begin();
3335 while (p
!= in_progress_proxy_ops
.end()) {
3336 list
<OpRequestRef
>& ls
= p
->second
;
3337 dout(10) << __func__
<< " " << p
->first
<< " requeuing " << ls
.size()
3338 << " requests" << dendl
;
3340 in_progress_proxy_ops
.erase(p
++);
3343 in_progress_proxy_ops
.clear();
3347 struct C_ProxyWrite_Commit
: public Context
{
3350 epoch_t last_peering_reset
;
3352 PrimaryLogPG::ProxyWriteOpRef pwop
;
3353 C_ProxyWrite_Commit(PrimaryLogPG
*p
, hobject_t o
, epoch_t lpr
,
3354 const PrimaryLogPG::ProxyWriteOpRef
& pw
)
3355 : pg(p
), oid(o
), last_peering_reset(lpr
),
3358 void finish(int r
) override
{
3362 if (pwop
->canceled
) {
3366 if (last_peering_reset
== pg
->get_last_peering_reset()) {
3367 pg
->finish_proxy_write(oid
, tid
, r
);
3373 void PrimaryLogPG::do_proxy_write(OpRequestRef op
, ObjectContextRef obc
)
3375 // NOTE: non-const because ProxyWriteOp takes a mutable ref
3376 MOSDOp
*m
= static_cast<MOSDOp
*>(op
->get_nonconst_req());
3377 object_locator_t oloc
;
3378 SnapContext
snapc(m
->get_snap_seq(), m
->get_snaps());
3380 /* extensible tier */
3381 if (obc
&& obc
->obs
.exists
&& obc
->obs
.oi
.has_manifest()) {
3382 switch (obc
->obs
.oi
.manifest
.type
) {
3383 case object_manifest_t::TYPE_REDIRECT
:
3384 oloc
= object_locator_t(obc
->obs
.oi
.manifest
.redirect_target
);
3385 soid
= obc
->obs
.oi
.manifest
.redirect_target
;
3388 ceph_abort_msg("unrecognized manifest type");
3392 soid
= m
->get_hobj();
3393 oloc
= object_locator_t(m
->get_object_locator());
3394 oloc
.pool
= pool
.info
.tier_of
;
3397 unsigned flags
= CEPH_OSD_FLAG_IGNORE_CACHE
| CEPH_OSD_FLAG_IGNORE_OVERLAY
;
3398 if (!(op
->may_write() || op
->may_cache())) {
3399 flags
|= CEPH_OSD_FLAG_RWORDERED
;
3401 dout(10) << __func__
<< " Start proxy write for " << *m
<< dendl
;
3403 ProxyWriteOpRef
pwop(std::make_shared
<ProxyWriteOp
>(op
, soid
, m
->ops
, m
->get_reqid()));
3404 pwop
->ctx
= new OpContext(op
, m
->get_reqid(), &pwop
->ops
, this);
3405 pwop
->mtime
= m
->get_mtime();
3407 ObjectOperation obj_op
;
3408 obj_op
.dup(pwop
->ops
);
3410 C_ProxyWrite_Commit
*fin
= new C_ProxyWrite_Commit(
3411 this, soid
, get_last_peering_reset(), pwop
);
3412 unsigned n
= info
.pgid
.hash_to_shard(osd
->m_objecter_finishers
);
3413 ceph_tid_t tid
= osd
->objecter
->mutate(
3414 soid
.oid
, oloc
, obj_op
, snapc
,
3415 ceph::real_clock::from_ceph_timespec(pwop
->mtime
),
3416 flags
, new C_OnFinisher(fin
, osd
->objecter_finishers
[n
]),
3417 &pwop
->user_version
, pwop
->reqid
);
3419 pwop
->objecter_tid
= tid
;
3420 proxywrite_ops
[tid
] = pwop
;
3421 in_progress_proxy_ops
[soid
].push_back(op
);
3424 void PrimaryLogPG::do_proxy_chunked_op(OpRequestRef op
, const hobject_t
& missing_oid
,
3425 ObjectContextRef obc
, bool write_ordered
)
3427 MOSDOp
*m
= static_cast<MOSDOp
*>(op
->get_nonconst_req());
3428 OSDOp
*osd_op
= NULL
;
3429 for (unsigned int i
= 0; i
< m
->ops
.size(); i
++) {
3430 osd_op
= &m
->ops
[i
];
3431 uint64_t cursor
= osd_op
->op
.extent
.offset
;
3432 uint64_t op_length
= osd_op
->op
.extent
.offset
+ osd_op
->op
.extent
.length
;
3433 uint64_t chunk_length
= 0, chunk_index
= 0, req_len
= 0;
3434 object_manifest_t
*manifest
= &obc
->obs
.oi
.manifest
;
3435 map
<uint64_t, map
<uint64_t, uint64_t>> chunk_read
;
3437 while (cursor
< op_length
) {
3440 /* find the right chunk position for cursor */
3441 for (auto &p
: manifest
->chunk_map
) {
3442 if (p
.first
<= cursor
&& p
.first
+ p
.second
.length
> cursor
) {
3443 chunk_length
= p
.second
.length
;
3444 chunk_index
= p
.first
;
3449 if (!chunk_index
&& !chunk_length
) {
3450 if (cursor
== osd_op
->op
.extent
.offset
) {
3451 OpContext
*ctx
= new OpContext(op
, m
->get_reqid(), &m
->ops
, this);
3452 ctx
->reply
= new MOSDOpReply(m
, 0, get_osdmap_epoch(), 0, false);
3453 ctx
->data_off
= osd_op
->op
.extent
.offset
;
3454 ctx
->ignore_log_op_stats
= true;
3455 complete_read_ctx(0, ctx
);
3459 uint64_t next_length
= chunk_length
;
3460 /* the size to read -> | op length | */
3462 if (cursor
+ next_length
> op_length
) {
3463 next_length
= op_length
- cursor
;
3465 /* the size to read -> | op length | */
3467 if (cursor
+ next_length
> chunk_index
+ chunk_length
) {
3468 next_length
= chunk_index
+ chunk_length
- cursor
;
3471 chunk_read
[cursor
] = {{chunk_index
, next_length
}};
3472 cursor
+= next_length
;
3475 req_len
= cursor
- osd_op
->op
.extent
.offset
;
3476 for (auto &p
: chunk_read
) {
3477 auto chunks
= p
.second
.begin();
3478 dout(20) << __func__
<< " chunk_index: " << chunks
->first
3479 << " next_length: " << chunks
->second
<< " cursor: "
3480 << p
.first
<< dendl
;
3481 do_proxy_chunked_read(op
, obc
, i
, chunks
->first
, p
.first
, chunks
->second
, req_len
, write_ordered
);
3486 struct RefCountCallback
: public Context
{
3489 PrimaryLogPG::OpContext
*ctx
;
3491 epoch_t last_peering_reset
;
3493 RefCountCallback(PrimaryLogPG
*pg
, PrimaryLogPG::OpContext
*ctx
,
3494 OSDOp
&osd_op
, epoch_t lpr
)
3495 : pg(pg
), ctx(ctx
), osd_op(osd_op
), last_peering_reset(lpr
)
3497 void finish(int r
) override
{
3499 if (last_peering_reset
== pg
->get_last_peering_reset()) {
3502 pg
->execute_ctx(ctx
);
3505 pg
->osd
->reply_op_error(ctx
->op
, r
);
3507 pg
->close_op_ctx(ctx
);
3514 struct SetManifestFinisher
: public PrimaryLogPG::OpFinisher
{
3517 explicit SetManifestFinisher(OSDOp
& osd_op
) : osd_op(osd_op
) {
3520 int execute() override
{
3525 void PrimaryLogPG::refcount_manifest(ObjectContextRef obc
, object_locator_t oloc
, hobject_t soid
,
3526 SnapContext snapc
, bool get
, Context
*cb
, uint64_t offset
)
3528 unsigned flags
= CEPH_OSD_FLAG_IGNORE_CACHE
| CEPH_OSD_FLAG_IGNORE_OVERLAY
|
3529 CEPH_OSD_FLAG_RWORDERED
;
3531 dout(10) << __func__
<< " Start refcount for " << soid
<< dendl
;
3533 ObjectOperation obj_op
;
3536 cls_chunk_refcount_get_op call
;
3537 call
.source
= obc
->obs
.oi
.soid
;
3539 obj_op
.call("cas", "chunk_get", in
);
3541 cls_chunk_refcount_put_op call
;
3542 call
.source
= obc
->obs
.oi
.soid
;
3544 obj_op
.call("cas", "chunk_put", in
);
3547 unsigned n
= info
.pgid
.hash_to_shard(osd
->m_objecter_finishers
);
3550 c
= new C_OnFinisher(cb
, osd
->objecter_finishers
[n
]);
3555 osd
->objecter
->mutate(
3556 soid
.oid
, oloc
, obj_op
, snapc
,
3557 ceph::real_clock::from_ceph_timespec(obc
->obs
.oi
.mtime
),
3561 void PrimaryLogPG::do_proxy_chunked_read(OpRequestRef op
, ObjectContextRef obc
, int op_index
,
3562 uint64_t chunk_index
, uint64_t req_offset
, uint64_t req_length
,
3563 uint64_t req_total_len
, bool write_ordered
)
3565 MOSDOp
*m
= static_cast<MOSDOp
*>(op
->get_nonconst_req());
3566 object_manifest_t
*manifest
= &obc
->obs
.oi
.manifest
;
3567 if (!manifest
->chunk_map
.count(chunk_index
)) {
3570 uint64_t chunk_length
= manifest
->chunk_map
[chunk_index
].length
;
3571 hobject_t soid
= manifest
->chunk_map
[chunk_index
].oid
;
3572 hobject_t ori_soid
= m
->get_hobj();
3573 object_locator_t
oloc(soid
);
3574 unsigned flags
= CEPH_OSD_FLAG_IGNORE_CACHE
| CEPH_OSD_FLAG_IGNORE_OVERLAY
;
3575 if (write_ordered
) {
3576 flags
|= CEPH_OSD_FLAG_RWORDERED
;
3579 if (!chunk_length
|| soid
== hobject_t()) {
3583 /* same as do_proxy_read() */
3584 flags
|= m
->get_flags() & (CEPH_OSD_FLAG_RWORDERED
|
3585 CEPH_OSD_FLAG_ORDERSNAP
|
3586 CEPH_OSD_FLAG_ENFORCE_SNAPC
|
3587 CEPH_OSD_FLAG_MAP_SNAP_CLONE
);
3589 dout(10) << __func__
<< " Start do chunk proxy read for " << *m
3590 << " index: " << op_index
<< " oid: " << soid
.oid
.name
<< " req_offset: " << req_offset
3591 << " req_length: " << req_length
<< dendl
;
3593 ProxyReadOpRef
prdop(std::make_shared
<ProxyReadOp
>(op
, ori_soid
, m
->ops
));
3595 ObjectOperation
*pobj_op
= new ObjectOperation
;
3596 OSDOp
&osd_op
= pobj_op
->add_op(m
->ops
[op_index
].op
.op
);
3598 if (chunk_index
<= req_offset
) {
3599 osd_op
.op
.extent
.offset
= manifest
->chunk_map
[chunk_index
].offset
+ req_offset
- chunk_index
;
3601 ceph_abort_msg("chunk_index > req_offset");
3603 osd_op
.op
.extent
.length
= req_length
;
3605 ObjectOperation obj_op
;
3606 obj_op
.dup(pobj_op
->ops
);
3608 C_ProxyChunkRead
*fin
= new C_ProxyChunkRead(this, ori_soid
, get_last_peering_reset(),
3610 fin
->obj_op
= pobj_op
;
3611 fin
->op_index
= op_index
;
3612 fin
->req_offset
= req_offset
;
3614 fin
->req_total_len
= req_total_len
;
3616 unsigned n
= info
.pgid
.hash_to_shard(osd
->m_objecter_finishers
);
3617 ceph_tid_t tid
= osd
->objecter
->read(
3618 soid
.oid
, oloc
, obj_op
,
3619 m
->get_snapid(), NULL
,
3620 flags
, new C_OnFinisher(fin
, osd
->objecter_finishers
[n
]),
3621 &prdop
->user_version
,
3622 &prdop
->data_offset
,
3625 prdop
->objecter_tid
= tid
;
3626 proxyread_ops
[tid
] = prdop
;
3627 in_progress_proxy_ops
[ori_soid
].push_back(op
);
3630 bool PrimaryLogPG::can_proxy_chunked_read(OpRequestRef op
, ObjectContextRef obc
)
3632 MOSDOp
*m
= static_cast<MOSDOp
*>(op
->get_nonconst_req());
3633 OSDOp
*osd_op
= NULL
;
3635 for (unsigned int i
= 0; i
< m
->ops
.size(); i
++) {
3636 osd_op
= &m
->ops
[i
];
3637 ceph_osd_op op
= osd_op
->op
;
3639 case CEPH_OSD_OP_READ
:
3640 case CEPH_OSD_OP_SYNC_READ
: {
3641 uint64_t cursor
= osd_op
->op
.extent
.offset
;
3642 uint64_t remain
= osd_op
->op
.extent
.length
;
3644 /* requested chunks exist in chunk_map ? */
3645 for (auto &p
: obc
->obs
.oi
.manifest
.chunk_map
) {
3646 if (p
.first
<= cursor
&& p
.first
+ p
.second
.length
> cursor
) {
3647 if (!p
.second
.is_missing()) {
3650 if (p
.second
.length
>= remain
) {
3654 remain
= remain
- p
.second
.length
;
3656 cursor
+= p
.second
.length
;
3661 dout(20) << __func__
<< " requested chunks don't exist in chunk_map " << dendl
;
3673 void PrimaryLogPG::finish_proxy_write(hobject_t oid
, ceph_tid_t tid
, int r
)
3675 dout(10) << __func__
<< " " << oid
<< " tid " << tid
3676 << " " << cpp_strerror(r
) << dendl
;
3678 map
<ceph_tid_t
, ProxyWriteOpRef
>::iterator p
= proxywrite_ops
.find(tid
);
3679 if (p
== proxywrite_ops
.end()) {
3680 dout(10) << __func__
<< " no proxywrite_op found" << dendl
;
3683 ProxyWriteOpRef pwop
= p
->second
;
3684 ceph_assert(tid
== pwop
->objecter_tid
);
3685 ceph_assert(oid
== pwop
->soid
);
3687 proxywrite_ops
.erase(tid
);
3689 map
<hobject_t
, list
<OpRequestRef
> >::iterator q
= in_progress_proxy_ops
.find(oid
);
3690 if (q
== in_progress_proxy_ops
.end()) {
3691 dout(10) << __func__
<< " no in_progress_proxy_ops found" << dendl
;
3696 list
<OpRequestRef
>& in_progress_op
= q
->second
;
3697 ceph_assert(in_progress_op
.size());
3698 list
<OpRequestRef
>::iterator it
= std::find(in_progress_op
.begin(),
3699 in_progress_op
.end(),
3701 ceph_assert(it
!= in_progress_op
.end());
3702 in_progress_op
.erase(it
);
3703 if (in_progress_op
.size() == 0) {
3704 in_progress_proxy_ops
.erase(oid
);
3705 } else if (std::find(in_progress_op
.begin(),
3706 in_progress_op
.end(),
3707 pwop
->op
) != in_progress_op
.end()) {
3711 dout(20) << __func__
<< " " << oid
<< " tid " << tid
3712 << " in_progress_op size: "
3713 << in_progress_op
.size() << dendl
;
3717 osd
->logger
->inc(l_osd_tier_proxy_write
);
3719 const MOSDOp
*m
= static_cast<const MOSDOp
*>(pwop
->op
->get_req());
3720 ceph_assert(m
!= NULL
);
3722 if (!pwop
->sent_reply
) {
3724 MOSDOpReply
*reply
= pwop
->ctx
->reply
;
3726 pwop
->ctx
->reply
= NULL
;
3728 reply
= new MOSDOpReply(m
, r
, get_osdmap_epoch(), 0, true);
3729 reply
->set_reply_versions(eversion_t(), pwop
->user_version
);
3731 reply
->add_flags(CEPH_OSD_FLAG_ACK
| CEPH_OSD_FLAG_ONDISK
);
3732 dout(10) << " sending commit on " << pwop
<< " " << reply
<< dendl
;
3733 osd
->send_message_osd_client(reply
, m
->get_connection());
3734 pwop
->sent_reply
= true;
3735 pwop
->ctx
->op
->mark_commit_sent();
3742 void PrimaryLogPG::cancel_proxy_write(ProxyWriteOpRef pwop
,
3743 vector
<ceph_tid_t
> *tids
)
3745 dout(10) << __func__
<< " " << pwop
->soid
<< dendl
;
3746 pwop
->canceled
= true;
3748 // cancel objecter op, if we can
3749 if (pwop
->objecter_tid
) {
3750 tids
->push_back(pwop
->objecter_tid
);
3753 proxywrite_ops
.erase(pwop
->objecter_tid
);
3754 pwop
->objecter_tid
= 0;
3758 class PromoteCallback
: public PrimaryLogPG::CopyCallback
{
3759 ObjectContextRef obc
;
3763 PromoteCallback(ObjectContextRef obc_
, PrimaryLogPG
*pg_
)
3766 start(ceph_clock_now()) {}
3768 void finish(PrimaryLogPG::CopyCallbackResults results
) override
{
3769 PrimaryLogPG::CopyResults
*results_data
= results
.get
<1>();
3770 int r
= results
.get
<0>();
3771 pg
->finish_promote(r
, results_data
, obc
);
3772 pg
->osd
->logger
->tinc(l_osd_tier_promote_lat
, ceph_clock_now() - start
);
3776 class PromoteManifestCallback
: public PrimaryLogPG::CopyCallback
{
3777 ObjectContextRef obc
;
3780 PrimaryLogPG::OpContext
*ctx
;
3781 PrimaryLogPG::CopyCallbackResults promote_results
;
3783 PromoteManifestCallback(ObjectContextRef obc_
, PrimaryLogPG
*pg_
, PrimaryLogPG::OpContext
*ctx
= NULL
)
3786 start(ceph_clock_now()), ctx(ctx
) {}
3788 void finish(PrimaryLogPG::CopyCallbackResults results
) override
{
3789 PrimaryLogPG::CopyResults
*results_data
= results
.get
<1>();
3790 int r
= results
.get
<0>();
3792 promote_results
= results
;
3793 pg
->execute_ctx(ctx
);
3795 pg
->finish_promote_manifest(r
, results_data
, obc
);
3797 pg
->osd
->logger
->tinc(l_osd_tier_promote_lat
, ceph_clock_now() - start
);
3799 friend struct PromoteFinisher
;
3802 struct PromoteFinisher
: public PrimaryLogPG::OpFinisher
{
3803 PromoteManifestCallback
*promote_callback
;
3805 explicit PromoteFinisher(PromoteManifestCallback
*promote_callback
)
3806 : promote_callback(promote_callback
) {
3809 int execute() override
{
3810 if (promote_callback
->ctx
->obc
->obs
.oi
.manifest
.is_redirect()) {
3811 promote_callback
->ctx
->pg
->finish_promote(promote_callback
->promote_results
.get
<0>(),
3812 promote_callback
->promote_results
.get
<1>(),
3813 promote_callback
->obc
);
3814 } else if (promote_callback
->ctx
->obc
->obs
.oi
.manifest
.is_chunked()) {
3815 promote_callback
->ctx
->pg
->finish_promote_manifest(promote_callback
->promote_results
.get
<0>(),
3816 promote_callback
->promote_results
.get
<1>(),
3817 promote_callback
->obc
);
3819 ceph_abort_msg("unrecognized manifest type");
3825 void PrimaryLogPG::promote_object(ObjectContextRef obc
,
3826 const hobject_t
& missing_oid
,
3827 const object_locator_t
& oloc
,
3829 ObjectContextRef
*promote_obc
)
3831 hobject_t hoid
= obc
? obc
->obs
.oi
.soid
: missing_oid
;
3832 ceph_assert(hoid
!= hobject_t());
3833 if (write_blocked_by_scrub(hoid
)) {
3834 dout(10) << __func__
<< " " << hoid
3835 << " blocked by scrub" << dendl
;
3837 waiting_for_scrub
.push_back(op
);
3838 op
->mark_delayed("waiting for scrub");
3839 dout(10) << __func__
<< " " << hoid
3840 << " placing op in waiting_for_scrub" << dendl
;
3842 dout(10) << __func__
<< " " << hoid
3843 << " no op, dropping on the floor" << dendl
;
3847 if (!obc
) { // we need to create an ObjectContext
3848 ceph_assert(missing_oid
!= hobject_t());
3849 obc
= get_object_context(missing_oid
, true);
3855 * Before promote complete, if there are proxy-reads for the object,
3856 * for this case we don't use DONTNEED.
3858 unsigned src_fadvise_flags
= LIBRADOS_OP_FLAG_FADVISE_SEQUENTIAL
;
3859 map
<hobject_t
, list
<OpRequestRef
>>::iterator q
= in_progress_proxy_ops
.find(obc
->obs
.oi
.soid
);
3860 if (q
== in_progress_proxy_ops
.end()) {
3861 src_fadvise_flags
|= LIBRADOS_OP_FLAG_FADVISE_DONTNEED
;
3865 object_locator_t my_oloc
;
3867 if (!obc
->obs
.oi
.has_manifest()) {
3869 my_oloc
.pool
= pool
.info
.tier_of
;
3870 src_hoid
= obc
->obs
.oi
.soid
;
3871 cb
= new PromoteCallback(obc
, this);
3873 if (obc
->obs
.oi
.manifest
.is_chunked()) {
3874 src_hoid
= obc
->obs
.oi
.soid
;
3875 cb
= new PromoteManifestCallback(obc
, this);
3876 } else if (obc
->obs
.oi
.manifest
.is_redirect()) {
3877 object_locator_t
src_oloc(obc
->obs
.oi
.manifest
.redirect_target
);
3879 src_hoid
= obc
->obs
.oi
.manifest
.redirect_target
;
3880 cb
= new PromoteCallback(obc
, this);
3882 ceph_abort_msg("unrecognized manifest type");
3886 unsigned flags
= CEPH_OSD_COPY_FROM_FLAG_IGNORE_OVERLAY
|
3887 CEPH_OSD_COPY_FROM_FLAG_IGNORE_CACHE
|
3888 CEPH_OSD_COPY_FROM_FLAG_MAP_SNAP_CLONE
|
3889 CEPH_OSD_COPY_FROM_FLAG_RWORDERED
;
3890 start_copy(cb
, obc
, src_hoid
, my_oloc
, 0, flags
,
3891 obc
->obs
.oi
.soid
.snap
== CEPH_NOSNAP
,
3892 src_fadvise_flags
, 0);
3894 ceph_assert(obc
->is_blocked());
3897 wait_for_blocked_object(obc
->obs
.oi
.soid
, op
);
3898 info
.stats
.stats
.sum
.num_promote
++;
3901 void PrimaryLogPG::execute_ctx(OpContext
*ctx
)
3904 dout(10) << __func__
<< " " << ctx
<< dendl
;
3905 ctx
->reset_obs(ctx
->obc
);
3906 ctx
->update_log_only
= false; // reset in case finish_copyfrom() is re-running execute_ctx
3907 OpRequestRef op
= ctx
->op
;
3908 const MOSDOp
*m
= static_cast<const MOSDOp
*>(op
->get_req());
3909 ObjectContextRef obc
= ctx
->obc
;
3910 const hobject_t
& soid
= obc
->obs
.oi
.soid
;
3912 // this method must be idempotent since we may call it several times
3913 // before we finally apply the resulting transaction.
3914 ctx
->op_t
.reset(new PGTransaction
);
3916 if (op
->may_write() || op
->may_cache()) {
3918 if (!(m
->has_flag(CEPH_OSD_FLAG_ENFORCE_SNAPC
)) &&
3919 pool
.info
.is_pool_snaps_mode()) {
3921 ctx
->snapc
= pool
.snapc
;
3923 // client specified snapc
3924 ctx
->snapc
.seq
= m
->get_snap_seq();
3925 ctx
->snapc
.snaps
= m
->get_snaps();
3926 filter_snapc(ctx
->snapc
.snaps
);
3928 if ((m
->has_flag(CEPH_OSD_FLAG_ORDERSNAP
)) &&
3929 ctx
->snapc
.seq
< obc
->ssc
->snapset
.seq
) {
3930 dout(10) << " ORDERSNAP flag set and snapc seq " << ctx
->snapc
.seq
3931 << " < snapset seq " << obc
->ssc
->snapset
.seq
3932 << " on " << obc
->obs
.oi
.soid
<< dendl
;
3933 reply_ctx(ctx
, -EOLDSNAPC
);
3938 ctx
->at_version
= get_next_version();
3939 ctx
->mtime
= m
->get_mtime();
3941 dout(10) << __func__
<< " " << soid
<< " " << *ctx
->ops
3942 << " ov " << obc
->obs
.oi
.version
<< " av " << ctx
->at_version
3943 << " snapc " << ctx
->snapc
3944 << " snapset " << obc
->ssc
->snapset
3947 dout(10) << __func__
<< " " << soid
<< " " << *ctx
->ops
3948 << " ov " << obc
->obs
.oi
.version
3952 if (!ctx
->user_at_version
)
3953 ctx
->user_at_version
= obc
->obs
.oi
.user_version
;
3954 dout(30) << __func__
<< " user_at_version " << ctx
->user_at_version
<< dendl
;
3958 osd_reqid_t reqid
= ctx
->op
->get_reqid();
3960 tracepoint(osd
, prepare_tx_enter
, reqid
.name
._type
,
3961 reqid
.name
._num
, reqid
.tid
, reqid
.inc
);
3964 int result
= prepare_transaction(ctx
);
3968 osd_reqid_t reqid
= ctx
->op
->get_reqid();
3970 tracepoint(osd
, prepare_tx_exit
, reqid
.name
._type
,
3971 reqid
.name
._num
, reqid
.tid
, reqid
.inc
);
3974 bool pending_async_reads
= !ctx
->pending_async_reads
.empty();
3975 if (result
== -EINPROGRESS
|| pending_async_reads
) {
3977 if (pending_async_reads
) {
3978 ceph_assert(pool
.info
.is_erasure());
3979 in_progress_async_reads
.push_back(make_pair(op
, ctx
));
3980 ctx
->start_async_reads(this);
3985 if (result
== -EAGAIN
) {
3986 // clean up after the ctx
3991 bool successful_write
= !ctx
->op_t
->empty() && op
->may_write() && result
>= 0;
3992 // prepare the reply
3993 ctx
->reply
= new MOSDOpReply(m
, 0, get_osdmap_epoch(), 0,
3996 // Write operations aren't allowed to return a data payload because
3997 // we can't do so reliably. If the client has to resend the request
3998 // and it has already been applied, we will return 0 with no
3999 // payload. Non-deterministic behavior is no good. However, it is
4000 // possible to construct an operation that does a read, does a guard
4001 // check (e.g., CMPXATTR), and then a write. Then we either succeed
4002 // with the write, or return a CMPXATTR and the read value.
4003 if (successful_write
) {
4004 // write. normalize the result code.
4005 dout(20) << " zeroing write result code " << result
<< dendl
;
4008 ctx
->reply
->set_result(result
);
4011 if ((ctx
->op_t
->empty() || result
< 0) && !ctx
->update_log_only
) {
4012 // finish side-effects
4014 do_osd_op_effects(ctx
, m
->get_connection());
4016 complete_read_ctx(result
, ctx
);
4020 ctx
->reply
->set_reply_versions(ctx
->at_version
, ctx
->user_at_version
);
4022 ceph_assert(op
->may_write() || op
->may_cache());
4025 if (hard_limit_pglog())
4026 calc_trim_to_aggressive();
4030 // verify that we are doing this in order?
4031 if (cct
->_conf
->osd_debug_op_order
&& m
->get_source().is_client() &&
4032 !pool
.info
.is_tier() && !pool
.info
.has_tiers()) {
4033 map
<client_t
,ceph_tid_t
>& cm
= debug_op_order
[obc
->obs
.oi
.soid
];
4034 ceph_tid_t t
= m
->get_tid();
4035 client_t n
= m
->get_source().num();
4036 map
<client_t
,ceph_tid_t
>::iterator p
= cm
.find(n
);
4037 if (p
== cm
.end()) {
4038 dout(20) << " op order client." << n
<< " tid " << t
<< " (first)" << dendl
;
4041 dout(20) << " op order client." << n
<< " tid " << t
<< " last was " << p
->second
<< dendl
;
4042 if (p
->second
> t
) {
4043 derr
<< "bad op order, already applied " << p
->second
<< " > this " << t
<< dendl
;
4044 ceph_abort_msg("out of order op");
4050 if (ctx
->update_log_only
) {
4052 do_osd_op_effects(ctx
, m
->get_connection());
4054 dout(20) << __func__
<< " update_log_only -- result=" << result
<< dendl
;
4055 // save just what we need from ctx
4056 MOSDOpReply
*reply
= ctx
->reply
;
4057 ctx
->reply
= nullptr;
4058 reply
->claim_op_out_data(*ctx
->ops
);
4059 reply
->get_header().data_off
= (ctx
->data_off
? *ctx
->data_off
: 0);
4062 if (result
== -ENOENT
) {
4063 reply
->set_enoent_reply_versions(info
.last_update
,
4064 info
.last_user_version
);
4066 reply
->add_flags(CEPH_OSD_FLAG_ACK
| CEPH_OSD_FLAG_ONDISK
);
4067 // append to pg log for dup detection - don't save buffers for now
4068 record_write_error(op
, soid
, reply
, result
);
4072 // no need to capture PG ref, repop cancel will handle that
4073 // Can capture the ctx by pointer, it's owned by the repop
4074 ctx
->register_on_commit(
4077 log_op_stats(*ctx
->op
, ctx
->bytes_written
, ctx
->bytes_read
);
4079 if (m
&& !ctx
->sent_reply
) {
4080 MOSDOpReply
*reply
= ctx
->reply
;
4082 ctx
->reply
= nullptr;
4084 reply
= new MOSDOpReply(m
, 0, get_osdmap_epoch(), 0, true);
4085 reply
->set_reply_versions(ctx
->at_version
,
4086 ctx
->user_at_version
);
4088 reply
->add_flags(CEPH_OSD_FLAG_ACK
| CEPH_OSD_FLAG_ONDISK
);
4089 dout(10) << " sending reply on " << *m
<< " " << reply
<< dendl
;
4090 osd
->send_message_osd_client(reply
, m
->get_connection());
4091 ctx
->sent_reply
= true;
4092 ctx
->op
->mark_commit_sent();
4095 ctx
->register_on_success(
4099 ctx
->op
? ctx
->op
->get_req()->get_connection() :
4102 ctx
->register_on_finish(
4107 // issue replica writes
4108 ceph_tid_t rep_tid
= osd
->get_tid();
4110 RepGather
*repop
= new_repop(ctx
, obc
, rep_tid
);
4112 issue_repop(repop
, ctx
);
4117 void PrimaryLogPG::close_op_ctx(OpContext
*ctx
) {
4118 release_object_locks(ctx
->lock_manager
);
4122 for (auto p
= ctx
->on_finish
.begin(); p
!= ctx
->on_finish
.end();
4123 ctx
->on_finish
.erase(p
++)) {
4129 void PrimaryLogPG::reply_ctx(OpContext
*ctx
, int r
)
4132 osd
->reply_op_error(ctx
->op
, r
);
4136 void PrimaryLogPG::reply_ctx(OpContext
*ctx
, int r
, eversion_t v
, version_t uv
)
4139 osd
->reply_op_error(ctx
->op
, r
, v
, uv
);
4143 void PrimaryLogPG::log_op_stats(const OpRequest
& op
,
4145 const uint64_t outb
)
4147 const MOSDOp
* const m
= static_cast<const MOSDOp
*>(op
.get_req());
4148 const utime_t now
= ceph_clock_now();
4150 const utime_t latency
= now
- m
->get_recv_stamp();
4151 const utime_t process_latency
= now
- op
.get_dequeued_time();
4153 osd
->logger
->inc(l_osd_op
);
4155 osd
->logger
->inc(l_osd_op_outb
, outb
);
4156 osd
->logger
->inc(l_osd_op_inb
, inb
);
4157 osd
->logger
->tinc(l_osd_op_lat
, latency
);
4158 osd
->logger
->tinc(l_osd_op_process_lat
, process_latency
);
4160 if (op
.may_read() && op
.may_write()) {
4161 osd
->logger
->inc(l_osd_op_rw
);
4162 osd
->logger
->inc(l_osd_op_rw_inb
, inb
);
4163 osd
->logger
->inc(l_osd_op_rw_outb
, outb
);
4164 osd
->logger
->tinc(l_osd_op_rw_lat
, latency
);
4165 osd
->logger
->hinc(l_osd_op_rw_lat_inb_hist
, latency
.to_nsec(), inb
);
4166 osd
->logger
->hinc(l_osd_op_rw_lat_outb_hist
, latency
.to_nsec(), outb
);
4167 osd
->logger
->tinc(l_osd_op_rw_process_lat
, process_latency
);
4168 } else if (op
.may_read()) {
4169 osd
->logger
->inc(l_osd_op_r
);
4170 osd
->logger
->inc(l_osd_op_r_outb
, outb
);
4171 osd
->logger
->tinc(l_osd_op_r_lat
, latency
);
4172 osd
->logger
->hinc(l_osd_op_r_lat_outb_hist
, latency
.to_nsec(), outb
);
4173 osd
->logger
->tinc(l_osd_op_r_process_lat
, process_latency
);
4174 } else if (op
.may_write() || op
.may_cache()) {
4175 osd
->logger
->inc(l_osd_op_w
);
4176 osd
->logger
->inc(l_osd_op_w_inb
, inb
);
4177 osd
->logger
->tinc(l_osd_op_w_lat
, latency
);
4178 osd
->logger
->hinc(l_osd_op_w_lat_inb_hist
, latency
.to_nsec(), inb
);
4179 osd
->logger
->tinc(l_osd_op_w_process_lat
, process_latency
);
4184 dout(15) << "log_op_stats " << *m
4187 << " lat " << latency
<< dendl
;
4189 if (m_dynamic_perf_stats
.is_enabled()) {
4190 m_dynamic_perf_stats
.add(osd
, info
, op
, inb
, outb
, latency
);
4194 void PrimaryLogPG::set_dynamic_perf_stats_queries(
4195 const std::list
<OSDPerfMetricQuery
> &queries
)
4197 m_dynamic_perf_stats
.set_queries(queries
);
4200 void PrimaryLogPG::get_dynamic_perf_stats(DynamicPerfStats
*stats
)
4202 std::swap(m_dynamic_perf_stats
, *stats
);
4205 void PrimaryLogPG::do_scan(
4207 ThreadPool::TPHandle
&handle
)
4209 const MOSDPGScan
*m
= static_cast<const MOSDPGScan
*>(op
->get_req());
4210 ceph_assert(m
->get_type() == MSG_OSD_PG_SCAN
);
4211 dout(10) << "do_scan " << *m
<< dendl
;
4216 case MOSDPGScan::OP_SCAN_GET_DIGEST
:
4218 auto dpp
= get_dpp();
4219 if (osd
->check_backfill_full(dpp
)) {
4220 dout(1) << __func__
<< ": Canceling backfill: Full." << dendl
;
4221 queue_peering_event(
4223 std::make_shared
<PGPeeringEvent
>(
4226 BackfillTooFull())));
4230 BackfillInterval bi
;
4231 bi
.begin
= m
->begin
;
4232 // No need to flush, there won't be any in progress writes occuring
4235 cct
->_conf
->osd_backfill_scan_min
,
4236 cct
->_conf
->osd_backfill_scan_max
,
4239 MOSDPGScan
*reply
= new MOSDPGScan(
4240 MOSDPGScan::OP_SCAN_DIGEST
,
4242 get_osdmap_epoch(), m
->query_epoch
,
4243 spg_t(info
.pgid
.pgid
, get_primary().shard
), bi
.begin
, bi
.end
);
4244 encode(bi
.objects
, reply
->get_data());
4245 osd
->send_message_osd_cluster(reply
, m
->get_connection());
4249 case MOSDPGScan::OP_SCAN_DIGEST
:
4251 pg_shard_t from
= m
->from
;
4253 // Check that from is in backfill_targets vector
4254 ceph_assert(is_backfill_targets(from
));
4256 BackfillInterval
& bi
= peer_backfill_info
[from
];
4257 bi
.begin
= m
->begin
;
4259 auto p
= m
->get_data().cbegin();
4261 // take care to preserve ordering!
4263 ::decode_noclear(bi
.objects
, p
);
4265 if (waiting_on_backfill
.erase(from
)) {
4266 if (waiting_on_backfill
.empty()) {
4267 ceph_assert(peer_backfill_info
.size() == backfill_targets
.size());
4268 finish_recovery_op(hobject_t::get_max());
4271 // we canceled backfill for a while due to a too full, and this
4272 // is an extra response from a non-too-full peer
4273 dout(20) << __func__
<< " canceled backfill (too full?)" << dendl
;
4280 void PrimaryLogPG::do_backfill(OpRequestRef op
)
4282 const MOSDPGBackfill
*m
= static_cast<const MOSDPGBackfill
*>(op
->get_req());
4283 ceph_assert(m
->get_type() == MSG_OSD_PG_BACKFILL
);
4284 dout(10) << "do_backfill " << *m
<< dendl
;
4289 case MOSDPGBackfill::OP_BACKFILL_FINISH
:
4291 ceph_assert(cct
->_conf
->osd_kill_backfill_at
!= 1);
4293 MOSDPGBackfill
*reply
= new MOSDPGBackfill(
4294 MOSDPGBackfill::OP_BACKFILL_FINISH_ACK
,
4297 spg_t(info
.pgid
.pgid
, get_primary().shard
));
4298 reply
->set_priority(get_recovery_op_priority());
4299 osd
->send_message_osd_cluster(reply
, m
->get_connection());
4300 queue_peering_event(
4302 std::make_shared
<PGPeeringEvent
>(
4309 case MOSDPGBackfill::OP_BACKFILL_PROGRESS
:
4311 ceph_assert(cct
->_conf
->osd_kill_backfill_at
!= 2);
4313 info
.set_last_backfill(m
->last_backfill
);
4314 // During backfill submit_push_data() tracks num_bytes which is needed in case
4315 // backfill stops and starts again. We want to know how many bytes this
4316 // pg is consuming on the disk in order to compute amount of new data
4317 // reserved to hold backfill if it won't fit.
4318 if (m
->op
== MOSDPGBackfill::OP_BACKFILL_PROGRESS
) {
4319 dout(0) << __func__
<< " primary " << m
->stats
.stats
.sum
.num_bytes
<< " local " << info
.stats
.stats
.sum
.num_bytes
<< dendl
;
4320 int64_t bytes
= info
.stats
.stats
.sum
.num_bytes
;
4321 info
.stats
= m
->stats
;
4322 info
.stats
.stats
.sum
.num_bytes
= bytes
;
4324 dout(0) << __func__
<< " final " << m
->stats
.stats
.sum
.num_bytes
<< " replaces local " << info
.stats
.stats
.sum
.num_bytes
<< dendl
;
4325 info
.stats
= m
->stats
;
4328 ObjectStore::Transaction t
;
4331 int tr
= osd
->store
->queue_transaction(ch
, std::move(t
), NULL
);
4332 ceph_assert(tr
== 0);
4336 case MOSDPGBackfill::OP_BACKFILL_FINISH_ACK
:
4338 ceph_assert(is_primary());
4339 ceph_assert(cct
->_conf
->osd_kill_backfill_at
!= 3);
4340 finish_recovery_op(hobject_t::get_max());
4346 void PrimaryLogPG::do_backfill_remove(OpRequestRef op
)
4348 const MOSDPGBackfillRemove
*m
= static_cast<const MOSDPGBackfillRemove
*>(
4350 ceph_assert(m
->get_type() == MSG_OSD_PG_BACKFILL_REMOVE
);
4351 dout(7) << __func__
<< " " << m
->ls
<< dendl
;
4355 ObjectStore::Transaction t
;
4356 for (auto& p
: m
->ls
) {
4357 if (is_remote_backfilling()) {
4359 int r
= osd
->store
->stat(ch
, ghobject_t(p
.first
, ghobject_t::NO_GEN
,
4360 pg_whoami
.shard
) , &st
);
4362 sub_local_num_bytes(st
.st_size
);
4364 if (pool
.info
.is_erasure()) {
4366 int r
= osd
->store
->getattr(
4368 ghobject_t(p
.first
, ghobject_t::NO_GEN
, pg_whoami
.shard
),
4372 object_info_t
oi(bv
);
4373 usersize
= oi
.size
* pgbackend
->get_ec_data_chunk_count();
4375 dout(0) << __func__
<< " " << ghobject_t(p
.first
, ghobject_t::NO_GEN
, pg_whoami
.shard
)
4376 << " can't get object info" << dendl
;
4380 usersize
= st
.st_size
;
4382 sub_num_bytes(usersize
);
4383 dout(10) << __func__
<< " " << ghobject_t(p
.first
, ghobject_t::NO_GEN
, pg_whoami
.shard
)
4384 << " sub actual data by " << st
.st_size
4385 << " sub num_bytes by " << usersize
4389 remove_snap_mapped_object(t
, p
.first
);
4391 int r
= osd
->store
->queue_transaction(ch
, std::move(t
), NULL
);
4392 ceph_assert(r
== 0);
4395 int PrimaryLogPG::trim_object(
4396 bool first
, const hobject_t
&coid
, PrimaryLogPG::OpContextUPtr
*ctxp
)
4402 ObjectContextRef obc
= get_object_context(coid
, false, NULL
);
4403 if (!obc
|| !obc
->ssc
|| !obc
->ssc
->exists
) {
4404 osd
->clog
->error() << __func__
<< ": Can not trim " << coid
4405 << " repair needed " << (obc
? "(no obc->ssc or !exists)" : "(no obc)");
4409 hobject_t head_oid
= coid
.get_head();
4410 ObjectContextRef head_obc
= get_object_context(head_oid
, false);
4412 osd
->clog
->error() << __func__
<< ": Can not trim " << coid
4413 << " repair needed, no snapset obc for " << head_oid
;
4417 SnapSet
& snapset
= obc
->ssc
->snapset
;
4419 object_info_t
&coi
= obc
->obs
.oi
;
4420 auto citer
= snapset
.clone_snaps
.find(coid
.snap
);
4421 if (citer
== snapset
.clone_snaps
.end()) {
4422 osd
->clog
->error() << "No clone_snaps in snapset " << snapset
4423 << " for object " << coid
<< "\n";
4426 set
<snapid_t
> old_snaps(citer
->second
.begin(), citer
->second
.end());
4427 if (old_snaps
.empty()) {
4428 osd
->clog
->error() << "No object info snaps for object " << coid
;
4432 dout(10) << coid
<< " old_snaps " << old_snaps
4433 << " old snapset " << snapset
<< dendl
;
4434 if (snapset
.seq
== 0) {
4435 osd
->clog
->error() << "No snapset.seq for object " << coid
;
4439 set
<snapid_t
> new_snaps
;
4440 for (set
<snapid_t
>::iterator i
= old_snaps
.begin();
4441 i
!= old_snaps
.end();
4443 if (!pool
.info
.is_removed_snap(*i
))
4444 new_snaps
.insert(*i
);
4447 vector
<snapid_t
>::iterator p
= snapset
.clones
.end();
4449 if (new_snaps
.empty()) {
4450 p
= std::find(snapset
.clones
.begin(), snapset
.clones
.end(), coid
.snap
);
4451 if (p
== snapset
.clones
.end()) {
4452 osd
->clog
->error() << "Snap " << coid
.snap
<< " not in clones";
4457 OpContextUPtr ctx
= simple_opc_create(obc
);
4458 ctx
->head_obc
= head_obc
;
4460 if (!ctx
->lock_manager
.get_snaptrimmer_write(
4464 close_op_ctx(ctx
.release());
4465 dout(10) << __func__
<< ": Unable to get a wlock on " << coid
<< dendl
;
4469 if (!ctx
->lock_manager
.get_snaptrimmer_write(
4473 close_op_ctx(ctx
.release());
4474 dout(10) << __func__
<< ": Unable to get a wlock on " << head_oid
<< dendl
;
4478 ctx
->at_version
= get_next_version();
4480 PGTransaction
*t
= ctx
->op_t
.get();
4482 if (new_snaps
.empty()) {
4484 dout(10) << coid
<< " snaps " << old_snaps
<< " -> "
4485 << new_snaps
<< " ... deleting" << dendl
;
4488 ceph_assert(p
!= snapset
.clones
.end());
4490 snapid_t last
= coid
.snap
;
4491 ctx
->delta_stats
.num_bytes
-= snapset
.get_clone_bytes(last
);
4493 if (p
!= snapset
.clones
.begin()) {
4494 // not the oldest... merge overlap into next older clone
4495 vector
<snapid_t
>::iterator n
= p
- 1;
4496 hobject_t prev_coid
= coid
;
4497 prev_coid
.snap
= *n
;
4498 bool adjust_prev_bytes
= is_present_clone(prev_coid
);
4500 if (adjust_prev_bytes
)
4501 ctx
->delta_stats
.num_bytes
-= snapset
.get_clone_bytes(*n
);
4503 snapset
.clone_overlap
[*n
].intersection_of(
4504 snapset
.clone_overlap
[*p
]);
4506 if (adjust_prev_bytes
)
4507 ctx
->delta_stats
.num_bytes
+= snapset
.get_clone_bytes(*n
);
4509 ctx
->delta_stats
.num_objects
--;
4511 ctx
->delta_stats
.num_objects_dirty
--;
4513 ctx
->delta_stats
.num_objects_omap
--;
4514 if (coi
.is_whiteout()) {
4515 dout(20) << __func__
<< " trimming whiteout on " << coid
<< dendl
;
4516 ctx
->delta_stats
.num_whiteouts
--;
4518 ctx
->delta_stats
.num_object_clones
--;
4519 if (coi
.is_cache_pinned())
4520 ctx
->delta_stats
.num_objects_pinned
--;
4521 if (coi
.has_manifest())
4522 ctx
->delta_stats
.num_objects_manifest
--;
4523 obc
->obs
.exists
= false;
4525 snapset
.clones
.erase(p
);
4526 snapset
.clone_overlap
.erase(last
);
4527 snapset
.clone_size
.erase(last
);
4528 snapset
.clone_snaps
.erase(last
);
4532 pg_log_entry_t::DELETE
,
4535 ctx
->obs
->oi
.version
,
4547 coi
= object_info_t(coid
);
4549 ctx
->at_version
.version
++;
4551 // save adjusted snaps for this object
4552 dout(10) << coid
<< " snaps " << old_snaps
<< " -> " << new_snaps
<< dendl
;
4553 snapset
.clone_snaps
[coid
.snap
] =
4554 vector
<snapid_t
>(new_snaps
.rbegin(), new_snaps
.rend());
4555 // we still do a 'modify' event on this object just to trigger a
4556 // snapmapper.update ... :(
4558 coi
.prior_version
= coi
.version
;
4559 coi
.version
= ctx
->at_version
;
4561 encode(coi
, bl
, get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD
, nullptr));
4562 t
->setattr(coid
, OI_ATTR
, bl
);
4566 pg_log_entry_t::MODIFY
,
4575 ctx
->at_version
.version
++;
4583 // save head snapset
4584 dout(10) << coid
<< " new snapset " << snapset
<< " on "
4585 << head_obc
->obs
.oi
<< dendl
;
4586 if (snapset
.clones
.empty() &&
4587 (head_obc
->obs
.oi
.is_whiteout() &&
4588 !(head_obc
->obs
.oi
.is_dirty() && pool
.info
.is_tier()) &&
4589 !head_obc
->obs
.oi
.is_cache_pinned())) {
4590 // NOTE: this arguably constitutes minor interference with the
4591 // tiering agent if this is a cache tier since a snap trim event
4592 // is effectively evicting a whiteout we might otherwise want to
4594 dout(10) << coid
<< " removing " << head_oid
<< dendl
;
4597 pg_log_entry_t::DELETE
,
4600 head_obc
->obs
.oi
.version
,
4606 derr
<< "removing snap head" << dendl
;
4607 object_info_t
& oi
= head_obc
->obs
.oi
;
4608 ctx
->delta_stats
.num_objects
--;
4609 if (oi
.is_dirty()) {
4610 ctx
->delta_stats
.num_objects_dirty
--;
4613 ctx
->delta_stats
.num_objects_omap
--;
4614 if (oi
.is_whiteout()) {
4615 dout(20) << __func__
<< " trimming whiteout on " << oi
.soid
<< dendl
;
4616 ctx
->delta_stats
.num_whiteouts
--;
4618 if (oi
.is_cache_pinned()) {
4619 ctx
->delta_stats
.num_objects_pinned
--;
4621 if (coi
.has_manifest())
4622 ctx
->delta_stats
.num_objects_manifest
--;
4623 head_obc
->obs
.exists
= false;
4624 head_obc
->obs
.oi
= object_info_t(head_oid
);
4625 t
->remove(head_oid
);
4627 dout(10) << coid
<< " filtering snapset on " << head_oid
<< dendl
;
4628 snapset
.filter(pool
.info
);
4629 dout(10) << coid
<< " writing updated snapset on " << head_oid
4630 << ", snapset is " << snapset
<< dendl
;
4633 pg_log_entry_t::MODIFY
,
4636 head_obc
->obs
.oi
.version
,
4643 head_obc
->obs
.oi
.prior_version
= head_obc
->obs
.oi
.version
;
4644 head_obc
->obs
.oi
.version
= ctx
->at_version
;
4646 map
<string
, bufferlist
> attrs
;
4648 encode(snapset
, bl
);
4649 attrs
[SS_ATTR
].claim(bl
);
4652 encode(head_obc
->obs
.oi
, bl
,
4653 get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD
, nullptr));
4654 attrs
[OI_ATTR
].claim(bl
);
4655 t
->setattrs(head_oid
, attrs
);
4658 *ctxp
= std::move(ctx
);
4662 void PrimaryLogPG::kick_snap_trim()
4664 ceph_assert(is_active());
4665 ceph_assert(is_primary());
4667 !state_test(PG_STATE_PREMERGE
) &&
4668 !snap_trimq
.empty()) {
4669 if (get_osdmap()->test_flag(CEPH_OSDMAP_NOSNAPTRIM
)) {
4670 dout(10) << __func__
<< ": nosnaptrim set, not kicking" << dendl
;
4672 dout(10) << __func__
<< ": clean and snaps to trim, kicking" << dendl
;
4673 snap_trimmer_machine
.process_event(KickTrim());
4678 void PrimaryLogPG::snap_trimmer_scrub_complete()
4680 if (is_primary() && is_active() && is_clean()) {
4681 ceph_assert(!snap_trimq
.empty());
4682 snap_trimmer_machine
.process_event(ScrubComplete());
4686 void PrimaryLogPG::snap_trimmer(epoch_t queued
)
4688 if (deleting
|| pg_has_reset_since(queued
)) {
4692 ceph_assert(is_primary());
4694 dout(10) << "snap_trimmer posting" << dendl
;
4695 snap_trimmer_machine
.process_event(DoSnapWork());
4696 dout(10) << "snap_trimmer complete" << dendl
;
4700 int PrimaryLogPG::do_xattr_cmp_u64(int op
, __u64 v1
, bufferlist
& xattr
)
4704 string
v2s(xattr
.c_str(), xattr
.length());
4706 v2
= strtoull(v2s
.c_str(), NULL
, 10);
4710 dout(20) << "do_xattr_cmp_u64 '" << v1
<< "' vs '" << v2
<< "' op " << op
<< dendl
;
4713 case CEPH_OSD_CMPXATTR_OP_EQ
:
4715 case CEPH_OSD_CMPXATTR_OP_NE
:
4717 case CEPH_OSD_CMPXATTR_OP_GT
:
4719 case CEPH_OSD_CMPXATTR_OP_GTE
:
4721 case CEPH_OSD_CMPXATTR_OP_LT
:
4723 case CEPH_OSD_CMPXATTR_OP_LTE
:
4730 int PrimaryLogPG::do_xattr_cmp_str(int op
, string
& v1s
, bufferlist
& xattr
)
4732 string
v2s(xattr
.c_str(), xattr
.length());
4734 dout(20) << "do_xattr_cmp_str '" << v1s
<< "' vs '" << v2s
<< "' op " << op
<< dendl
;
4737 case CEPH_OSD_CMPXATTR_OP_EQ
:
4738 return (v1s
.compare(v2s
) == 0);
4739 case CEPH_OSD_CMPXATTR_OP_NE
:
4740 return (v1s
.compare(v2s
) != 0);
4741 case CEPH_OSD_CMPXATTR_OP_GT
:
4742 return (v1s
.compare(v2s
) > 0);
4743 case CEPH_OSD_CMPXATTR_OP_GTE
:
4744 return (v1s
.compare(v2s
) >= 0);
4745 case CEPH_OSD_CMPXATTR_OP_LT
:
4746 return (v1s
.compare(v2s
) < 0);
4747 case CEPH_OSD_CMPXATTR_OP_LTE
:
4748 return (v1s
.compare(v2s
) <= 0);
4754 int PrimaryLogPG::do_writesame(OpContext
*ctx
, OSDOp
& osd_op
)
4756 ceph_osd_op
& op
= osd_op
.op
;
4757 vector
<OSDOp
> write_ops(1);
4758 OSDOp
& write_op
= write_ops
[0];
4759 uint64_t write_length
= op
.writesame
.length
;
4765 if (!op
.writesame
.data_length
|| write_length
% op
.writesame
.data_length
)
4768 if (op
.writesame
.data_length
!= osd_op
.indata
.length()) {
4769 derr
<< "invalid length ws data length " << op
.writesame
.data_length
<< " actual len " << osd_op
.indata
.length() << dendl
;
4773 while (write_length
) {
4774 write_op
.indata
.append(osd_op
.indata
);
4775 write_length
-= op
.writesame
.data_length
;
4778 write_op
.op
.op
= CEPH_OSD_OP_WRITE
;
4779 write_op
.op
.extent
.offset
= op
.writesame
.offset
;
4780 write_op
.op
.extent
.length
= op
.writesame
.length
;
4781 result
= do_osd_ops(ctx
, write_ops
);
4783 derr
<< "do_writesame do_osd_ops failed " << result
<< dendl
;
4788 // ========================================================================
4789 // low level osd ops
4791 int PrimaryLogPG::do_tmap2omap(OpContext
*ctx
, unsigned flags
)
4793 dout(20) << " convert tmap to omap for " << ctx
->new_obs
.oi
.soid
<< dendl
;
4794 bufferlist header
, vals
;
4795 int r
= _get_tmap(ctx
, &header
, &vals
);
4797 if (r
== -ENODATA
&& (flags
& CEPH_OSD_TMAP2OMAP_NULLOK
))
4802 vector
<OSDOp
> ops(3);
4804 ops
[0].op
.op
= CEPH_OSD_OP_TRUNCATE
;
4805 ops
[0].op
.extent
.offset
= 0;
4806 ops
[0].op
.extent
.length
= 0;
4808 ops
[1].op
.op
= CEPH_OSD_OP_OMAPSETHEADER
;
4809 ops
[1].indata
.claim(header
);
4811 ops
[2].op
.op
= CEPH_OSD_OP_OMAPSETVALS
;
4812 ops
[2].indata
.claim(vals
);
4814 return do_osd_ops(ctx
, ops
);
4817 int PrimaryLogPG::do_tmapup_slow(OpContext
*ctx
, bufferlist::const_iterator
& bp
,
4818 OSDOp
& osd_op
, bufferlist
& bl
)
4822 map
<string
, bufferlist
> m
;
4824 auto p
= bl
.cbegin();
4827 ceph_assert(p
.end());
4837 case CEPH_OSD_TMAP_SET
: // insert key
4845 case CEPH_OSD_TMAP_RM
: // remove key
4847 if (!m
.count(key
)) {
4852 case CEPH_OSD_TMAP_RMSLOPPY
: // remove key
4856 case CEPH_OSD_TMAP_HDR
: // update header
4868 encode(header
, obl
);
4872 vector
<OSDOp
> nops(1);
4873 OSDOp
& newop
= nops
[0];
4874 newop
.op
.op
= CEPH_OSD_OP_WRITEFULL
;
4875 newop
.op
.extent
.offset
= 0;
4876 newop
.op
.extent
.length
= obl
.length();
4878 do_osd_ops(ctx
, nops
);
4879 osd_op
.outdata
.claim(newop
.outdata
);
4883 int PrimaryLogPG::do_tmapup(OpContext
*ctx
, bufferlist::const_iterator
& bp
, OSDOp
& osd_op
)
4885 bufferlist::const_iterator orig_bp
= bp
;
4888 dout(10) << "tmapup is a no-op" << dendl
;
4890 // read the whole object
4891 vector
<OSDOp
> nops(1);
4892 OSDOp
& newop
= nops
[0];
4893 newop
.op
.op
= CEPH_OSD_OP_READ
;
4894 newop
.op
.extent
.offset
= 0;
4895 newop
.op
.extent
.length
= 0;
4896 result
= do_osd_ops(ctx
, nops
);
4898 dout(10) << "tmapup read " << newop
.outdata
.length() << dendl
;
4900 dout(30) << " starting is \n";
4901 newop
.outdata
.hexdump(*_dout
);
4904 auto ip
= newop
.outdata
.cbegin();
4907 dout(30) << "the update command is: \n";
4908 osd_op
.indata
.hexdump(*_dout
);
4914 if (newop
.outdata
.length()) {
4918 dout(10) << "tmapup header " << header
.length() << dendl
;
4920 if (!bp
.end() && *bp
== CEPH_OSD_TMAP_HDR
) {
4923 dout(10) << "tmapup new header " << header
.length() << dendl
;
4926 encode(header
, obl
);
4928 dout(20) << "tmapup initial nkeys " << nkeys
<< dendl
;
4931 bufferlist newkeydata
;
4932 string nextkey
, last_in_key
;
4934 bool have_next
= false;
4937 decode(nextkey
, ip
);
4938 decode(nextval
, ip
);
4940 while (!bp
.end() && !result
) {
4947 catch (buffer::error
& e
) {
4950 if (key
< last_in_key
) {
4951 dout(5) << "tmapup warning: key '" << key
<< "' < previous key '" << last_in_key
4952 << "', falling back to an inefficient (unsorted) update" << dendl
;
4954 return do_tmapup_slow(ctx
, bp
, osd_op
, newop
.outdata
);
4958 dout(10) << "tmapup op " << (int)op
<< " key " << key
<< dendl
;
4960 // skip existing intervening keys
4961 bool key_exists
= false;
4962 while (have_next
&& !key_exists
) {
4963 dout(20) << " (have_next=" << have_next
<< " nextkey=" << nextkey
<< ")" << dendl
;
4966 if (nextkey
< key
) {
4968 encode(nextkey
, newkeydata
);
4969 encode(nextval
, newkeydata
);
4970 dout(20) << " keep " << nextkey
<< " " << nextval
.length() << dendl
;
4972 // don't copy; discard old value. and stop.
4973 dout(20) << " drop " << nextkey
<< " " << nextval
.length() << dendl
;
4978 decode(nextkey
, ip
);
4979 decode(nextval
, ip
);
4985 if (op
== CEPH_OSD_TMAP_SET
) {
4990 catch (buffer::error
& e
) {
4993 encode(key
, newkeydata
);
4994 encode(val
, newkeydata
);
4995 dout(20) << " set " << key
<< " " << val
.length() << dendl
;
4997 } else if (op
== CEPH_OSD_TMAP_CREATE
) {
5005 catch (buffer::error
& e
) {
5008 encode(key
, newkeydata
);
5009 encode(val
, newkeydata
);
5010 dout(20) << " create " << key
<< " " << val
.length() << dendl
;
5012 } else if (op
== CEPH_OSD_TMAP_RM
) {
5017 } else if (op
== CEPH_OSD_TMAP_RMSLOPPY
) {
5020 dout(10) << " invalid tmap op " << (int)op
<< dendl
;
5027 encode(nextkey
, newkeydata
);
5028 encode(nextval
, newkeydata
);
5029 dout(20) << " keep " << nextkey
<< " " << nextval
.length() << dendl
;
5033 rest
.substr_of(newop
.outdata
, ip
.get_off(), newop
.outdata
.length() - ip
.get_off());
5034 dout(20) << " keep trailing " << rest
.length()
5035 << " at " << newkeydata
.length() << dendl
;
5036 newkeydata
.claim_append(rest
);
5039 // encode final key count + key data
5040 dout(20) << "tmapup final nkeys " << nkeys
<< dendl
;
5042 obl
.claim_append(newkeydata
);
5045 dout(30) << " final is \n";
5046 obl
.hexdump(*_dout
);
5050 auto tp
= obl
.cbegin();
5053 map
<string
,bufferlist
> d
;
5055 ceph_assert(tp
.end());
5056 dout(0) << " **** debug sanity check, looks ok ****" << dendl
;
5061 dout(20) << "tmapput write " << obl
.length() << dendl
;
5062 newop
.op
.op
= CEPH_OSD_OP_WRITEFULL
;
5063 newop
.op
.extent
.offset
= 0;
5064 newop
.op
.extent
.length
= obl
.length();
5066 do_osd_ops(ctx
, nops
);
5067 osd_op
.outdata
.claim(newop
.outdata
);
5073 static int check_offset_and_length(uint64_t offset
, uint64_t length
,
5074 uint64_t max
, DoutPrefixProvider
*dpp
)
5076 if (offset
>= max
||
5078 offset
+ length
> max
) {
5079 ldpp_dout(dpp
, 10) << __func__
<< " "
5080 << "osd_max_object_size: " << max
5081 << "; Hard limit of object size is 4GB." << dendl
;
5088 struct FillInVerifyExtent
: public Context
{
5091 bufferlist
*outdatap
;
5092 boost::optional
<uint32_t> maybe_crc
;
5097 FillInVerifyExtent(ceph_le64
*r
, int32_t *rv
, bufferlist
*blp
,
5098 boost::optional
<uint32_t> mc
, uint64_t size
,
5099 OSDService
*osd
, hobject_t soid
, __le32 flags
) :
5100 r(r
), rval(rv
), outdatap(blp
), maybe_crc(mc
),
5101 size(size
), osd(osd
), soid(soid
), flags(flags
) {}
5102 void finish(int len
) override
{
5110 // whole object? can we verify the checksum?
5111 if (maybe_crc
&& *r
== size
) {
5112 uint32_t crc
= outdatap
->crc32c(-1);
5113 if (maybe_crc
!= crc
) {
5114 osd
->clog
->error() << std::hex
<< " full-object read crc 0x" << crc
5115 << " != expected 0x" << *maybe_crc
5116 << std::dec
<< " on " << soid
;
5117 if (!(flags
& CEPH_OSD_OP_FLAG_FAILOK
)) {
5126 struct ToSparseReadResult
: public Context
{
5128 bufferlist
* data_bl
;
5129 uint64_t data_offset
;
5131 ToSparseReadResult(int* result
, bufferlist
* bl
, uint64_t offset
,
5133 : result(result
), data_bl(bl
), data_offset(offset
),len(len
) {}
5134 void finish(int r
) override
{
5142 map
<uint64_t, uint64_t> extents
= {{data_offset
, r
}};
5143 encode(extents
, outdata
);
5144 ::encode_destructively(*data_bl
, outdata
);
5145 data_bl
->swap(outdata
);
5149 template<typename V
>
5150 static string
list_keys(const map
<string
, V
>& m
) {
5152 for (typename map
<string
, V
>::const_iterator itr
= m
.begin(); itr
!= m
.end(); ++itr
) {
5156 s
.append(itr
->first
);
5161 template<typename T
>
5162 static string
list_entries(const T
& m
) {
5164 for (typename
T::const_iterator itr
= m
.begin(); itr
!= m
.end(); ++itr
) {
5173 void PrimaryLogPG::maybe_create_new_object(
5175 bool ignore_transaction
)
5177 ObjectState
& obs
= ctx
->new_obs
;
5179 ctx
->delta_stats
.num_objects
++;
5181 ceph_assert(!obs
.oi
.is_whiteout());
5182 obs
.oi
.new_object();
5183 if (!ignore_transaction
)
5184 ctx
->op_t
->create(obs
.oi
.soid
);
5185 } else if (obs
.oi
.is_whiteout()) {
5186 dout(10) << __func__
<< " clearing whiteout on " << obs
.oi
.soid
<< dendl
;
5187 ctx
->new_obs
.oi
.clear_flag(object_info_t::FLAG_WHITEOUT
);
5188 --ctx
->delta_stats
.num_whiteouts
;
5192 struct ReadFinisher
: public PrimaryLogPG::OpFinisher
{
5195 explicit ReadFinisher(OSDOp
& osd_op
) : osd_op(osd_op
) {
5198 int execute() override
{
5203 struct C_ChecksumRead
: public Context
{
5204 PrimaryLogPG
*primary_log_pg
;
5206 Checksummer::CSumType csum_type
;
5207 bufferlist init_value_bl
;
5208 ceph_le64 read_length
;
5210 Context
*fill_extent_ctx
;
5212 C_ChecksumRead(PrimaryLogPG
*primary_log_pg
, OSDOp
&osd_op
,
5213 Checksummer::CSumType csum_type
, bufferlist
&&init_value_bl
,
5214 boost::optional
<uint32_t> maybe_crc
, uint64_t size
,
5215 OSDService
*osd
, hobject_t soid
, __le32 flags
)
5216 : primary_log_pg(primary_log_pg
), osd_op(osd_op
),
5217 csum_type(csum_type
), init_value_bl(std::move(init_value_bl
)),
5218 fill_extent_ctx(new FillInVerifyExtent(&read_length
, &osd_op
.rval
,
5219 &read_bl
, maybe_crc
, size
,
5220 osd
, soid
, flags
)) {
5222 ~C_ChecksumRead() override
{
5223 delete fill_extent_ctx
;
5226 void finish(int r
) override
{
5227 fill_extent_ctx
->complete(r
);
5228 fill_extent_ctx
= nullptr;
5230 if (osd_op
.rval
>= 0) {
5231 bufferlist::const_iterator init_value_bl_it
= init_value_bl
.begin();
5232 osd_op
.rval
= primary_log_pg
->finish_checksum(osd_op
, csum_type
,
5233 &init_value_bl_it
, read_bl
);
5238 int PrimaryLogPG::do_checksum(OpContext
*ctx
, OSDOp
& osd_op
,
5239 bufferlist::const_iterator
*bl_it
)
5241 dout(20) << __func__
<< dendl
;
5243 auto& op
= osd_op
.op
;
5244 if (op
.checksum
.chunk_size
> 0) {
5245 if (op
.checksum
.length
== 0) {
5246 dout(10) << __func__
<< ": length required when chunk size provided"
5250 if (op
.checksum
.length
% op
.checksum
.chunk_size
!= 0) {
5251 dout(10) << __func__
<< ": length not aligned to chunk size" << dendl
;
5256 auto& oi
= ctx
->new_obs
.oi
;
5257 if (op
.checksum
.offset
== 0 && op
.checksum
.length
== 0) {
5258 // zeroed offset+length implies checksum whole object
5259 op
.checksum
.length
= oi
.size
;
5260 } else if (op
.checksum
.offset
>= oi
.size
) {
5261 // read size was trimmed to zero, do nothing
5262 // see PrimaryLogPG::do_read
5264 } else if (op
.extent
.offset
+ op
.extent
.length
> oi
.size
) {
5265 op
.extent
.length
= oi
.size
- op
.extent
.offset
;
5266 if (op
.checksum
.chunk_size
> 0 &&
5267 op
.checksum
.length
% op
.checksum
.chunk_size
!= 0) {
5268 dout(10) << __func__
<< ": length (trimmed to 0x"
5269 << std::hex
<< op
.checksum
.length
5270 << ") not aligned to chunk size 0x"
5271 << op
.checksum
.chunk_size
<< std::dec
5277 Checksummer::CSumType csum_type
;
5278 switch (op
.checksum
.type
) {
5279 case CEPH_OSD_CHECKSUM_OP_TYPE_XXHASH32
:
5280 csum_type
= Checksummer::CSUM_XXHASH32
;
5282 case CEPH_OSD_CHECKSUM_OP_TYPE_XXHASH64
:
5283 csum_type
= Checksummer::CSUM_XXHASH64
;
5285 case CEPH_OSD_CHECKSUM_OP_TYPE_CRC32C
:
5286 csum_type
= Checksummer::CSUM_CRC32C
;
5289 dout(10) << __func__
<< ": unknown crc type ("
5290 << static_cast<uint32_t>(op
.checksum
.type
) << ")" << dendl
;
5294 size_t csum_init_value_size
= Checksummer::get_csum_init_value_size(csum_type
);
5295 if (bl_it
->get_remaining() < csum_init_value_size
) {
5296 dout(10) << __func__
<< ": init value not provided" << dendl
;
5300 bufferlist init_value_bl
;
5301 init_value_bl
.substr_of(bl_it
->get_bl(), bl_it
->get_off(),
5302 csum_init_value_size
);
5303 bl_it
->advance(csum_init_value_size
);
5305 if (pool
.info
.is_erasure() && op
.checksum
.length
> 0) {
5306 // If there is a data digest and it is possible we are reading
5307 // entire object, pass the digest.
5308 boost::optional
<uint32_t> maybe_crc
;
5309 if (oi
.is_data_digest() && op
.checksum
.offset
== 0 &&
5310 op
.checksum
.length
>= oi
.size
) {
5311 maybe_crc
= oi
.data_digest
;
5315 auto& soid
= oi
.soid
;
5316 auto checksum_ctx
= new C_ChecksumRead(this, osd_op
, csum_type
,
5317 std::move(init_value_bl
), maybe_crc
,
5318 oi
.size
, osd
, soid
, op
.flags
);
5320 ctx
->pending_async_reads
.push_back({
5321 {op
.checksum
.offset
, op
.checksum
.length
, op
.flags
},
5322 {&checksum_ctx
->read_bl
, checksum_ctx
}});
5324 dout(10) << __func__
<< ": async_read noted for " << soid
<< dendl
;
5325 ctx
->op_finishers
[ctx
->current_osd_subop_num
].reset(
5326 new ReadFinisher(osd_op
));
5327 return -EINPROGRESS
;
5331 std::vector
<OSDOp
> read_ops(1);
5332 auto& read_op
= read_ops
[0];
5333 if (op
.checksum
.length
> 0) {
5334 read_op
.op
.op
= CEPH_OSD_OP_READ
;
5335 read_op
.op
.flags
= op
.flags
;
5336 read_op
.op
.extent
.offset
= op
.checksum
.offset
;
5337 read_op
.op
.extent
.length
= op
.checksum
.length
;
5338 read_op
.op
.extent
.truncate_size
= 0;
5339 read_op
.op
.extent
.truncate_seq
= 0;
5341 int r
= do_osd_ops(ctx
, read_ops
);
5343 derr
<< __func__
<< ": do_osd_ops failed: " << cpp_strerror(r
) << dendl
;
5348 bufferlist::const_iterator init_value_bl_it
= init_value_bl
.begin();
5349 return finish_checksum(osd_op
, csum_type
, &init_value_bl_it
,
5353 int PrimaryLogPG::finish_checksum(OSDOp
& osd_op
,
5354 Checksummer::CSumType csum_type
,
5355 bufferlist::const_iterator
*init_value_bl_it
,
5356 const bufferlist
&read_bl
) {
5357 dout(20) << __func__
<< dendl
;
5359 auto& op
= osd_op
.op
;
5361 if (op
.checksum
.length
> 0 && read_bl
.length() != op
.checksum
.length
) {
5362 derr
<< __func__
<< ": bytes read " << read_bl
.length() << " != "
5363 << op
.checksum
.length
<< dendl
;
5367 size_t csum_chunk_size
= (op
.checksum
.chunk_size
!= 0 ?
5368 op
.checksum
.chunk_size
: read_bl
.length());
5369 uint32_t csum_count
= (csum_chunk_size
> 0 ?
5370 read_bl
.length() / csum_chunk_size
: 0);
5373 bufferptr csum_data
;
5374 if (csum_count
> 0) {
5375 size_t csum_value_size
= Checksummer::get_csum_value_size(csum_type
);
5376 csum_data
= buffer::create(csum_value_size
* csum_count
);
5378 csum
.append(csum_data
);
5380 switch (csum_type
) {
5381 case Checksummer::CSUM_XXHASH32
:
5383 Checksummer::xxhash32::init_value_t init_value
;
5384 decode(init_value
, *init_value_bl_it
);
5385 Checksummer::calculate
<Checksummer::xxhash32
>(
5386 init_value
, csum_chunk_size
, 0, read_bl
.length(), read_bl
,
5390 case Checksummer::CSUM_XXHASH64
:
5392 Checksummer::xxhash64::init_value_t init_value
;
5393 decode(init_value
, *init_value_bl_it
);
5394 Checksummer::calculate
<Checksummer::xxhash64
>(
5395 init_value
, csum_chunk_size
, 0, read_bl
.length(), read_bl
,
5399 case Checksummer::CSUM_CRC32C
:
5401 Checksummer::crc32c::init_value_t init_value
;
5402 decode(init_value
, *init_value_bl_it
);
5403 Checksummer::calculate
<Checksummer::crc32c
>(
5404 init_value
, csum_chunk_size
, 0, read_bl
.length(), read_bl
,
5413 encode(csum_count
, osd_op
.outdata
);
5414 osd_op
.outdata
.claim_append(csum
);
5418 struct C_ExtentCmpRead
: public Context
{
5419 PrimaryLogPG
*primary_log_pg
;
5421 ceph_le64 read_length
{};
5423 Context
*fill_extent_ctx
;
5425 C_ExtentCmpRead(PrimaryLogPG
*primary_log_pg
, OSDOp
&osd_op
,
5426 boost::optional
<uint32_t> maybe_crc
, uint64_t size
,
5427 OSDService
*osd
, hobject_t soid
, __le32 flags
)
5428 : primary_log_pg(primary_log_pg
), osd_op(osd_op
),
5429 fill_extent_ctx(new FillInVerifyExtent(&read_length
, &osd_op
.rval
,
5430 &read_bl
, maybe_crc
, size
,
5431 osd
, soid
, flags
)) {
5433 ~C_ExtentCmpRead() override
{
5434 delete fill_extent_ctx
;
5437 void finish(int r
) override
{
5441 delete fill_extent_ctx
;
5443 fill_extent_ctx
->complete(r
);
5445 fill_extent_ctx
= nullptr;
5447 if (osd_op
.rval
>= 0) {
5448 osd_op
.rval
= primary_log_pg
->finish_extent_cmp(osd_op
, read_bl
);
5453 int PrimaryLogPG::do_extent_cmp(OpContext
*ctx
, OSDOp
& osd_op
)
5455 dout(20) << __func__
<< dendl
;
5456 ceph_osd_op
& op
= osd_op
.op
;
5458 auto& oi
= ctx
->new_obs
.oi
;
5459 uint64_t size
= oi
.size
;
5460 if ((oi
.truncate_seq
< op
.extent
.truncate_seq
) &&
5461 (op
.extent
.offset
+ op
.extent
.length
> op
.extent
.truncate_size
)) {
5462 size
= op
.extent
.truncate_size
;
5465 if (op
.extent
.offset
>= size
) {
5466 op
.extent
.length
= 0;
5467 } else if (op
.extent
.offset
+ op
.extent
.length
> size
) {
5468 op
.extent
.length
= size
- op
.extent
.offset
;
5471 if (op
.extent
.length
== 0) {
5472 dout(20) << __func__
<< " zero length extent" << dendl
;
5473 return finish_extent_cmp(osd_op
, bufferlist
{});
5474 } else if (!ctx
->obs
->exists
|| ctx
->obs
->oi
.is_whiteout()) {
5475 dout(20) << __func__
<< " object DNE" << dendl
;
5476 return finish_extent_cmp(osd_op
, {});
5477 } else if (pool
.info
.is_erasure()) {
5478 // If there is a data digest and it is possible we are reading
5479 // entire object, pass the digest.
5480 boost::optional
<uint32_t> maybe_crc
;
5481 if (oi
.is_data_digest() && op
.checksum
.offset
== 0 &&
5482 op
.checksum
.length
>= oi
.size
) {
5483 maybe_crc
= oi
.data_digest
;
5487 auto& soid
= oi
.soid
;
5488 auto extent_cmp_ctx
= new C_ExtentCmpRead(this, osd_op
, maybe_crc
, oi
.size
,
5489 osd
, soid
, op
.flags
);
5490 ctx
->pending_async_reads
.push_back({
5491 {op
.extent
.offset
, op
.extent
.length
, op
.flags
},
5492 {&extent_cmp_ctx
->read_bl
, extent_cmp_ctx
}});
5494 dout(10) << __func__
<< ": async_read noted for " << soid
<< dendl
;
5496 ctx
->op_finishers
[ctx
->current_osd_subop_num
].reset(
5497 new ReadFinisher(osd_op
));
5498 return -EINPROGRESS
;
5502 vector
<OSDOp
> read_ops(1);
5503 OSDOp
& read_op
= read_ops
[0];
5505 read_op
.op
.op
= CEPH_OSD_OP_SYNC_READ
;
5506 read_op
.op
.extent
.offset
= op
.extent
.offset
;
5507 read_op
.op
.extent
.length
= op
.extent
.length
;
5508 read_op
.op
.extent
.truncate_seq
= op
.extent
.truncate_seq
;
5509 read_op
.op
.extent
.truncate_size
= op
.extent
.truncate_size
;
5511 int result
= do_osd_ops(ctx
, read_ops
);
5513 derr
<< __func__
<< " failed " << result
<< dendl
;
5516 return finish_extent_cmp(osd_op
, read_op
.outdata
);
5519 int PrimaryLogPG::finish_extent_cmp(OSDOp
& osd_op
, const bufferlist
&read_bl
)
5521 for (uint64_t idx
= 0; idx
< osd_op
.indata
.length(); ++idx
) {
5522 char read_byte
= (idx
< read_bl
.length() ? read_bl
[idx
] : 0);
5523 if (osd_op
.indata
[idx
] != read_byte
) {
5524 return (-MAX_ERRNO
- idx
);
5531 int PrimaryLogPG::do_read(OpContext
*ctx
, OSDOp
& osd_op
) {
5532 dout(20) << __func__
<< dendl
;
5533 auto& op
= osd_op
.op
;
5534 auto& oi
= ctx
->new_obs
.oi
;
5535 auto& soid
= oi
.soid
;
5536 __u32 seq
= oi
.truncate_seq
;
5537 uint64_t size
= oi
.size
;
5538 bool trimmed_read
= false;
5540 dout(30) << __func__
<< " oi.size: " << oi
.size
<< dendl
;
5541 dout(30) << __func__
<< " oi.truncate_seq: " << oi
.truncate_seq
<< dendl
;
5542 dout(30) << __func__
<< " op.extent.truncate_seq: " << op
.extent
.truncate_seq
<< dendl
;
5543 dout(30) << __func__
<< " op.extent.truncate_size: " << op
.extent
.truncate_size
<< dendl
;
5545 // are we beyond truncate_size?
5546 if ( (seq
< op
.extent
.truncate_seq
) &&
5547 (op
.extent
.offset
+ op
.extent
.length
> op
.extent
.truncate_size
) &&
5548 (size
> op
.extent
.truncate_size
) )
5549 size
= op
.extent
.truncate_size
;
5551 if (op
.extent
.length
== 0) //length is zero mean read the whole object
5552 op
.extent
.length
= size
;
5554 if (op
.extent
.offset
>= size
) {
5555 op
.extent
.length
= 0;
5556 trimmed_read
= true;
5557 } else if (op
.extent
.offset
+ op
.extent
.length
> size
) {
5558 op
.extent
.length
= size
- op
.extent
.offset
;
5559 trimmed_read
= true;
5562 dout(30) << __func__
<< "op.extent.length is now " << op
.extent
.length
<< dendl
;
5564 // read into a buffer
5566 if (trimmed_read
&& op
.extent
.length
== 0) {
5567 // read size was trimmed to zero and it is expected to do nothing
5568 // a read operation of 0 bytes does *not* do nothing, this is why
5569 // the trimmed_read boolean is needed
5570 } else if (pool
.info
.is_erasure()) {
5571 // The initialisation below is required to silence a false positive
5572 // -Wmaybe-uninitialized warning
5573 boost::optional
<uint32_t> maybe_crc
= boost::make_optional(false, uint32_t());
5574 // If there is a data digest and it is possible we are reading
5575 // entire object, pass the digest. FillInVerifyExtent will
5576 // will check the oi.size again.
5577 if (oi
.is_data_digest() && op
.extent
.offset
== 0 &&
5578 op
.extent
.length
>= oi
.size
)
5579 maybe_crc
= oi
.data_digest
;
5580 ctx
->pending_async_reads
.push_back(
5582 boost::make_tuple(op
.extent
.offset
, op
.extent
.length
, op
.flags
),
5583 make_pair(&osd_op
.outdata
,
5584 new FillInVerifyExtent(&op
.extent
.length
, &osd_op
.rval
,
5585 &osd_op
.outdata
, maybe_crc
, oi
.size
,
5586 osd
, soid
, op
.flags
))));
5587 dout(10) << " async_read noted for " << soid
<< dendl
;
5589 ctx
->op_finishers
[ctx
->current_osd_subop_num
].reset(
5590 new ReadFinisher(osd_op
));
5592 int r
= pgbackend
->objects_read_sync(
5593 soid
, op
.extent
.offset
, op
.extent
.length
, op
.flags
, &osd_op
.outdata
);
5594 // whole object? can we verify the checksum?
5595 if (r
>= 0 && op
.extent
.offset
== 0 &&
5596 (uint64_t)r
== oi
.size
&& oi
.is_data_digest()) {
5597 uint32_t crc
= osd_op
.outdata
.crc32c(-1);
5598 if (oi
.data_digest
!= crc
) {
5599 osd
->clog
->error() << info
.pgid
<< std::hex
5600 << " full-object read crc 0x" << crc
5601 << " != expected 0x" << oi
.data_digest
5602 << std::dec
<< " on " << soid
;
5603 r
= -EIO
; // try repair later
5607 r
= rep_repair_primary_object(soid
, ctx
);
5610 op
.extent
.length
= r
;
5611 else if (r
== -EAGAIN
) {
5615 op
.extent
.length
= 0;
5617 dout(10) << " read got " << r
<< " / " << op
.extent
.length
5618 << " bytes from obj " << soid
<< dendl
;
5621 ctx
->delta_stats
.num_rd_kb
+= shift_round_up(op
.extent
.length
, 10);
5622 ctx
->delta_stats
.num_rd
++;
5627 int PrimaryLogPG::do_sparse_read(OpContext
*ctx
, OSDOp
& osd_op
) {
5628 dout(20) << __func__
<< dendl
;
5629 auto& op
= osd_op
.op
;
5630 auto& oi
= ctx
->new_obs
.oi
;
5631 auto& soid
= oi
.soid
;
5633 if (op
.extent
.truncate_seq
) {
5634 dout(0) << "sparse_read does not support truncation sequence " << dendl
;
5639 if (pool
.info
.is_erasure()) {
5640 // translate sparse read to a normal one if not supported
5641 uint64_t offset
= op
.extent
.offset
;
5642 uint64_t length
= op
.extent
.length
;
5643 if (offset
> oi
.size
) {
5645 } else if (offset
+ length
> oi
.size
) {
5646 length
= oi
.size
- offset
;
5650 ctx
->pending_async_reads
.push_back(
5652 boost::make_tuple(offset
, length
, op
.flags
),
5655 new ToSparseReadResult(&osd_op
.rval
, &osd_op
.outdata
, offset
,
5656 &op
.extent
.length
))));
5657 dout(10) << " async_read (was sparse_read) noted for " << soid
<< dendl
;
5659 ctx
->op_finishers
[ctx
->current_osd_subop_num
].reset(
5660 new ReadFinisher(osd_op
));
5662 dout(10) << " sparse read ended up empty for " << soid
<< dendl
;
5663 map
<uint64_t, uint64_t> extents
;
5664 encode(extents
, osd_op
.outdata
);
5667 // read into a buffer
5668 map
<uint64_t, uint64_t> m
;
5669 uint32_t total_read
= 0;
5670 int r
= osd
->store
->fiemap(ch
, ghobject_t(soid
, ghobject_t::NO_GEN
,
5672 op
.extent
.offset
, op
.extent
.length
, m
);
5677 map
<uint64_t, uint64_t>::iterator miter
;
5679 uint64_t last
= op
.extent
.offset
;
5680 for (miter
= m
.begin(); miter
!= m
.end(); ++miter
) {
5682 if (cct
->_conf
->osd_verify_sparse_read_holes
&&
5683 last
< miter
->first
) {
5685 uint64_t len
= miter
->first
- last
;
5686 r
= pgbackend
->objects_read_sync(soid
, last
, len
, op
.flags
, &t
);
5688 osd
->clog
->error() << coll
<< " " << soid
5689 << " sparse-read failed to read: "
5691 } else if (!t
.is_zero()) {
5692 osd
->clog
->error() << coll
<< " " << soid
5693 << " sparse-read found data in hole "
5694 << last
<< "~" << len
;
5699 r
= pgbackend
->objects_read_sync(soid
, miter
->first
, miter
->second
,
5702 r
= rep_repair_primary_object(soid
, ctx
);
5708 // this is usually happen when we get extent that exceeds the actual file
5710 if (r
< (int)miter
->second
)
5713 dout(10) << "sparse-read " << miter
->first
<< "@" << miter
->second
5715 data_bl
.claim_append(tmpbl
);
5716 last
= miter
->first
+ r
;
5719 // verify trailing hole?
5720 if (cct
->_conf
->osd_verify_sparse_read_holes
) {
5721 uint64_t end
= std::min
<uint64_t>(op
.extent
.offset
+ op
.extent
.length
,
5725 uint64_t len
= end
- last
;
5726 r
= pgbackend
->objects_read_sync(soid
, last
, len
, op
.flags
, &t
);
5728 osd
->clog
->error() << coll
<< " " << soid
5729 << " sparse-read failed to read: " << r
;
5730 } else if (!t
.is_zero()) {
5731 osd
->clog
->error() << coll
<< " " << soid
5732 << " sparse-read found data in hole "
5733 << last
<< "~" << len
;
5738 // Why SPARSE_READ need checksum? In fact, librbd always use sparse-read.
5739 // Maybe at first, there is no much whole objects. With continued use, more
5740 // and more whole object exist. So from this point, for spare-read add
5741 // checksum make sense.
5742 if (total_read
== oi
.size
&& oi
.is_data_digest()) {
5743 uint32_t crc
= data_bl
.crc32c(-1);
5744 if (oi
.data_digest
!= crc
) {
5745 osd
->clog
->error() << info
.pgid
<< std::hex
5746 << " full-object read crc 0x" << crc
5747 << " != expected 0x" << oi
.data_digest
5748 << std::dec
<< " on " << soid
;
5749 r
= rep_repair_primary_object(soid
, ctx
);
5756 op
.extent
.length
= total_read
;
5758 encode(m
, osd_op
.outdata
); // re-encode since it might be modified
5759 ::encode_destructively(data_bl
, osd_op
.outdata
);
5761 dout(10) << " sparse_read got " << total_read
<< " bytes from object "
5765 ctx
->delta_stats
.num_rd_kb
+= shift_round_up(op
.extent
.length
, 10);
5766 ctx
->delta_stats
.num_rd
++;
5770 int PrimaryLogPG::do_osd_ops(OpContext
*ctx
, vector
<OSDOp
>& ops
)
5773 SnapSetContext
*ssc
= ctx
->obc
->ssc
;
5774 ObjectState
& obs
= ctx
->new_obs
;
5775 object_info_t
& oi
= obs
.oi
;
5776 const hobject_t
& soid
= oi
.soid
;
5777 const bool skip_data_digest
= osd
->store
->has_builtin_csum() &&
5778 osd
->osd_skip_data_digest
;
5780 PGTransaction
* t
= ctx
->op_t
.get();
5782 dout(10) << "do_osd_op " << soid
<< " " << ops
<< dendl
;
5784 ctx
->current_osd_subop_num
= 0;
5785 for (auto p
= ops
.begin(); p
!= ops
.end(); ++p
, ctx
->current_osd_subop_num
++, ctx
->processed_subop_count
++) {
5787 ceph_osd_op
& op
= osd_op
.op
;
5789 OpFinisher
* op_finisher
= nullptr;
5791 auto op_finisher_it
= ctx
->op_finishers
.find(ctx
->current_osd_subop_num
);
5792 if (op_finisher_it
!= ctx
->op_finishers
.end()) {
5793 op_finisher
= op_finisher_it
->second
.get();
5797 // TODO: check endianness (__le32 vs uint32_t, etc.)
5798 // The fields in ceph_osd_op are little-endian (according to the definition in rados.h),
5799 // but the code in this function seems to treat them as native-endian. What should the
5801 tracepoint(osd
, do_osd_op_pre
, soid
.oid
.name
.c_str(), soid
.snap
.val
, op
.op
, ceph_osd_op_name(op
.op
), op
.flags
);
5803 dout(10) << "do_osd_op " << osd_op
<< dendl
;
5805 auto bp
= osd_op
.indata
.cbegin();
5807 // user-visible modifcation?
5809 // non user-visible modifications
5810 case CEPH_OSD_OP_WATCH
:
5811 case CEPH_OSD_OP_CACHE_EVICT
:
5812 case CEPH_OSD_OP_CACHE_FLUSH
:
5813 case CEPH_OSD_OP_CACHE_TRY_FLUSH
:
5814 case CEPH_OSD_OP_UNDIRTY
:
5815 case CEPH_OSD_OP_COPY_FROM
: // we handle user_version update explicitly
5816 case CEPH_OSD_OP_CACHE_PIN
:
5817 case CEPH_OSD_OP_CACHE_UNPIN
:
5818 case CEPH_OSD_OP_SET_REDIRECT
:
5819 case CEPH_OSD_OP_TIER_PROMOTE
:
5822 if (op
.op
& CEPH_OSD_OP_MODE_WR
)
5823 ctx
->user_modify
= true;
5826 // munge -1 truncate to 0 truncate
5827 if (ceph_osd_op_uses_extent(op
.op
) &&
5828 op
.extent
.truncate_seq
== 1 &&
5829 op
.extent
.truncate_size
== (-1ULL)) {
5830 op
.extent
.truncate_size
= 0;
5831 op
.extent
.truncate_seq
= 0;
5834 // munge ZERO -> TRUNCATE? (don't munge to DELETE or we risk hosing attributes)
5835 if (op
.op
== CEPH_OSD_OP_ZERO
&&
5837 op
.extent
.offset
< static_cast<Option::size_t>(osd
->osd_max_object_size
) &&
5838 op
.extent
.length
>= 1 &&
5839 op
.extent
.length
<= static_cast<Option::size_t>(osd
->osd_max_object_size
) &&
5840 op
.extent
.offset
+ op
.extent
.length
>= oi
.size
) {
5841 if (op
.extent
.offset
>= oi
.size
) {
5845 dout(10) << " munging ZERO " << op
.extent
.offset
<< "~" << op
.extent
.length
5846 << " -> TRUNCATE " << op
.extent
.offset
<< " (old size is " << oi
.size
<< ")" << dendl
;
5847 op
.op
= CEPH_OSD_OP_TRUNCATE
;
5854 case CEPH_OSD_OP_CMPEXT
:
5856 tracepoint(osd
, do_osd_op_pre_extent_cmp
, soid
.oid
.name
.c_str(),
5857 soid
.snap
.val
, oi
.size
, oi
.truncate_seq
, op
.extent
.offset
,
5858 op
.extent
.length
, op
.extent
.truncate_size
,
5859 op
.extent
.truncate_seq
);
5861 if (op_finisher
== nullptr) {
5862 result
= do_extent_cmp(ctx
, osd_op
);
5864 result
= op_finisher
->execute();
5868 case CEPH_OSD_OP_SYNC_READ
:
5869 if (pool
.info
.is_erasure()) {
5870 result
= -EOPNOTSUPP
;
5874 case CEPH_OSD_OP_READ
:
5876 tracepoint(osd
, do_osd_op_pre_read
, soid
.oid
.name
.c_str(),
5877 soid
.snap
.val
, oi
.size
, oi
.truncate_seq
, op
.extent
.offset
,
5878 op
.extent
.length
, op
.extent
.truncate_size
,
5879 op
.extent
.truncate_seq
);
5880 if (op_finisher
== nullptr) {
5881 if (!ctx
->data_off
) {
5882 ctx
->data_off
= op
.extent
.offset
;
5884 result
= do_read(ctx
, osd_op
);
5886 result
= op_finisher
->execute();
5890 case CEPH_OSD_OP_CHECKSUM
:
5893 tracepoint(osd
, do_osd_op_pre_checksum
, soid
.oid
.name
.c_str(),
5894 soid
.snap
.val
, oi
.size
, oi
.truncate_seq
, op
.checksum
.type
,
5895 op
.checksum
.offset
, op
.checksum
.length
,
5896 op
.checksum
.chunk_size
);
5898 if (op_finisher
== nullptr) {
5899 result
= do_checksum(ctx
, osd_op
, &bp
);
5901 result
= op_finisher
->execute();
5907 case CEPH_OSD_OP_MAPEXT
:
5908 tracepoint(osd
, do_osd_op_pre_mapext
, soid
.oid
.name
.c_str(), soid
.snap
.val
, op
.extent
.offset
, op
.extent
.length
);
5909 if (pool
.info
.is_erasure()) {
5910 result
= -EOPNOTSUPP
;
5915 // read into a buffer
5917 int r
= osd
->store
->fiemap(ch
, ghobject_t(soid
, ghobject_t::NO_GEN
,
5919 op
.extent
.offset
, op
.extent
.length
, bl
);
5920 osd_op
.outdata
.claim(bl
);
5924 ctx
->delta_stats
.num_rd_kb
+= shift_round_up(bl
.length(), 10);
5925 ctx
->delta_stats
.num_rd
++;
5926 dout(10) << " map_extents done on object " << soid
<< dendl
;
5931 case CEPH_OSD_OP_SPARSE_READ
:
5932 tracepoint(osd
, do_osd_op_pre_sparse_read
, soid
.oid
.name
.c_str(),
5933 soid
.snap
.val
, oi
.size
, oi
.truncate_seq
, op
.extent
.offset
,
5934 op
.extent
.length
, op
.extent
.truncate_size
,
5935 op
.extent
.truncate_seq
);
5936 if (op_finisher
== nullptr) {
5937 result
= do_sparse_read(ctx
, osd_op
);
5939 result
= op_finisher
->execute();
5943 case CEPH_OSD_OP_CALL
:
5945 string cname
, mname
;
5948 bp
.copy(op
.cls
.class_len
, cname
);
5949 bp
.copy(op
.cls
.method_len
, mname
);
5950 bp
.copy(op
.cls
.indata_len
, indata
);
5951 } catch (buffer::error
& e
) {
5952 dout(10) << "call unable to decode class + method + indata" << dendl
;
5953 dout(30) << "in dump: ";
5954 osd_op
.indata
.hexdump(*_dout
);
5957 tracepoint(osd
, do_osd_op_pre_call
, soid
.oid
.name
.c_str(), soid
.snap
.val
, "???", "???");
5960 tracepoint(osd
, do_osd_op_pre_call
, soid
.oid
.name
.c_str(), soid
.snap
.val
, cname
.c_str(), mname
.c_str());
5962 ClassHandler::ClassData
*cls
;
5963 result
= osd
->class_handler
->open_class(cname
, &cls
);
5964 ceph_assert(result
== 0); // init_op_flags() already verified this works.
5966 ClassHandler::ClassMethod
*method
= cls
->get_method(mname
.c_str());
5968 dout(10) << "call method " << cname
<< "." << mname
<< " does not exist" << dendl
;
5969 result
= -EOPNOTSUPP
;
5973 int flags
= method
->get_flags();
5974 if (flags
& CLS_METHOD_WR
)
5975 ctx
->user_modify
= true;
5978 dout(10) << "call method " << cname
<< "." << mname
<< dendl
;
5979 int prev_rd
= ctx
->num_read
;
5980 int prev_wr
= ctx
->num_write
;
5981 result
= method
->exec((cls_method_context_t
)&ctx
, indata
, outdata
);
5983 if (ctx
->num_read
> prev_rd
&& !(flags
& CLS_METHOD_RD
)) {
5984 derr
<< "method " << cname
<< "." << mname
<< " tried to read object but is not marked RD" << dendl
;
5988 if (ctx
->num_write
> prev_wr
&& !(flags
& CLS_METHOD_WR
)) {
5989 derr
<< "method " << cname
<< "." << mname
<< " tried to update object but is not marked WR" << dendl
;
5994 dout(10) << "method called response length=" << outdata
.length() << dendl
;
5995 op
.extent
.length
= outdata
.length();
5996 osd_op
.outdata
.claim_append(outdata
);
5997 dout(30) << "out dump: ";
5998 osd_op
.outdata
.hexdump(*_dout
);
6003 case CEPH_OSD_OP_STAT
:
6004 // note: stat does not require RD
6006 tracepoint(osd
, do_osd_op_pre_stat
, soid
.oid
.name
.c_str(), soid
.snap
.val
);
6008 if (obs
.exists
&& !oi
.is_whiteout()) {
6009 encode(oi
.size
, osd_op
.outdata
);
6010 encode(oi
.mtime
, osd_op
.outdata
);
6011 dout(10) << "stat oi has " << oi
.size
<< " " << oi
.mtime
<< dendl
;
6014 dout(10) << "stat oi object does not exist" << dendl
;
6017 ctx
->delta_stats
.num_rd
++;
6021 case CEPH_OSD_OP_ISDIRTY
:
6024 tracepoint(osd
, do_osd_op_pre_isdirty
, soid
.oid
.name
.c_str(), soid
.snap
.val
);
6025 bool is_dirty
= obs
.oi
.is_dirty();
6026 encode(is_dirty
, osd_op
.outdata
);
6027 ctx
->delta_stats
.num_rd
++;
6032 case CEPH_OSD_OP_UNDIRTY
:
6035 tracepoint(osd
, do_osd_op_pre_undirty
, soid
.oid
.name
.c_str(), soid
.snap
.val
);
6036 if (oi
.is_dirty()) {
6037 ctx
->undirty
= true; // see make_writeable()
6039 ctx
->delta_stats
.num_wr
++;
6045 case CEPH_OSD_OP_CACHE_TRY_FLUSH
:
6048 tracepoint(osd
, do_osd_op_pre_try_flush
, soid
.oid
.name
.c_str(), soid
.snap
.val
);
6049 if (ctx
->lock_type
!= ObjectContext::RWState::RWNONE
) {
6050 dout(10) << "cache-try-flush without SKIPRWLOCKS flag set" << dendl
;
6054 if (pool
.info
.cache_mode
== pg_pool_t::CACHEMODE_NONE
) {
6062 if (oi
.is_cache_pinned()) {
6063 dout(10) << "cache-try-flush on a pinned object, consider unpin this object first" << dendl
;
6067 if (oi
.is_dirty()) {
6068 result
= start_flush(ctx
->op
, ctx
->obc
, false, NULL
, boost::none
);
6069 if (result
== -EINPROGRESS
)
6077 case CEPH_OSD_OP_CACHE_FLUSH
:
6080 tracepoint(osd
, do_osd_op_pre_cache_flush
, soid
.oid
.name
.c_str(), soid
.snap
.val
);
6081 if (ctx
->lock_type
== ObjectContext::RWState::RWNONE
) {
6082 dout(10) << "cache-flush with SKIPRWLOCKS flag set" << dendl
;
6086 if (pool
.info
.cache_mode
== pg_pool_t::CACHEMODE_NONE
) {
6094 if (oi
.is_cache_pinned()) {
6095 dout(10) << "cache-flush on a pinned object, consider unpin this object first" << dendl
;
6100 if (oi
.is_dirty()) {
6101 result
= start_flush(ctx
->op
, ctx
->obc
, true, &missing
, boost::none
);
6102 if (result
== -EINPROGRESS
)
6107 // Check special return value which has set missing_return
6108 if (result
== -ENOENT
) {
6109 dout(10) << __func__
<< " CEPH_OSD_OP_CACHE_FLUSH got ENOENT" << dendl
;
6110 ceph_assert(!missing
.is_min());
6111 wait_for_unreadable_object(missing
, ctx
->op
);
6112 // Error code which is used elsewhere when wait_for_unreadable_object() is used
6118 case CEPH_OSD_OP_CACHE_EVICT
:
6121 tracepoint(osd
, do_osd_op_pre_cache_evict
, soid
.oid
.name
.c_str(), soid
.snap
.val
);
6122 if (pool
.info
.cache_mode
== pg_pool_t::CACHEMODE_NONE
) {
6130 if (oi
.is_cache_pinned()) {
6131 dout(10) << "cache-evict on a pinned object, consider unpin this object first" << dendl
;
6135 if (oi
.is_dirty()) {
6139 if (!oi
.watchers
.empty()) {
6143 if (soid
.snap
== CEPH_NOSNAP
) {
6144 result
= _verify_no_head_clones(soid
, ssc
->snapset
);
6148 result
= _delete_oid(ctx
, true, false);
6150 // mark that this is a cache eviction to avoid triggering normal
6151 // make_writeable() clone creation in finish_ctx()
6152 ctx
->cache_evict
= true;
6154 osd
->logger
->inc(l_osd_tier_evict
);
6158 case CEPH_OSD_OP_GETXATTR
:
6162 bp
.copy(op
.xattr
.name_len
, aname
);
6163 tracepoint(osd
, do_osd_op_pre_getxattr
, soid
.oid
.name
.c_str(), soid
.snap
.val
, aname
.c_str());
6164 string name
= "_" + aname
;
6165 int r
= getattr_maybe_cache(
6170 op
.xattr
.value_len
= osd_op
.outdata
.length();
6172 ctx
->delta_stats
.num_rd_kb
+= shift_round_up(osd_op
.outdata
.length(), 10);
6176 ctx
->delta_stats
.num_rd
++;
6180 case CEPH_OSD_OP_GETXATTRS
:
6183 tracepoint(osd
, do_osd_op_pre_getxattrs
, soid
.oid
.name
.c_str(), soid
.snap
.val
);
6184 map
<string
, bufferlist
> out
;
6185 result
= getattrs_maybe_cache(
6191 ctx
->delta_stats
.num_rd_kb
+= shift_round_up(bl
.length(), 10);
6192 ctx
->delta_stats
.num_rd
++;
6193 osd_op
.outdata
.claim_append(bl
);
6197 case CEPH_OSD_OP_CMPXATTR
:
6201 bp
.copy(op
.xattr
.name_len
, aname
);
6202 tracepoint(osd
, do_osd_op_pre_cmpxattr
, soid
.oid
.name
.c_str(), soid
.snap
.val
, aname
.c_str());
6203 string name
= "_" + aname
;
6204 name
[op
.xattr
.name_len
+ 1] = 0;
6207 result
= getattr_maybe_cache(
6211 if (result
< 0 && result
!= -EEXIST
&& result
!= -ENODATA
)
6214 ctx
->delta_stats
.num_rd
++;
6215 ctx
->delta_stats
.num_rd_kb
+= shift_round_up(xattr
.length(), 10);
6217 switch (op
.xattr
.cmp_mode
) {
6218 case CEPH_OSD_CMPXATTR_MODE_STRING
:
6221 bp
.copy(op
.xattr
.value_len
, val
);
6222 val
[op
.xattr
.value_len
] = 0;
6223 dout(10) << "CEPH_OSD_OP_CMPXATTR name=" << name
<< " val=" << val
6224 << " op=" << (int)op
.xattr
.cmp_op
<< " mode=" << (int)op
.xattr
.cmp_mode
<< dendl
;
6225 result
= do_xattr_cmp_str(op
.xattr
.cmp_op
, val
, xattr
);
6229 case CEPH_OSD_CMPXATTR_MODE_U64
:
6235 catch (buffer::error
& e
) {
6239 dout(10) << "CEPH_OSD_OP_CMPXATTR name=" << name
<< " val=" << u64val
6240 << " op=" << (int)op
.xattr
.cmp_op
<< " mode=" << (int)op
.xattr
.cmp_mode
<< dendl
;
6241 result
= do_xattr_cmp_u64(op
.xattr
.cmp_op
, u64val
, xattr
);
6246 dout(10) << "bad cmp mode " << (int)op
.xattr
.cmp_mode
<< dendl
;
6251 dout(10) << "comparison returned false" << dendl
;
6252 result
= -ECANCELED
;
6256 dout(10) << "comparison returned " << result
<< " " << cpp_strerror(-result
) << dendl
;
6260 dout(10) << "comparison returned true" << dendl
;
6264 case CEPH_OSD_OP_ASSERT_VER
:
6267 uint64_t ver
= op
.assert_ver
.ver
;
6268 tracepoint(osd
, do_osd_op_pre_assert_ver
, soid
.oid
.name
.c_str(), soid
.snap
.val
, ver
);
6271 else if (ver
< oi
.user_version
)
6273 else if (ver
> oi
.user_version
)
6274 result
= -EOVERFLOW
;
6278 case CEPH_OSD_OP_LIST_WATCHERS
:
6281 tracepoint(osd
, do_osd_op_pre_list_watchers
, soid
.oid
.name
.c_str(), soid
.snap
.val
);
6282 obj_list_watch_response_t resp
;
6284 map
<pair
<uint64_t, entity_name_t
>, watch_info_t
>::const_iterator oi_iter
;
6285 for (oi_iter
= oi
.watchers
.begin(); oi_iter
!= oi
.watchers
.end();
6287 dout(20) << "key cookie=" << oi_iter
->first
.first
6288 << " entity=" << oi_iter
->first
.second
<< " "
6289 << oi_iter
->second
<< dendl
;
6290 ceph_assert(oi_iter
->first
.first
== oi_iter
->second
.cookie
);
6291 ceph_assert(oi_iter
->first
.second
.is_client());
6293 watch_item_t
wi(oi_iter
->first
.second
, oi_iter
->second
.cookie
,
6294 oi_iter
->second
.timeout_seconds
, oi_iter
->second
.addr
);
6295 resp
.entries
.push_back(wi
);
6298 resp
.encode(osd_op
.outdata
, ctx
->get_features());
6301 ctx
->delta_stats
.num_rd
++;
6305 case CEPH_OSD_OP_LIST_SNAPS
:
6308 tracepoint(osd
, do_osd_op_pre_list_snaps
, soid
.oid
.name
.c_str(), soid
.snap
.val
);
6309 obj_list_snap_response_t resp
;
6312 ssc
= ctx
->obc
->ssc
= get_snapset_context(soid
, false);
6315 dout(20) << " snapset " << ssc
->snapset
<< dendl
;
6317 int clonecount
= ssc
->snapset
.clones
.size();
6318 clonecount
++; // for head
6319 resp
.clones
.reserve(clonecount
);
6320 for (auto clone_iter
= ssc
->snapset
.clones
.begin();
6321 clone_iter
!= ssc
->snapset
.clones
.end(); ++clone_iter
) {
6323 ci
.cloneid
= *clone_iter
;
6325 hobject_t clone_oid
= soid
;
6326 clone_oid
.snap
= *clone_iter
;
6328 auto p
= ssc
->snapset
.clone_snaps
.find(*clone_iter
);
6329 if (p
== ssc
->snapset
.clone_snaps
.end()) {
6330 osd
->clog
->error() << "osd." << osd
->whoami
6331 << ": inconsistent clone_snaps found for oid "
6332 << soid
<< " clone " << *clone_iter
6333 << " snapset " << ssc
->snapset
;
6337 for (auto q
= p
->second
.rbegin(); q
!= p
->second
.rend(); ++q
) {
6338 ci
.snaps
.push_back(*q
);
6341 dout(20) << " clone " << *clone_iter
<< " snaps " << ci
.snaps
<< dendl
;
6343 map
<snapid_t
, interval_set
<uint64_t> >::const_iterator coi
;
6344 coi
= ssc
->snapset
.clone_overlap
.find(ci
.cloneid
);
6345 if (coi
== ssc
->snapset
.clone_overlap
.end()) {
6346 osd
->clog
->error() << "osd." << osd
->whoami
6347 << ": inconsistent clone_overlap found for oid "
6348 << soid
<< " clone " << *clone_iter
;
6352 const interval_set
<uint64_t> &o
= coi
->second
;
6353 ci
.overlap
.reserve(o
.num_intervals());
6354 for (interval_set
<uint64_t>::const_iterator r
= o
.begin();
6355 r
!= o
.end(); ++r
) {
6356 ci
.overlap
.push_back(pair
<uint64_t,uint64_t>(r
.get_start(),
6360 map
<snapid_t
, uint64_t>::const_iterator si
;
6361 si
= ssc
->snapset
.clone_size
.find(ci
.cloneid
);
6362 if (si
== ssc
->snapset
.clone_size
.end()) {
6363 osd
->clog
->error() << "osd." << osd
->whoami
6364 << ": inconsistent clone_size found for oid "
6365 << soid
<< " clone " << *clone_iter
;
6369 ci
.size
= si
->second
;
6371 resp
.clones
.push_back(ci
);
6376 if (!ctx
->obc
->obs
.oi
.is_whiteout()) {
6377 ceph_assert(obs
.exists
);
6379 ci
.cloneid
= CEPH_NOSNAP
;
6381 //Size for HEAD is oi.size
6384 resp
.clones
.push_back(ci
);
6386 resp
.seq
= ssc
->snapset
.seq
;
6388 resp
.encode(osd_op
.outdata
);
6391 ctx
->delta_stats
.num_rd
++;
6395 case CEPH_OSD_OP_NOTIFY
:
6402 uint32_t ver
; // obsolete
6404 decode(timeout
, bp
);
6406 } catch (const buffer::error
&e
) {
6409 tracepoint(osd
, do_osd_op_pre_notify
, soid
.oid
.name
.c_str(), soid
.snap
.val
, timeout
);
6411 timeout
= cct
->_conf
->osd_default_notify_timeout
;
6414 n
.timeout
= timeout
;
6415 n
.notify_id
= osd
->get_next_id(get_osdmap_epoch());
6416 n
.cookie
= op
.watch
.cookie
;
6418 ctx
->notifies
.push_back(n
);
6420 // return our unique notify id to the client
6421 encode(n
.notify_id
, osd_op
.outdata
);
6425 case CEPH_OSD_OP_NOTIFY_ACK
:
6429 uint64_t notify_id
= 0;
6430 uint64_t watch_cookie
= 0;
6431 decode(notify_id
, bp
);
6432 decode(watch_cookie
, bp
);
6433 bufferlist reply_bl
;
6435 decode(reply_bl
, bp
);
6437 tracepoint(osd
, do_osd_op_pre_notify_ack
, soid
.oid
.name
.c_str(), soid
.snap
.val
, notify_id
, watch_cookie
, "Y");
6438 OpContext::NotifyAck
ack(notify_id
, watch_cookie
, reply_bl
);
6439 ctx
->notify_acks
.push_back(ack
);
6440 } catch (const buffer::error
&e
) {
6441 tracepoint(osd
, do_osd_op_pre_notify_ack
, soid
.oid
.name
.c_str(), soid
.snap
.val
, op
.watch
.cookie
, 0, "N");
6442 OpContext::NotifyAck
ack(
6443 // op.watch.cookie is actually the notify_id for historical reasons
6446 ctx
->notify_acks
.push_back(ack
);
6451 case CEPH_OSD_OP_SETALLOCHINT
:
6454 tracepoint(osd
, do_osd_op_pre_setallochint
, soid
.oid
.name
.c_str(), soid
.snap
.val
, op
.alloc_hint
.expected_object_size
, op
.alloc_hint
.expected_write_size
);
6455 maybe_create_new_object(ctx
);
6456 oi
.expected_object_size
= op
.alloc_hint
.expected_object_size
;
6457 oi
.expected_write_size
= op
.alloc_hint
.expected_write_size
;
6458 oi
.alloc_hint_flags
= op
.alloc_hint
.flags
;
6459 t
->set_alloc_hint(soid
, op
.alloc_hint
.expected_object_size
,
6460 op
.alloc_hint
.expected_write_size
,
6461 op
.alloc_hint
.flags
);
6469 // -- object data --
6471 case CEPH_OSD_OP_WRITE
:
6474 __u32 seq
= oi
.truncate_seq
;
6475 tracepoint(osd
, do_osd_op_pre_write
, soid
.oid
.name
.c_str(), soid
.snap
.val
, oi
.size
, seq
, op
.extent
.offset
, op
.extent
.length
, op
.extent
.truncate_size
, op
.extent
.truncate_seq
);
6476 if (op
.extent
.length
!= osd_op
.indata
.length()) {
6481 if (pool
.info
.has_flag(pg_pool_t::FLAG_WRITE_FADVISE_DONTNEED
))
6482 op
.flags
= op
.flags
| CEPH_OSD_OP_FLAG_FADVISE_DONTNEED
;
6484 if (pool
.info
.requires_aligned_append() &&
6485 (op
.extent
.offset
% pool
.info
.required_alignment() != 0)) {
6486 result
= -EOPNOTSUPP
;
6491 if (pool
.info
.requires_aligned_append() && op
.extent
.offset
) {
6492 result
= -EOPNOTSUPP
;
6495 } else if (op
.extent
.offset
!= oi
.size
&&
6496 pool
.info
.requires_aligned_append()) {
6497 result
= -EOPNOTSUPP
;
6501 if (seq
&& (seq
> op
.extent
.truncate_seq
) &&
6502 (op
.extent
.offset
+ op
.extent
.length
> oi
.size
)) {
6503 // old write, arrived after trimtrunc
6504 op
.extent
.length
= (op
.extent
.offset
> oi
.size
? 0 : oi
.size
- op
.extent
.offset
);
6505 dout(10) << " old truncate_seq " << op
.extent
.truncate_seq
<< " < current " << seq
6506 << ", adjusting write length to " << op
.extent
.length
<< dendl
;
6508 t
.substr_of(osd_op
.indata
, 0, op
.extent
.length
);
6509 osd_op
.indata
.swap(t
);
6511 if (op
.extent
.truncate_seq
> seq
) {
6512 // write arrives before trimtrunc
6513 if (obs
.exists
&& !oi
.is_whiteout()) {
6514 dout(10) << " truncate_seq " << op
.extent
.truncate_seq
<< " > current " << seq
6515 << ", truncating to " << op
.extent
.truncate_size
<< dendl
;
6516 t
->truncate(soid
, op
.extent
.truncate_size
);
6517 oi
.truncate_seq
= op
.extent
.truncate_seq
;
6518 oi
.truncate_size
= op
.extent
.truncate_size
;
6519 if (oi
.size
> op
.extent
.truncate_size
) {
6520 interval_set
<uint64_t> trim
;
6521 trim
.insert(op
.extent
.truncate_size
,
6522 oi
.size
- op
.extent
.truncate_size
);
6523 ctx
->modified_ranges
.union_of(trim
);
6525 if (op
.extent
.truncate_size
!= oi
.size
) {
6526 truncate_update_size_and_usage(ctx
->delta_stats
,
6528 op
.extent
.truncate_size
);
6531 dout(10) << " truncate_seq " << op
.extent
.truncate_seq
<< " > current " << seq
6532 << ", but object is new" << dendl
;
6533 oi
.truncate_seq
= op
.extent
.truncate_seq
;
6534 oi
.truncate_size
= op
.extent
.truncate_size
;
6537 result
= check_offset_and_length(
6538 op
.extent
.offset
, op
.extent
.length
,
6539 static_cast<Option::size_t>(osd
->osd_max_object_size
), get_dpp());
6543 maybe_create_new_object(ctx
);
6545 if (op
.extent
.length
== 0) {
6546 if (op
.extent
.offset
> oi
.size
) {
6548 soid
, op
.extent
.offset
);
6554 soid
, op
.extent
.offset
, op
.extent
.length
, osd_op
.indata
, op
.flags
);
6557 if (op
.extent
.offset
== 0 && op
.extent
.length
>= oi
.size
6558 && !skip_data_digest
) {
6559 obs
.oi
.set_data_digest(osd_op
.indata
.crc32c(-1));
6560 } else if (op
.extent
.offset
== oi
.size
&& obs
.oi
.is_data_digest()) {
6561 if (skip_data_digest
) {
6562 obs
.oi
.clear_data_digest();
6564 obs
.oi
.set_data_digest(osd_op
.indata
.crc32c(obs
.oi
.data_digest
));
6567 obs
.oi
.clear_data_digest();
6569 write_update_size_and_usage(ctx
->delta_stats
, oi
, ctx
->modified_ranges
,
6570 op
.extent
.offset
, op
.extent
.length
);
6575 case CEPH_OSD_OP_WRITEFULL
:
6577 { // write full object
6578 tracepoint(osd
, do_osd_op_pre_writefull
, soid
.oid
.name
.c_str(), soid
.snap
.val
, oi
.size
, 0, op
.extent
.length
);
6580 if (op
.extent
.length
!= osd_op
.indata
.length()) {
6584 result
= check_offset_and_length(
6585 0, op
.extent
.length
,
6586 static_cast<Option::size_t>(osd
->osd_max_object_size
), get_dpp());
6590 if (pool
.info
.has_flag(pg_pool_t::FLAG_WRITE_FADVISE_DONTNEED
))
6591 op
.flags
= op
.flags
| CEPH_OSD_OP_FLAG_FADVISE_DONTNEED
;
6593 maybe_create_new_object(ctx
);
6594 if (pool
.info
.is_erasure()) {
6595 t
->truncate(soid
, 0);
6596 } else if (obs
.exists
&& op
.extent
.length
< oi
.size
) {
6597 t
->truncate(soid
, op
.extent
.length
);
6599 if (op
.extent
.length
) {
6600 t
->write(soid
, 0, op
.extent
.length
, osd_op
.indata
, op
.flags
);
6602 if (!skip_data_digest
) {
6603 obs
.oi
.set_data_digest(osd_op
.indata
.crc32c(-1));
6605 obs
.oi
.clear_data_digest();
6608 write_update_size_and_usage(ctx
->delta_stats
, oi
, ctx
->modified_ranges
,
6609 0, op
.extent
.length
, true);
6613 case CEPH_OSD_OP_WRITESAME
:
6615 tracepoint(osd
, do_osd_op_pre_writesame
, soid
.oid
.name
.c_str(), soid
.snap
.val
, oi
.size
, op
.writesame
.offset
, op
.writesame
.length
, op
.writesame
.data_length
);
6616 result
= do_writesame(ctx
, osd_op
);
6619 case CEPH_OSD_OP_ROLLBACK
:
6621 tracepoint(osd
, do_osd_op_pre_rollback
, soid
.oid
.name
.c_str(), soid
.snap
.val
);
6622 result
= _rollback_to(ctx
, op
);
6625 case CEPH_OSD_OP_ZERO
:
6626 tracepoint(osd
, do_osd_op_pre_zero
, soid
.oid
.name
.c_str(), soid
.snap
.val
, op
.extent
.offset
, op
.extent
.length
);
6627 if (pool
.info
.requires_aligned_append()) {
6628 result
= -EOPNOTSUPP
;
6633 result
= check_offset_and_length(
6634 op
.extent
.offset
, op
.extent
.length
,
6635 static_cast<Option::size_t>(osd
->osd_max_object_size
), get_dpp());
6639 ceph_assert(op
.extent
.length
);
6640 if (obs
.exists
&& !oi
.is_whiteout()) {
6641 t
->zero(soid
, op
.extent
.offset
, op
.extent
.length
);
6642 interval_set
<uint64_t> ch
;
6643 ch
.insert(op
.extent
.offset
, op
.extent
.length
);
6644 ctx
->modified_ranges
.union_of(ch
);
6645 ctx
->delta_stats
.num_wr
++;
6646 oi
.clear_data_digest();
6652 case CEPH_OSD_OP_CREATE
:
6655 tracepoint(osd
, do_osd_op_pre_create
, soid
.oid
.name
.c_str(), soid
.snap
.val
);
6656 int flags
= le32_to_cpu(op
.flags
);
6657 if (obs
.exists
&& !oi
.is_whiteout() &&
6658 (flags
& CEPH_OSD_OP_FLAG_EXCL
)) {
6659 result
= -EEXIST
; /* this is an exclusive create */
6661 if (osd_op
.indata
.length()) {
6662 auto p
= osd_op
.indata
.cbegin();
6665 decode(category
, p
);
6667 catch (buffer::error
& e
) {
6671 // category is no longer implemented.
6674 maybe_create_new_object(ctx
);
6681 case CEPH_OSD_OP_TRIMTRUNC
:
6682 op
.extent
.offset
= op
.extent
.truncate_size
;
6685 case CEPH_OSD_OP_TRUNCATE
:
6686 tracepoint(osd
, do_osd_op_pre_truncate
, soid
.oid
.name
.c_str(), soid
.snap
.val
, oi
.size
, oi
.truncate_seq
, op
.extent
.offset
, op
.extent
.length
, op
.extent
.truncate_size
, op
.extent
.truncate_seq
);
6687 if (pool
.info
.requires_aligned_append()) {
6688 result
= -EOPNOTSUPP
;
6694 if (!obs
.exists
|| oi
.is_whiteout()) {
6695 dout(10) << " object dne, truncate is a no-op" << dendl
;
6699 result
= check_offset_and_length(
6700 op
.extent
.offset
, op
.extent
.length
,
6701 static_cast<Option::size_t>(osd
->osd_max_object_size
), get_dpp());
6705 if (op
.extent
.truncate_seq
) {
6706 ceph_assert(op
.extent
.offset
== op
.extent
.truncate_size
);
6707 if (op
.extent
.truncate_seq
<= oi
.truncate_seq
) {
6708 dout(10) << " truncate seq " << op
.extent
.truncate_seq
<< " <= current " << oi
.truncate_seq
6709 << ", no-op" << dendl
;
6712 dout(10) << " truncate seq " << op
.extent
.truncate_seq
<< " > current " << oi
.truncate_seq
6713 << ", truncating" << dendl
;
6714 oi
.truncate_seq
= op
.extent
.truncate_seq
;
6715 oi
.truncate_size
= op
.extent
.truncate_size
;
6718 maybe_create_new_object(ctx
);
6719 t
->truncate(soid
, op
.extent
.offset
);
6720 if (oi
.size
> op
.extent
.offset
) {
6721 interval_set
<uint64_t> trim
;
6722 trim
.insert(op
.extent
.offset
, oi
.size
-op
.extent
.offset
);
6723 ctx
->modified_ranges
.union_of(trim
);
6725 if (op
.extent
.offset
!= oi
.size
) {
6726 truncate_update_size_and_usage(ctx
->delta_stats
,
6730 ctx
->delta_stats
.num_wr
++;
6731 // do no set exists, or we will break above DELETE -> TRUNCATE munging.
6733 oi
.clear_data_digest();
6737 case CEPH_OSD_OP_DELETE
:
6739 tracepoint(osd
, do_osd_op_pre_delete
, soid
.oid
.name
.c_str(), soid
.snap
.val
);
6741 if (oi
.has_manifest()) {
6742 if ((oi
.flags
& object_info_t::FLAG_REDIRECT_HAS_REFERENCE
) && oi
.manifest
.is_redirect()) {
6743 ctx
->register_on_commit(
6745 object_locator_t
target_oloc(oi
.manifest
.redirect_target
);
6746 refcount_manifest(ctx
->obc
, target_oloc
, oi
.manifest
.redirect_target
,
6747 SnapContext(), false, NULL
, 0);
6749 } else if (oi
.manifest
.is_chunked()) {
6750 ctx
->register_on_commit(
6752 for (auto p
: oi
.manifest
.chunk_map
) {
6753 if (p
.second
.has_reference()) {
6754 object_locator_t
target_oloc(p
.second
.oid
);
6755 refcount_manifest(ctx
->obc
, target_oloc
, p
.second
.oid
,
6756 SnapContext(), false, NULL
, p
.first
);
6762 result
= _delete_oid(ctx
, false, ctx
->ignore_cache
);
6766 case CEPH_OSD_OP_WATCH
:
6769 tracepoint(osd
, do_osd_op_pre_watch
, soid
.oid
.name
.c_str(), soid
.snap
.val
,
6770 op
.watch
.cookie
, op
.watch
.op
);
6775 uint64_t cookie
= op
.watch
.cookie
;
6776 entity_name_t entity
= ctx
->reqid
.name
;
6777 ObjectContextRef obc
= ctx
->obc
;
6779 dout(10) << "watch " << ceph_osd_watch_op_name(op
.watch
.op
)
6780 << ": ctx->obc=" << (void *)obc
.get() << " cookie=" << cookie
6781 << " oi.version=" << oi
.version
.version
<< " ctx->at_version=" << ctx
->at_version
<< dendl
;
6782 dout(10) << "watch: oi.user_version=" << oi
.user_version
<< dendl
;
6783 dout(10) << "watch: peer_addr="
6784 << ctx
->op
->get_req()->get_connection()->get_peer_addr() << dendl
;
6786 uint32_t timeout
= cct
->_conf
->osd_client_watch_timeout
;
6787 if (op
.watch
.timeout
!= 0) {
6788 timeout
= op
.watch
.timeout
;
6791 watch_info_t
w(cookie
, timeout
,
6792 ctx
->op
->get_req()->get_connection()->get_peer_addr());
6793 if (op
.watch
.op
== CEPH_OSD_WATCH_OP_WATCH
||
6794 op
.watch
.op
== CEPH_OSD_WATCH_OP_LEGACY_WATCH
) {
6795 if (oi
.watchers
.count(make_pair(cookie
, entity
))) {
6796 dout(10) << " found existing watch " << w
<< " by " << entity
<< dendl
;
6798 dout(10) << " registered new watch " << w
<< " by " << entity
<< dendl
;
6799 oi
.watchers
[make_pair(cookie
, entity
)] = w
;
6800 t
->nop(soid
); // make sure update the object_info on disk!
6802 bool will_ping
= (op
.watch
.op
== CEPH_OSD_WATCH_OP_WATCH
);
6803 ctx
->watch_connects
.push_back(make_pair(w
, will_ping
));
6804 } else if (op
.watch
.op
== CEPH_OSD_WATCH_OP_RECONNECT
) {
6805 if (!oi
.watchers
.count(make_pair(cookie
, entity
))) {
6809 dout(10) << " found existing watch " << w
<< " by " << entity
<< dendl
;
6810 ctx
->watch_connects
.push_back(make_pair(w
, true));
6811 } else if (op
.watch
.op
== CEPH_OSD_WATCH_OP_PING
) {
6812 /* Note: WATCH with PING doesn't cause may_write() to return true,
6813 * so if there is nothing else in the transaction, this is going
6814 * to run do_osd_op_effects, but not write out a log entry */
6815 if (!oi
.watchers
.count(make_pair(cookie
, entity
))) {
6819 map
<pair
<uint64_t,entity_name_t
>,WatchRef
>::iterator p
=
6820 obc
->watchers
.find(make_pair(cookie
, entity
));
6821 if (p
== obc
->watchers
.end() ||
6822 !p
->second
->is_connected()) {
6823 // client needs to reconnect
6824 result
= -ETIMEDOUT
;
6827 dout(10) << " found existing watch " << w
<< " by " << entity
<< dendl
;
6828 p
->second
->got_ping(ceph_clock_now());
6830 } else if (op
.watch
.op
== CEPH_OSD_WATCH_OP_UNWATCH
) {
6831 map
<pair
<uint64_t, entity_name_t
>, watch_info_t
>::iterator oi_iter
=
6832 oi
.watchers
.find(make_pair(cookie
, entity
));
6833 if (oi_iter
!= oi
.watchers
.end()) {
6834 dout(10) << " removed watch " << oi_iter
->second
<< " by "
6836 oi
.watchers
.erase(oi_iter
);
6837 t
->nop(soid
); // update oi on disk
6838 ctx
->watch_disconnects
.push_back(
6839 watch_disconnect_t(cookie
, entity
, false));
6841 dout(10) << " can't remove: no watch by " << entity
<< dendl
;
6847 case CEPH_OSD_OP_CACHE_PIN
:
6848 tracepoint(osd
, do_osd_op_pre_cache_pin
, soid
.oid
.name
.c_str(), soid
.snap
.val
);
6849 if ((!pool
.info
.is_tier() ||
6850 pool
.info
.cache_mode
== pg_pool_t::CACHEMODE_NONE
)) {
6852 dout(10) << " pin object is only allowed on the cache tier " << dendl
;
6857 if (!obs
.exists
|| oi
.is_whiteout()) {
6862 if (!oi
.is_cache_pinned()) {
6863 oi
.set_flag(object_info_t::FLAG_CACHE_PIN
);
6865 ctx
->delta_stats
.num_objects_pinned
++;
6866 ctx
->delta_stats
.num_wr
++;
6872 case CEPH_OSD_OP_CACHE_UNPIN
:
6873 tracepoint(osd
, do_osd_op_pre_cache_unpin
, soid
.oid
.name
.c_str(), soid
.snap
.val
);
6874 if ((!pool
.info
.is_tier() ||
6875 pool
.info
.cache_mode
== pg_pool_t::CACHEMODE_NONE
)) {
6877 dout(10) << " pin object is only allowed on the cache tier " << dendl
;
6882 if (!obs
.exists
|| oi
.is_whiteout()) {
6887 if (oi
.is_cache_pinned()) {
6888 oi
.clear_flag(object_info_t::FLAG_CACHE_PIN
);
6890 ctx
->delta_stats
.num_objects_pinned
--;
6891 ctx
->delta_stats
.num_wr
++;
6897 case CEPH_OSD_OP_SET_REDIRECT
:
6900 if (pool
.info
.is_tier()) {
6908 if (get_osdmap()->require_osd_release
< CEPH_RELEASE_LUMINOUS
) {
6909 result
= -EOPNOTSUPP
;
6913 object_t target_name
;
6914 object_locator_t target_oloc
;
6915 snapid_t target_snapid
= (uint64_t)op
.copy_from
.snapid
;
6916 version_t target_version
= op
.copy_from
.src_version
;
6918 decode(target_name
, bp
);
6919 decode(target_oloc
, bp
);
6921 catch (buffer::error
& e
) {
6926 get_osdmap()->object_locator_to_pg(target_name
, target_oloc
, raw_pg
);
6927 hobject_t
target(target_name
, target_oloc
.key
, target_snapid
,
6928 raw_pg
.ps(), raw_pg
.pool(),
6929 target_oloc
.nspace
);
6930 if (target
== soid
) {
6931 dout(20) << " set-redirect self is invalid" << dendl
;
6936 bool need_reference
= (osd_op
.op
.flags
& CEPH_OSD_OP_FLAG_WITH_REFERENCE
);
6937 bool has_reference
= (oi
.flags
& object_info_t::FLAG_REDIRECT_HAS_REFERENCE
);
6938 if (has_reference
) {
6940 dout(5) << " the object is already a manifest " << dendl
;
6943 if (op_finisher
== nullptr && need_reference
) {
6945 ctx
->op_finishers
[ctx
->current_osd_subop_num
].reset(
6946 new SetManifestFinisher(osd_op
));
6947 RefCountCallback
*fin
= new RefCountCallback(
6948 this, ctx
, osd_op
, get_last_peering_reset());
6949 refcount_manifest(ctx
->obc
, target_oloc
, target
, SnapContext(),
6951 result
= -EINPROGRESS
;
6955 result
= op_finisher
->execute();
6956 ceph_assert(result
== 0);
6959 if (!oi
.has_manifest() && !oi
.manifest
.is_redirect())
6960 ctx
->delta_stats
.num_objects_manifest
++;
6962 oi
.set_flag(object_info_t::FLAG_MANIFEST
);
6963 oi
.manifest
.redirect_target
= target
;
6964 oi
.manifest
.type
= object_manifest_t::TYPE_REDIRECT
;
6965 t
->truncate(soid
, 0);
6966 if (oi
.is_omap() && pool
.info
.supports_omap()) {
6967 t
->omap_clear(soid
);
6968 obs
.oi
.clear_omap_digest();
6969 obs
.oi
.clear_flag(object_info_t::FLAG_OMAP
);
6971 ctx
->delta_stats
.num_bytes
-= oi
.size
;
6974 oi
.user_version
= target_version
;
6975 ctx
->user_at_version
= target_version
;
6977 map
<string
,bufferlist
> rmattrs
;
6978 result
= getattrs_maybe_cache(ctx
->obc
, &rmattrs
);
6982 map
<string
, bufferlist
>::iterator iter
;
6983 for (iter
= rmattrs
.begin(); iter
!= rmattrs
.end(); ++iter
) {
6984 const string
& name
= iter
->first
;
6985 t
->rmattr(soid
, name
);
6987 if (!has_reference
&& need_reference
) {
6988 oi
.set_flag(object_info_t::FLAG_REDIRECT_HAS_REFERENCE
);
6990 dout(10) << "set-redirect oid:" << oi
.soid
<< " user_version: " << oi
.user_version
<< dendl
;
6992 ctx
->op_finishers
.erase(ctx
->current_osd_subop_num
);
6999 case CEPH_OSD_OP_SET_CHUNK
:
7002 if (pool
.info
.is_tier()) {
7010 if (get_osdmap()->require_osd_release
< CEPH_RELEASE_LUMINOUS
) {
7011 result
= -EOPNOTSUPP
;
7015 object_locator_t tgt_oloc
;
7016 uint64_t src_offset
, src_length
, tgt_offset
;
7019 decode(src_offset
, bp
);
7020 decode(src_length
, bp
);
7021 decode(tgt_oloc
, bp
);
7022 decode(tgt_name
, bp
);
7023 decode(tgt_offset
, bp
);
7025 catch (buffer::error
& e
) {
7035 for (auto &p
: oi
.manifest
.chunk_map
) {
7036 if ((p
.first
<= src_offset
&& p
.first
+ p
.second
.length
> src_offset
) ||
7037 (p
.first
> src_offset
&& p
.first
<= src_offset
+ src_length
)) {
7038 dout(20) << __func__
<< " overlapped !! offset: " << src_offset
<< " length: " << src_length
7039 << " chunk_info: " << p
<< dendl
;
7040 result
= -EOPNOTSUPP
;
7045 if (!oi
.manifest
.is_chunked()) {
7046 oi
.manifest
.clear();
7050 chunk_info_t chunk_info
;
7051 get_osdmap()->object_locator_to_pg(tgt_name
, tgt_oloc
, raw_pg
);
7052 hobject_t
target(tgt_name
, tgt_oloc
.key
, snapid_t(),
7053 raw_pg
.ps(), raw_pg
.pool(),
7055 bool need_reference
= (osd_op
.op
.flags
& CEPH_OSD_OP_FLAG_WITH_REFERENCE
);
7056 bool has_reference
= (oi
.manifest
.chunk_map
.find(src_offset
) != oi
.manifest
.chunk_map
.end()) &&
7057 (oi
.manifest
.chunk_map
[src_offset
].flags
& chunk_info_t::FLAG_HAS_REFERENCE
);
7058 if (has_reference
) {
7060 dout(5) << " the object is already a manifest " << dendl
;
7063 if (op_finisher
== nullptr && need_reference
) {
7065 ctx
->op_finishers
[ctx
->current_osd_subop_num
].reset(
7066 new SetManifestFinisher(osd_op
));
7067 RefCountCallback
*fin
= new RefCountCallback(
7068 this, ctx
, osd_op
, get_last_peering_reset());
7069 refcount_manifest(ctx
->obc
, tgt_oloc
, target
, SnapContext(),
7070 true, fin
, src_offset
);
7071 result
= -EINPROGRESS
;
7074 result
= op_finisher
->execute();
7075 ceph_assert(result
== 0);
7078 chunk_info_t chunk_info
;
7079 chunk_info
.set_flag(chunk_info_t::FLAG_MISSING
);
7080 chunk_info
.oid
= target
;
7081 chunk_info
.offset
= tgt_offset
;
7082 chunk_info
.length
= src_length
;
7083 oi
.manifest
.chunk_map
[src_offset
] = chunk_info
;
7084 if (!oi
.has_manifest() && !oi
.manifest
.is_chunked())
7085 ctx
->delta_stats
.num_objects_manifest
++;
7086 oi
.set_flag(object_info_t::FLAG_MANIFEST
);
7087 oi
.manifest
.type
= object_manifest_t::TYPE_CHUNKED
;
7088 if (!has_reference
&& need_reference
) {
7089 oi
.manifest
.chunk_map
[src_offset
].set_flag(chunk_info_t::FLAG_HAS_REFERENCE
);
7091 if (need_reference
&& pool
.info
.get_fingerprint_type() != pg_pool_t::TYPE_FINGERPRINT_NONE
) {
7092 oi
.manifest
.chunk_map
[src_offset
].set_flag(chunk_info_t::FLAG_HAS_FINGERPRINT
);
7096 dout(10) << "set-chunked oid:" << oi
.soid
<< " user_version: " << oi
.user_version
7097 << " chunk_info: " << chunk_info
<< dendl
;
7099 ctx
->op_finishers
.erase(ctx
->current_osd_subop_num
);
7106 case CEPH_OSD_OP_TIER_PROMOTE
:
7109 if (pool
.info
.is_tier()) {
7117 if (get_osdmap()->require_osd_release
< CEPH_RELEASE_LUMINOUS
) {
7118 result
= -EOPNOTSUPP
;
7121 if (!obs
.oi
.has_manifest()) {
7126 if (op_finisher
== nullptr) {
7127 PromoteManifestCallback
*cb
;
7128 object_locator_t my_oloc
;
7131 if (obs
.oi
.manifest
.is_chunked()) {
7132 src_hoid
= obs
.oi
.soid
;
7133 cb
= new PromoteManifestCallback(ctx
->obc
, this, ctx
);
7134 } else if (obs
.oi
.manifest
.is_redirect()) {
7135 object_locator_t
src_oloc(obs
.oi
.manifest
.redirect_target
);
7137 src_hoid
= obs
.oi
.manifest
.redirect_target
;
7138 cb
= new PromoteManifestCallback(ctx
->obc
, this, ctx
);
7140 ceph_abort_msg("unrecognized manifest type");
7142 ctx
->op_finishers
[ctx
->current_osd_subop_num
].reset(
7143 new PromoteFinisher(cb
));
7144 unsigned flags
= CEPH_OSD_COPY_FROM_FLAG_IGNORE_OVERLAY
|
7145 CEPH_OSD_COPY_FROM_FLAG_IGNORE_CACHE
|
7146 CEPH_OSD_COPY_FROM_FLAG_MAP_SNAP_CLONE
|
7147 CEPH_OSD_COPY_FROM_FLAG_RWORDERED
;
7148 unsigned src_fadvise_flags
= LIBRADOS_OP_FLAG_FADVISE_SEQUENTIAL
;
7149 start_copy(cb
, ctx
->obc
, src_hoid
, my_oloc
, 0, flags
,
7150 obs
.oi
.soid
.snap
== CEPH_NOSNAP
,
7151 src_fadvise_flags
, 0);
7153 dout(10) << "tier-promote oid:" << oi
.soid
<< " manifest: " << obs
.oi
.manifest
<< dendl
;
7154 result
= -EINPROGRESS
;
7156 result
= op_finisher
->execute();
7157 ceph_assert(result
== 0);
7158 ctx
->op_finishers
.erase(ctx
->current_osd_subop_num
);
7164 case CEPH_OSD_OP_UNSET_MANIFEST
:
7167 if (pool
.info
.is_tier()) {
7175 if (!oi
.has_manifest()) {
7176 result
= -EOPNOTSUPP
;
7179 if (get_osdmap()->require_osd_release
< CEPH_RELEASE_LUMINOUS
) {
7180 result
= -EOPNOTSUPP
;
7184 if (oi
.manifest
.is_redirect()) {
7185 if ((oi
.flags
& object_info_t::FLAG_REDIRECT_HAS_REFERENCE
)) {
7186 ctx
->register_on_commit(
7188 object_locator_t
target_oloc(oi
.manifest
.redirect_target
);
7189 refcount_manifest(ctx
->obc
, target_oloc
, oi
.manifest
.redirect_target
,
7190 SnapContext(), false, NULL
, 0);
7193 } else if (oi
.manifest
.is_chunked()) {
7194 ctx
->register_on_commit(
7196 for (auto p
: oi
.manifest
.chunk_map
) {
7197 if (p
.second
.flags
& chunk_info_t::FLAG_HAS_REFERENCE
) {
7198 object_locator_t
target_oloc(p
.second
.oid
);
7199 refcount_manifest(ctx
->obc
, target_oloc
, p
.second
.oid
,
7200 SnapContext(), false, NULL
, p
.first
);
7205 ceph_abort_msg("unrecognized manifest type");
7208 oi
.clear_flag(object_info_t::FLAG_MANIFEST
);
7209 oi
.manifest
= object_manifest_t();
7210 ctx
->delta_stats
.num_objects_manifest
--;
7211 ctx
->delta_stats
.num_wr
++;
7217 // -- object attrs --
7219 case CEPH_OSD_OP_SETXATTR
:
7222 if (cct
->_conf
->osd_max_attr_size
> 0 &&
7223 op
.xattr
.value_len
> cct
->_conf
->osd_max_attr_size
) {
7224 tracepoint(osd
, do_osd_op_pre_setxattr
, soid
.oid
.name
.c_str(), soid
.snap
.val
, "???");
7228 unsigned max_name_len
=
7229 std::min
<uint64_t>(osd
->store
->get_max_attr_name_length(),
7230 cct
->_conf
->osd_max_attr_name_len
);
7231 if (op
.xattr
.name_len
> max_name_len
) {
7232 result
= -ENAMETOOLONG
;
7235 maybe_create_new_object(ctx
);
7237 bp
.copy(op
.xattr
.name_len
, aname
);
7238 tracepoint(osd
, do_osd_op_pre_setxattr
, soid
.oid
.name
.c_str(), soid
.snap
.val
, aname
.c_str());
7239 string name
= "_" + aname
;
7241 bp
.copy(op
.xattr
.value_len
, bl
);
7242 t
->setattr(soid
, name
, bl
);
7243 ctx
->delta_stats
.num_wr
++;
7247 case CEPH_OSD_OP_RMXATTR
:
7251 bp
.copy(op
.xattr
.name_len
, aname
);
7252 tracepoint(osd
, do_osd_op_pre_rmxattr
, soid
.oid
.name
.c_str(), soid
.snap
.val
, aname
.c_str());
7253 if (!obs
.exists
|| oi
.is_whiteout()) {
7257 string name
= "_" + aname
;
7258 t
->rmattr(soid
, name
);
7259 ctx
->delta_stats
.num_wr
++;
7264 // -- fancy writers --
7265 case CEPH_OSD_OP_APPEND
:
7267 tracepoint(osd
, do_osd_op_pre_append
, soid
.oid
.name
.c_str(), soid
.snap
.val
, oi
.size
, oi
.truncate_seq
, op
.extent
.offset
, op
.extent
.length
, op
.extent
.truncate_size
, op
.extent
.truncate_seq
);
7268 // just do it inline; this works because we are happy to execute
7269 // fancy op on replicas as well.
7270 vector
<OSDOp
> nops(1);
7271 OSDOp
& newop
= nops
[0];
7272 newop
.op
.op
= CEPH_OSD_OP_WRITE
;
7273 newop
.op
.extent
.offset
= oi
.size
;
7274 newop
.op
.extent
.length
= op
.extent
.length
;
7275 newop
.op
.extent
.truncate_seq
= oi
.truncate_seq
;
7276 newop
.indata
= osd_op
.indata
;
7277 result
= do_osd_ops(ctx
, nops
);
7278 osd_op
.outdata
.claim(newop
.outdata
);
7282 case CEPH_OSD_OP_STARTSYNC
:
7286 // -- trivial map --
7287 case CEPH_OSD_OP_TMAPGET
:
7288 tracepoint(osd
, do_osd_op_pre_tmapget
, soid
.oid
.name
.c_str(), soid
.snap
.val
);
7289 if (pool
.info
.is_erasure()) {
7290 result
= -EOPNOTSUPP
;
7294 vector
<OSDOp
> nops(1);
7295 OSDOp
& newop
= nops
[0];
7296 newop
.op
.op
= CEPH_OSD_OP_SYNC_READ
;
7297 newop
.op
.extent
.offset
= 0;
7298 newop
.op
.extent
.length
= 0;
7299 do_osd_ops(ctx
, nops
);
7300 osd_op
.outdata
.claim(newop
.outdata
);
7304 case CEPH_OSD_OP_TMAPPUT
:
7305 tracepoint(osd
, do_osd_op_pre_tmapput
, soid
.oid
.name
.c_str(), soid
.snap
.val
);
7306 if (pool
.info
.is_erasure()) {
7307 result
= -EOPNOTSUPP
;
7311 //_dout_lock.Lock();
7312 //osd_op.data.hexdump(*_dout);
7313 //_dout_lock.Unlock();
7315 // verify sort order
7316 bool unsorted
= false;
7326 dout(10) << "tmapput key " << key
<< dendl
;
7329 if (key
< last_key
) {
7330 dout(10) << "TMAPPUT is unordered; resorting" << dendl
;
7339 vector
<OSDOp
> nops(1);
7340 OSDOp
& newop
= nops
[0];
7341 newop
.op
.op
= CEPH_OSD_OP_WRITEFULL
;
7342 newop
.op
.extent
.offset
= 0;
7343 newop
.op
.extent
.length
= osd_op
.indata
.length();
7344 newop
.indata
= osd_op
.indata
;
7347 bp
= osd_op
.indata
.begin();
7349 map
<string
, bufferlist
> m
;
7352 ceph_assert(bp
.end());
7354 encode(header
, newbl
);
7356 newop
.indata
= newbl
;
7358 result
= do_osd_ops(ctx
, nops
);
7359 ceph_assert(result
== 0);
7363 case CEPH_OSD_OP_TMAPUP
:
7364 tracepoint(osd
, do_osd_op_pre_tmapup
, soid
.oid
.name
.c_str(), soid
.snap
.val
);
7365 if (pool
.info
.is_erasure()) {
7366 result
= -EOPNOTSUPP
;
7370 result
= do_tmapup(ctx
, bp
, osd_op
);
7373 case CEPH_OSD_OP_TMAP2OMAP
:
7375 tracepoint(osd
, do_osd_op_pre_tmap2omap
, soid
.oid
.name
.c_str(), soid
.snap
.val
);
7376 result
= do_tmap2omap(ctx
, op
.tmap2omap
.flags
);
7380 case CEPH_OSD_OP_OMAPGETKEYS
:
7384 uint64_t max_return
;
7386 decode(start_after
, bp
);
7387 decode(max_return
, bp
);
7389 catch (buffer::error
& e
) {
7391 tracepoint(osd
, do_osd_op_pre_omapgetkeys
, soid
.oid
.name
.c_str(), soid
.snap
.val
, "???", 0);
7394 if (max_return
> cct
->_conf
->osd_max_omap_entries_per_request
) {
7395 max_return
= cct
->_conf
->osd_max_omap_entries_per_request
;
7397 tracepoint(osd
, do_osd_op_pre_omapgetkeys
, soid
.oid
.name
.c_str(), soid
.snap
.val
, start_after
.c_str(), max_return
);
7401 bool truncated
= false;
7403 ObjectMap::ObjectMapIterator iter
= osd
->store
->get_omap_iterator(
7404 ch
, ghobject_t(soid
)
7407 iter
->upper_bound(start_after
);
7408 for (num
= 0; iter
->valid(); ++num
, iter
->next()) {
7409 if (num
>= max_return
||
7410 bl
.length() >= cct
->_conf
->osd_max_omap_bytes_per_request
) {
7414 encode(iter
->key(), bl
);
7416 } // else return empty out_set
7417 encode(num
, osd_op
.outdata
);
7418 osd_op
.outdata
.claim_append(bl
);
7419 encode(truncated
, osd_op
.outdata
);
7420 ctx
->delta_stats
.num_rd_kb
+= shift_round_up(osd_op
.outdata
.length(), 10);
7421 ctx
->delta_stats
.num_rd
++;
7425 case CEPH_OSD_OP_OMAPGETVALS
:
7429 uint64_t max_return
;
7430 string filter_prefix
;
7432 decode(start_after
, bp
);
7433 decode(max_return
, bp
);
7434 decode(filter_prefix
, bp
);
7436 catch (buffer::error
& e
) {
7438 tracepoint(osd
, do_osd_op_pre_omapgetvals
, soid
.oid
.name
.c_str(), soid
.snap
.val
, "???", 0, "???");
7441 if (max_return
> cct
->_conf
->osd_max_omap_entries_per_request
) {
7442 max_return
= cct
->_conf
->osd_max_omap_entries_per_request
;
7444 tracepoint(osd
, do_osd_op_pre_omapgetvals
, soid
.oid
.name
.c_str(), soid
.snap
.val
, start_after
.c_str(), max_return
, filter_prefix
.c_str());
7447 bool truncated
= false;
7450 ObjectMap::ObjectMapIterator iter
= osd
->store
->get_omap_iterator(
7451 ch
, ghobject_t(soid
)
7457 iter
->upper_bound(start_after
);
7458 if (filter_prefix
> start_after
) iter
->lower_bound(filter_prefix
);
7461 iter
->key().substr(0, filter_prefix
.size()) == filter_prefix
;
7462 ++num
, iter
->next()) {
7463 dout(20) << "Found key " << iter
->key() << dendl
;
7464 if (num
>= max_return
||
7465 bl
.length() >= cct
->_conf
->osd_max_omap_bytes_per_request
) {
7469 encode(iter
->key(), bl
);
7470 encode(iter
->value(), bl
);
7472 } // else return empty out_set
7473 encode(num
, osd_op
.outdata
);
7474 osd_op
.outdata
.claim_append(bl
);
7475 encode(truncated
, osd_op
.outdata
);
7476 ctx
->delta_stats
.num_rd_kb
+= shift_round_up(osd_op
.outdata
.length(), 10);
7477 ctx
->delta_stats
.num_rd
++;
7481 case CEPH_OSD_OP_OMAPGETHEADER
:
7482 tracepoint(osd
, do_osd_op_pre_omapgetheader
, soid
.oid
.name
.c_str(), soid
.snap
.val
);
7483 if (!oi
.is_omap()) {
7484 // return empty header
7489 osd
->store
->omap_get_header(ch
, ghobject_t(soid
), &osd_op
.outdata
);
7490 ctx
->delta_stats
.num_rd_kb
+= shift_round_up(osd_op
.outdata
.length(), 10);
7491 ctx
->delta_stats
.num_rd
++;
7495 case CEPH_OSD_OP_OMAPGETVALSBYKEYS
:
7498 set
<string
> keys_to_get
;
7500 decode(keys_to_get
, bp
);
7502 catch (buffer::error
& e
) {
7504 tracepoint(osd
, do_osd_op_pre_omapgetvalsbykeys
, soid
.oid
.name
.c_str(), soid
.snap
.val
, "???");
7507 tracepoint(osd
, do_osd_op_pre_omapgetvalsbykeys
, soid
.oid
.name
.c_str(), soid
.snap
.val
, list_entries(keys_to_get
).c_str());
7508 map
<string
, bufferlist
> out
;
7510 osd
->store
->omap_get_values(ch
, ghobject_t(soid
), keys_to_get
, &out
);
7511 } // else return empty omap entries
7512 encode(out
, osd_op
.outdata
);
7513 ctx
->delta_stats
.num_rd_kb
+= shift_round_up(osd_op
.outdata
.length(), 10);
7514 ctx
->delta_stats
.num_rd
++;
7518 case CEPH_OSD_OP_OMAP_CMP
:
7521 if (!obs
.exists
|| oi
.is_whiteout()) {
7523 tracepoint(osd
, do_osd_op_pre_omap_cmp
, soid
.oid
.name
.c_str(), soid
.snap
.val
, "???");
7526 map
<string
, pair
<bufferlist
, int> > assertions
;
7528 decode(assertions
, bp
);
7530 catch (buffer::error
& e
) {
7532 tracepoint(osd
, do_osd_op_pre_omap_cmp
, soid
.oid
.name
.c_str(), soid
.snap
.val
, "???");
7535 tracepoint(osd
, do_osd_op_pre_omap_cmp
, soid
.oid
.name
.c_str(), soid
.snap
.val
, list_keys(assertions
).c_str());
7537 map
<string
, bufferlist
> out
;
7541 for (map
<string
, pair
<bufferlist
, int> >::iterator i
= assertions
.begin();
7542 i
!= assertions
.end();
7544 to_get
.insert(i
->first
);
7545 int r
= osd
->store
->omap_get_values(ch
, ghobject_t(soid
),
7551 } // else leave out empty
7553 //Should set num_rd_kb based on encode length of map
7554 ctx
->delta_stats
.num_rd
++;
7558 for (map
<string
, pair
<bufferlist
, int> >::iterator i
= assertions
.begin();
7559 i
!= assertions
.end();
7561 auto out_entry
= out
.find(i
->first
);
7562 bufferlist
&bl
= (out_entry
!= out
.end()) ?
7563 out_entry
->second
: empty
;
7564 switch (i
->second
.second
) {
7565 case CEPH_OSD_CMPXATTR_OP_EQ
:
7566 if (!(bl
== i
->second
.first
)) {
7570 case CEPH_OSD_CMPXATTR_OP_LT
:
7571 if (!(bl
< i
->second
.first
)) {
7575 case CEPH_OSD_CMPXATTR_OP_GT
:
7576 if (!(bl
> i
->second
.first
)) {
7594 case CEPH_OSD_OP_OMAPSETVALS
:
7595 if (!pool
.info
.supports_omap()) {
7596 result
= -EOPNOTSUPP
;
7597 tracepoint(osd
, do_osd_op_pre_omapsetvals
, soid
.oid
.name
.c_str(), soid
.snap
.val
);
7602 maybe_create_new_object(ctx
);
7603 bufferlist to_set_bl
;
7605 decode_str_str_map_to_bl(bp
, &to_set_bl
);
7607 catch (buffer::error
& e
) {
7609 tracepoint(osd
, do_osd_op_pre_omapsetvals
, soid
.oid
.name
.c_str(), soid
.snap
.val
);
7612 tracepoint(osd
, do_osd_op_pre_omapsetvals
, soid
.oid
.name
.c_str(), soid
.snap
.val
);
7613 if (cct
->_conf
->subsys
.should_gather
<dout_subsys
, 20>()) {
7614 dout(20) << "setting vals: " << dendl
;
7615 map
<string
,bufferlist
> to_set
;
7616 bufferlist::const_iterator pt
= to_set_bl
.begin();
7618 for (map
<string
, bufferlist
>::iterator i
= to_set
.begin();
7621 dout(20) << "\t" << i
->first
<< dendl
;
7624 t
->omap_setkeys(soid
, to_set_bl
);
7625 ctx
->delta_stats
.num_wr
++;
7626 ctx
->delta_stats
.num_wr_kb
+= shift_round_up(to_set_bl
.length(), 10);
7628 obs
.oi
.set_flag(object_info_t::FLAG_OMAP
);
7629 obs
.oi
.clear_omap_digest();
7632 case CEPH_OSD_OP_OMAPSETHEADER
:
7633 tracepoint(osd
, do_osd_op_pre_omapsetheader
, soid
.oid
.name
.c_str(), soid
.snap
.val
);
7634 if (!pool
.info
.supports_omap()) {
7635 result
= -EOPNOTSUPP
;
7640 maybe_create_new_object(ctx
);
7641 t
->omap_setheader(soid
, osd_op
.indata
);
7642 ctx
->delta_stats
.num_wr
++;
7644 obs
.oi
.set_flag(object_info_t::FLAG_OMAP
);
7645 obs
.oi
.clear_omap_digest();
7648 case CEPH_OSD_OP_OMAPCLEAR
:
7649 tracepoint(osd
, do_osd_op_pre_omapclear
, soid
.oid
.name
.c_str(), soid
.snap
.val
);
7650 if (!pool
.info
.supports_omap()) {
7651 result
= -EOPNOTSUPP
;
7656 if (!obs
.exists
|| oi
.is_whiteout()) {
7661 t
->omap_clear(soid
);
7662 ctx
->delta_stats
.num_wr
++;
7663 obs
.oi
.clear_omap_digest();
7664 obs
.oi
.clear_flag(object_info_t::FLAG_OMAP
);
7669 case CEPH_OSD_OP_OMAPRMKEYS
:
7670 if (!pool
.info
.supports_omap()) {
7671 result
= -EOPNOTSUPP
;
7672 tracepoint(osd
, do_osd_op_pre_omaprmkeys
, soid
.oid
.name
.c_str(), soid
.snap
.val
);
7677 if (!obs
.exists
|| oi
.is_whiteout()) {
7679 tracepoint(osd
, do_osd_op_pre_omaprmkeys
, soid
.oid
.name
.c_str(), soid
.snap
.val
);
7682 bufferlist to_rm_bl
;
7684 decode_str_set_to_bl(bp
, &to_rm_bl
);
7686 catch (buffer::error
& e
) {
7688 tracepoint(osd
, do_osd_op_pre_omaprmkeys
, soid
.oid
.name
.c_str(), soid
.snap
.val
);
7691 tracepoint(osd
, do_osd_op_pre_omaprmkeys
, soid
.oid
.name
.c_str(), soid
.snap
.val
);
7692 t
->omap_rmkeys(soid
, to_rm_bl
);
7693 ctx
->delta_stats
.num_wr
++;
7695 obs
.oi
.clear_omap_digest();
7698 case CEPH_OSD_OP_COPY_GET
:
7700 tracepoint(osd
, do_osd_op_pre_copy_get
, soid
.oid
.name
.c_str(),
7702 if (op_finisher
== nullptr) {
7703 result
= do_copy_get(ctx
, bp
, osd_op
, ctx
->obc
);
7705 result
= op_finisher
->execute();
7709 case CEPH_OSD_OP_COPY_FROM
:
7713 object_locator_t src_oloc
;
7714 snapid_t src_snapid
= (uint64_t)op
.copy_from
.snapid
;
7715 version_t src_version
= op
.copy_from
.src_version
;
7717 decode(src_name
, bp
);
7718 decode(src_oloc
, bp
);
7720 catch (buffer::error
& e
) {
7723 do_osd_op_pre_copy_from
,
7724 soid
.oid
.name
.c_str(),
7736 do_osd_op_pre_copy_from
,
7737 soid
.oid
.name
.c_str(),
7739 src_name
.name
.c_str(),
7741 src_oloc
.key
.c_str(),
7742 src_oloc
.nspace
.c_str(),
7746 if (op_finisher
== nullptr) {
7749 get_osdmap()->object_locator_to_pg(src_name
, src_oloc
, raw_pg
);
7750 hobject_t
src(src_name
, src_oloc
.key
, src_snapid
,
7751 raw_pg
.ps(), raw_pg
.pool(),
7754 dout(20) << " copy from self is invalid" << dendl
;
7758 CopyFromCallback
*cb
= new CopyFromCallback(ctx
, osd_op
);
7759 ctx
->op_finishers
[ctx
->current_osd_subop_num
].reset(
7760 new CopyFromFinisher(cb
));
7761 start_copy(cb
, ctx
->obc
, src
, src_oloc
, src_version
,
7764 op
.copy_from
.src_fadvise_flags
,
7766 result
= -EINPROGRESS
;
7769 result
= op_finisher
->execute();
7770 ceph_assert(result
== 0);
7772 // COPY_FROM cannot be executed multiple times -- it must restart
7773 ctx
->op_finishers
.erase(ctx
->current_osd_subop_num
);
7779 tracepoint(osd
, do_osd_op_pre_unknown
, soid
.oid
.name
.c_str(), soid
.snap
.val
, op
.op
, ceph_osd_op_name(op
.op
));
7780 dout(1) << "unrecognized osd op " << op
.op
7781 << " " << ceph_osd_op_name(op
.op
)
7783 result
= -EOPNOTSUPP
;
7787 osd_op
.rval
= result
;
7788 tracepoint(osd
, do_osd_op_post
, soid
.oid
.name
.c_str(), soid
.snap
.val
, op
.op
, ceph_osd_op_name(op
.op
), op
.flags
, result
);
7789 if (result
< 0 && (op
.flags
& CEPH_OSD_OP_FLAG_FAILOK
) &&
7790 result
!= -EAGAIN
&& result
!= -EINPROGRESS
)
7799 int PrimaryLogPG::_get_tmap(OpContext
*ctx
, bufferlist
*header
, bufferlist
*vals
)
7801 if (ctx
->new_obs
.oi
.size
== 0) {
7802 dout(20) << "unable to get tmap for zero sized " << ctx
->new_obs
.oi
.soid
<< dendl
;
7805 vector
<OSDOp
> nops(1);
7806 OSDOp
&newop
= nops
[0];
7807 newop
.op
.op
= CEPH_OSD_OP_TMAPGET
;
7808 do_osd_ops(ctx
, nops
);
7810 bufferlist::const_iterator i
= newop
.outdata
.begin();
7812 (*vals
).substr_of(newop
.outdata
, i
.get_off(), i
.get_remaining());
7814 dout(20) << "unsuccessful at decoding tmap for " << ctx
->new_obs
.oi
.soid
7818 dout(20) << "successful at decoding tmap for " << ctx
->new_obs
.oi
.soid
7823 int PrimaryLogPG::_verify_no_head_clones(const hobject_t
& soid
,
7826 // verify that all clones have been evicted
7827 dout(20) << __func__
<< " verifying clones are absent "
7829 for (vector
<snapid_t
>::const_iterator p
= ss
.clones
.begin();
7830 p
!= ss
.clones
.end();
7832 hobject_t clone_oid
= soid
;
7833 clone_oid
.snap
= *p
;
7834 if (is_missing_object(clone_oid
))
7836 ObjectContextRef clone_obc
= get_object_context(clone_oid
, false);
7837 if (clone_obc
&& clone_obc
->obs
.exists
) {
7838 dout(10) << __func__
<< " cannot evict head before clone "
7839 << clone_oid
<< dendl
;
7842 if (copy_ops
.count(clone_oid
)) {
7843 dout(10) << __func__
<< " cannot evict head, pending promote on clone "
7844 << clone_oid
<< dendl
;
7851 inline int PrimaryLogPG::_delete_oid(
7853 bool no_whiteout
, // no whiteouts, no matter what.
7854 bool try_no_whiteout
) // try not to whiteout
7856 SnapSet
& snapset
= ctx
->new_snapset
;
7857 ObjectState
& obs
= ctx
->new_obs
;
7858 object_info_t
& oi
= obs
.oi
;
7859 const hobject_t
& soid
= oi
.soid
;
7860 PGTransaction
* t
= ctx
->op_t
.get();
7862 // cache: cache: set whiteout on delete?
7863 bool whiteout
= false;
7864 if (pool
.info
.cache_mode
!= pg_pool_t::CACHEMODE_NONE
7866 && !try_no_whiteout
) {
7870 // in luminous or later, we can't delete the head if there are
7871 // clones. we trust the caller passing no_whiteout has already
7872 // verified they don't exist.
7873 if (!snapset
.clones
.empty() ||
7874 (!ctx
->snapc
.snaps
.empty() && ctx
->snapc
.snaps
[0] > snapset
.seq
)) {
7876 dout(20) << __func__
<< " has or will have clones but no_whiteout=1"
7879 dout(20) << __func__
<< " has or will have clones; will whiteout"
7884 dout(20) << __func__
<< " " << soid
<< " whiteout=" << (int)whiteout
7885 << " no_whiteout=" << (int)no_whiteout
7886 << " try_no_whiteout=" << (int)try_no_whiteout
7888 if (!obs
.exists
|| (obs
.oi
.is_whiteout() && whiteout
))
7894 interval_set
<uint64_t> ch
;
7895 ch
.insert(0, oi
.size
);
7896 ctx
->modified_ranges
.union_of(ch
);
7899 ctx
->delta_stats
.num_wr
++;
7900 if (soid
.is_snap()) {
7901 ceph_assert(ctx
->obc
->ssc
->snapset
.clone_overlap
.count(soid
.snap
));
7902 ctx
->delta_stats
.num_bytes
-= ctx
->obc
->ssc
->snapset
.get_clone_bytes(soid
.snap
);
7904 ctx
->delta_stats
.num_bytes
-= oi
.size
;
7909 // disconnect all watchers
7910 for (map
<pair
<uint64_t, entity_name_t
>, watch_info_t
>::iterator p
=
7911 oi
.watchers
.begin();
7912 p
!= oi
.watchers
.end();
7914 dout(20) << __func__
<< " will disconnect watcher " << p
->first
<< dendl
;
7915 ctx
->watch_disconnects
.push_back(
7916 watch_disconnect_t(p
->first
.first
, p
->first
.second
, true));
7918 oi
.watchers
.clear();
7921 dout(20) << __func__
<< " setting whiteout on " << soid
<< dendl
;
7922 oi
.set_flag(object_info_t::FLAG_WHITEOUT
);
7923 ctx
->delta_stats
.num_whiteouts
++;
7925 osd
->logger
->inc(l_osd_tier_whiteout
);
7930 ctx
->delta_stats
.num_objects
--;
7932 ctx
->delta_stats
.num_object_clones
--;
7933 if (oi
.is_whiteout()) {
7934 dout(20) << __func__
<< " deleting whiteout on " << soid
<< dendl
;
7935 ctx
->delta_stats
.num_whiteouts
--;
7936 oi
.clear_flag(object_info_t::FLAG_WHITEOUT
);
7938 if (oi
.is_cache_pinned()) {
7939 ctx
->delta_stats
.num_objects_pinned
--;
7941 if (oi
.has_manifest()) {
7942 ctx
->delta_stats
.num_objects_manifest
--;
7948 int PrimaryLogPG::_rollback_to(OpContext
*ctx
, ceph_osd_op
& op
)
7950 SnapSet
& snapset
= ctx
->new_snapset
;
7951 ObjectState
& obs
= ctx
->new_obs
;
7952 object_info_t
& oi
= obs
.oi
;
7953 const hobject_t
& soid
= oi
.soid
;
7954 PGTransaction
* t
= ctx
->op_t
.get();
7955 snapid_t snapid
= (uint64_t)op
.snap
.snapid
;
7956 hobject_t missing_oid
;
7958 dout(10) << "_rollback_to " << soid
<< " snapid " << snapid
<< dendl
;
7960 ObjectContextRef rollback_to
;
7962 int ret
= find_object_context(
7963 hobject_t(soid
.oid
, soid
.get_key(), snapid
, soid
.get_hash(), info
.pgid
.pool(),
7964 soid
.get_namespace()),
7965 &rollback_to
, false, false, &missing_oid
);
7966 if (ret
== -EAGAIN
) {
7967 /* clone must be missing */
7968 ceph_assert(is_degraded_or_backfilling_object(missing_oid
) || is_degraded_on_async_recovery_target(missing_oid
));
7969 dout(20) << "_rollback_to attempted to roll back to a missing or backfilling clone "
7970 << missing_oid
<< " (requested snapid: ) " << snapid
<< dendl
;
7971 block_write_on_degraded_snap(missing_oid
, ctx
->op
);
7975 ObjectContextRef promote_obc
;
7976 cache_result_t tier_mode_result
;
7977 if (obs
.exists
&& obs
.oi
.has_manifest()) {
7979 maybe_handle_manifest_detail(
7985 maybe_handle_cache_detail(
7995 switch (tier_mode_result
) {
7996 case cache_result_t::NOOP
:
7998 case cache_result_t::BLOCKED_PROMOTE
:
7999 ceph_assert(promote_obc
);
8000 block_write_on_snap_rollback(soid
, promote_obc
, ctx
->op
);
8002 case cache_result_t::BLOCKED_FULL
:
8003 block_write_on_full_cache(soid
, ctx
->op
);
8005 case cache_result_t::REPLIED_WITH_EAGAIN
:
8006 ceph_abort_msg("this can't happen, no rollback on replica");
8008 ceph_abort_msg("must promote was set, other values are not valid");
8013 if (ret
== -ENOENT
|| (rollback_to
&& rollback_to
->obs
.oi
.is_whiteout())) {
8014 // there's no snapshot here, or there's no object.
8015 // if there's no snapshot, we delete the object; otherwise, do nothing.
8016 dout(20) << "_rollback_to deleting head on " << soid
.oid
8017 << " because got ENOENT|whiteout on find_object_context" << dendl
;
8018 if (ctx
->obc
->obs
.oi
.watchers
.size()) {
8019 // Cannot delete an object with watchers
8022 _delete_oid(ctx
, false, false);
8026 // ummm....huh? It *can't* return anything else at time of writing.
8027 ceph_abort_msg("unexpected error code in _rollback_to");
8028 } else { //we got our context, let's use it to do the rollback!
8029 hobject_t
& rollback_to_sobject
= rollback_to
->obs
.oi
.soid
;
8030 if (is_degraded_or_backfilling_object(rollback_to_sobject
) ||
8031 is_degraded_on_async_recovery_target(rollback_to_sobject
)) {
8032 dout(20) << "_rollback_to attempted to roll back to a degraded object "
8033 << rollback_to_sobject
<< " (requested snapid: ) " << snapid
<< dendl
;
8034 block_write_on_degraded_snap(rollback_to_sobject
, ctx
->op
);
8036 } else if (rollback_to
->obs
.oi
.soid
.snap
== CEPH_NOSNAP
) {
8037 // rolling back to the head; we just need to clone it.
8040 /* 1) Delete current head
8041 * 2) Clone correct snapshot into head
8042 * 3) Calculate clone_overlaps by following overlaps
8043 * forward from rollback snapshot */
8044 dout(10) << "_rollback_to deleting " << soid
.oid
8045 << " and rolling back to old snap" << dendl
;
8050 t
->clone(soid
, rollback_to_sobject
);
8051 t
->add_obc(rollback_to
);
8053 map
<snapid_t
, interval_set
<uint64_t> >::iterator iter
=
8054 snapset
.clone_overlap
.lower_bound(snapid
);
8055 ceph_assert(iter
!= snapset
.clone_overlap
.end());
8056 interval_set
<uint64_t> overlaps
= iter
->second
;
8058 iter
!= snapset
.clone_overlap
.end();
8060 overlaps
.intersection_of(iter
->second
);
8062 if (obs
.oi
.size
> 0) {
8063 interval_set
<uint64_t> modified
;
8064 modified
.insert(0, obs
.oi
.size
);
8065 overlaps
.intersection_of(modified
);
8066 modified
.subtract(overlaps
);
8067 ctx
->modified_ranges
.union_of(modified
);
8070 // Adjust the cached objectcontext
8071 maybe_create_new_object(ctx
, true);
8072 ctx
->delta_stats
.num_bytes
-= obs
.oi
.size
;
8073 ctx
->delta_stats
.num_bytes
+= rollback_to
->obs
.oi
.size
;
8074 obs
.oi
.size
= rollback_to
->obs
.oi
.size
;
8075 if (rollback_to
->obs
.oi
.is_data_digest())
8076 obs
.oi
.set_data_digest(rollback_to
->obs
.oi
.data_digest
);
8078 obs
.oi
.clear_data_digest();
8079 if (rollback_to
->obs
.oi
.is_omap_digest())
8080 obs
.oi
.set_omap_digest(rollback_to
->obs
.oi
.omap_digest
);
8082 obs
.oi
.clear_omap_digest();
8084 if (rollback_to
->obs
.oi
.is_omap()) {
8085 dout(10) << __func__
<< " setting omap flag on " << obs
.oi
.soid
<< dendl
;
8086 obs
.oi
.set_flag(object_info_t::FLAG_OMAP
);
8088 dout(10) << __func__
<< " clearing omap flag on " << obs
.oi
.soid
<< dendl
;
8089 obs
.oi
.clear_flag(object_info_t::FLAG_OMAP
);
8096 void PrimaryLogPG::_make_clone(
8099 ObjectContextRef obc
,
8100 const hobject_t
& head
, const hobject_t
& coid
,
8104 encode(*poi
, bv
, get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD
, nullptr));
8106 t
->clone(coid
, head
);
8107 setattr_maybe_cache(obc
, t
, OI_ATTR
, bv
);
8108 rmattr_maybe_cache(obc
, t
, SS_ATTR
);
8111 void PrimaryLogPG::make_writeable(OpContext
*ctx
)
8113 const hobject_t
& soid
= ctx
->obs
->oi
.soid
;
8114 SnapContext
& snapc
= ctx
->snapc
;
8117 ceph_assert(soid
.snap
== CEPH_NOSNAP
);
8118 dout(20) << "make_writeable " << soid
<< " snapset=" << ctx
->new_snapset
8119 << " snapc=" << snapc
<< dendl
;
8121 bool was_dirty
= ctx
->obc
->obs
.oi
.is_dirty();
8122 if (ctx
->new_obs
.exists
) {
8123 // we will mark the object dirty
8124 if (ctx
->undirty
&& was_dirty
) {
8125 dout(20) << " clearing DIRTY flag" << dendl
;
8126 ceph_assert(ctx
->new_obs
.oi
.is_dirty());
8127 ctx
->new_obs
.oi
.clear_flag(object_info_t::FLAG_DIRTY
);
8128 --ctx
->delta_stats
.num_objects_dirty
;
8129 osd
->logger
->inc(l_osd_tier_clean
);
8130 } else if (!was_dirty
&& !ctx
->undirty
) {
8131 dout(20) << " setting DIRTY flag" << dendl
;
8132 ctx
->new_obs
.oi
.set_flag(object_info_t::FLAG_DIRTY
);
8133 ++ctx
->delta_stats
.num_objects_dirty
;
8134 osd
->logger
->inc(l_osd_tier_dirty
);
8138 dout(20) << " deletion, decrementing num_dirty and clearing flag" << dendl
;
8139 ctx
->new_obs
.oi
.clear_flag(object_info_t::FLAG_DIRTY
);
8140 --ctx
->delta_stats
.num_objects_dirty
;
8144 if ((ctx
->new_obs
.exists
&&
8145 ctx
->new_obs
.oi
.is_omap()) &&
8146 (!ctx
->obc
->obs
.exists
||
8147 !ctx
->obc
->obs
.oi
.is_omap())) {
8148 ++ctx
->delta_stats
.num_objects_omap
;
8150 if ((!ctx
->new_obs
.exists
||
8151 !ctx
->new_obs
.oi
.is_omap()) &&
8152 (ctx
->obc
->obs
.exists
&&
8153 ctx
->obc
->obs
.oi
.is_omap())) {
8154 --ctx
->delta_stats
.num_objects_omap
;
8157 if (ctx
->new_snapset
.seq
> snapc
.seq
) {
8158 dout(10) << " op snapset is old" << dendl
;
8161 if ((ctx
->obs
->exists
&& !ctx
->obs
->oi
.is_whiteout()) && // head exist(ed)
8162 snapc
.snaps
.size() && // there are snaps
8163 !ctx
->cache_evict
&&
8164 snapc
.snaps
[0] > ctx
->new_snapset
.seq
) { // existing object is old
8166 hobject_t coid
= soid
;
8167 coid
.snap
= snapc
.seq
;
8171 l
< snapc
.snaps
.size() && snapc
.snaps
[l
] > ctx
->new_snapset
.seq
;
8174 vector
<snapid_t
> snaps(l
);
8175 for (unsigned i
=0; i
<l
; i
++)
8176 snaps
[i
] = snapc
.snaps
[i
];
8179 object_info_t
static_snap_oi(coid
);
8180 object_info_t
*snap_oi
;
8182 ctx
->clone_obc
= object_contexts
.lookup_or_create(static_snap_oi
.soid
);
8183 ctx
->clone_obc
->destructor_callback
=
8184 new C_PG_ObjectContext(this, ctx
->clone_obc
.get());
8185 ctx
->clone_obc
->obs
.oi
= static_snap_oi
;
8186 ctx
->clone_obc
->obs
.exists
= true;
8187 ctx
->clone_obc
->ssc
= ctx
->obc
->ssc
;
8188 ctx
->clone_obc
->ssc
->ref
++;
8189 if (pool
.info
.is_erasure())
8190 ctx
->clone_obc
->attr_cache
= ctx
->obc
->attr_cache
;
8191 snap_oi
= &ctx
->clone_obc
->obs
.oi
;
8192 bool got
= ctx
->lock_manager
.get_write_greedy(
8197 dout(20) << " got greedy write on clone_obc " << *ctx
->clone_obc
<< dendl
;
8199 snap_oi
= &static_snap_oi
;
8201 snap_oi
->version
= ctx
->at_version
;
8202 snap_oi
->prior_version
= ctx
->obs
->oi
.version
;
8203 snap_oi
->copy_user_bits(ctx
->obs
->oi
);
8205 _make_clone(ctx
, ctx
->op_t
.get(), ctx
->clone_obc
, soid
, coid
, snap_oi
);
8207 ctx
->delta_stats
.num_objects
++;
8208 if (snap_oi
->is_dirty()) {
8209 ctx
->delta_stats
.num_objects_dirty
++;
8210 osd
->logger
->inc(l_osd_tier_dirty
);
8212 if (snap_oi
->is_omap())
8213 ctx
->delta_stats
.num_objects_omap
++;
8214 if (snap_oi
->is_cache_pinned())
8215 ctx
->delta_stats
.num_objects_pinned
++;
8216 if (snap_oi
->has_manifest())
8217 ctx
->delta_stats
.num_objects_manifest
++;
8218 ctx
->delta_stats
.num_object_clones
++;
8219 ctx
->new_snapset
.clones
.push_back(coid
.snap
);
8220 ctx
->new_snapset
.clone_size
[coid
.snap
] = ctx
->obs
->oi
.size
;
8221 ctx
->new_snapset
.clone_snaps
[coid
.snap
] = snaps
;
8223 // clone_overlap should contain an entry for each clone
8224 // (an empty interval_set if there is no overlap)
8225 ctx
->new_snapset
.clone_overlap
[coid
.snap
];
8226 if (ctx
->obs
->oi
.size
)
8227 ctx
->new_snapset
.clone_overlap
[coid
.snap
].insert(0, ctx
->obs
->oi
.size
);
8230 dout(10) << " cloning v " << ctx
->obs
->oi
.version
8231 << " to " << coid
<< " v " << ctx
->at_version
8232 << " snaps=" << snaps
8233 << " snapset=" << ctx
->new_snapset
<< dendl
;
8234 ctx
->log
.push_back(pg_log_entry_t(
8235 pg_log_entry_t::CLONE
, coid
, ctx
->at_version
,
8236 ctx
->obs
->oi
.version
,
8237 ctx
->obs
->oi
.user_version
,
8238 osd_reqid_t(), ctx
->new_obs
.oi
.mtime
, 0));
8239 encode(snaps
, ctx
->log
.back().snaps
);
8241 ctx
->at_version
.version
++;
8244 // update most recent clone_overlap and usage stats
8245 if (ctx
->new_snapset
.clones
.size() > 0) {
8246 // the clone_overlap is difference of range between head and clones.
8247 // we need to check whether the most recent clone exists, if it's
8248 // been evicted, it's not included in the stats, but the clone_overlap
8249 // is still exist in the snapset, so we should update the
8250 // clone_overlap to make it sense.
8251 hobject_t last_clone_oid
= soid
;
8252 last_clone_oid
.snap
= ctx
->new_snapset
.clone_overlap
.rbegin()->first
;
8253 interval_set
<uint64_t> &newest_overlap
=
8254 ctx
->new_snapset
.clone_overlap
.rbegin()->second
;
8255 ctx
->modified_ranges
.intersection_of(newest_overlap
);
8256 if (is_present_clone(last_clone_oid
)) {
8257 // modified_ranges is still in use by the clone
8258 ctx
->delta_stats
.num_bytes
+= ctx
->modified_ranges
.size();
8260 newest_overlap
.subtract(ctx
->modified_ranges
);
8263 if (snapc
.seq
> ctx
->new_snapset
.seq
) {
8264 // update snapset with latest snap context
8265 ctx
->new_snapset
.seq
= snapc
.seq
;
8266 ctx
->new_snapset
.snaps
= snapc
.snaps
;
8268 dout(20) << "make_writeable " << soid
8269 << " done, snapset=" << ctx
->new_snapset
<< dendl
;
8273 void PrimaryLogPG::write_update_size_and_usage(object_stat_sum_t
& delta_stats
, object_info_t
& oi
,
8274 interval_set
<uint64_t>& modified
, uint64_t offset
,
8275 uint64_t length
, bool write_full
)
8277 interval_set
<uint64_t> ch
;
8280 ch
.insert(0, oi
.size
);
8282 ch
.insert(offset
, length
);
8283 modified
.union_of(ch
);
8285 (offset
+ length
> oi
.size
&& length
)) {
8286 uint64_t new_size
= offset
+ length
;
8287 delta_stats
.num_bytes
-= oi
.size
;
8288 delta_stats
.num_bytes
+= new_size
;
8292 if (oi
.has_manifest() && oi
.manifest
.is_chunked()) {
8293 for (auto &p
: oi
.manifest
.chunk_map
) {
8294 if ((p
.first
<= offset
&& p
.first
+ p
.second
.length
> offset
) ||
8295 (p
.first
> offset
&& p
.first
<= offset
+ length
)) {
8296 p
.second
.clear_flag(chunk_info_t::FLAG_MISSING
);
8297 p
.second
.set_flag(chunk_info_t::FLAG_DIRTY
);
8301 delta_stats
.num_wr
++;
8302 delta_stats
.num_wr_kb
+= shift_round_up(length
, 10);
8305 void PrimaryLogPG::truncate_update_size_and_usage(
8306 object_stat_sum_t
& delta_stats
,
8308 uint64_t truncate_size
)
8310 if (oi
.size
!= truncate_size
) {
8311 delta_stats
.num_bytes
-= oi
.size
;
8312 delta_stats
.num_bytes
+= truncate_size
;
8313 oi
.size
= truncate_size
;
8317 void PrimaryLogPG::complete_disconnect_watches(
8318 ObjectContextRef obc
,
8319 const list
<watch_disconnect_t
> &to_disconnect
)
8321 for (list
<watch_disconnect_t
>::const_iterator i
=
8322 to_disconnect
.begin();
8323 i
!= to_disconnect
.end();
8325 pair
<uint64_t, entity_name_t
> watcher(i
->cookie
, i
->name
);
8326 auto watchers_entry
= obc
->watchers
.find(watcher
);
8327 if (watchers_entry
!= obc
->watchers
.end()) {
8328 WatchRef watch
= watchers_entry
->second
;
8329 dout(10) << "do_osd_op_effects disconnect watcher " << watcher
<< dendl
;
8330 obc
->watchers
.erase(watcher
);
8331 watch
->remove(i
->send_disconnect
);
8333 dout(10) << "do_osd_op_effects disconnect failed to find watcher "
8334 << watcher
<< dendl
;
8339 void PrimaryLogPG::do_osd_op_effects(OpContext
*ctx
, const ConnectionRef
& conn
)
8341 entity_name_t entity
= ctx
->reqid
.name
;
8342 dout(15) << "do_osd_op_effects " << entity
<< " con " << conn
.get() << dendl
;
8344 // disconnects first
8345 complete_disconnect_watches(ctx
->obc
, ctx
->watch_disconnects
);
8349 auto session
= conn
->get_priv();
8353 for (list
<pair
<watch_info_t
,bool> >::iterator i
= ctx
->watch_connects
.begin();
8354 i
!= ctx
->watch_connects
.end();
8356 pair
<uint64_t, entity_name_t
> watcher(i
->first
.cookie
, entity
);
8357 dout(15) << "do_osd_op_effects applying watch connect on session "
8358 << session
.get() << " watcher " << watcher
<< dendl
;
8360 if (ctx
->obc
->watchers
.count(watcher
)) {
8361 dout(15) << "do_osd_op_effects found existing watch watcher " << watcher
8363 watch
= ctx
->obc
->watchers
[watcher
];
8365 dout(15) << "do_osd_op_effects new watcher " << watcher
8367 watch
= Watch::makeWatchRef(
8368 this, osd
, ctx
->obc
, i
->first
.timeout_seconds
,
8369 i
->first
.cookie
, entity
, conn
->get_peer_addr());
8370 ctx
->obc
->watchers
.insert(
8375 watch
->connect(conn
, i
->second
);
8378 for (list
<notify_info_t
>::iterator p
= ctx
->notifies
.begin();
8379 p
!= ctx
->notifies
.end();
8381 dout(10) << "do_osd_op_effects, notify " << *p
<< dendl
;
8382 ConnectionRef
conn(ctx
->op
->get_req()->get_connection());
8384 Notify::makeNotifyRef(
8386 ctx
->reqid
.name
.num(),
8391 ctx
->obc
->obs
.oi
.user_version
,
8393 for (map
<pair
<uint64_t, entity_name_t
>, WatchRef
>::iterator i
=
8394 ctx
->obc
->watchers
.begin();
8395 i
!= ctx
->obc
->watchers
.end();
8397 dout(10) << "starting notify on watch " << i
->first
<< dendl
;
8398 i
->second
->start_notify(notif
);
8403 for (list
<OpContext::NotifyAck
>::iterator p
= ctx
->notify_acks
.begin();
8404 p
!= ctx
->notify_acks
.end();
8406 if (p
->watch_cookie
)
8407 dout(10) << "notify_ack " << make_pair(p
->watch_cookie
.get(), p
->notify_id
) << dendl
;
8409 dout(10) << "notify_ack " << make_pair("NULL", p
->notify_id
) << dendl
;
8410 for (map
<pair
<uint64_t, entity_name_t
>, WatchRef
>::iterator i
=
8411 ctx
->obc
->watchers
.begin();
8412 i
!= ctx
->obc
->watchers
.end();
8414 if (i
->first
.second
!= entity
) continue;
8415 if (p
->watch_cookie
&&
8416 p
->watch_cookie
.get() != i
->first
.first
) continue;
8417 dout(10) << "acking notify on watch " << i
->first
<< dendl
;
8418 i
->second
->notify_ack(p
->notify_id
, p
->reply_bl
);
8423 hobject_t
PrimaryLogPG::generate_temp_object(const hobject_t
& target
)
8426 ss
<< "temp_" << info
.pgid
<< "_" << get_role()
8427 << "_" << osd
->monc
->get_global_id() << "_" << (++temp_seq
);
8428 hobject_t hoid
= target
.make_temp_hobject(ss
.str());
8429 dout(20) << __func__
<< " " << hoid
<< dendl
;
8433 hobject_t
PrimaryLogPG::get_temp_recovery_object(
8434 const hobject_t
& target
,
8438 ss
<< "temp_recovering_" << info
.pgid
// (note this includes the shardid)
8440 << "_" << info
.history
.same_interval_since
8441 << "_" << target
.snap
;
8442 // pgid + version + interval + snapid is unique, and short
8443 hobject_t hoid
= target
.make_temp_hobject(ss
.str());
8444 dout(20) << __func__
<< " " << hoid
<< dendl
;
8448 int PrimaryLogPG::prepare_transaction(OpContext
*ctx
)
8450 ceph_assert(!ctx
->ops
->empty());
8452 // valid snap context?
8453 if (!ctx
->snapc
.is_valid()) {
8454 dout(10) << " invalid snapc " << ctx
->snapc
<< dendl
;
8458 // prepare the actual mutation
8459 int result
= do_osd_ops(ctx
, *ctx
->ops
);
8461 if (ctx
->op
->may_write() &&
8462 get_osdmap()->require_osd_release
>= CEPH_RELEASE_KRAKEN
) {
8463 // need to save the error code in the pg log, to detect dup ops,
8464 // but do nothing else
8465 ctx
->update_log_only
= true;
8470 // read-op? write-op noop? done?
8471 if (ctx
->op_t
->empty() && !ctx
->modify
) {
8472 if (ctx
->pending_async_reads
.empty())
8473 unstable_stats
.add(ctx
->delta_stats
);
8474 if (ctx
->op
->may_write() &&
8475 get_osdmap()->require_osd_release
>= CEPH_RELEASE_KRAKEN
) {
8476 ctx
->update_log_only
= true;
8482 if ((ctx
->delta_stats
.num_bytes
> 0 ||
8483 ctx
->delta_stats
.num_objects
> 0) && // FIXME: keys?
8484 (pool
.info
.has_flag(pg_pool_t::FLAG_FULL
) ||
8485 get_osdmap()->test_flag(CEPH_OSDMAP_FULL
))) {
8486 const MOSDOp
*m
= static_cast<const MOSDOp
*>(ctx
->op
->get_req());
8487 if (ctx
->reqid
.name
.is_mds() || // FIXME: ignore MDS for now
8488 m
->has_flag(CEPH_OSD_FLAG_FULL_FORCE
)) {
8489 dout(20) << __func__
<< " full, but proceeding due to FULL_FORCE or MDS"
8491 } else if (m
->has_flag(CEPH_OSD_FLAG_FULL_TRY
)) {
8492 // they tried, they failed.
8493 dout(20) << __func__
<< " full, replying to FULL_TRY op" << dendl
;
8494 return pool
.info
.has_flag(pg_pool_t::FLAG_FULL_QUOTA
) ? -EDQUOT
: -ENOSPC
;
8497 dout(20) << __func__
<< " full, dropping request (bad client)" << dendl
;
8502 const hobject_t
& soid
= ctx
->obs
->oi
.soid
;
8503 // clone, if necessary
8504 if (soid
.snap
== CEPH_NOSNAP
)
8505 make_writeable(ctx
);
8508 ctx
->new_obs
.exists
? pg_log_entry_t::MODIFY
:
8509 pg_log_entry_t::DELETE
);
8514 void PrimaryLogPG::finish_ctx(OpContext
*ctx
, int log_op_type
)
8516 const hobject_t
& soid
= ctx
->obs
->oi
.soid
;
8517 dout(20) << __func__
<< " " << soid
<< " " << ctx
8518 << " op " << pg_log_entry_t::get_op_name(log_op_type
)
8520 utime_t now
= ceph_clock_now();
8522 // finish and log the op.
8523 if (ctx
->user_modify
) {
8524 // update the user_version for any modify ops, except for the watch op
8525 ctx
->user_at_version
= std::max(info
.last_user_version
, ctx
->new_obs
.oi
.user_version
) + 1;
8526 /* In order for new clients and old clients to interoperate properly
8527 * when exchanging versions, we need to lower bound the user_version
8528 * (which our new clients pay proper attention to)
8529 * by the at_version (which is all the old clients can ever see). */
8530 if (ctx
->at_version
.version
> ctx
->user_at_version
)
8531 ctx
->user_at_version
= ctx
->at_version
.version
;
8532 ctx
->new_obs
.oi
.user_version
= ctx
->user_at_version
;
8534 ctx
->bytes_written
= ctx
->op_t
->get_bytes_written();
8536 if (ctx
->new_obs
.exists
) {
8537 ctx
->new_obs
.oi
.version
= ctx
->at_version
;
8538 ctx
->new_obs
.oi
.prior_version
= ctx
->obs
->oi
.version
;
8539 ctx
->new_obs
.oi
.last_reqid
= ctx
->reqid
;
8540 if (ctx
->mtime
!= utime_t()) {
8541 ctx
->new_obs
.oi
.mtime
= ctx
->mtime
;
8542 dout(10) << " set mtime to " << ctx
->new_obs
.oi
.mtime
<< dendl
;
8543 ctx
->new_obs
.oi
.local_mtime
= now
;
8545 dout(10) << " mtime unchanged at " << ctx
->new_obs
.oi
.mtime
<< dendl
;
8549 map
<string
, bufferlist
> attrs
;
8550 bufferlist
bv(sizeof(ctx
->new_obs
.oi
));
8551 encode(ctx
->new_obs
.oi
, bv
,
8552 get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD
, nullptr));
8553 attrs
[OI_ATTR
].claim(bv
);
8556 if (soid
.snap
== CEPH_NOSNAP
) {
8557 dout(10) << " final snapset " << ctx
->new_snapset
8558 << " in " << soid
<< dendl
;
8560 encode(ctx
->new_snapset
, bss
);
8561 attrs
[SS_ATTR
].claim(bss
);
8563 dout(10) << " no snapset (this is a clone)" << dendl
;
8565 ctx
->op_t
->setattrs(soid
, attrs
);
8568 ctx
->new_obs
.oi
= object_info_t(ctx
->obc
->obs
.oi
.soid
);
8572 ctx
->log
.push_back(pg_log_entry_t(log_op_type
, soid
, ctx
->at_version
,
8573 ctx
->obs
->oi
.version
,
8574 ctx
->user_at_version
, ctx
->reqid
,
8576 if (soid
.snap
< CEPH_NOSNAP
) {
8577 switch (log_op_type
) {
8578 case pg_log_entry_t::MODIFY
:
8579 case pg_log_entry_t::PROMOTE
:
8580 case pg_log_entry_t::CLEAN
:
8581 dout(20) << __func__
<< " encoding snaps from " << ctx
->new_snapset
8583 encode(ctx
->new_snapset
.clone_snaps
[soid
.snap
], ctx
->log
.back().snaps
);
8590 if (!ctx
->extra_reqids
.empty()) {
8591 dout(20) << __func__
<< " extra_reqids " << ctx
->extra_reqids
<< " "
8592 << ctx
->extra_reqid_return_codes
<< dendl
;
8593 ctx
->log
.back().extra_reqids
.swap(ctx
->extra_reqids
);
8594 ctx
->log
.back().extra_reqid_return_codes
.swap(ctx
->extra_reqid_return_codes
);
8597 // apply new object state.
8598 ctx
->obc
->obs
= ctx
->new_obs
;
8600 if (soid
.is_head() && !ctx
->obc
->obs
.exists
) {
8601 ctx
->obc
->ssc
->exists
= false;
8602 ctx
->obc
->ssc
->snapset
= SnapSet();
8604 ctx
->obc
->ssc
->exists
= true;
8605 ctx
->obc
->ssc
->snapset
= ctx
->new_snapset
;
8609 void PrimaryLogPG::apply_stats(
8610 const hobject_t
&soid
,
8611 const object_stat_sum_t
&delta_stats
) {
8613 info
.stats
.stats
.add(delta_stats
);
8614 info
.stats
.stats
.floor(0);
8616 for (set
<pg_shard_t
>::iterator i
= backfill_targets
.begin();
8617 i
!= backfill_targets
.end();
8620 pg_info_t
& pinfo
= peer_info
[bt
];
8621 if (soid
<= pinfo
.last_backfill
)
8622 pinfo
.stats
.stats
.add(delta_stats
);
8623 else if (soid
<= last_backfill_started
)
8624 pending_backfill_updates
[soid
].stats
.add(delta_stats
);
8627 if (is_primary() && scrubber
.active
) {
8628 if (soid
< scrubber
.start
) {
8629 dout(20) << __func__
<< " " << soid
<< " < [" << scrubber
.start
8630 << "," << scrubber
.end
<< ")" << dendl
;
8631 scrub_cstat
.add(delta_stats
);
8633 dout(20) << __func__
<< " " << soid
<< " >= [" << scrubber
.start
8634 << "," << scrubber
.end
<< ")" << dendl
;
8639 void PrimaryLogPG::complete_read_ctx(int result
, OpContext
*ctx
)
8641 const MOSDOp
*m
= static_cast<const MOSDOp
*>(ctx
->op
->get_req());
8642 ceph_assert(ctx
->async_reads_complete());
8644 for (vector
<OSDOp
>::iterator p
= ctx
->ops
->begin();
8645 p
!= ctx
->ops
->end() && result
>= 0; ++p
) {
8646 if (p
->rval
< 0 && !(p
->op
.flags
& CEPH_OSD_OP_FLAG_FAILOK
)) {
8650 ctx
->bytes_read
+= p
->outdata
.length();
8652 ctx
->reply
->claim_op_out_data(*ctx
->ops
);
8653 ctx
->reply
->get_header().data_off
= (ctx
->data_off
? *ctx
->data_off
: 0);
8655 MOSDOpReply
*reply
= ctx
->reply
;
8656 ctx
->reply
= nullptr;
8659 if (!ctx
->ignore_log_op_stats
) {
8660 log_op_stats(*ctx
->op
, ctx
->bytes_written
, ctx
->bytes_read
);
8662 publish_stats_to_osd();
8665 // on read, return the current object version
8667 reply
->set_reply_versions(eversion_t(), ctx
->obs
->oi
.user_version
);
8669 reply
->set_reply_versions(eversion_t(), ctx
->user_at_version
);
8671 } else if (result
== -ENOENT
) {
8672 // on ENOENT, set a floor for what the next user version will be.
8673 reply
->set_enoent_reply_versions(info
.last_update
, info
.last_user_version
);
8676 reply
->set_result(result
);
8677 reply
->add_flags(CEPH_OSD_FLAG_ACK
| CEPH_OSD_FLAG_ONDISK
);
8678 osd
->send_message_osd_client(reply
, m
->get_connection());
8682 // ========================================================================
8685 struct C_Copyfrom
: public Context
{
8688 epoch_t last_peering_reset
;
8690 PrimaryLogPG::CopyOpRef cop
; // used for keeping the cop alive
8691 C_Copyfrom(PrimaryLogPG
*p
, hobject_t o
, epoch_t lpr
,
8692 const PrimaryLogPG::CopyOpRef
& c
)
8693 : pg(p
), oid(o
), last_peering_reset(lpr
),
8696 void finish(int r
) override
{
8697 if (r
== -ECANCELED
)
8700 if (last_peering_reset
== pg
->get_last_peering_reset()) {
8701 pg
->process_copy_chunk(oid
, tid
, r
);
8708 struct C_CopyFrom_AsyncReadCb
: public Context
{
8710 object_copy_data_t reply_obj
;
8713 C_CopyFrom_AsyncReadCb(OSDOp
*osd_op
, uint64_t features
) :
8714 osd_op(osd_op
), features(features
), len(0) {}
8715 void finish(int r
) override
{
8721 ceph_assert(len
> 0);
8722 ceph_assert(len
<= reply_obj
.data
.length());
8724 bl
.substr_of(reply_obj
.data
, 0, len
);
8725 reply_obj
.data
.swap(bl
);
8726 encode(reply_obj
, osd_op
->outdata
, features
);
8730 struct C_CopyChunk
: public Context
{
8733 epoch_t last_peering_reset
;
8735 PrimaryLogPG::CopyOpRef cop
; // used for keeping the cop alive
8736 uint64_t offset
= 0;
8737 C_CopyChunk(PrimaryLogPG
*p
, hobject_t o
, epoch_t lpr
,
8738 const PrimaryLogPG::CopyOpRef
& c
)
8739 : pg(p
), oid(o
), last_peering_reset(lpr
),
8742 void finish(int r
) override
{
8743 if (r
== -ECANCELED
)
8746 if (last_peering_reset
== pg
->get_last_peering_reset()) {
8747 pg
->process_copy_chunk_manifest(oid
, tid
, r
, offset
);
8754 int PrimaryLogPG::do_copy_get(OpContext
*ctx
, bufferlist::const_iterator
& bp
,
8755 OSDOp
& osd_op
, ObjectContextRef
&obc
)
8757 object_info_t
& oi
= obc
->obs
.oi
;
8758 hobject_t
& soid
= oi
.soid
;
8760 object_copy_cursor_t cursor
;
8764 decode(out_max
, bp
);
8766 catch (buffer::error
& e
) {
8771 const MOSDOp
*op
= reinterpret_cast<const MOSDOp
*>(ctx
->op
->get_req());
8772 uint64_t features
= op
->get_features();
8774 bool async_read_started
= false;
8775 object_copy_data_t _reply_obj
;
8776 C_CopyFrom_AsyncReadCb
*cb
= nullptr;
8777 if (pool
.info
.is_erasure()) {
8778 cb
= new C_CopyFrom_AsyncReadCb(&osd_op
, features
);
8780 object_copy_data_t
&reply_obj
= cb
? cb
->reply_obj
: _reply_obj
;
8782 reply_obj
.size
= oi
.size
;
8783 reply_obj
.mtime
= oi
.mtime
;
8784 ceph_assert(obc
->ssc
);
8785 if (soid
.snap
< CEPH_NOSNAP
) {
8786 auto p
= obc
->ssc
->snapset
.clone_snaps
.find(soid
.snap
);
8787 ceph_assert(p
!= obc
->ssc
->snapset
.clone_snaps
.end()); // warn?
8788 reply_obj
.snaps
= p
->second
;
8790 reply_obj
.snap_seq
= obc
->ssc
->snapset
.seq
;
8792 if (oi
.is_data_digest()) {
8793 reply_obj
.flags
|= object_copy_data_t::FLAG_DATA_DIGEST
;
8794 reply_obj
.data_digest
= oi
.data_digest
;
8796 if (oi
.is_omap_digest()) {
8797 reply_obj
.flags
|= object_copy_data_t::FLAG_OMAP_DIGEST
;
8798 reply_obj
.omap_digest
= oi
.omap_digest
;
8800 reply_obj
.truncate_seq
= oi
.truncate_seq
;
8801 reply_obj
.truncate_size
= oi
.truncate_size
;
8804 map
<string
,bufferlist
>& out_attrs
= reply_obj
.attrs
;
8805 if (!cursor
.attr_complete
) {
8806 result
= getattrs_maybe_cache(
8815 cursor
.attr_complete
= true;
8816 dout(20) << " got attrs" << dendl
;
8819 int64_t left
= out_max
- osd_op
.outdata
.length();
8822 bufferlist
& bl
= reply_obj
.data
;
8823 if (left
> 0 && !cursor
.data_complete
) {
8824 if (cursor
.data_offset
< oi
.size
) {
8825 uint64_t max_read
= std::min(oi
.size
- cursor
.data_offset
, (uint64_t)left
);
8827 async_read_started
= true;
8828 ctx
->pending_async_reads
.push_back(
8830 boost::make_tuple(cursor
.data_offset
, max_read
, osd_op
.op
.flags
),
8831 make_pair(&bl
, cb
)));
8834 ctx
->op_finishers
[ctx
->current_osd_subop_num
].reset(
8835 new ReadFinisher(osd_op
));
8836 result
= -EINPROGRESS
;
8838 dout(10) << __func__
<< ": async_read noted for " << soid
<< dendl
;
8840 result
= pgbackend
->objects_read_sync(
8841 oi
.soid
, cursor
.data_offset
, max_read
, osd_op
.op
.flags
, &bl
);
8846 cursor
.data_offset
+= max_read
;
8848 if (cursor
.data_offset
== oi
.size
) {
8849 cursor
.data_complete
= true;
8850 dout(20) << " got data" << dendl
;
8852 ceph_assert(cursor
.data_offset
<= oi
.size
);
8856 uint32_t omap_keys
= 0;
8857 if (!pool
.info
.supports_omap() || !oi
.is_omap()) {
8858 cursor
.omap_complete
= true;
8860 if (left
> 0 && !cursor
.omap_complete
) {
8861 ceph_assert(cursor
.data_complete
);
8862 if (cursor
.omap_offset
.empty()) {
8863 osd
->store
->omap_get_header(ch
, ghobject_t(oi
.soid
),
8864 &reply_obj
.omap_header
);
8866 bufferlist omap_data
;
8867 ObjectMap::ObjectMapIterator iter
=
8868 osd
->store
->get_omap_iterator(ch
, ghobject_t(oi
.soid
));
8870 iter
->upper_bound(cursor
.omap_offset
);
8871 for (; iter
->valid(); iter
->next()) {
8873 encode(iter
->key(), omap_data
);
8874 encode(iter
->value(), omap_data
);
8875 left
-= iter
->key().length() + 4 + iter
->value().length() + 4;
8880 encode(omap_keys
, reply_obj
.omap_data
);
8881 reply_obj
.omap_data
.claim_append(omap_data
);
8883 if (iter
->valid()) {
8884 cursor
.omap_offset
= iter
->key();
8886 cursor
.omap_complete
= true;
8887 dout(20) << " got omap" << dendl
;
8892 if (cursor
.is_complete()) {
8893 // include reqids only in the final step. this is a bit fragile
8895 pg_log
.get_log().get_object_reqids(ctx
->obc
->obs
.oi
.soid
, 10,
8897 &reply_obj
.reqid_return_codes
);
8898 dout(20) << " got reqids" << dendl
;
8901 dout(20) << " cursor.is_complete=" << cursor
.is_complete()
8902 << " " << out_attrs
.size() << " attrs"
8903 << " " << bl
.length() << " bytes"
8904 << " " << reply_obj
.omap_header
.length() << " omap header bytes"
8905 << " " << reply_obj
.omap_data
.length() << " omap data bytes in "
8906 << omap_keys
<< " keys"
8907 << " " << reply_obj
.reqids
.size() << " reqids"
8909 reply_obj
.cursor
= cursor
;
8910 if (!async_read_started
) {
8911 encode(reply_obj
, osd_op
.outdata
, features
);
8913 if (cb
&& !async_read_started
) {
8923 void PrimaryLogPG::fill_in_copy_get_noent(OpRequestRef
& op
, hobject_t oid
,
8926 // NOTE: we take non-const ref here for claim_op_out_data below; we must
8927 // be careful not to modify anything else that will upset a racing
8929 MOSDOp
*m
= static_cast<MOSDOp
*>(op
->get_nonconst_req());
8930 uint64_t features
= m
->get_features();
8931 object_copy_data_t reply_obj
;
8933 pg_log
.get_log().get_object_reqids(oid
, 10, &reply_obj
.reqids
,
8934 &reply_obj
.reqid_return_codes
);
8935 dout(20) << __func__
<< " got reqids " << reply_obj
.reqids
<< dendl
;
8936 encode(reply_obj
, osd_op
.outdata
, features
);
8937 osd_op
.rval
= -ENOENT
;
8938 MOSDOpReply
*reply
= new MOSDOpReply(m
, 0, get_osdmap_epoch(), 0, false);
8939 reply
->claim_op_out_data(m
->ops
);
8940 reply
->set_result(-ENOENT
);
8941 reply
->add_flags(CEPH_OSD_FLAG_ACK
| CEPH_OSD_FLAG_ONDISK
);
8942 osd
->send_message_osd_client(reply
, m
->get_connection());
8945 void PrimaryLogPG::start_copy(CopyCallback
*cb
, ObjectContextRef obc
,
8946 hobject_t src
, object_locator_t oloc
,
8947 version_t version
, unsigned flags
,
8948 bool mirror_snapset
,
8949 unsigned src_obj_fadvise_flags
,
8950 unsigned dest_obj_fadvise_flags
)
8952 const hobject_t
& dest
= obc
->obs
.oi
.soid
;
8953 dout(10) << __func__
<< " " << dest
8954 << " from " << src
<< " " << oloc
<< " v" << version
8955 << " flags " << flags
8956 << (mirror_snapset
? " mirror_snapset" : "")
8959 ceph_assert(!mirror_snapset
|| src
.snap
== CEPH_NOSNAP
);
8961 // cancel a previous in-progress copy?
8962 if (copy_ops
.count(dest
)) {
8963 // FIXME: if the src etc match, we could avoid restarting from the
8965 CopyOpRef cop
= copy_ops
[dest
];
8966 vector
<ceph_tid_t
> tids
;
8967 cancel_copy(cop
, false, &tids
);
8968 osd
->objecter
->op_cancel(tids
, -ECANCELED
);
8971 CopyOpRef
cop(std::make_shared
<CopyOp
>(cb
, obc
, src
, oloc
, version
, flags
,
8972 mirror_snapset
, src_obj_fadvise_flags
,
8973 dest_obj_fadvise_flags
));
8974 copy_ops
[dest
] = cop
;
8977 if (!obc
->obs
.oi
.has_manifest()) {
8978 _copy_some(obc
, cop
);
8980 if (obc
->obs
.oi
.manifest
.is_redirect()) {
8981 _copy_some(obc
, cop
);
8982 } else if (obc
->obs
.oi
.manifest
.is_chunked()) {
8983 auto p
= obc
->obs
.oi
.manifest
.chunk_map
.begin();
8984 _copy_some_manifest(obc
, cop
, p
->first
);
8986 ceph_abort_msg("unrecognized manifest type");
8991 void PrimaryLogPG::_copy_some(ObjectContextRef obc
, CopyOpRef cop
)
8993 dout(10) << __func__
<< " " << *obc
<< " " << cop
<< dendl
;
8996 if (cop
->flags
& CEPH_OSD_COPY_FROM_FLAG_FLUSH
)
8997 flags
|= CEPH_OSD_FLAG_FLUSH
;
8998 if (cop
->flags
& CEPH_OSD_COPY_FROM_FLAG_IGNORE_CACHE
)
8999 flags
|= CEPH_OSD_FLAG_IGNORE_CACHE
;
9000 if (cop
->flags
& CEPH_OSD_COPY_FROM_FLAG_IGNORE_OVERLAY
)
9001 flags
|= CEPH_OSD_FLAG_IGNORE_OVERLAY
;
9002 if (cop
->flags
& CEPH_OSD_COPY_FROM_FLAG_MAP_SNAP_CLONE
)
9003 flags
|= CEPH_OSD_FLAG_MAP_SNAP_CLONE
;
9004 if (cop
->flags
& CEPH_OSD_COPY_FROM_FLAG_RWORDERED
)
9005 flags
|= CEPH_OSD_FLAG_RWORDERED
;
9007 C_GatherBuilder
gather(cct
);
9009 if (cop
->cursor
.is_initial() && cop
->mirror_snapset
) {
9011 ceph_assert(cop
->src
.snap
== CEPH_NOSNAP
);
9013 op
.list_snaps(&cop
->results
.snapset
, NULL
);
9014 ceph_tid_t tid
= osd
->objecter
->read(cop
->src
.oid
, cop
->oloc
, op
,
9016 flags
, gather
.new_sub(), NULL
);
9017 cop
->objecter_tid2
= tid
;
9021 if (cop
->results
.user_version
) {
9022 op
.assert_version(cop
->results
.user_version
);
9024 // we should learn the version after the first chunk, if we didn't know
9026 ceph_assert(cop
->cursor
.is_initial());
9028 op
.copy_get(&cop
->cursor
, get_copy_chunk_size(),
9029 &cop
->results
.object_size
, &cop
->results
.mtime
,
9030 &cop
->attrs
, &cop
->data
, &cop
->omap_header
, &cop
->omap_data
,
9031 &cop
->results
.snaps
, &cop
->results
.snap_seq
,
9032 &cop
->results
.flags
,
9033 &cop
->results
.source_data_digest
,
9034 &cop
->results
.source_omap_digest
,
9035 &cop
->results
.reqids
,
9036 &cop
->results
.reqid_return_codes
,
9037 &cop
->results
.truncate_seq
,
9038 &cop
->results
.truncate_size
,
9040 op
.set_last_op_flags(cop
->src_obj_fadvise_flags
);
9042 C_Copyfrom
*fin
= new C_Copyfrom(this, obc
->obs
.oi
.soid
,
9043 get_last_peering_reset(), cop
);
9044 unsigned n
= info
.pgid
.hash_to_shard(osd
->m_objecter_finishers
);
9045 gather
.set_finisher(new C_OnFinisher(fin
,
9046 osd
->objecter_finishers
[n
]));
9048 ceph_tid_t tid
= osd
->objecter
->read(cop
->src
.oid
, cop
->oloc
, op
,
9049 cop
->src
.snap
, NULL
,
9052 // discover the object version if we don't know it yet
9053 cop
->results
.user_version
? NULL
: &cop
->results
.user_version
);
9055 cop
->objecter_tid
= tid
;
9059 void PrimaryLogPG::_copy_some_manifest(ObjectContextRef obc
, CopyOpRef cop
, uint64_t start_offset
)
9061 dout(10) << __func__
<< " " << *obc
<< " " << cop
<< dendl
;
9064 if (cop
->flags
& CEPH_OSD_COPY_FROM_FLAG_FLUSH
)
9065 flags
|= CEPH_OSD_FLAG_FLUSH
;
9066 if (cop
->flags
& CEPH_OSD_COPY_FROM_FLAG_IGNORE_CACHE
)
9067 flags
|= CEPH_OSD_FLAG_IGNORE_CACHE
;
9068 if (cop
->flags
& CEPH_OSD_COPY_FROM_FLAG_IGNORE_OVERLAY
)
9069 flags
|= CEPH_OSD_FLAG_IGNORE_OVERLAY
;
9070 if (cop
->flags
& CEPH_OSD_COPY_FROM_FLAG_MAP_SNAP_CLONE
)
9071 flags
|= CEPH_OSD_FLAG_MAP_SNAP_CLONE
;
9072 if (cop
->flags
& CEPH_OSD_COPY_FROM_FLAG_RWORDERED
)
9073 flags
|= CEPH_OSD_FLAG_RWORDERED
;
9076 uint64_t last_offset
= 0, chunks_size
= 0;
9077 object_manifest_t
*manifest
= &obc
->obs
.oi
.manifest
;
9078 map
<uint64_t, chunk_info_t
>::iterator iter
= manifest
->chunk_map
.find(start_offset
);
9079 for (;iter
!= manifest
->chunk_map
.end(); ++iter
) {
9081 chunks_size
+= iter
->second
.length
;
9082 last_offset
= iter
->first
;
9083 if (get_copy_chunk_size() < chunks_size
) {
9088 cop
->num_chunk
= num_chunks
;
9089 cop
->start_offset
= start_offset
;
9090 cop
->last_offset
= last_offset
;
9091 dout(20) << __func__
<< " oid " << obc
->obs
.oi
.soid
<< " num_chunks: " << num_chunks
9092 << " start_offset: " << start_offset
<< " chunks_size: " << chunks_size
9093 << " last_offset: " << last_offset
<< dendl
;
9095 iter
= manifest
->chunk_map
.find(start_offset
);
9096 for (;iter
!= manifest
->chunk_map
.end(); ++iter
) {
9097 uint64_t obj_offset
= iter
->first
;
9098 uint64_t length
= manifest
->chunk_map
[iter
->first
].length
;
9099 hobject_t soid
= manifest
->chunk_map
[iter
->first
].oid
;
9100 object_locator_t
oloc(soid
);
9101 CopyCallback
* cb
= NULL
;
9102 CopyOpRef
sub_cop(std::make_shared
<CopyOp
>(cb
, ObjectContextRef(), cop
->src
, oloc
,
9103 cop
->results
.user_version
, cop
->flags
, cop
->mirror_snapset
,
9104 cop
->src_obj_fadvise_flags
, cop
->dest_obj_fadvise_flags
));
9105 sub_cop
->cursor
.data_offset
= obj_offset
;
9106 cop
->chunk_cops
[obj_offset
] = sub_cop
;
9108 int s
= sub_cop
->chunk_ops
.size();
9109 sub_cop
->chunk_ops
.resize(s
+1);
9110 sub_cop
->chunk_ops
[s
].op
.op
= CEPH_OSD_OP_READ
;
9111 sub_cop
->chunk_ops
[s
].op
.extent
.offset
= manifest
->chunk_map
[iter
->first
].offset
;
9112 sub_cop
->chunk_ops
[s
].op
.extent
.length
= length
;
9115 op
.dup(sub_cop
->chunk_ops
);
9117 dout(20) << __func__
<< " tgt_oid: " << soid
.oid
<< " tgt_offset: "
9118 << manifest
->chunk_map
[iter
->first
].offset
9119 << " length: " << length
<< " pool id: " << oloc
.pool
<< dendl
;
9121 if (cop
->results
.user_version
) {
9122 op
.assert_version(cop
->results
.user_version
);
9124 // we should learn the version after the first chunk, if we didn't know
9126 ceph_assert(cop
->cursor
.is_initial());
9128 op
.set_last_op_flags(cop
->src_obj_fadvise_flags
);
9130 C_CopyChunk
*fin
= new C_CopyChunk(this, obc
->obs
.oi
.soid
,
9131 get_last_peering_reset(), cop
);
9132 fin
->offset
= obj_offset
;
9133 unsigned n
= info
.pgid
.hash_to_shard(osd
->m_objecter_finishers
);
9135 ceph_tid_t tid
= osd
->objecter
->read(soid
.oid
, oloc
, op
,
9136 sub_cop
->src
.snap
, NULL
,
9138 new C_OnFinisher(fin
, osd
->objecter_finishers
[n
]),
9139 // discover the object version if we don't know it yet
9140 sub_cop
->results
.user_version
? NULL
: &sub_cop
->results
.user_version
);
9142 sub_cop
->objecter_tid
= tid
;
9143 if (last_offset
< iter
->first
) {
9149 void PrimaryLogPG::process_copy_chunk(hobject_t oid
, ceph_tid_t tid
, int r
)
9151 dout(10) << __func__
<< " " << oid
<< " tid " << tid
9152 << " " << cpp_strerror(r
) << dendl
;
9153 map
<hobject_t
,CopyOpRef
>::iterator p
= copy_ops
.find(oid
);
9154 if (p
== copy_ops
.end()) {
9155 dout(10) << __func__
<< " no copy_op found" << dendl
;
9158 CopyOpRef cop
= p
->second
;
9159 if (tid
!= cop
->objecter_tid
) {
9160 dout(10) << __func__
<< " tid " << tid
<< " != cop " << cop
9161 << " tid " << cop
->objecter_tid
<< dendl
;
9165 if (cop
->omap_data
.length() || cop
->omap_header
.length())
9166 cop
->results
.has_omap
= true;
9168 if (r
>= 0 && !pool
.info
.supports_omap() &&
9169 (cop
->omap_data
.length() || cop
->omap_header
.length())) {
9172 cop
->objecter_tid
= 0;
9173 cop
->objecter_tid2
= 0; // assume this ordered before us (if it happened)
9174 ObjectContextRef
& cobc
= cop
->obc
;
9179 ceph_assert(cop
->rval
>= 0);
9181 if (oid
.snap
< CEPH_NOSNAP
&& !cop
->results
.snaps
.empty()) {
9182 // verify snap hasn't been deleted
9183 vector
<snapid_t
>::iterator p
= cop
->results
.snaps
.begin();
9184 while (p
!= cop
->results
.snaps
.end()) {
9185 if (pool
.info
.is_removed_snap(*p
)) {
9186 dout(10) << __func__
<< " clone snap " << *p
<< " has been deleted"
9188 for (vector
<snapid_t
>::iterator q
= p
+ 1;
9189 q
!= cop
->results
.snaps
.end();
9192 cop
->results
.snaps
.resize(cop
->results
.snaps
.size() - 1);
9197 if (cop
->results
.snaps
.empty()) {
9198 dout(10) << __func__
<< " no more snaps for " << oid
<< dendl
;
9204 ceph_assert(cop
->rval
>= 0);
9206 if (!cop
->temp_cursor
.data_complete
) {
9207 cop
->results
.data_digest
= cop
->data
.crc32c(cop
->results
.data_digest
);
9209 if (pool
.info
.supports_omap() && !cop
->temp_cursor
.omap_complete
) {
9210 if (cop
->omap_header
.length()) {
9211 cop
->results
.omap_digest
=
9212 cop
->omap_header
.crc32c(cop
->results
.omap_digest
);
9214 if (cop
->omap_data
.length()) {
9216 keys
.substr_of(cop
->omap_data
, 4, cop
->omap_data
.length() - 4);
9217 cop
->results
.omap_digest
= keys
.crc32c(cop
->results
.omap_digest
);
9221 if (!cop
->temp_cursor
.attr_complete
) {
9222 for (map
<string
,bufferlist
>::iterator p
= cop
->attrs
.begin();
9223 p
!= cop
->attrs
.end();
9225 cop
->results
.attrs
[string("_") + p
->first
] = p
->second
;
9230 if (!cop
->cursor
.is_complete()) {
9231 // write out what we have so far
9232 if (cop
->temp_cursor
.is_initial()) {
9233 ceph_assert(!cop
->results
.started_temp_obj
);
9234 cop
->results
.started_temp_obj
= true;
9235 cop
->results
.temp_oid
= generate_temp_object(oid
);
9236 dout(20) << __func__
<< " using temp " << cop
->results
.temp_oid
<< dendl
;
9238 ObjectContextRef tempobc
= get_object_context(cop
->results
.temp_oid
, true);
9239 OpContextUPtr ctx
= simple_opc_create(tempobc
);
9240 if (cop
->temp_cursor
.is_initial()) {
9241 ctx
->new_temp_oid
= cop
->results
.temp_oid
;
9243 _write_copy_chunk(cop
, ctx
->op_t
.get());
9244 simple_opc_submit(std::move(ctx
));
9245 dout(10) << __func__
<< " fetching more" << dendl
;
9246 _copy_some(cobc
, cop
);
9251 if (cop
->results
.is_data_digest() || cop
->results
.is_omap_digest()) {
9252 dout(20) << __func__
<< std::hex
9253 << " got digest: rx data 0x" << cop
->results
.data_digest
9254 << " omap 0x" << cop
->results
.omap_digest
9255 << ", source: data 0x" << cop
->results
.source_data_digest
9256 << " omap 0x" << cop
->results
.source_omap_digest
9258 << " flags " << cop
->results
.flags
9261 if (cop
->results
.is_data_digest() &&
9262 cop
->results
.data_digest
!= cop
->results
.source_data_digest
) {
9263 derr
<< __func__
<< std::hex
<< " data digest 0x" << cop
->results
.data_digest
9264 << " != source 0x" << cop
->results
.source_data_digest
<< std::dec
9266 osd
->clog
->error() << info
.pgid
<< " copy from " << cop
->src
9267 << " to " << cop
->obc
->obs
.oi
.soid
<< std::hex
9268 << " data digest 0x" << cop
->results
.data_digest
9269 << " != source 0x" << cop
->results
.source_data_digest
9274 if (cop
->results
.is_omap_digest() &&
9275 cop
->results
.omap_digest
!= cop
->results
.source_omap_digest
) {
9276 derr
<< __func__
<< std::hex
9277 << " omap digest 0x" << cop
->results
.omap_digest
9278 << " != source 0x" << cop
->results
.source_omap_digest
9279 << std::dec
<< dendl
;
9280 osd
->clog
->error() << info
.pgid
<< " copy from " << cop
->src
9281 << " to " << cop
->obc
->obs
.oi
.soid
<< std::hex
9282 << " omap digest 0x" << cop
->results
.omap_digest
9283 << " != source 0x" << cop
->results
.source_omap_digest
9288 if (cct
->_conf
->osd_debug_inject_copyfrom_error
) {
9289 derr
<< __func__
<< " injecting copyfrom failure" << dendl
;
9294 cop
->results
.fill_in_final_tx
= std::function
<void(PGTransaction
*)>(
9295 [this, &cop
/* avoid ref cycle */](PGTransaction
*t
) {
9296 ObjectState
& obs
= cop
->obc
->obs
;
9297 if (cop
->temp_cursor
.is_initial()) {
9298 dout(20) << "fill_in_final_tx: writing "
9299 << "directly to final object" << dendl
;
9300 // write directly to final object
9301 cop
->results
.temp_oid
= obs
.oi
.soid
;
9302 _write_copy_chunk(cop
, t
);
9304 // finish writing to temp object, then move into place
9305 dout(20) << "fill_in_final_tx: writing to temp object" << dendl
;
9306 _write_copy_chunk(cop
, t
);
9307 t
->rename(obs
.oi
.soid
, cop
->results
.temp_oid
);
9309 t
->setattrs(obs
.oi
.soid
, cop
->results
.attrs
);
9312 dout(20) << __func__
<< " success; committing" << dendl
;
9315 dout(20) << __func__
<< " complete r = " << cpp_strerror(r
) << dendl
;
9316 CopyCallbackResults
results(r
, &cop
->results
);
9317 cop
->cb
->complete(results
);
9319 copy_ops
.erase(cobc
->obs
.oi
.soid
);
9322 if (r
< 0 && cop
->results
.started_temp_obj
) {
9323 dout(10) << __func__
<< " deleting partial temp object "
9324 << cop
->results
.temp_oid
<< dendl
;
9325 ObjectContextRef tempobc
= get_object_context(cop
->results
.temp_oid
, true);
9326 OpContextUPtr ctx
= simple_opc_create(tempobc
);
9327 ctx
->op_t
->remove(cop
->results
.temp_oid
);
9328 ctx
->discard_temp_oid
= cop
->results
.temp_oid
;
9329 simple_opc_submit(std::move(ctx
));
9332 // cancel and requeue proxy ops on this object
9334 cancel_and_requeue_proxy_ops(cobc
->obs
.oi
.soid
);
9337 kick_object_context_blocked(cobc
);
9340 void PrimaryLogPG::process_copy_chunk_manifest(hobject_t oid
, ceph_tid_t tid
, int r
, uint64_t offset
)
9342 dout(10) << __func__
<< " " << oid
<< " tid " << tid
9343 << " " << cpp_strerror(r
) << dendl
;
9344 map
<hobject_t
,CopyOpRef
>::iterator p
= copy_ops
.find(oid
);
9345 if (p
== copy_ops
.end()) {
9346 dout(10) << __func__
<< " no copy_op found" << dendl
;
9349 CopyOpRef obj_cop
= p
->second
;
9350 CopyOpRef chunk_cop
= obj_cop
->chunk_cops
[offset
];
9352 if (tid
!= chunk_cop
->objecter_tid
) {
9353 dout(10) << __func__
<< " tid " << tid
<< " != cop " << chunk_cop
9354 << " tid " << chunk_cop
->objecter_tid
<< dendl
;
9358 if (chunk_cop
->omap_data
.length() || chunk_cop
->omap_header
.length()) {
9362 chunk_cop
->objecter_tid
= 0;
9363 chunk_cop
->objecter_tid2
= 0; // assume this ordered before us (if it happened)
9364 ObjectContextRef
& cobc
= obj_cop
->obc
;
9365 OSDOp
&chunk_data
= chunk_cop
->chunk_ops
[0];
9368 obj_cop
->failed
= true;
9372 if (obj_cop
->failed
) {
9375 if (!chunk_data
.outdata
.length()) {
9377 obj_cop
->failed
= true;
9381 obj_cop
->num_chunk
--;
9383 /* check all of the copyop are completed */
9384 if (obj_cop
->num_chunk
) {
9385 dout(20) << __func__
<< " num_chunk: " << obj_cop
->num_chunk
<< dendl
;
9390 OpContextUPtr ctx
= simple_opc_create(obj_cop
->obc
);
9391 if (!ctx
->lock_manager
.take_write_lock(
9392 obj_cop
->obc
->obs
.oi
.soid
,
9394 // recovery op can take read lock.
9395 // so need to wait for recovery completion
9397 obj_cop
->failed
= true;
9398 close_op_ctx(ctx
.release());
9401 dout(20) << __func__
<< " took lock on obc, " << obj_cop
->obc
->rwstate
<< dendl
;
9403 PGTransaction
*t
= ctx
->op_t
.get();
9404 ObjectState
& obs
= ctx
->new_obs
;
9405 for (auto p
: obj_cop
->chunk_cops
) {
9406 OSDOp
&sub_chunk
= p
.second
->chunk_ops
[0];
9407 t
->write(cobc
->obs
.oi
.soid
,
9408 p
.second
->cursor
.data_offset
,
9409 sub_chunk
.outdata
.length(),
9411 p
.second
->dest_obj_fadvise_flags
);
9412 dout(20) << __func__
<< " offset: " << p
.second
->cursor
.data_offset
9413 << " length: " << sub_chunk
.outdata
.length() << dendl
;
9414 write_update_size_and_usage(ctx
->delta_stats
, obs
.oi
, ctx
->modified_ranges
,
9415 p
.second
->cursor
.data_offset
, sub_chunk
.outdata
.length());
9416 obs
.oi
.manifest
.chunk_map
[p
.second
->cursor
.data_offset
].clear_flag(chunk_info_t::FLAG_DIRTY
);
9417 obs
.oi
.manifest
.chunk_map
[p
.second
->cursor
.data_offset
].clear_flag(chunk_info_t::FLAG_MISSING
);
9418 sub_chunk
.outdata
.clear();
9420 obs
.oi
.clear_data_digest();
9421 ctx
->at_version
= get_next_version();
9422 finish_ctx(ctx
.get(), pg_log_entry_t::PROMOTE
);
9423 simple_opc_submit(std::move(ctx
));
9425 auto p
= cobc
->obs
.oi
.manifest
.chunk_map
.rbegin();
9426 /* check remaining work */
9427 if (p
!= cobc
->obs
.oi
.manifest
.chunk_map
.rend()) {
9428 if (obj_cop
->last_offset
>= p
->first
+ p
->second
.length
) {
9429 for (auto &en
: cobc
->obs
.oi
.manifest
.chunk_map
) {
9430 if (obj_cop
->last_offset
< en
.first
) {
9431 _copy_some_manifest(cobc
, obj_cop
, en
.first
);
9440 dout(20) << __func__
<< " complete r = " << cpp_strerror(r
) << dendl
;
9441 CopyCallbackResults
results(r
, &obj_cop
->results
);
9442 obj_cop
->cb
->complete(results
);
9444 copy_ops
.erase(cobc
->obs
.oi
.soid
);
9447 // cancel and requeue proxy ops on this object
9449 cancel_and_requeue_proxy_ops(cobc
->obs
.oi
.soid
);
9452 kick_object_context_blocked(cobc
);
9455 void PrimaryLogPG::cancel_and_requeue_proxy_ops(hobject_t oid
) {
9456 vector
<ceph_tid_t
> tids
;
9457 for (map
<ceph_tid_t
, ProxyReadOpRef
>::iterator it
= proxyread_ops
.begin();
9458 it
!= proxyread_ops
.end();) {
9459 if (it
->second
->soid
== oid
) {
9460 cancel_proxy_read((it
++)->second
, &tids
);
9465 for (map
<ceph_tid_t
, ProxyWriteOpRef
>::iterator it
= proxywrite_ops
.begin();
9466 it
!= proxywrite_ops
.end();) {
9467 if (it
->second
->soid
== oid
) {
9468 cancel_proxy_write((it
++)->second
, &tids
);
9473 osd
->objecter
->op_cancel(tids
, -ECANCELED
);
9474 kick_proxy_ops_blocked(oid
);
9477 void PrimaryLogPG::_write_copy_chunk(CopyOpRef cop
, PGTransaction
*t
)
9479 dout(20) << __func__
<< " " << cop
9480 << " " << cop
->attrs
.size() << " attrs"
9481 << " " << cop
->data
.length() << " bytes"
9482 << " " << cop
->omap_header
.length() << " omap header bytes"
9483 << " " << cop
->omap_data
.length() << " omap data bytes"
9485 if (!cop
->temp_cursor
.attr_complete
) {
9486 t
->create(cop
->results
.temp_oid
);
9488 if (!cop
->temp_cursor
.data_complete
) {
9489 ceph_assert(cop
->data
.length() + cop
->temp_cursor
.data_offset
==
9490 cop
->cursor
.data_offset
);
9491 if (pool
.info
.required_alignment() &&
9492 !cop
->cursor
.data_complete
) {
9494 * Trim off the unaligned bit at the end, we'll adjust cursor.data_offset
9495 * to pick it up on the next pass.
9497 ceph_assert(cop
->temp_cursor
.data_offset
%
9498 pool
.info
.required_alignment() == 0);
9499 if (cop
->data
.length() % pool
.info
.required_alignment() != 0) {
9501 cop
->data
.length() % pool
.info
.required_alignment();
9503 bl
.substr_of(cop
->data
, 0, cop
->data
.length() - to_trim
);
9505 cop
->cursor
.data_offset
-= to_trim
;
9506 ceph_assert(cop
->data
.length() + cop
->temp_cursor
.data_offset
==
9507 cop
->cursor
.data_offset
);
9510 if (cop
->data
.length()) {
9512 cop
->results
.temp_oid
,
9513 cop
->temp_cursor
.data_offset
,
9516 cop
->dest_obj_fadvise_flags
);
9520 if (pool
.info
.supports_omap()) {
9521 if (!cop
->temp_cursor
.omap_complete
) {
9522 if (cop
->omap_header
.length()) {
9524 cop
->results
.temp_oid
,
9526 cop
->omap_header
.clear();
9528 if (cop
->omap_data
.length()) {
9529 map
<string
,bufferlist
> omap
;
9530 bufferlist::const_iterator p
= cop
->omap_data
.begin();
9532 t
->omap_setkeys(cop
->results
.temp_oid
, omap
);
9533 cop
->omap_data
.clear();
9537 ceph_assert(cop
->omap_header
.length() == 0);
9538 ceph_assert(cop
->omap_data
.length() == 0);
9540 cop
->temp_cursor
= cop
->cursor
;
9543 void PrimaryLogPG::finish_copyfrom(CopyFromCallback
*cb
)
9545 OpContext
*ctx
= cb
->ctx
;
9546 dout(20) << "finish_copyfrom on " << ctx
->obs
->oi
.soid
<< dendl
;
9548 ObjectState
& obs
= ctx
->new_obs
;
9550 dout(20) << __func__
<< ": exists, removing" << dendl
;
9551 ctx
->op_t
->remove(obs
.oi
.soid
);
9553 ctx
->delta_stats
.num_objects
++;
9556 if (cb
->is_temp_obj_used()) {
9557 ctx
->discard_temp_oid
= cb
->results
->temp_oid
;
9559 cb
->results
->fill_in_final_tx(ctx
->op_t
.get());
9561 // CopyFromCallback fills this in for us
9562 obs
.oi
.user_version
= ctx
->user_at_version
;
9564 if (cb
->results
->is_data_digest()) {
9565 obs
.oi
.set_data_digest(cb
->results
->data_digest
);
9567 obs
.oi
.clear_data_digest();
9569 if (cb
->results
->is_omap_digest()) {
9570 obs
.oi
.set_omap_digest(cb
->results
->omap_digest
);
9572 obs
.oi
.clear_omap_digest();
9575 obs
.oi
.truncate_seq
= cb
->results
->truncate_seq
;
9576 obs
.oi
.truncate_size
= cb
->results
->truncate_size
;
9578 ctx
->extra_reqids
= cb
->results
->reqids
;
9579 ctx
->extra_reqid_return_codes
= cb
->results
->reqid_return_codes
;
9581 // cache: clear whiteout?
9582 if (obs
.oi
.is_whiteout()) {
9583 dout(10) << __func__
<< " clearing whiteout on " << obs
.oi
.soid
<< dendl
;
9584 obs
.oi
.clear_flag(object_info_t::FLAG_WHITEOUT
);
9585 --ctx
->delta_stats
.num_whiteouts
;
9588 if (cb
->results
->has_omap
) {
9589 dout(10) << __func__
<< " setting omap flag on " << obs
.oi
.soid
<< dendl
;
9590 obs
.oi
.set_flag(object_info_t::FLAG_OMAP
);
9592 dout(10) << __func__
<< " clearing omap flag on " << obs
.oi
.soid
<< dendl
;
9593 obs
.oi
.clear_flag(object_info_t::FLAG_OMAP
);
9596 interval_set
<uint64_t> ch
;
9597 if (obs
.oi
.size
> 0)
9598 ch
.insert(0, obs
.oi
.size
);
9599 ctx
->modified_ranges
.union_of(ch
);
9601 if (cb
->get_data_size() != obs
.oi
.size
) {
9602 ctx
->delta_stats
.num_bytes
-= obs
.oi
.size
;
9603 obs
.oi
.size
= cb
->get_data_size();
9604 ctx
->delta_stats
.num_bytes
+= obs
.oi
.size
;
9606 ctx
->delta_stats
.num_wr
++;
9607 ctx
->delta_stats
.num_wr_kb
+= shift_round_up(obs
.oi
.size
, 10);
9609 osd
->logger
->inc(l_osd_copyfrom
);
9612 void PrimaryLogPG::finish_promote(int r
, CopyResults
*results
,
9613 ObjectContextRef obc
)
9615 const hobject_t
& soid
= obc
->obs
.oi
.soid
;
9616 dout(10) << __func__
<< " " << soid
<< " r=" << r
9617 << " uv" << results
->user_version
<< dendl
;
9619 if (r
== -ECANCELED
) {
9623 if (r
!= -ENOENT
&& soid
.is_snap()) {
9624 if (results
->snaps
.empty()) {
9625 // we must have read "snap" content from the head object in
9626 // the base pool. use snap_seq to construct what snaps should
9627 // be for this clone (what is was before we evicted the clean
9628 // clone from this pool, and what it will be when we flush and
9629 // the clone eventually happens in the base pool).
9630 SnapSet
& snapset
= obc
->ssc
->snapset
;
9631 vector
<snapid_t
>::iterator p
= snapset
.snaps
.begin();
9632 while (p
!= snapset
.snaps
.end() && *p
> soid
.snap
)
9634 while (p
!= snapset
.snaps
.end() && *p
> results
->snap_seq
) {
9635 results
->snaps
.push_back(*p
);
9640 dout(20) << __func__
<< " snaps " << results
->snaps
<< dendl
;
9641 filter_snapc(results
->snaps
);
9643 dout(20) << __func__
<< " filtered snaps " << results
->snaps
<< dendl
;
9644 if (results
->snaps
.empty()) {
9645 dout(20) << __func__
9646 << " snaps are empty, clone is invalid,"
9647 << " setting r to ENOENT" << dendl
;
9652 if (r
< 0 && results
->started_temp_obj
) {
9653 dout(10) << __func__
<< " abort; will clean up partial work" << dendl
;
9654 ObjectContextRef tempobc
= get_object_context(results
->temp_oid
, false);
9655 ceph_assert(tempobc
);
9656 OpContextUPtr ctx
= simple_opc_create(tempobc
);
9657 ctx
->op_t
->remove(results
->temp_oid
);
9658 simple_opc_submit(std::move(ctx
));
9659 results
->started_temp_obj
= false;
9662 if (r
== -ENOENT
&& soid
.is_snap()) {
9663 dout(10) << __func__
9664 << ": enoent while trying to promote clone, " << soid
9665 << " must have been trimmed, removing from snapset"
9667 hobject_t
head(soid
.get_head());
9668 ObjectContextRef obc
= get_object_context(head
, false);
9671 OpContextUPtr tctx
= simple_opc_create(obc
);
9672 tctx
->at_version
= get_next_version();
9673 filter_snapc(tctx
->new_snapset
.snaps
);
9674 vector
<snapid_t
> new_clones
;
9675 map
<snapid_t
, vector
<snapid_t
>> new_clone_snaps
;
9676 for (vector
<snapid_t
>::iterator i
= tctx
->new_snapset
.clones
.begin();
9677 i
!= tctx
->new_snapset
.clones
.end();
9679 if (*i
!= soid
.snap
) {
9680 new_clones
.push_back(*i
);
9681 auto p
= tctx
->new_snapset
.clone_snaps
.find(*i
);
9682 if (p
!= tctx
->new_snapset
.clone_snaps
.end()) {
9683 new_clone_snaps
[*i
] = p
->second
;
9687 tctx
->new_snapset
.clones
.swap(new_clones
);
9688 tctx
->new_snapset
.clone_overlap
.erase(soid
.snap
);
9689 tctx
->new_snapset
.clone_size
.erase(soid
.snap
);
9690 tctx
->new_snapset
.clone_snaps
.swap(new_clone_snaps
);
9692 // take RWWRITE lock for duration of our local write. ignore starvation.
9693 if (!tctx
->lock_manager
.take_write_lock(
9696 ceph_abort_msg("problem!");
9698 dout(20) << __func__
<< " took lock on obc, " << obc
->rwstate
<< dendl
;
9700 finish_ctx(tctx
.get(), pg_log_entry_t::PROMOTE
);
9702 simple_opc_submit(std::move(tctx
));
9706 bool whiteout
= false;
9708 ceph_assert(soid
.snap
== CEPH_NOSNAP
); // snap case is above
9709 dout(10) << __func__
<< " whiteout " << soid
<< dendl
;
9713 if (r
< 0 && !whiteout
) {
9714 derr
<< __func__
<< " unexpected promote error " << cpp_strerror(r
) << dendl
;
9715 // pass error to everyone blocked on this object
9716 // FIXME: this is pretty sloppy, but at this point we got
9717 // something unexpected and don't have many other options.
9718 map
<hobject_t
,list
<OpRequestRef
>>::iterator blocked_iter
=
9719 waiting_for_blocked_object
.find(soid
);
9720 if (blocked_iter
!= waiting_for_blocked_object
.end()) {
9721 while (!blocked_iter
->second
.empty()) {
9722 osd
->reply_op_error(blocked_iter
->second
.front(), r
);
9723 blocked_iter
->second
.pop_front();
9725 waiting_for_blocked_object
.erase(blocked_iter
);
9730 osd
->promote_finish(results
->object_size
);
9732 OpContextUPtr tctx
= simple_opc_create(obc
);
9733 tctx
->at_version
= get_next_version();
9735 if (!obc
->obs
.oi
.has_manifest()) {
9736 ++tctx
->delta_stats
.num_objects
;
9738 if (soid
.snap
< CEPH_NOSNAP
)
9739 ++tctx
->delta_stats
.num_object_clones
;
9740 tctx
->new_obs
.exists
= true;
9742 tctx
->extra_reqids
= results
->reqids
;
9743 tctx
->extra_reqid_return_codes
= results
->reqid_return_codes
;
9746 // create a whiteout
9747 tctx
->op_t
->create(soid
);
9748 tctx
->new_obs
.oi
.set_flag(object_info_t::FLAG_WHITEOUT
);
9749 ++tctx
->delta_stats
.num_whiteouts
;
9750 dout(20) << __func__
<< " creating whiteout on " << soid
<< dendl
;
9751 osd
->logger
->inc(l_osd_tier_whiteout
);
9753 if (results
->has_omap
) {
9754 dout(10) << __func__
<< " setting omap flag on " << soid
<< dendl
;
9755 tctx
->new_obs
.oi
.set_flag(object_info_t::FLAG_OMAP
);
9756 ++tctx
->delta_stats
.num_objects_omap
;
9759 results
->fill_in_final_tx(tctx
->op_t
.get());
9760 if (results
->started_temp_obj
) {
9761 tctx
->discard_temp_oid
= results
->temp_oid
;
9763 tctx
->new_obs
.oi
.size
= results
->object_size
;
9764 tctx
->new_obs
.oi
.user_version
= results
->user_version
;
9765 if (results
->is_data_digest()) {
9766 tctx
->new_obs
.oi
.set_data_digest(results
->data_digest
);
9768 tctx
->new_obs
.oi
.clear_data_digest();
9770 if (results
->is_omap_digest()) {
9771 tctx
->new_obs
.oi
.set_omap_digest(results
->omap_digest
);
9773 tctx
->new_obs
.oi
.clear_omap_digest();
9775 tctx
->new_obs
.oi
.truncate_seq
= results
->truncate_seq
;
9776 tctx
->new_obs
.oi
.truncate_size
= results
->truncate_size
;
9778 if (soid
.snap
!= CEPH_NOSNAP
) {
9779 ceph_assert(obc
->ssc
->snapset
.clone_snaps
.count(soid
.snap
));
9780 ceph_assert(obc
->ssc
->snapset
.clone_size
.count(soid
.snap
));
9781 ceph_assert(obc
->ssc
->snapset
.clone_size
[soid
.snap
] ==
9782 results
->object_size
);
9783 ceph_assert(obc
->ssc
->snapset
.clone_overlap
.count(soid
.snap
));
9785 tctx
->delta_stats
.num_bytes
+= obc
->ssc
->snapset
.get_clone_bytes(soid
.snap
);
9787 tctx
->delta_stats
.num_bytes
+= results
->object_size
;
9791 if (results
->mirror_snapset
) {
9792 ceph_assert(tctx
->new_obs
.oi
.soid
.snap
== CEPH_NOSNAP
);
9793 tctx
->new_snapset
.from_snap_set(
9795 get_osdmap()->require_osd_release
< CEPH_RELEASE_LUMINOUS
);
9797 dout(20) << __func__
<< " new_snapset " << tctx
->new_snapset
<< dendl
;
9799 // take RWWRITE lock for duration of our local write. ignore starvation.
9800 if (!tctx
->lock_manager
.take_write_lock(
9803 ceph_abort_msg("problem!");
9805 dout(20) << __func__
<< " took lock on obc, " << obc
->rwstate
<< dendl
;
9807 finish_ctx(tctx
.get(), pg_log_entry_t::PROMOTE
);
9809 simple_opc_submit(std::move(tctx
));
9811 osd
->logger
->inc(l_osd_tier_promote
);
9814 agent_state
->is_idle())
9815 agent_choose_mode();
9818 void PrimaryLogPG::finish_promote_manifest(int r
, CopyResults
*results
,
9819 ObjectContextRef obc
)
9821 const hobject_t
& soid
= obc
->obs
.oi
.soid
;
9822 dout(10) << __func__
<< " " << soid
<< " r=" << r
9823 << " uv" << results
->user_version
<< dendl
;
9825 if (r
== -ECANCELED
|| r
== -EAGAIN
) {
9830 derr
<< __func__
<< " unexpected promote error " << cpp_strerror(r
) << dendl
;
9831 // pass error to everyone blocked on this object
9832 // FIXME: this is pretty sloppy, but at this point we got
9833 // something unexpected and don't have many other options.
9834 map
<hobject_t
,list
<OpRequestRef
>>::iterator blocked_iter
=
9835 waiting_for_blocked_object
.find(soid
);
9836 if (blocked_iter
!= waiting_for_blocked_object
.end()) {
9837 while (!blocked_iter
->second
.empty()) {
9838 osd
->reply_op_error(blocked_iter
->second
.front(), r
);
9839 blocked_iter
->second
.pop_front();
9841 waiting_for_blocked_object
.erase(blocked_iter
);
9846 osd
->promote_finish(results
->object_size
);
9847 osd
->logger
->inc(l_osd_tier_promote
);
9850 agent_state
->is_idle())
9851 agent_choose_mode();
9854 void PrimaryLogPG::cancel_copy(CopyOpRef cop
, bool requeue
,
9855 vector
<ceph_tid_t
> *tids
)
9857 dout(10) << __func__
<< " " << cop
->obc
->obs
.oi
.soid
9858 << " from " << cop
->src
<< " " << cop
->oloc
9859 << " v" << cop
->results
.user_version
<< dendl
;
9861 // cancel objecter op, if we can
9862 if (cop
->objecter_tid
) {
9863 tids
->push_back(cop
->objecter_tid
);
9864 cop
->objecter_tid
= 0;
9865 if (cop
->objecter_tid2
) {
9866 tids
->push_back(cop
->objecter_tid2
);
9867 cop
->objecter_tid2
= 0;
9871 copy_ops
.erase(cop
->obc
->obs
.oi
.soid
);
9872 cop
->obc
->stop_block();
9874 kick_object_context_blocked(cop
->obc
);
9875 cop
->results
.should_requeue
= requeue
;
9876 CopyCallbackResults
result(-ECANCELED
, &cop
->results
);
9877 cop
->cb
->complete(result
);
9879 // There may still be an objecter callback referencing this copy op.
9880 // That callback will not need the obc since it's been canceled, and
9881 // we need the obc reference to go away prior to flush.
9882 cop
->obc
= ObjectContextRef();
9885 void PrimaryLogPG::cancel_copy_ops(bool requeue
, vector
<ceph_tid_t
> *tids
)
9887 dout(10) << __func__
<< dendl
;
9888 map
<hobject_t
,CopyOpRef
>::iterator p
= copy_ops
.begin();
9889 while (p
!= copy_ops
.end()) {
9890 // requeue this op? can I queue up all of them?
9891 cancel_copy((p
++)->second
, requeue
, tids
);
9896 // ========================================================================
9899 // Flush a dirty object in the cache tier by writing it back to the
9900 // base tier. The sequence looks like:
9902 // * send a copy-from operation to the base tier to copy the current
9903 // version of the object
9904 // * base tier will pull the object via (perhaps multiple) copy-get(s)
9905 // * on completion, we check if the object has been modified. if so,
9906 // just reply with -EAGAIN.
9907 // * try to take a write lock so we can clear the dirty flag. if this
9908 // fails, wait and retry
9909 // * start a repop that clears the bit.
9911 // If we have to wait, we will retry by coming back through the
9912 // start_flush method. We check if a flush is already in progress
9913 // and, if so, try to finish it by rechecking the version and trying
9914 // to clear the dirty bit.
9916 // In order for the cache-flush (a write op) to not block the copy-get
9917 // from reading the object, the client *must* set the SKIPRWLOCKS
9920 // NOTE: normally writes are strictly ordered for the client, but
9921 // flushes are special in that they can be reordered with respect to
9922 // other writes. In particular, we can't have a flush request block
9923 // an update to the cache pool object!
9925 struct C_Flush
: public Context
{
9928 epoch_t last_peering_reset
;
9931 C_Flush(PrimaryLogPG
*p
, hobject_t o
, epoch_t lpr
)
9932 : pg(p
), oid(o
), last_peering_reset(lpr
),
9933 tid(0), start(ceph_clock_now())
9935 void finish(int r
) override
{
9936 if (r
== -ECANCELED
)
9939 if (last_peering_reset
== pg
->get_last_peering_reset()) {
9940 pg
->finish_flush(oid
, tid
, r
);
9941 pg
->osd
->logger
->tinc(l_osd_tier_flush_lat
, ceph_clock_now() - start
);
9947 int PrimaryLogPG::start_flush(
9948 OpRequestRef op
, ObjectContextRef obc
,
9949 bool blocking
, hobject_t
*pmissing
,
9950 boost::optional
<std::function
<void()>> &&on_flush
)
9952 const object_info_t
& oi
= obc
->obs
.oi
;
9953 const hobject_t
& soid
= oi
.soid
;
9954 dout(10) << __func__
<< " " << soid
9955 << " v" << oi
.version
9956 << " uv" << oi
.user_version
9957 << " " << (blocking
? "blocking" : "non-blocking/best-effort")
9960 // get a filtered snapset, need to remove removed snaps
9961 SnapSet snapset
= obc
->ssc
->snapset
.get_filtered(pool
.info
);
9963 // verify there are no (older) check for dirty clones
9965 dout(20) << " snapset " << snapset
<< dendl
;
9966 vector
<snapid_t
>::reverse_iterator p
= snapset
.clones
.rbegin();
9967 while (p
!= snapset
.clones
.rend() && *p
>= soid
.snap
)
9969 if (p
!= snapset
.clones
.rend()) {
9970 hobject_t next
= soid
;
9972 ceph_assert(next
.snap
< soid
.snap
);
9973 if (pg_log
.get_missing().is_missing(next
)) {
9974 dout(10) << __func__
<< " missing clone is " << next
<< dendl
;
9979 ObjectContextRef older_obc
= get_object_context(next
, false);
9981 dout(20) << __func__
<< " next oldest clone is " << older_obc
->obs
.oi
9983 if (older_obc
->obs
.oi
.is_dirty()) {
9984 dout(10) << __func__
<< " next oldest clone is dirty: "
9985 << older_obc
->obs
.oi
<< dendl
;
9989 dout(20) << __func__
<< " next oldest clone " << next
9990 << " is not present; implicitly clean" << dendl
;
9993 dout(20) << __func__
<< " no older clones" << dendl
;
10000 map
<hobject_t
,FlushOpRef
>::iterator p
= flush_ops
.find(soid
);
10001 if (p
!= flush_ops
.end()) {
10002 FlushOpRef fop
= p
->second
;
10003 if (fop
->op
== op
) {
10004 // we couldn't take the write lock on a cache-try-flush before;
10005 // now we are trying again for the lock.
10006 return try_flush_mark_clean(fop
);
10008 if (fop
->flushed_version
== obc
->obs
.oi
.user_version
&&
10009 (fop
->blocking
|| !blocking
)) {
10010 // nonblocking can join anything
10011 // blocking can only join a blocking flush
10012 dout(20) << __func__
<< " piggybacking on existing flush " << dendl
;
10014 fop
->dup_ops
.push_back(op
);
10015 return -EAGAIN
; // clean up this ctx; op will retry later
10018 // cancel current flush since it will fail anyway, or because we
10019 // are blocking and the existing flush is nonblocking.
10020 dout(20) << __func__
<< " canceling previous flush; it will fail" << dendl
;
10022 osd
->reply_op_error(fop
->op
, -EBUSY
);
10023 while (!fop
->dup_ops
.empty()) {
10024 osd
->reply_op_error(fop
->dup_ops
.front(), -EBUSY
);
10025 fop
->dup_ops
.pop_front();
10027 vector
<ceph_tid_t
> tids
;
10028 cancel_flush(fop
, false, &tids
);
10029 osd
->objecter
->op_cancel(tids
, -ECANCELED
);
10032 if (obc
->obs
.oi
.has_manifest() && obc
->obs
.oi
.manifest
.is_chunked()) {
10033 int r
= start_manifest_flush(op
, obc
, blocking
, std::move(on_flush
));
10034 if (r
!= -EINPROGRESS
) {
10042 * In general, we need to send a delete and a copyfrom.
10043 * Consider snapc 10:[10, 9, 8, 4, 3, 2]:[10(10, 9), 4(4,3,2)]
10044 * where 4 is marked as clean. To flush 10, we have to:
10045 * 1) delete 4:[4,3,2] -- Logically, the object does not exist after 4
10046 * 2) copyfrom 8:[8,4,3,2] -- flush object after snap 8
10048 * There is a complicating case. Supposed there had been a clone 7
10049 * for snaps [7, 6] which has been trimmed since they no longer exist.
10050 * In the base pool, we'd have 5:[4,3,2]:[4(4,3,2)]+head. When we submit
10051 * the delete, the snap will be promoted to 5, and the head will become
10052 * a whiteout. When the copy-from goes through, we'll end up with
10053 * 8:[8,4,3,2]:[4(4,3,2)]+head.
10055 * Another complication is the case where there is an interval change
10056 * after doing the delete and the flush but before marking the object
10057 * clean. We'll happily delete head and then recreate it at the same
10058 * sequence number, which works out ok.
10061 SnapContext snapc
, dsnapc
;
10062 if (snapset
.seq
!= 0) {
10063 if (soid
.snap
== CEPH_NOSNAP
) {
10064 snapc
.seq
= snapset
.seq
;
10065 snapc
.snaps
= snapset
.snaps
;
10067 snapid_t min_included_snap
;
10068 auto p
= snapset
.clone_snaps
.find(soid
.snap
);
10069 ceph_assert(p
!= snapset
.clone_snaps
.end());
10070 min_included_snap
= p
->second
.back();
10071 snapc
= snapset
.get_ssc_as_of(min_included_snap
- 1);
10074 snapid_t prev_snapc
= 0;
10075 for (vector
<snapid_t
>::reverse_iterator citer
= snapset
.clones
.rbegin();
10076 citer
!= snapset
.clones
.rend();
10078 if (*citer
< soid
.snap
) {
10079 prev_snapc
= *citer
;
10084 dsnapc
= snapset
.get_ssc_as_of(prev_snapc
);
10087 object_locator_t
base_oloc(soid
);
10088 base_oloc
.pool
= pool
.info
.tier_of
;
10090 if (dsnapc
.seq
< snapc
.seq
) {
10093 osd
->objecter
->mutate(
10098 ceph::real_clock::from_ceph_timespec(oi
.mtime
),
10099 (CEPH_OSD_FLAG_IGNORE_OVERLAY
|
10100 CEPH_OSD_FLAG_ENFORCE_SNAPC
),
10101 NULL
/* no callback, we'll rely on the ordering w.r.t the next op */);
10104 FlushOpRef
fop(std::make_shared
<FlushOp
>());
10106 fop
->flushed_version
= oi
.user_version
;
10107 fop
->blocking
= blocking
;
10108 fop
->on_flush
= std::move(on_flush
);
10112 if (oi
.is_whiteout()) {
10113 fop
->removal
= true;
10116 object_locator_t
oloc(soid
);
10117 o
.copy_from(soid
.oid
.name
, soid
.snap
, oloc
, oi
.user_version
,
10118 CEPH_OSD_COPY_FROM_FLAG_FLUSH
|
10119 CEPH_OSD_COPY_FROM_FLAG_IGNORE_OVERLAY
|
10120 CEPH_OSD_COPY_FROM_FLAG_IGNORE_CACHE
|
10121 CEPH_OSD_COPY_FROM_FLAG_MAP_SNAP_CLONE
,
10122 LIBRADOS_OP_FLAG_FADVISE_SEQUENTIAL
|LIBRADOS_OP_FLAG_FADVISE_NOCACHE
);
10124 //mean the base tier don't cache data after this
10125 if (agent_state
&& agent_state
->evict_mode
!= TierAgentState::EVICT_MODE_FULL
)
10126 o
.set_last_op_flags(LIBRADOS_OP_FLAG_FADVISE_DONTNEED
);
10128 C_Flush
*fin
= new C_Flush(this, soid
, get_last_peering_reset());
10130 unsigned n
= info
.pgid
.hash_to_shard(osd
->m_objecter_finishers
);
10131 ceph_tid_t tid
= osd
->objecter
->mutate(
10132 soid
.oid
, base_oloc
, o
, snapc
,
10133 ceph::real_clock::from_ceph_timespec(oi
.mtime
),
10134 CEPH_OSD_FLAG_IGNORE_OVERLAY
| CEPH_OSD_FLAG_ENFORCE_SNAPC
,
10135 new C_OnFinisher(fin
,
10136 osd
->objecter_finishers
[n
]));
10137 /* we're under the pg lock and fin->finish() is grabbing that */
10139 fop
->objecter_tid
= tid
;
10141 flush_ops
[soid
] = fop
;
10142 info
.stats
.stats
.sum
.num_flush
++;
10143 info
.stats
.stats
.sum
.num_flush_kb
+= shift_round_up(oi
.size
, 10);
10144 return -EINPROGRESS
;
10147 void PrimaryLogPG::finish_flush(hobject_t oid
, ceph_tid_t tid
, int r
)
10149 dout(10) << __func__
<< " " << oid
<< " tid " << tid
10150 << " " << cpp_strerror(r
) << dendl
;
10151 map
<hobject_t
,FlushOpRef
>::iterator p
= flush_ops
.find(oid
);
10152 if (p
== flush_ops
.end()) {
10153 dout(10) << __func__
<< " no flush_op found" << dendl
;
10156 FlushOpRef fop
= p
->second
;
10157 if (tid
!= fop
->objecter_tid
&& !fop
->obc
->obs
.oi
.has_manifest()) {
10158 dout(10) << __func__
<< " tid " << tid
<< " != fop " << fop
10159 << " tid " << fop
->objecter_tid
<< dendl
;
10162 ObjectContextRef obc
= fop
->obc
;
10163 fop
->objecter_tid
= 0;
10165 if (r
< 0 && !(r
== -ENOENT
&& fop
->removal
)) {
10167 osd
->reply_op_error(fop
->op
, -EBUSY
);
10168 if (fop
->blocking
) {
10170 kick_object_context_blocked(obc
);
10173 if (!fop
->dup_ops
.empty()) {
10174 dout(20) << __func__
<< " requeueing dups" << dendl
;
10175 requeue_ops(fop
->dup_ops
);
10177 if (fop
->on_flush
) {
10178 (*(fop
->on_flush
))();
10179 fop
->on_flush
= boost::none
;
10181 flush_ops
.erase(oid
);
10185 r
= try_flush_mark_clean(fop
);
10186 if (r
== -EBUSY
&& fop
->op
) {
10187 osd
->reply_op_error(fop
->op
, r
);
10191 int PrimaryLogPG::try_flush_mark_clean(FlushOpRef fop
)
10193 ObjectContextRef obc
= fop
->obc
;
10194 const hobject_t
& oid
= obc
->obs
.oi
.soid
;
10196 if (fop
->blocking
) {
10198 kick_object_context_blocked(obc
);
10201 if (fop
->flushed_version
!= obc
->obs
.oi
.user_version
||
10202 !obc
->obs
.exists
) {
10203 if (obc
->obs
.exists
)
10204 dout(10) << __func__
<< " flushed_version " << fop
->flushed_version
10205 << " != current " << obc
->obs
.oi
.user_version
10208 dout(10) << __func__
<< " object no longer exists" << dendl
;
10210 if (!fop
->dup_ops
.empty()) {
10211 dout(20) << __func__
<< " requeueing dups" << dendl
;
10212 requeue_ops(fop
->dup_ops
);
10214 if (fop
->on_flush
) {
10215 (*(fop
->on_flush
))();
10216 fop
->on_flush
= boost::none
;
10218 flush_ops
.erase(oid
);
10220 osd
->logger
->inc(l_osd_tier_flush_fail
);
10222 osd
->logger
->inc(l_osd_tier_try_flush_fail
);
10226 if (!fop
->blocking
&&
10227 write_blocked_by_scrub(oid
)) {
10229 dout(10) << __func__
<< " blocked by scrub" << dendl
;
10230 requeue_op(fop
->op
);
10231 requeue_ops(fop
->dup_ops
);
10232 return -EAGAIN
; // will retry
10234 osd
->logger
->inc(l_osd_tier_try_flush_fail
);
10235 vector
<ceph_tid_t
> tids
;
10236 cancel_flush(fop
, false, &tids
);
10237 osd
->objecter
->op_cancel(tids
, -ECANCELED
);
10242 // successfully flushed, can we evict this object?
10243 if (!obc
->obs
.oi
.has_manifest() && !fop
->op
&&
10244 agent_state
&& agent_state
->evict_mode
!= TierAgentState::EVICT_MODE_IDLE
&&
10245 agent_maybe_evict(obc
, true)) {
10246 osd
->logger
->inc(l_osd_tier_clean
);
10247 if (fop
->on_flush
) {
10248 (*(fop
->on_flush
))();
10249 fop
->on_flush
= boost::none
;
10251 flush_ops
.erase(oid
);
10255 dout(10) << __func__
<< " clearing DIRTY flag for " << oid
<< dendl
;
10256 OpContextUPtr ctx
= simple_opc_create(fop
->obc
);
10258 // successfully flushed; can we clear the dirty bit?
10259 // try to take the lock manually, since we don't
10261 if (ctx
->lock_manager
.get_lock_type(
10262 ObjectContext::RWState::RWWRITE
,
10266 dout(20) << __func__
<< " took write lock" << dendl
;
10267 } else if (fop
->op
) {
10268 dout(10) << __func__
<< " waiting on write lock " << fop
->op
<< " "
10269 << fop
->dup_ops
<< dendl
;
10270 // fop->op is now waiting on the lock; get fop->dup_ops to wait too.
10271 for (auto op
: fop
->dup_ops
) {
10272 bool locked
= ctx
->lock_manager
.get_lock_type(
10273 ObjectContext::RWState::RWWRITE
,
10277 ceph_assert(!locked
);
10279 close_op_ctx(ctx
.release());
10280 return -EAGAIN
; // will retry
10282 dout(10) << __func__
<< " failed write lock, no op; failing" << dendl
;
10283 close_op_ctx(ctx
.release());
10284 osd
->logger
->inc(l_osd_tier_try_flush_fail
);
10285 vector
<ceph_tid_t
> tids
;
10286 cancel_flush(fop
, false, &tids
);
10287 osd
->objecter
->op_cancel(tids
, -ECANCELED
);
10291 if (fop
->on_flush
) {
10292 ctx
->register_on_finish(*(fop
->on_flush
));
10293 fop
->on_flush
= boost::none
;
10296 ctx
->at_version
= get_next_version();
10298 ctx
->new_obs
= obc
->obs
;
10299 ctx
->new_obs
.oi
.clear_flag(object_info_t::FLAG_DIRTY
);
10300 --ctx
->delta_stats
.num_objects_dirty
;
10301 if (fop
->obc
->obs
.oi
.has_manifest()) {
10302 ceph_assert(obc
->obs
.oi
.manifest
.is_chunked());
10303 PGTransaction
* t
= ctx
->op_t
.get();
10304 uint64_t chunks_size
= 0;
10305 for (auto &p
: ctx
->new_obs
.oi
.manifest
.chunk_map
) {
10306 chunks_size
+= p
.second
.length
;
10308 if (ctx
->new_obs
.oi
.is_omap() && pool
.info
.supports_omap()) {
10309 t
->omap_clear(oid
);
10310 ctx
->new_obs
.oi
.clear_omap_digest();
10311 ctx
->new_obs
.oi
.clear_flag(object_info_t::FLAG_OMAP
);
10313 if (obc
->obs
.oi
.size
== chunks_size
) {
10314 t
->truncate(oid
, 0);
10315 interval_set
<uint64_t> trim
;
10316 trim
.insert(0, ctx
->new_obs
.oi
.size
);
10317 ctx
->modified_ranges
.union_of(trim
);
10318 truncate_update_size_and_usage(ctx
->delta_stats
,
10321 ctx
->new_obs
.oi
.new_object();
10322 for (auto &p
: ctx
->new_obs
.oi
.manifest
.chunk_map
) {
10323 p
.second
.clear_flag(chunk_info_t::FLAG_DIRTY
);
10324 p
.second
.set_flag(chunk_info_t::FLAG_MISSING
);
10327 for (auto &p
: ctx
->new_obs
.oi
.manifest
.chunk_map
) {
10328 if (p
.second
.is_dirty()) {
10329 dout(20) << __func__
<< " offset: " << p
.second
.offset
10330 << " length: " << p
.second
.length
<< dendl
;
10331 p
.second
.clear_flag(chunk_info_t::FLAG_DIRTY
);
10332 p
.second
.clear_flag(chunk_info_t::FLAG_MISSING
); // CLEAN
10338 finish_ctx(ctx
.get(), pg_log_entry_t::CLEAN
);
10340 osd
->logger
->inc(l_osd_tier_clean
);
10342 if (!fop
->dup_ops
.empty() || fop
->op
) {
10343 dout(20) << __func__
<< " requeueing for " << ctx
->at_version
<< dendl
;
10344 list
<OpRequestRef
> ls
;
10346 ls
.push_back(fop
->op
);
10347 ls
.splice(ls
.end(), fop
->dup_ops
);
10351 simple_opc_submit(std::move(ctx
));
10353 flush_ops
.erase(oid
);
10356 osd
->logger
->inc(l_osd_tier_flush
);
10358 osd
->logger
->inc(l_osd_tier_try_flush
);
10360 return -EINPROGRESS
;
10363 void PrimaryLogPG::cancel_flush(FlushOpRef fop
, bool requeue
,
10364 vector
<ceph_tid_t
> *tids
)
10366 dout(10) << __func__
<< " " << fop
->obc
->obs
.oi
.soid
<< " tid "
10367 << fop
->objecter_tid
<< dendl
;
10368 if (fop
->objecter_tid
) {
10369 tids
->push_back(fop
->objecter_tid
);
10370 fop
->objecter_tid
= 0;
10372 if (fop
->io_tids
.size()) {
10373 for (auto &p
: fop
->io_tids
) {
10374 tids
->push_back(p
.second
);
10378 if (fop
->blocking
&& fop
->obc
->is_blocked()) {
10379 fop
->obc
->stop_block();
10380 kick_object_context_blocked(fop
->obc
);
10384 requeue_op(fop
->op
);
10385 requeue_ops(fop
->dup_ops
);
10387 if (fop
->on_flush
) {
10388 (*(fop
->on_flush
))();
10389 fop
->on_flush
= boost::none
;
10391 flush_ops
.erase(fop
->obc
->obs
.oi
.soid
);
10394 void PrimaryLogPG::cancel_flush_ops(bool requeue
, vector
<ceph_tid_t
> *tids
)
10396 dout(10) << __func__
<< dendl
;
10397 map
<hobject_t
,FlushOpRef
>::iterator p
= flush_ops
.begin();
10398 while (p
!= flush_ops
.end()) {
10399 cancel_flush((p
++)->second
, requeue
, tids
);
10403 bool PrimaryLogPG::is_present_clone(hobject_t coid
)
10405 if (!pool
.info
.allow_incomplete_clones())
10407 if (is_missing_object(coid
))
10409 ObjectContextRef obc
= get_object_context(coid
, false);
10410 return obc
&& obc
->obs
.exists
;
10413 // ========================================================================
10416 class C_OSD_RepopCommit
: public Context
{
10417 PrimaryLogPGRef pg
;
10418 boost::intrusive_ptr
<PrimaryLogPG::RepGather
> repop
;
10420 C_OSD_RepopCommit(PrimaryLogPG
*pg
, PrimaryLogPG::RepGather
*repop
)
10421 : pg(pg
), repop(repop
) {}
10422 void finish(int) override
{
10423 pg
->repop_all_committed(repop
.get());
10427 void PrimaryLogPG::repop_all_committed(RepGather
*repop
)
10429 dout(10) << __func__
<< ": repop tid " << repop
->rep_tid
<< " all committed "
10431 repop
->all_committed
= true;
10432 if (!repop
->rep_aborted
) {
10433 if (repop
->v
!= eversion_t()) {
10434 last_update_ondisk
= repop
->v
;
10435 last_complete_ondisk
= repop
->pg_local_last_complete
;
10441 void PrimaryLogPG::op_applied(const eversion_t
&applied_version
)
10443 dout(10) << "op_applied version " << applied_version
<< dendl
;
10444 ceph_assert(applied_version
!= eversion_t());
10445 ceph_assert(applied_version
<= info
.last_update
);
10446 last_update_applied
= applied_version
;
10447 if (is_primary()) {
10448 if (scrubber
.active
) {
10449 if (last_update_applied
>= scrubber
.subset_last_update
) {
10450 requeue_scrub(ops_blocked_by_scrub());
10453 ceph_assert(scrubber
.start
== scrubber
.end
);
10458 void PrimaryLogPG::eval_repop(RepGather
*repop
)
10460 const MOSDOp
*m
= NULL
;
10462 m
= static_cast<const MOSDOp
*>(repop
->op
->get_req());
10465 dout(10) << "eval_repop " << *repop
<< dendl
;
10467 dout(10) << "eval_repop " << *repop
<< " (no op)" << dendl
;
10470 if (repop
->all_committed
) {
10471 dout(10) << " commit: " << *repop
<< dendl
;
10472 for (auto p
= repop
->on_committed
.begin();
10473 p
!= repop
->on_committed
.end();
10474 repop
->on_committed
.erase(p
++)) {
10477 // send dup commits, in order
10478 auto it
= waiting_for_ondisk
.find(repop
->v
);
10479 if (it
!= waiting_for_ondisk
.end()) {
10480 ceph_assert(waiting_for_ondisk
.begin()->first
== repop
->v
);
10481 for (auto& i
: it
->second
) {
10482 int return_code
= repop
->r
;
10483 if (return_code
>= 0) {
10484 return_code
= std::get
<2>(i
);
10486 osd
->reply_op_error(std::get
<0>(i
), return_code
, repop
->v
,
10489 waiting_for_ondisk
.erase(it
);
10492 publish_stats_to_osd();
10493 calc_min_last_complete_ondisk();
10495 dout(10) << " removing " << *repop
<< dendl
;
10496 ceph_assert(!repop_queue
.empty());
10497 dout(20) << " q front is " << *repop_queue
.front() << dendl
;
10498 if (repop_queue
.front() == repop
) {
10499 RepGather
*to_remove
= nullptr;
10500 while (!repop_queue
.empty() &&
10501 (to_remove
= repop_queue
.front())->all_committed
) {
10502 repop_queue
.pop_front();
10503 for (auto p
= to_remove
->on_success
.begin();
10504 p
!= to_remove
->on_success
.end();
10505 to_remove
->on_success
.erase(p
++)) {
10508 remove_repop(to_remove
);
10514 void PrimaryLogPG::issue_repop(RepGather
*repop
, OpContext
*ctx
)
10517 const hobject_t
& soid
= ctx
->obs
->oi
.soid
;
10518 dout(7) << "issue_repop rep_tid " << repop
->rep_tid
10522 repop
->v
= ctx
->at_version
;
10523 if (ctx
->at_version
> eversion_t()) {
10524 for (set
<pg_shard_t
>::iterator i
= acting_recovery_backfill
.begin();
10525 i
!= acting_recovery_backfill
.end();
10527 if (*i
== get_primary()) continue;
10528 pg_info_t
&pinfo
= peer_info
[*i
];
10529 // keep peer_info up to date
10530 if (pinfo
.last_complete
== pinfo
.last_update
)
10531 pinfo
.last_complete
= ctx
->at_version
;
10532 pinfo
.last_update
= ctx
->at_version
;
10536 ctx
->op_t
->add_obc(ctx
->obc
);
10537 if (ctx
->clone_obc
) {
10538 ctx
->op_t
->add_obc(ctx
->clone_obc
);
10540 if (ctx
->head_obc
) {
10541 ctx
->op_t
->add_obc(ctx
->head_obc
);
10544 Context
*on_all_commit
= new C_OSD_RepopCommit(this, repop
);
10545 if (!(ctx
->log
.empty())) {
10546 ceph_assert(ctx
->at_version
>= projected_last_update
);
10547 projected_last_update
= ctx
->at_version
;
10549 for (auto &&entry
: ctx
->log
) {
10550 projected_log
.add(entry
);
10553 bool requires_missing_loc
= false;
10554 for (set
<pg_shard_t
>::iterator i
= async_recovery_targets
.begin();
10555 i
!= async_recovery_targets
.end();
10557 if (*i
== get_primary() || !peer_missing
[*i
].is_missing(soid
)) continue;
10558 requires_missing_loc
= true;
10559 for (auto &&entry
: ctx
->log
) {
10560 peer_missing
[*i
].add_next_event(entry
);
10564 if (requires_missing_loc
) {
10565 for (auto &&entry
: ctx
->log
) {
10566 dout(30) << __func__
<< " missing_loc before: "
10567 << missing_loc
.get_locations(entry
.soid
) << dendl
;
10568 missing_loc
.add_missing(entry
.soid
, entry
.version
,
10569 eversion_t(), entry
.is_delete());
10570 // clear out missing_loc
10571 missing_loc
.clear_location(entry
.soid
);
10572 for (auto &i
: actingset
) {
10573 if (!peer_missing
[i
].is_missing(entry
.soid
))
10574 missing_loc
.add_location(entry
.soid
, i
);
10576 dout(30) << __func__
<< " missing_loc after: "
10577 << missing_loc
.get_locations(entry
.soid
) << dendl
;
10581 pgbackend
->submit_transaction(
10585 std::move(ctx
->op_t
),
10587 min_last_complete_ondisk
,
10589 ctx
->updated_hset_history
,
10596 PrimaryLogPG::RepGather
*PrimaryLogPG::new_repop(
10597 OpContext
*ctx
, ObjectContextRef obc
,
10598 ceph_tid_t rep_tid
)
10601 dout(10) << "new_repop rep_tid " << rep_tid
<< " on " << *ctx
->op
->get_req() << dendl
;
10603 dout(10) << "new_repop rep_tid " << rep_tid
<< " (no op)" << dendl
;
10605 RepGather
*repop
= new RepGather(
10606 ctx
, rep_tid
, info
.last_complete
);
10608 repop
->start
= ceph_clock_now();
10610 repop_queue
.push_back(&repop
->queue_item
);
10613 osd
->logger
->inc(l_osd_op_wip
);
10615 dout(10) << __func__
<< ": " << *repop
<< dendl
;
10619 boost::intrusive_ptr
<PrimaryLogPG::RepGather
> PrimaryLogPG::new_repop(
10620 eversion_t version
,
10622 ObcLockManager
&&manager
,
10624 boost::optional
<std::function
<void(void)> > &&on_complete
)
10626 RepGather
*repop
= new RepGather(
10627 std::move(manager
),
10629 std::move(on_complete
),
10631 info
.last_complete
,
10633 repop
->v
= version
;
10635 repop
->start
= ceph_clock_now();
10637 repop_queue
.push_back(&repop
->queue_item
);
10639 osd
->logger
->inc(l_osd_op_wip
);
10641 dout(10) << __func__
<< ": " << *repop
<< dendl
;
10642 return boost::intrusive_ptr
<RepGather
>(repop
);
10645 void PrimaryLogPG::remove_repop(RepGather
*repop
)
10647 dout(20) << __func__
<< " " << *repop
<< dendl
;
10649 for (auto p
= repop
->on_finish
.begin();
10650 p
!= repop
->on_finish
.end();
10651 repop
->on_finish
.erase(p
++)) {
10655 release_object_locks(
10656 repop
->lock_manager
);
10659 osd
->logger
->dec(l_osd_op_wip
);
10662 PrimaryLogPG::OpContextUPtr
PrimaryLogPG::simple_opc_create(ObjectContextRef obc
)
10664 dout(20) << __func__
<< " " << obc
->obs
.oi
.soid
<< dendl
;
10665 ceph_tid_t rep_tid
= osd
->get_tid();
10666 osd_reqid_t
reqid(osd
->get_cluster_msgr_name(), 0, rep_tid
);
10667 OpContextUPtr
ctx(new OpContext(OpRequestRef(), reqid
, nullptr, obc
, this));
10668 ctx
->op_t
.reset(new PGTransaction());
10669 ctx
->mtime
= ceph_clock_now();
10673 void PrimaryLogPG::simple_opc_submit(OpContextUPtr ctx
)
10675 RepGather
*repop
= new_repop(ctx
.get(), ctx
->obc
, ctx
->reqid
.tid
);
10676 dout(20) << __func__
<< " " << repop
<< dendl
;
10677 issue_repop(repop
, ctx
.get());
10679 if (hard_limit_pglog())
10680 calc_trim_to_aggressive();
10687 void PrimaryLogPG::submit_log_entries(
10688 const mempool::osd_pglog::list
<pg_log_entry_t
> &entries
,
10689 ObcLockManager
&&manager
,
10690 boost::optional
<std::function
<void(void)> > &&_on_complete
,
10694 dout(10) << __func__
<< " " << entries
<< dendl
;
10695 ceph_assert(is_primary());
10697 eversion_t version
;
10698 if (!entries
.empty()) {
10699 ceph_assert(entries
.rbegin()->version
>= projected_last_update
);
10700 version
= projected_last_update
= entries
.rbegin()->version
;
10703 boost::intrusive_ptr
<RepGather
> repop
;
10704 boost::optional
<std::function
<void(void)> > on_complete
;
10705 if (get_osdmap()->require_osd_release
>= CEPH_RELEASE_JEWEL
) {
10709 std::move(manager
),
10711 std::move(_on_complete
));
10713 on_complete
= std::move(_on_complete
);
10716 pgbackend
->call_write_ordered(
10717 [this, entries
, repop
, on_complete
]() {
10718 ObjectStore::Transaction t
;
10719 eversion_t old_last_update
= info
.last_update
;
10720 merge_new_log_entries(entries
, t
, pg_trim_to
, min_last_complete_ondisk
);
10723 set
<pg_shard_t
> waiting_on
;
10724 for (set
<pg_shard_t
>::const_iterator i
= acting_recovery_backfill
.begin();
10725 i
!= acting_recovery_backfill
.end();
10727 pg_shard_t
peer(*i
);
10728 if (peer
== pg_whoami
) continue;
10729 ceph_assert(peer_missing
.count(peer
));
10730 ceph_assert(peer_info
.count(peer
));
10731 if (get_osdmap()->require_osd_release
>= CEPH_RELEASE_JEWEL
) {
10732 ceph_assert(repop
);
10733 MOSDPGUpdateLogMissing
*m
= new MOSDPGUpdateLogMissing(
10735 spg_t(info
.pgid
.pgid
, i
->shard
),
10737 get_osdmap_epoch(),
10738 last_peering_reset
,
10741 min_last_complete_ondisk
);
10742 osd
->send_message_osd_cluster(
10743 peer
.osd
, m
, get_osdmap_epoch());
10744 waiting_on
.insert(peer
);
10746 MOSDPGLog
*m
= new MOSDPGLog(
10747 peer
.shard
, pg_whoami
.shard
,
10748 info
.last_update
.epoch
,
10749 info
, last_peering_reset
);
10750 m
->log
.log
= entries
;
10751 m
->log
.tail
= old_last_update
;
10752 m
->log
.head
= info
.last_update
;
10753 osd
->send_message_osd_cluster(
10754 peer
.osd
, m
, get_osdmap_epoch());
10757 ceph_tid_t rep_tid
= repop
->rep_tid
;
10758 waiting_on
.insert(pg_whoami
);
10759 log_entry_update_waiting_on
.insert(
10762 LogUpdateCtx
{std::move(repop
), std::move(waiting_on
)}
10764 struct OnComplete
: public Context
{
10765 PrimaryLogPGRef pg
;
10766 ceph_tid_t rep_tid
;
10769 PrimaryLogPGRef pg
,
10770 ceph_tid_t rep_tid
,
10772 : pg(pg
), rep_tid(rep_tid
), epoch(epoch
) {}
10773 void finish(int) override
{
10775 if (!pg
->pg_has_reset_since(epoch
)) {
10776 auto it
= pg
->log_entry_update_waiting_on
.find(rep_tid
);
10777 ceph_assert(it
!= pg
->log_entry_update_waiting_on
.end());
10778 auto it2
= it
->second
.waiting_on
.find(pg
->pg_whoami
);
10779 ceph_assert(it2
!= it
->second
.waiting_on
.end());
10780 it
->second
.waiting_on
.erase(it2
);
10781 if (it
->second
.waiting_on
.empty()) {
10782 pg
->repop_all_committed(it
->second
.repop
.get());
10783 pg
->log_entry_update_waiting_on
.erase(it
);
10789 t
.register_on_commit(
10790 new OnComplete
{this, rep_tid
, get_osdmap_epoch()});
10791 int r
= osd
->store
->queue_transaction(ch
, std::move(t
), NULL
);
10792 ceph_assert(r
== 0);
10793 op_applied(info
.last_update
);
10796 if (hard_limit_pglog())
10797 calc_trim_to_aggressive();
10802 void PrimaryLogPG::cancel_log_updates()
10804 // get rid of all the LogUpdateCtx so their references to repops are
10806 log_entry_update_waiting_on
.clear();
10809 // -------------------------------------------------------
10811 void PrimaryLogPG::get_watchers(list
<obj_watch_item_t
> *ls
)
10814 pair
<hobject_t
, ObjectContextRef
> i
;
10815 while (object_contexts
.get_next(i
.first
, &i
)) {
10816 ObjectContextRef
obc(i
.second
);
10817 get_obc_watchers(obc
, *ls
);
10822 void PrimaryLogPG::get_obc_watchers(ObjectContextRef obc
, list
<obj_watch_item_t
> &pg_watchers
)
10824 for (map
<pair
<uint64_t, entity_name_t
>, WatchRef
>::iterator j
=
10825 obc
->watchers
.begin();
10826 j
!= obc
->watchers
.end();
10828 obj_watch_item_t owi
;
10830 owi
.obj
= obc
->obs
.oi
.soid
;
10831 owi
.wi
.addr
= j
->second
->get_peer_addr();
10832 owi
.wi
.name
= j
->second
->get_entity();
10833 owi
.wi
.cookie
= j
->second
->get_cookie();
10834 owi
.wi
.timeout_seconds
= j
->second
->get_timeout();
10836 dout(30) << "watch: Found oid=" << owi
.obj
<< " addr=" << owi
.wi
.addr
10837 << " name=" << owi
.wi
.name
<< " cookie=" << owi
.wi
.cookie
<< dendl
;
10839 pg_watchers
.push_back(owi
);
10843 void PrimaryLogPG::check_blacklisted_watchers()
10845 dout(20) << "PrimaryLogPG::check_blacklisted_watchers for pg " << get_pgid() << dendl
;
10846 pair
<hobject_t
, ObjectContextRef
> i
;
10847 while (object_contexts
.get_next(i
.first
, &i
))
10848 check_blacklisted_obc_watchers(i
.second
);
10851 void PrimaryLogPG::check_blacklisted_obc_watchers(ObjectContextRef obc
)
10853 dout(20) << "PrimaryLogPG::check_blacklisted_obc_watchers for obc " << obc
->obs
.oi
.soid
<< dendl
;
10854 for (map
<pair
<uint64_t, entity_name_t
>, WatchRef
>::iterator k
=
10855 obc
->watchers
.begin();
10856 k
!= obc
->watchers
.end();
10858 //Advance iterator now so handle_watch_timeout() can erase element
10859 map
<pair
<uint64_t, entity_name_t
>, WatchRef
>::iterator j
= k
++;
10860 dout(30) << "watch: Found " << j
->second
->get_entity() << " cookie " << j
->second
->get_cookie() << dendl
;
10861 entity_addr_t ea
= j
->second
->get_peer_addr();
10862 dout(30) << "watch: Check entity_addr_t " << ea
<< dendl
;
10863 if (get_osdmap()->is_blacklisted(ea
)) {
10864 dout(10) << "watch: Found blacklisted watcher for " << ea
<< dendl
;
10865 ceph_assert(j
->second
->get_pg() == this);
10866 j
->second
->unregister_cb();
10867 handle_watch_timeout(j
->second
);
10872 void PrimaryLogPG::populate_obc_watchers(ObjectContextRef obc
)
10874 ceph_assert(is_active());
10875 auto it_objects
= pg_log
.get_log().objects
.find(obc
->obs
.oi
.soid
);
10876 ceph_assert((recovering
.count(obc
->obs
.oi
.soid
) ||
10877 !is_missing_object(obc
->obs
.oi
.soid
)) ||
10878 (it_objects
!= pg_log
.get_log().objects
.end() && // or this is a revert... see recover_primary()
10879 it_objects
->second
->op
==
10880 pg_log_entry_t::LOST_REVERT
&&
10881 it_objects
->second
->reverting_to
==
10882 obc
->obs
.oi
.version
));
10884 dout(10) << "populate_obc_watchers " << obc
->obs
.oi
.soid
<< dendl
;
10885 ceph_assert(obc
->watchers
.empty());
10886 // populate unconnected_watchers
10887 for (map
<pair
<uint64_t, entity_name_t
>, watch_info_t
>::iterator p
=
10888 obc
->obs
.oi
.watchers
.begin();
10889 p
!= obc
->obs
.oi
.watchers
.end();
10891 utime_t expire
= info
.stats
.last_became_active
;
10892 expire
+= p
->second
.timeout_seconds
;
10893 dout(10) << " unconnected watcher " << p
->first
<< " will expire " << expire
<< dendl
;
10895 Watch::makeWatchRef(
10896 this, osd
, obc
, p
->second
.timeout_seconds
, p
->first
.first
,
10897 p
->first
.second
, p
->second
.addr
));
10898 watch
->disconnect();
10899 obc
->watchers
.insert(
10901 make_pair(p
->first
.first
, p
->first
.second
),
10904 // Look for watchers from blacklisted clients and drop
10905 check_blacklisted_obc_watchers(obc
);
10908 void PrimaryLogPG::handle_watch_timeout(WatchRef watch
)
10910 ObjectContextRef obc
= watch
->get_obc(); // handle_watch_timeout owns this ref
10911 dout(10) << "handle_watch_timeout obc " << obc
<< dendl
;
10913 if (!is_active()) {
10914 dout(10) << "handle_watch_timeout not active, no-op" << dendl
;
10917 if (!obc
->obs
.exists
) {
10918 dout(10) << __func__
<< " object " << obc
->obs
.oi
.soid
<< " dne" << dendl
;
10921 if (is_degraded_or_backfilling_object(obc
->obs
.oi
.soid
)) {
10922 callbacks_for_degraded_object
[obc
->obs
.oi
.soid
].push_back(
10923 watch
->get_delayed_cb()
10925 dout(10) << "handle_watch_timeout waiting for degraded on obj "
10926 << obc
->obs
.oi
.soid
10931 if (write_blocked_by_scrub(obc
->obs
.oi
.soid
)) {
10932 dout(10) << "handle_watch_timeout waiting for scrub on obj "
10933 << obc
->obs
.oi
.soid
10935 scrubber
.add_callback(
10936 watch
->get_delayed_cb() // This callback!
10941 OpContextUPtr ctx
= simple_opc_create(obc
);
10942 ctx
->at_version
= get_next_version();
10944 object_info_t
& oi
= ctx
->new_obs
.oi
;
10945 oi
.watchers
.erase(make_pair(watch
->get_cookie(),
10946 watch
->get_entity()));
10948 list
<watch_disconnect_t
> watch_disconnects
= {
10949 watch_disconnect_t(watch
->get_cookie(), watch
->get_entity(), true)
10951 ctx
->register_on_success(
10952 [this, obc
, watch_disconnects
]() {
10953 complete_disconnect_watches(obc
, watch_disconnects
);
10957 PGTransaction
*t
= ctx
->op_t
.get();
10958 ctx
->log
.push_back(pg_log_entry_t(pg_log_entry_t::MODIFY
, obc
->obs
.oi
.soid
,
10962 osd_reqid_t(), ctx
->mtime
, 0));
10964 oi
.prior_version
= obc
->obs
.oi
.version
;
10965 oi
.version
= ctx
->at_version
;
10967 encode(oi
, bl
, get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD
, nullptr));
10968 t
->setattr(obc
->obs
.oi
.soid
, OI_ATTR
, bl
);
10970 // apply new object state.
10971 ctx
->obc
->obs
= ctx
->new_obs
;
10973 // no ctx->delta_stats
10974 simple_opc_submit(std::move(ctx
));
10977 ObjectContextRef
PrimaryLogPG::create_object_context(const object_info_t
& oi
,
10978 SnapSetContext
*ssc
)
10980 ObjectContextRef
obc(object_contexts
.lookup_or_create(oi
.soid
));
10981 ceph_assert(obc
->destructor_callback
== NULL
);
10982 obc
->destructor_callback
= new C_PG_ObjectContext(this, obc
.get());
10984 obc
->obs
.exists
= false;
10987 register_snapset_context(ssc
);
10988 dout(10) << "create_object_context " << (void*)obc
.get() << " " << oi
.soid
<< " " << dendl
;
10990 populate_obc_watchers(obc
);
10994 ObjectContextRef
PrimaryLogPG::get_object_context(
10995 const hobject_t
& soid
,
10997 const map
<string
, bufferlist
> *attrs
)
10999 auto it_objects
= pg_log
.get_log().objects
.find(soid
);
11001 attrs
|| !pg_log
.get_missing().is_missing(soid
) ||
11002 // or this is a revert... see recover_primary()
11003 (it_objects
!= pg_log
.get_log().objects
.end() &&
11004 it_objects
->second
->op
==
11005 pg_log_entry_t::LOST_REVERT
));
11006 ObjectContextRef obc
= object_contexts
.lookup(soid
);
11007 osd
->logger
->inc(l_osd_object_ctx_cache_total
);
11009 osd
->logger
->inc(l_osd_object_ctx_cache_hit
);
11010 dout(10) << __func__
<< ": found obc in cache: " << obc
11013 dout(10) << __func__
<< ": obc NOT found in cache: " << soid
<< dendl
;
11017 auto it_oi
= attrs
->find(OI_ATTR
);
11018 ceph_assert(it_oi
!= attrs
->end());
11019 bv
= it_oi
->second
;
11021 int r
= pgbackend
->objects_get_attr(soid
, OI_ATTR
, &bv
);
11024 dout(10) << __func__
<< ": no obc for soid "
11025 << soid
<< " and !can_create"
11027 return ObjectContextRef(); // -ENOENT!
11030 dout(10) << __func__
<< ": no obc for soid "
11031 << soid
<< " but can_create"
11034 object_info_t
oi(soid
);
11035 SnapSetContext
*ssc
= get_snapset_context(
11036 soid
, true, 0, false);
11038 obc
= create_object_context(oi
, ssc
);
11039 dout(10) << __func__
<< ": " << obc
<< " " << soid
11040 << " " << obc
->rwstate
11041 << " oi: " << obc
->obs
.oi
11042 << " ssc: " << obc
->ssc
11043 << " snapset: " << obc
->ssc
->snapset
<< dendl
;
11050 bufferlist::const_iterator bliter
= bv
.begin();
11051 decode(oi
, bliter
);
11053 dout(0) << __func__
<< ": obc corrupt: " << soid
<< dendl
;
11054 return ObjectContextRef(); // -ENOENT!
11057 ceph_assert(oi
.soid
.pool
== (int64_t)info
.pgid
.pool());
11059 obc
= object_contexts
.lookup_or_create(oi
.soid
);
11060 obc
->destructor_callback
= new C_PG_ObjectContext(this, obc
.get());
11062 obc
->obs
.exists
= true;
11064 obc
->ssc
= get_snapset_context(
11066 soid
.has_snapset() ? attrs
: 0);
11069 populate_obc_watchers(obc
);
11071 if (pool
.info
.is_erasure()) {
11073 obc
->attr_cache
= *attrs
;
11075 int r
= pgbackend
->objects_get_attrs(
11078 ceph_assert(r
== 0);
11082 dout(10) << __func__
<< ": creating obc from disk: " << obc
11086 // XXX: Caller doesn't expect this
11087 if (obc
->ssc
== NULL
) {
11088 derr
<< __func__
<< ": obc->ssc not available, not returning context" << dendl
;
11089 return ObjectContextRef(); // -ENOENT!
11092 dout(10) << __func__
<< ": " << obc
<< " " << soid
11093 << " " << obc
->rwstate
11094 << " oi: " << obc
->obs
.oi
11095 << " exists: " << (int)obc
->obs
.exists
11096 << " ssc: " << obc
->ssc
11097 << " snapset: " << obc
->ssc
->snapset
<< dendl
;
11101 void PrimaryLogPG::context_registry_on_change()
11103 pair
<hobject_t
, ObjectContextRef
> i
;
11104 while (object_contexts
.get_next(i
.first
, &i
)) {
11105 ObjectContextRef
obc(i
.second
);
11107 for (map
<pair
<uint64_t, entity_name_t
>, WatchRef
>::iterator j
=
11108 obc
->watchers
.begin();
11109 j
!= obc
->watchers
.end();
11110 obc
->watchers
.erase(j
++)) {
11111 j
->second
->discard();
11119 * If we return an error, and set *pmissing, then promoting that
11122 * If we return -EAGAIN, we will always set *pmissing to the missing
11123 * object to wait for.
11125 * If we return an error but do not set *pmissing, then we know the
11126 * object does not exist.
11128 int PrimaryLogPG::find_object_context(const hobject_t
& oid
,
11129 ObjectContextRef
*pobc
,
11131 bool map_snapid_to_clone
,
11132 hobject_t
*pmissing
)
11135 ceph_assert(oid
.pool
== static_cast<int64_t>(info
.pgid
.pool()));
11137 if (oid
.snap
== CEPH_NOSNAP
) {
11138 ObjectContextRef obc
= get_object_context(oid
, can_create
);
11144 dout(10) << __func__
<< " " << oid
11145 << " @" << oid
.snap
11146 << " oi=" << obc
->obs
.oi
11153 hobject_t head
= oid
.get_head();
11156 if (!map_snapid_to_clone
&& pool
.info
.is_removed_snap(oid
.snap
)) {
11157 dout(10) << __func__
<< " snap " << oid
.snap
<< " is removed" << dendl
;
11161 SnapSetContext
*ssc
= get_snapset_context(oid
, can_create
);
11162 if (!ssc
|| !(ssc
->exists
|| can_create
)) {
11163 dout(20) << __func__
<< " " << oid
<< " no snapset" << dendl
;
11165 *pmissing
= head
; // start by getting the head
11167 put_snapset_context(ssc
);
11171 if (map_snapid_to_clone
) {
11172 dout(10) << __func__
<< " " << oid
<< " @" << oid
.snap
11173 << " snapset " << ssc
->snapset
11174 << " map_snapid_to_clone=true" << dendl
;
11175 if (oid
.snap
> ssc
->snapset
.seq
) {
11176 // already must be readable
11177 ObjectContextRef obc
= get_object_context(head
, false);
11178 dout(10) << __func__
<< " " << oid
<< " @" << oid
.snap
11179 << " snapset " << ssc
->snapset
11180 << " maps to head" << dendl
;
11182 put_snapset_context(ssc
);
11183 return (obc
&& obc
->obs
.exists
) ? 0 : -ENOENT
;
11185 vector
<snapid_t
>::const_iterator citer
= std::find(
11186 ssc
->snapset
.clones
.begin(),
11187 ssc
->snapset
.clones
.end(),
11189 if (citer
== ssc
->snapset
.clones
.end()) {
11190 dout(10) << __func__
<< " " << oid
<< " @" << oid
.snap
11191 << " snapset " << ssc
->snapset
11192 << " maps to nothing" << dendl
;
11193 put_snapset_context(ssc
);
11197 dout(10) << __func__
<< " " << oid
<< " @" << oid
.snap
11198 << " snapset " << ssc
->snapset
11199 << " maps to " << oid
<< dendl
;
11201 if (pg_log
.get_missing().is_missing(oid
)) {
11202 dout(10) << __func__
<< " " << oid
<< " @" << oid
.snap
11203 << " snapset " << ssc
->snapset
11204 << " " << oid
<< " is missing" << dendl
;
11207 put_snapset_context(ssc
);
11211 ObjectContextRef obc
= get_object_context(oid
, false);
11212 if (!obc
|| !obc
->obs
.exists
) {
11213 dout(10) << __func__
<< " " << oid
<< " @" << oid
.snap
11214 << " snapset " << ssc
->snapset
11215 << " " << oid
<< " is not present" << dendl
;
11218 put_snapset_context(ssc
);
11221 dout(10) << __func__
<< " " << oid
<< " @" << oid
.snap
11222 << " snapset " << ssc
->snapset
11223 << " " << oid
<< " HIT" << dendl
;
11225 put_snapset_context(ssc
);
11228 ceph_abort(); //unreachable
11231 dout(10) << __func__
<< " " << oid
<< " @" << oid
.snap
11232 << " snapset " << ssc
->snapset
<< dendl
;
11235 if (oid
.snap
> ssc
->snapset
.seq
) {
11236 ObjectContextRef obc
= get_object_context(head
, false);
11237 dout(10) << __func__
<< " " << head
11238 << " want " << oid
.snap
<< " > snapset seq " << ssc
->snapset
.seq
11239 << " -- HIT " << obc
->obs
11244 ceph_assert(ssc
== obc
->ssc
);
11245 put_snapset_context(ssc
);
11251 // which clone would it be?
11253 while (k
< ssc
->snapset
.clones
.size() &&
11254 ssc
->snapset
.clones
[k
] < oid
.snap
)
11256 if (k
== ssc
->snapset
.clones
.size()) {
11257 dout(10) << __func__
<< " no clones with last >= oid.snap "
11258 << oid
.snap
<< " -- DNE" << dendl
;
11259 put_snapset_context(ssc
);
11262 hobject_t
soid(oid
.oid
, oid
.get_key(), ssc
->snapset
.clones
[k
], oid
.get_hash(),
11263 info
.pgid
.pool(), oid
.get_namespace());
11265 if (pg_log
.get_missing().is_missing(soid
)) {
11266 dout(20) << __func__
<< " " << soid
<< " missing, try again later"
11270 put_snapset_context(ssc
);
11274 ObjectContextRef obc
= get_object_context(soid
, false);
11275 if (!obc
|| !obc
->obs
.exists
) {
11278 put_snapset_context(ssc
);
11279 if (is_degraded_or_backfilling_object(soid
)) {
11280 dout(20) << __func__
<< " clone is degraded or backfilling " << soid
<< dendl
;
11282 } else if (is_degraded_on_async_recovery_target(soid
)) {
11283 dout(20) << __func__
<< " clone is recovering " << soid
<< dendl
;
11286 dout(20) << __func__
<< " missing clone " << soid
<< dendl
;
11294 ceph_assert(obc
->ssc
== ssc
);
11295 put_snapset_context(ssc
);
11300 dout(20) << __func__
<< " " << soid
11301 << " snapset " << obc
->ssc
->snapset
11303 snapid_t first
, last
;
11304 auto p
= obc
->ssc
->snapset
.clone_snaps
.find(soid
.snap
);
11305 ceph_assert(p
!= obc
->ssc
->snapset
.clone_snaps
.end());
11306 if (p
->second
.empty()) {
11307 dout(1) << __func__
<< " " << soid
<< " empty snapset -- DNE" << dendl
;
11308 ceph_assert(!cct
->_conf
->osd_debug_verify_snaps
);
11311 first
= p
->second
.back();
11312 last
= p
->second
.front();
11313 if (first
<= oid
.snap
) {
11314 dout(20) << __func__
<< " " << soid
<< " [" << first
<< "," << last
11315 << "] contains " << oid
.snap
<< " -- HIT " << obc
->obs
<< dendl
;
11319 dout(20) << __func__
<< " " << soid
<< " [" << first
<< "," << last
11320 << "] does not contain " << oid
.snap
<< " -- DNE" << dendl
;
11325 void PrimaryLogPG::object_context_destructor_callback(ObjectContext
*obc
)
11328 put_snapset_context(obc
->ssc
);
11331 void PrimaryLogPG::add_object_context_to_pg_stat(ObjectContextRef obc
, pg_stat_t
*pgstat
)
11333 object_info_t
& oi
= obc
->obs
.oi
;
11335 dout(10) << __func__
<< " " << oi
.soid
<< dendl
;
11336 ceph_assert(!oi
.soid
.is_snapdir());
11338 object_stat_sum_t stat
;
11339 stat
.num_objects
++;
11341 stat
.num_objects_dirty
++;
11342 if (oi
.is_whiteout())
11343 stat
.num_whiteouts
++;
11345 stat
.num_objects_omap
++;
11346 if (oi
.is_cache_pinned())
11347 stat
.num_objects_pinned
++;
11348 if (oi
.has_manifest())
11349 stat
.num_objects_manifest
++;
11351 if (oi
.soid
.is_snap()) {
11352 stat
.num_object_clones
++;
11355 obc
->ssc
= get_snapset_context(oi
.soid
, false);
11356 ceph_assert(obc
->ssc
);
11357 stat
.num_bytes
+= obc
->ssc
->snapset
.get_clone_bytes(oi
.soid
.snap
);
11359 stat
.num_bytes
+= oi
.size
;
11363 pgstat
->stats
.sum
.add(stat
);
11366 void PrimaryLogPG::kick_object_context_blocked(ObjectContextRef obc
)
11368 const hobject_t
& soid
= obc
->obs
.oi
.soid
;
11369 if (obc
->is_blocked()) {
11370 dout(10) << __func__
<< " " << soid
<< " still blocked" << dendl
;
11374 map
<hobject_t
, list
<OpRequestRef
>>::iterator p
= waiting_for_blocked_object
.find(soid
);
11375 if (p
!= waiting_for_blocked_object
.end()) {
11376 list
<OpRequestRef
>& ls
= p
->second
;
11377 dout(10) << __func__
<< " " << soid
<< " requeuing " << ls
.size() << " requests" << dendl
;
11379 waiting_for_blocked_object
.erase(p
);
11382 map
<hobject_t
, ObjectContextRef
>::iterator i
=
11383 objects_blocked_on_snap_promotion
.find(obc
->obs
.oi
.soid
.get_head());
11384 if (i
!= objects_blocked_on_snap_promotion
.end()) {
11385 ceph_assert(i
->second
== obc
);
11386 objects_blocked_on_snap_promotion
.erase(i
);
11389 if (obc
->requeue_scrub_on_unblock
) {
11390 obc
->requeue_scrub_on_unblock
= false;
11395 SnapSetContext
*PrimaryLogPG::get_snapset_context(
11396 const hobject_t
& oid
,
11398 const map
<string
, bufferlist
> *attrs
,
11401 std::lock_guard
l(snapset_contexts_lock
);
11402 SnapSetContext
*ssc
;
11403 map
<hobject_t
, SnapSetContext
*>::iterator p
= snapset_contexts
.find(
11404 oid
.get_snapdir());
11405 if (p
!= snapset_contexts
.end()) {
11406 if (can_create
|| p
->second
->exists
) {
11415 if (!(oid
.is_head() && !oid_existed
)) {
11416 r
= pgbackend
->objects_get_attr(oid
.get_head(), SS_ATTR
, &bv
);
11418 if (r
< 0 && !can_create
)
11421 auto it_ss
= attrs
->find(SS_ATTR
);
11422 ceph_assert(it_ss
!= attrs
->end());
11423 bv
= it_ss
->second
;
11425 ssc
= new SnapSetContext(oid
.get_snapdir());
11426 _register_snapset_context(ssc
);
11428 bufferlist::const_iterator bvp
= bv
.begin();
11430 ssc
->snapset
.decode(bvp
);
11431 } catch (buffer::error
& e
) {
11432 dout(0) << __func__
<< " Can't decode snapset: " << e
<< dendl
;
11435 ssc
->exists
= true;
11437 ssc
->exists
= false;
11445 void PrimaryLogPG::put_snapset_context(SnapSetContext
*ssc
)
11447 std::lock_guard
l(snapset_contexts_lock
);
11449 if (ssc
->ref
== 0) {
11450 if (ssc
->registered
)
11451 snapset_contexts
.erase(ssc
->oid
);
11458 * NONE - didn't pull anything
11459 * YES - pulled what the caller wanted
11460 * HEAD - needed to pull head first
11462 enum { PULL_NONE
, PULL_HEAD
, PULL_YES
};
11464 int PrimaryLogPG::recover_missing(
11465 const hobject_t
&soid
, eversion_t v
,
11467 PGBackend::RecoveryHandle
*h
)
11469 if (missing_loc
.is_unfound(soid
)) {
11470 dout(7) << __func__
<< " " << soid
11472 << " but it is unfound" << dendl
;
11476 if (missing_loc
.is_deleted(soid
)) {
11477 start_recovery_op(soid
);
11478 ceph_assert(!recovering
.count(soid
));
11479 recovering
.insert(make_pair(soid
, ObjectContextRef()));
11480 epoch_t cur_epoch
= get_osdmap_epoch();
11481 remove_missing_object(soid
, v
, new FunctionContext(
11484 if (!pg_has_reset_since(cur_epoch
)) {
11485 bool object_missing
= false;
11486 for (const auto& shard
: acting_recovery_backfill
) {
11487 if (shard
== pg_whoami
)
11489 if (peer_missing
[shard
].is_missing(soid
)) {
11490 dout(20) << __func__
<< ": soid " << soid
<< " needs to be deleted from replica " << shard
<< dendl
;
11491 object_missing
= true;
11495 if (!object_missing
) {
11496 object_stat_sum_t stat_diff
;
11497 stat_diff
.num_objects_recovered
= 1;
11498 if (scrub_after_recovery
)
11499 stat_diff
.num_objects_repaired
= 1;
11500 on_global_recover(soid
, stat_diff
, true);
11502 auto recovery_handle
= pgbackend
->open_recovery_op();
11503 pgbackend
->recover_delete_object(soid
, v
, recovery_handle
);
11504 pgbackend
->run_recovery_op(recovery_handle
, priority
);
11512 // is this a snapped object? if so, consult the snapset.. we may not need the entire object!
11513 ObjectContextRef obc
;
11514 ObjectContextRef head_obc
;
11515 if (soid
.snap
&& soid
.snap
< CEPH_NOSNAP
) {
11516 // do we have the head?
11517 hobject_t head
= soid
.get_head();
11518 if (pg_log
.get_missing().is_missing(head
)) {
11519 if (recovering
.count(head
)) {
11520 dout(10) << " missing but already recovering head " << head
<< dendl
;
11523 int r
= recover_missing(
11524 head
, pg_log
.get_missing().get_items().find(head
)->second
.need
, priority
,
11526 if (r
!= PULL_NONE
)
11531 head_obc
= get_object_context(
11535 ceph_assert(head_obc
);
11537 start_recovery_op(soid
);
11538 ceph_assert(!recovering
.count(soid
));
11539 recovering
.insert(make_pair(soid
, obc
));
11540 int r
= pgbackend
->recover_object(
11546 // This is only a pull which shouldn't return an error
11547 ceph_assert(r
>= 0);
11551 void PrimaryLogPG::remove_missing_object(const hobject_t
&soid
,
11552 eversion_t v
, Context
*on_complete
)
11554 dout(20) << __func__
<< " " << soid
<< " " << v
<< dendl
;
11555 ceph_assert(on_complete
!= nullptr);
11557 ObjectStore::Transaction t
;
11558 remove_snap_mapped_object(t
, soid
);
11560 ObjectRecoveryInfo recovery_info
;
11561 recovery_info
.soid
= soid
;
11562 recovery_info
.version
= v
;
11564 epoch_t cur_epoch
= get_osdmap_epoch();
11565 t
.register_on_complete(new FunctionContext(
11568 if (!pg_has_reset_since(cur_epoch
)) {
11569 ObjectStore::Transaction t2
;
11570 on_local_recover(soid
, recovery_info
, ObjectContextRef(), true, &t2
);
11571 t2
.register_on_complete(on_complete
);
11572 int r
= osd
->store
->queue_transaction(ch
, std::move(t2
), nullptr);
11573 ceph_assert(r
== 0);
11577 on_complete
->complete(-EAGAIN
);
11580 int r
= osd
->store
->queue_transaction(ch
, std::move(t
), nullptr);
11581 ceph_assert(r
== 0);
11584 void PrimaryLogPG::finish_degraded_object(const hobject_t
& oid
)
11586 dout(10) << __func__
<< " " << oid
<< dendl
;
11587 if (callbacks_for_degraded_object
.count(oid
)) {
11588 list
<Context
*> contexts
;
11589 contexts
.swap(callbacks_for_degraded_object
[oid
]);
11590 callbacks_for_degraded_object
.erase(oid
);
11591 for (list
<Context
*>::iterator i
= contexts
.begin();
11592 i
!= contexts
.end();
11597 map
<hobject_t
, snapid_t
>::iterator i
= objects_blocked_on_degraded_snap
.find(
11599 if (i
!= objects_blocked_on_degraded_snap
.end() &&
11600 i
->second
== oid
.snap
)
11601 objects_blocked_on_degraded_snap
.erase(i
);
11604 void PrimaryLogPG::_committed_pushed_object(
11605 epoch_t epoch
, eversion_t last_complete
)
11608 if (!pg_has_reset_since(epoch
)) {
11609 dout(10) << __func__
<< " last_complete " << last_complete
<< " now ondisk" << dendl
;
11610 last_complete_ondisk
= last_complete
;
11612 if (last_complete_ondisk
== info
.last_update
) {
11613 if (!is_primary()) {
11614 // Either we are a replica or backfill target.
11615 // we are fully up to date. tell the primary!
11616 osd
->send_message_osd_cluster(
11619 get_osdmap_epoch(),
11620 spg_t(info
.pgid
.pgid
, get_primary().shard
),
11621 last_complete_ondisk
),
11622 get_osdmap_epoch());
11624 calc_min_last_complete_ondisk();
11629 dout(10) << __func__
<< " pg has changed, not touching last_complete_ondisk" << dendl
;
11635 void PrimaryLogPG::_applied_recovered_object(ObjectContextRef obc
)
11637 dout(20) << __func__
<< dendl
;
11639 dout(20) << "obc = " << *obc
<< dendl
;
11641 ceph_assert(active_pushes
>= 1);
11644 // requeue an active chunky scrub waiting on recovery ops
11645 if (!deleting
&& active_pushes
== 0
11646 && scrubber
.is_chunky_scrub_active()) {
11647 requeue_scrub(ops_blocked_by_scrub());
11651 void PrimaryLogPG::_applied_recovered_object_replica()
11653 dout(20) << __func__
<< dendl
;
11654 ceph_assert(active_pushes
>= 1);
11657 // requeue an active chunky scrub waiting on recovery ops
11658 if (!deleting
&& active_pushes
== 0 &&
11659 scrubber
.active_rep_scrub
&& static_cast<const MOSDRepScrub
*>(
11660 scrubber
.active_rep_scrub
->get_req())->chunky
) {
11661 auto& op
= scrubber
.active_rep_scrub
;
11664 unique_ptr
<OpQueueItem::OpQueueable
>(new PGOpItem(info
.pgid
, op
)),
11665 op
->get_req()->get_cost(),
11666 op
->get_req()->get_priority(),
11667 op
->get_req()->get_recv_stamp(),
11668 op
->get_req()->get_source().num(),
11669 get_osdmap_epoch()));
11670 scrubber
.active_rep_scrub
.reset();
11674 void PrimaryLogPG::recover_got(hobject_t oid
, eversion_t v
)
11676 dout(10) << "got missing " << oid
<< " v " << v
<< dendl
;
11677 pg_log
.recover_got(oid
, v
, info
);
11678 if (pg_log
.get_log().complete_to
!= pg_log
.get_log().log
.end()) {
11679 dout(10) << "last_complete now " << info
.last_complete
11680 << " log.complete_to " << pg_log
.get_log().complete_to
->version
11683 dout(10) << "last_complete now " << info
.last_complete
11684 << " log.complete_to at end" << dendl
;
11685 //below is not true in the repair case.
11686 //assert(missing.num_missing() == 0); // otherwise, complete_to was wrong.
11687 ceph_assert(info
.last_complete
== info
.last_update
);
11691 void PrimaryLogPG::primary_failed(const hobject_t
&soid
)
11693 list
<pg_shard_t
> fl
= { pg_whoami
};
11694 failed_push(fl
, soid
);
11697 void PrimaryLogPG::failed_push(const list
<pg_shard_t
> &from
, const hobject_t
&soid
)
11699 dout(20) << __func__
<< ": " << soid
<< dendl
;
11700 ceph_assert(recovering
.count(soid
));
11701 auto obc
= recovering
[soid
];
11703 list
<OpRequestRef
> blocked_ops
;
11704 obc
->drop_recovery_read(&blocked_ops
);
11705 requeue_ops(blocked_ops
);
11707 recovering
.erase(soid
);
11708 for (auto&& i
: from
)
11709 missing_loc
.remove_location(soid
, i
);
11710 dout(0) << __func__
<< " " << soid
<< " from shard " << from
11711 << ", reps on " << missing_loc
.get_locations(soid
)
11712 << " unfound? " << missing_loc
.is_unfound(soid
) << dendl
;
11713 finish_recovery_op(soid
); // close out this attempt,
11716 eversion_t
PrimaryLogPG::pick_newest_available(const hobject_t
& oid
)
11719 pg_missing_item pmi
;
11720 bool is_missing
= pg_log
.get_missing().is_missing(oid
, &pmi
);
11721 ceph_assert(is_missing
);
11723 dout(10) << "pick_newest_available " << oid
<< " " << v
<< " on osd." << osd
->whoami
<< " (local)" << dendl
;
11725 ceph_assert(!acting_recovery_backfill
.empty());
11726 for (set
<pg_shard_t
>::iterator i
= acting_recovery_backfill
.begin();
11727 i
!= acting_recovery_backfill
.end();
11729 if (*i
== get_primary()) continue;
11730 pg_shard_t peer
= *i
;
11731 if (!peer_missing
[peer
].is_missing(oid
)) {
11734 eversion_t h
= peer_missing
[peer
].get_items().at(oid
).have
;
11735 dout(10) << "pick_newest_available " << oid
<< " " << h
<< " on osd." << peer
<< dendl
;
11740 dout(10) << "pick_newest_available " << oid
<< " " << v
<< " (newest)" << dendl
;
11744 void PrimaryLogPG::do_update_log_missing(OpRequestRef
&op
)
11746 const MOSDPGUpdateLogMissing
*m
= static_cast<const MOSDPGUpdateLogMissing
*>(
11748 ceph_assert(m
->get_type() == MSG_OSD_PG_UPDATE_LOG_MISSING
);
11749 ObjectStore::Transaction t
;
11750 boost::optional
<eversion_t
> op_trim_to
, op_roll_forward_to
;
11751 if (m
->pg_trim_to
!= eversion_t())
11752 op_trim_to
= m
->pg_trim_to
;
11753 if (m
->pg_roll_forward_to
!= eversion_t())
11754 op_roll_forward_to
= m
->pg_roll_forward_to
;
11756 dout(20) << __func__
<< " op_trim_to = " << op_trim_to
<< " op_roll_forward_to = " << op_roll_forward_to
<< dendl
;
11758 append_log_entries_update_missing(m
->entries
, t
, op_trim_to
, op_roll_forward_to
);
11759 eversion_t new_lcod
= info
.last_complete
;
11761 Context
*complete
= new FunctionContext(
11763 const MOSDPGUpdateLogMissing
*msg
= static_cast<const MOSDPGUpdateLogMissing
*>(
11766 if (!pg_has_reset_since(msg
->get_epoch())) {
11767 update_last_complete_ondisk(new_lcod
);
11768 MOSDPGUpdateLogMissingReply
*reply
=
11769 new MOSDPGUpdateLogMissingReply(
11770 spg_t(info
.pgid
.pgid
, primary_shard().shard
),
11776 reply
->set_priority(CEPH_MSG_PRIO_HIGH
);
11777 msg
->get_connection()->send_message(reply
);
11782 if (get_osdmap()->require_osd_release
>= CEPH_RELEASE_KRAKEN
) {
11783 t
.register_on_commit(complete
);
11785 /* Hack to work around the fact that ReplicatedBackend sends
11786 * ack+commit if commit happens first
11788 * This behavior is no longer necessary, but we preserve it so old
11789 * primaries can keep their repops in order */
11790 if (pool
.info
.is_erasure()) {
11791 t
.register_on_complete(complete
);
11793 t
.register_on_commit(complete
);
11796 int tr
= osd
->store
->queue_transaction(
11800 ceph_assert(tr
== 0);
11801 op_applied(info
.last_update
);
11804 void PrimaryLogPG::do_update_log_missing_reply(OpRequestRef
&op
)
11806 const MOSDPGUpdateLogMissingReply
*m
=
11807 static_cast<const MOSDPGUpdateLogMissingReply
*>(
11809 dout(20) << __func__
<< " got reply from "
11810 << m
->get_from() << dendl
;
11812 auto it
= log_entry_update_waiting_on
.find(m
->get_tid());
11813 if (it
!= log_entry_update_waiting_on
.end()) {
11814 if (it
->second
.waiting_on
.count(m
->get_from())) {
11815 it
->second
.waiting_on
.erase(m
->get_from());
11816 if (m
->last_complete_ondisk
!= eversion_t()) {
11817 update_peer_last_complete_ondisk(m
->get_from(), m
->last_complete_ondisk
);
11821 << info
.pgid
<< " got reply "
11822 << *m
<< " from shard we are not waiting for "
11826 if (it
->second
.waiting_on
.empty()) {
11827 repop_all_committed(it
->second
.repop
.get());
11828 log_entry_update_waiting_on
.erase(it
);
11832 << info
.pgid
<< " got reply "
11833 << *m
<< " on unknown tid " << m
->get_tid();
11837 /* Mark all unfound objects as lost.
11839 void PrimaryLogPG::mark_all_unfound_lost(
11844 dout(3) << __func__
<< " " << pg_log_entry_t::get_op_name(what
) << dendl
;
11845 list
<hobject_t
> oids
;
11847 dout(30) << __func__
<< ": log before:\n";
11848 pg_log
.get_log().print(*_dout
);
11851 mempool::osd_pglog::list
<pg_log_entry_t
> log_entries
;
11853 utime_t mtime
= ceph_clock_now();
11854 map
<hobject_t
, pg_missing_item
>::const_iterator m
=
11855 missing_loc
.get_needs_recovery().begin();
11856 map
<hobject_t
, pg_missing_item
>::const_iterator mend
=
11857 missing_loc
.get_needs_recovery().end();
11859 ObcLockManager manager
;
11860 eversion_t v
= get_next_version();
11861 v
.epoch
= get_osdmap_epoch();
11862 uint64_t num_unfound
= missing_loc
.num_unfound();
11863 while (m
!= mend
) {
11864 const hobject_t
&oid(m
->first
);
11865 if (!missing_loc
.is_unfound(oid
)) {
11866 // We only care about unfound objects
11871 ObjectContextRef obc
;
11875 case pg_log_entry_t::LOST_MARK
:
11876 ceph_abort_msg("actually, not implemented yet!");
11879 case pg_log_entry_t::LOST_REVERT
:
11880 prev
= pick_newest_available(oid
);
11881 if (prev
> eversion_t()) {
11884 pg_log_entry_t::LOST_REVERT
, oid
, v
,
11885 m
->second
.need
, 0, osd_reqid_t(), mtime
, 0);
11886 e
.reverting_to
= prev
;
11887 e
.mark_unrollbackable();
11888 log_entries
.push_back(e
);
11889 dout(10) << e
<< dendl
;
11891 // we are now missing the new version; recovery code will sort it out.
11897 case pg_log_entry_t::LOST_DELETE
:
11899 pg_log_entry_t
e(pg_log_entry_t::LOST_DELETE
, oid
, v
, m
->second
.need
,
11900 0, osd_reqid_t(), mtime
, 0);
11901 if (get_osdmap()->require_osd_release
>= CEPH_RELEASE_JEWEL
) {
11902 if (pool
.info
.require_rollback()) {
11903 e
.mod_desc
.try_rmobject(v
.version
);
11905 e
.mark_unrollbackable();
11907 } // otherwise, just do what we used to do
11908 dout(10) << e
<< dendl
;
11909 log_entries
.push_back(e
);
11910 oids
.push_back(oid
);
11912 // If context found mark object as deleted in case
11913 // of racing with new creation. This can happen if
11914 // object lost and EIO at primary.
11915 obc
= object_contexts
.lookup(oid
);
11917 obc
->obs
.exists
= false;
11929 info
.stats
.stats_invalid
= true;
11931 submit_log_entries(
11933 std::move(manager
),
11934 boost::optional
<std::function
<void(void)> >(
11935 [this, oids
, con
, num_unfound
, tid
]() {
11936 if (perform_deletes_during_peering()) {
11937 for (auto oid
: oids
) {
11938 // clear old locations - merge_new_log_entries will have
11939 // handled rebuilding missing_loc for each of these
11940 // objects if we have the RECOVERY_DELETES flag
11941 missing_loc
.recovered(oid
);
11945 if (is_recovery_unfound()) {
11946 queue_peering_event(
11948 std::make_shared
<PGPeeringEvent
>(
11949 get_osdmap_epoch(),
11950 get_osdmap_epoch(),
11952 } else if (is_backfill_unfound()) {
11953 queue_peering_event(
11955 std::make_shared
<PGPeeringEvent
>(
11956 get_osdmap_epoch(),
11957 get_osdmap_epoch(),
11958 RequestBackfill())));
11964 ss
<< "pg has " << num_unfound
11965 << " objects unfound and apparently lost marking";
11966 string rs
= ss
.str();
11967 dout(0) << "do_command r=" << 0 << " " << rs
<< dendl
;
11968 osd
->clog
->info() << rs
;
11970 MCommandReply
*reply
= new MCommandReply(0, rs
);
11971 reply
->set_tid(tid
);
11972 con
->send_message(reply
);
11978 void PrimaryLogPG::_split_into(pg_t child_pgid
, PG
*child
, unsigned split_bits
)
11980 ceph_assert(repop_queue
.empty());
11984 * pg status change notification
11987 void PrimaryLogPG::apply_and_flush_repops(bool requeue
)
11989 list
<OpRequestRef
> rq
;
11991 // apply all repops
11992 while (!repop_queue
.empty()) {
11993 RepGather
*repop
= repop_queue
.front();
11994 repop_queue
.pop_front();
11995 dout(10) << " canceling repop tid " << repop
->rep_tid
<< dendl
;
11996 repop
->rep_aborted
= true;
11997 repop
->on_committed
.clear();
11998 repop
->on_success
.clear();
12002 dout(10) << " requeuing " << *repop
->op
->get_req() << dendl
;
12003 rq
.push_back(repop
->op
);
12004 repop
->op
= OpRequestRef();
12007 // also requeue any dups, interleaved into position
12008 auto p
= waiting_for_ondisk
.find(repop
->v
);
12009 if (p
!= waiting_for_ondisk
.end()) {
12010 dout(10) << " also requeuing ondisk waiters " << p
->second
<< dendl
;
12011 for (auto& i
: p
->second
) {
12012 rq
.push_back(std::get
<0>(i
));
12014 waiting_for_ondisk
.erase(p
);
12018 remove_repop(repop
);
12021 ceph_assert(repop_queue
.empty());
12025 if (!waiting_for_ondisk
.empty()) {
12026 for (auto& i
: waiting_for_ondisk
) {
12027 for (auto& j
: i
.second
) {
12028 derr
<< __func__
<< ": op " << *(std::get
<0>(j
)->get_req())
12029 << " waiting on " << i
.first
<< dendl
;
12032 ceph_assert(waiting_for_ondisk
.empty());
12036 waiting_for_ondisk
.clear();
12039 void PrimaryLogPG::on_flushed()
12041 ceph_assert(flushes_in_progress
> 0);
12042 flushes_in_progress
--;
12043 if (flushes_in_progress
== 0) {
12044 requeue_ops(waiting_for_flush
);
12046 if (!is_peered() || !is_primary()) {
12047 pair
<hobject_t
, ObjectContextRef
> i
;
12048 while (object_contexts
.get_next(i
.first
, &i
)) {
12049 derr
<< __func__
<< ": object " << i
.first
<< " obc still alive" << dendl
;
12051 ceph_assert(object_contexts
.empty());
12055 void PrimaryLogPG::on_removal(ObjectStore::Transaction
*t
)
12057 dout(10) << __func__
<< dendl
;
12059 // adjust info to backfill
12060 info
.set_last_backfill(hobject_t());
12061 pg_log
.reset_backfill();
12065 PGLogEntryHandler rollbacker
{this, t
};
12066 pg_log
.roll_forward(&rollbacker
);
12071 void PrimaryLogPG::clear_async_reads()
12073 dout(10) << __func__
<< dendl
;
12074 for(auto& i
: in_progress_async_reads
) {
12075 dout(10) << "clear ctx: "
12076 << "OpRequestRef " << i
.first
12077 << " OpContext " << i
.second
12079 close_op_ctx(i
.second
);
12083 void PrimaryLogPG::clear_cache()
12085 object_contexts
.clear();
12088 void PrimaryLogPG::on_shutdown()
12090 dout(10) << __func__
<< dendl
;
12092 // handles queue races
12095 if (recovery_queued
) {
12096 recovery_queued
= false;
12097 osd
->clear_queued_recovery(this);
12100 clear_scrub_reserved();
12101 scrub_clear_state();
12103 unreg_next_scrub();
12105 vector
<ceph_tid_t
> tids
;
12106 cancel_copy_ops(false, &tids
);
12107 cancel_flush_ops(false, &tids
);
12108 cancel_proxy_ops(false, &tids
);
12109 osd
->objecter
->op_cancel(tids
, -ECANCELED
);
12111 apply_and_flush_repops(false);
12112 cancel_log_updates();
12113 // we must remove PGRefs, so do this this prior to release_backoffs() callers
12115 // clean up snap trim references
12116 snap_trimmer_machine
.process_event(Reset());
12118 pgbackend
->on_change();
12120 context_registry_on_change();
12121 object_contexts
.clear();
12123 clear_async_reads();
12125 osd
->remote_reserver
.cancel_reservation(info
.pgid
);
12126 osd
->local_reserver
.cancel_reservation(info
.pgid
);
12128 clear_primary_state();
12131 if (is_primary()) {
12132 osd
->clear_ready_to_merge(this);
12136 void PrimaryLogPG::on_activate()
12139 if (needs_recovery()) {
12140 dout(10) << "activate not all replicas are up-to-date, queueing recovery" << dendl
;
12141 queue_peering_event(
12143 std::make_shared
<PGPeeringEvent
>(
12144 get_osdmap_epoch(),
12145 get_osdmap_epoch(),
12147 } else if (needs_backfill()) {
12148 dout(10) << "activate queueing backfill" << dendl
;
12149 queue_peering_event(
12151 std::make_shared
<PGPeeringEvent
>(
12152 get_osdmap_epoch(),
12153 get_osdmap_epoch(),
12154 RequestBackfill())));
12156 dout(10) << "activate all replicas clean, no recovery" << dendl
;
12157 eio_errors_to_process
= false;
12158 queue_peering_event(
12160 std::make_shared
<PGPeeringEvent
>(
12161 get_osdmap_epoch(),
12162 get_osdmap_epoch(),
12163 AllReplicasRecovered())));
12166 publish_stats_to_osd();
12168 if (!backfill_targets
.empty()) {
12169 last_backfill_started
= earliest_backfill();
12170 new_backfill
= true;
12171 ceph_assert(!last_backfill_started
.is_max());
12172 dout(5) << __func__
<< ": bft=" << backfill_targets
12173 << " from " << last_backfill_started
<< dendl
;
12174 for (set
<pg_shard_t
>::iterator i
= backfill_targets
.begin();
12175 i
!= backfill_targets
.end();
12177 dout(5) << "target shard " << *i
12178 << " from " << peer_info
[*i
].last_backfill
12187 void PrimaryLogPG::_on_new_interval()
12189 dout(20) << __func__
<< " checking missing set deletes flag. missing = " << pg_log
.get_missing() << dendl
;
12190 if (!pg_log
.get_missing().may_include_deletes
&&
12191 get_osdmap()->test_flag(CEPH_OSDMAP_RECOVERY_DELETES
)) {
12192 pg_log
.rebuild_missing_set_with_deletes(osd
->store
, ch
, info
);
12194 ceph_assert(pg_log
.get_missing().may_include_deletes
== get_osdmap()->test_flag(CEPH_OSDMAP_RECOVERY_DELETES
));
12197 void PrimaryLogPG::on_change(ObjectStore::Transaction
*t
)
12199 dout(10) << __func__
<< dendl
;
12201 if (hit_set
&& hit_set
->insert_count() == 0) {
12202 dout(20) << " discarding empty hit_set" << dendl
;
12206 if (recovery_queued
) {
12207 recovery_queued
= false;
12208 osd
->clear_queued_recovery(this);
12211 // requeue everything in the reverse order they should be
12213 requeue_ops(waiting_for_peered
);
12214 requeue_ops(waiting_for_flush
);
12215 requeue_ops(waiting_for_active
);
12217 clear_scrub_reserved();
12219 vector
<ceph_tid_t
> tids
;
12220 cancel_copy_ops(is_primary(), &tids
);
12221 cancel_flush_ops(is_primary(), &tids
);
12222 cancel_proxy_ops(is_primary(), &tids
);
12223 osd
->objecter
->op_cancel(tids
, -ECANCELED
);
12225 // requeue object waiters
12226 for (auto& p
: waiting_for_unreadable_object
) {
12227 release_backoffs(p
.first
);
12229 if (is_primary()) {
12230 requeue_object_waiters(waiting_for_unreadable_object
);
12232 waiting_for_unreadable_object
.clear();
12234 for (map
<hobject_t
,list
<OpRequestRef
>>::iterator p
= waiting_for_degraded_object
.begin();
12235 p
!= waiting_for_degraded_object
.end();
12236 waiting_for_degraded_object
.erase(p
++)) {
12237 release_backoffs(p
->first
);
12239 requeue_ops(p
->second
);
12242 finish_degraded_object(p
->first
);
12245 // requeues waiting_for_scrub
12246 scrub_clear_state();
12248 for (auto p
= waiting_for_blocked_object
.begin();
12249 p
!= waiting_for_blocked_object
.end();
12250 waiting_for_blocked_object
.erase(p
++)) {
12252 requeue_ops(p
->second
);
12256 for (auto i
= callbacks_for_degraded_object
.begin();
12257 i
!= callbacks_for_degraded_object
.end();
12259 finish_degraded_object((i
++)->first
);
12261 ceph_assert(callbacks_for_degraded_object
.empty());
12263 if (is_primary()) {
12264 requeue_ops(waiting_for_cache_not_full
);
12266 waiting_for_cache_not_full
.clear();
12268 objects_blocked_on_cache_full
.clear();
12270 for (list
<pair
<OpRequestRef
, OpContext
*> >::iterator i
=
12271 in_progress_async_reads
.begin();
12272 i
!= in_progress_async_reads
.end();
12273 in_progress_async_reads
.erase(i
++)) {
12274 close_op_ctx(i
->second
);
12276 requeue_op(i
->first
);
12279 // this will requeue ops we were working on but didn't finish, and
12281 apply_and_flush_repops(is_primary());
12282 cancel_log_updates();
12284 // do this *after* apply_and_flush_repops so that we catch any newly
12285 // registered watches.
12286 context_registry_on_change();
12288 pgbackend
->on_change_cleanup(t
);
12289 scrubber
.cleanup_store(t
);
12290 pgbackend
->on_change();
12292 // clear snap_trimmer state
12293 snap_trimmer_machine
.process_event(Reset());
12295 debug_op_order
.clear();
12296 unstable_stats
.clear();
12298 // we don't want to cache object_contexts through the interval change
12299 // NOTE: we actually assert that all currently live references are dead
12300 // by the time the flush for the next interval completes.
12301 object_contexts
.clear();
12303 // should have been cleared above by finishing all of the degraded objects
12304 ceph_assert(objects_blocked_on_degraded_snap
.empty());
12307 void PrimaryLogPG::on_role_change()
12309 dout(10) << __func__
<< dendl
;
12310 if (get_role() != 0 && hit_set
) {
12311 dout(10) << " clearing hit set" << dendl
;
12316 void PrimaryLogPG::on_pool_change()
12318 dout(10) << __func__
<< dendl
;
12319 // requeue cache full waiters just in case the cache_mode is
12320 // changing away from writeback mode. note that if we are not
12321 // active the normal requeuing machinery is sufficient (and properly
12324 pool
.info
.cache_mode
!= pg_pool_t::CACHEMODE_WRITEBACK
&&
12325 !waiting_for_cache_not_full
.empty()) {
12326 dout(10) << __func__
<< " requeuing full waiters (not in writeback) "
12328 requeue_ops(waiting_for_cache_not_full
);
12329 objects_blocked_on_cache_full
.clear();
12335 // clear state. called on recovery completion AND cancellation.
12336 void PrimaryLogPG::_clear_recovery_state()
12338 missing_loc
.clear();
12339 #ifdef DEBUG_RECOVERY_OIDS
12340 recovering_oids
.clear();
12342 last_backfill_started
= hobject_t();
12343 set
<hobject_t
>::iterator i
= backfills_in_flight
.begin();
12344 while (i
!= backfills_in_flight
.end()) {
12345 ceph_assert(recovering
.count(*i
));
12346 backfills_in_flight
.erase(i
++);
12349 list
<OpRequestRef
> blocked_ops
;
12350 for (map
<hobject_t
, ObjectContextRef
>::iterator i
= recovering
.begin();
12351 i
!= recovering
.end();
12352 recovering
.erase(i
++)) {
12354 i
->second
->drop_recovery_read(&blocked_ops
);
12355 requeue_ops(blocked_ops
);
12358 ceph_assert(backfills_in_flight
.empty());
12359 pending_backfill_updates
.clear();
12360 ceph_assert(recovering
.empty());
12361 pgbackend
->clear_recovery_state();
12364 void PrimaryLogPG::cancel_pull(const hobject_t
&soid
)
12366 dout(20) << __func__
<< ": " << soid
<< dendl
;
12367 ceph_assert(recovering
.count(soid
));
12368 ObjectContextRef obc
= recovering
[soid
];
12370 list
<OpRequestRef
> blocked_ops
;
12371 obc
->drop_recovery_read(&blocked_ops
);
12372 requeue_ops(blocked_ops
);
12374 recovering
.erase(soid
);
12375 finish_recovery_op(soid
);
12376 release_backoffs(soid
);
12377 if (waiting_for_degraded_object
.count(soid
)) {
12378 dout(20) << " kicking degraded waiters on " << soid
<< dendl
;
12379 requeue_ops(waiting_for_degraded_object
[soid
]);
12380 waiting_for_degraded_object
.erase(soid
);
12382 if (waiting_for_unreadable_object
.count(soid
)) {
12383 dout(20) << " kicking unreadable waiters on " << soid
<< dendl
;
12384 requeue_ops(waiting_for_unreadable_object
[soid
]);
12385 waiting_for_unreadable_object
.erase(soid
);
12387 if (is_missing_object(soid
))
12388 pg_log
.set_last_requested(0); // get recover_primary to start over
12389 finish_degraded_object(soid
);
12392 void PrimaryLogPG::check_recovery_sources(const OSDMapRef
& osdmap
)
12395 * check that any peers we are planning to (or currently) pulling
12396 * objects from are dealt with.
12398 missing_loc
.check_recovery_sources(osdmap
);
12399 pgbackend
->check_recovery_sources(osdmap
);
12401 for (set
<pg_shard_t
>::iterator i
= peer_log_requested
.begin();
12402 i
!= peer_log_requested
.end();
12404 if (!osdmap
->is_up(i
->osd
)) {
12405 dout(10) << "peer_log_requested removing " << *i
<< dendl
;
12406 peer_log_requested
.erase(i
++);
12412 for (set
<pg_shard_t
>::iterator i
= peer_missing_requested
.begin();
12413 i
!= peer_missing_requested
.end();
12415 if (!osdmap
->is_up(i
->osd
)) {
12416 dout(10) << "peer_missing_requested removing " << *i
<< dendl
;
12417 peer_missing_requested
.erase(i
++);
12424 bool PrimaryLogPG::start_recovery_ops(
12426 ThreadPool::TPHandle
&handle
,
12427 uint64_t *ops_started
)
12429 uint64_t& started
= *ops_started
;
12431 bool work_in_progress
= false;
12432 bool recovery_started
= false;
12433 ceph_assert(is_primary());
12434 ceph_assert(is_peered());
12435 ceph_assert(!is_deleting());
12437 ceph_assert(recovery_queued
);
12438 recovery_queued
= false;
12440 if (!state_test(PG_STATE_RECOVERING
) &&
12441 !state_test(PG_STATE_BACKFILLING
)) {
12442 /* TODO: I think this case is broken and will make do_recovery()
12443 * unhappy since we're returning false */
12444 dout(10) << "recovery raced and were queued twice, ignoring!" << dendl
;
12445 return have_unfound();
12448 const auto &missing
= pg_log
.get_missing();
12450 unsigned int num_missing
= missing
.num_missing();
12451 uint64_t num_unfound
= get_num_unfound();
12453 if (num_missing
== 0) {
12454 info
.last_complete
= info
.last_update
;
12457 if (num_missing
== num_unfound
) {
12458 // All of the missing objects we have are unfound.
12459 // Recover the replicas.
12460 started
= recover_replicas(max
, handle
, &recovery_started
);
12463 // We still have missing objects that we should grab from replicas.
12464 started
+= recover_primary(max
, handle
);
12466 if (!started
&& num_unfound
!= get_num_unfound()) {
12467 // second chance to recovery replicas
12468 started
= recover_replicas(max
, handle
, &recovery_started
);
12471 if (started
|| recovery_started
)
12472 work_in_progress
= true;
12474 bool deferred_backfill
= false;
12475 if (recovering
.empty() &&
12476 state_test(PG_STATE_BACKFILLING
) &&
12477 !backfill_targets
.empty() && started
< max
&&
12478 missing
.num_missing() == 0 &&
12479 waiting_on_backfill
.empty()) {
12480 if (get_osdmap()->test_flag(CEPH_OSDMAP_NOBACKFILL
)) {
12481 dout(10) << "deferring backfill due to NOBACKFILL" << dendl
;
12482 deferred_backfill
= true;
12483 } else if (get_osdmap()->test_flag(CEPH_OSDMAP_NOREBALANCE
) &&
12485 dout(10) << "deferring backfill due to NOREBALANCE" << dendl
;
12486 deferred_backfill
= true;
12487 } else if (!backfill_reserved
) {
12488 dout(10) << "deferring backfill due to !backfill_reserved" << dendl
;
12489 if (!backfill_reserving
) {
12490 dout(10) << "queueing RequestBackfill" << dendl
;
12491 backfill_reserving
= true;
12492 queue_peering_event(
12494 std::make_shared
<PGPeeringEvent
>(
12495 get_osdmap_epoch(),
12496 get_osdmap_epoch(),
12497 RequestBackfill())));
12499 deferred_backfill
= true;
12501 started
+= recover_backfill(max
- started
, handle
, &work_in_progress
);
12505 dout(10) << " started " << started
<< dendl
;
12506 osd
->logger
->inc(l_osd_rop
, started
);
12508 if (!recovering
.empty() ||
12509 work_in_progress
|| recovery_ops_active
> 0 || deferred_backfill
)
12510 return !work_in_progress
&& have_unfound();
12512 ceph_assert(recovering
.empty());
12513 ceph_assert(recovery_ops_active
== 0);
12515 dout(10) << __func__
<< " needs_recovery: "
12516 << missing_loc
.get_needs_recovery()
12518 dout(10) << __func__
<< " missing_loc: "
12519 << missing_loc
.get_missing_locs()
12521 int unfound
= get_num_unfound();
12523 dout(10) << " still have " << unfound
<< " unfound" << dendl
;
12527 if (missing
.num_missing() > 0) {
12528 // this shouldn't happen!
12529 osd
->clog
->error() << info
.pgid
<< " Unexpected Error: recovery ending with "
12530 << missing
.num_missing() << ": " << missing
.get_items();
12534 if (needs_recovery()) {
12535 // this shouldn't happen!
12536 // We already checked num_missing() so we must have missing replicas
12537 osd
->clog
->error() << info
.pgid
12538 << " Unexpected Error: recovery ending with missing replicas";
12542 if (state_test(PG_STATE_RECOVERING
)) {
12543 state_clear(PG_STATE_RECOVERING
);
12544 state_clear(PG_STATE_FORCED_RECOVERY
);
12545 if (needs_backfill()) {
12546 dout(10) << "recovery done, queuing backfill" << dendl
;
12547 queue_peering_event(
12549 std::make_shared
<PGPeeringEvent
>(
12550 get_osdmap_epoch(),
12551 get_osdmap_epoch(),
12552 RequestBackfill())));
12554 dout(10) << "recovery done, no backfill" << dendl
;
12555 eio_errors_to_process
= false;
12556 state_clear(PG_STATE_FORCED_BACKFILL
);
12557 queue_peering_event(
12559 std::make_shared
<PGPeeringEvent
>(
12560 get_osdmap_epoch(),
12561 get_osdmap_epoch(),
12562 AllReplicasRecovered())));
12564 } else { // backfilling
12565 state_clear(PG_STATE_BACKFILLING
);
12566 state_clear(PG_STATE_FORCED_BACKFILL
);
12567 state_clear(PG_STATE_FORCED_RECOVERY
);
12568 dout(10) << "recovery done, backfill done" << dendl
;
12569 eio_errors_to_process
= false;
12570 queue_peering_event(
12572 std::make_shared
<PGPeeringEvent
>(
12573 get_osdmap_epoch(),
12574 get_osdmap_epoch(),
12582 * do one recovery op.
12583 * return true if done, false if nothing left to do.
12585 uint64_t PrimaryLogPG::recover_primary(uint64_t max
, ThreadPool::TPHandle
&handle
)
12587 ceph_assert(is_primary());
12589 const auto &missing
= pg_log
.get_missing();
12591 dout(10) << __func__
<< " recovering " << recovering
.size()
12593 << " missing " << missing
<< dendl
;
12595 dout(25) << __func__
<< " " << missing
.get_items() << dendl
;
12598 pg_log_entry_t
*latest
= 0;
12599 unsigned started
= 0;
12602 PGBackend::RecoveryHandle
*h
= pgbackend
->open_recovery_op();
12603 map
<version_t
, hobject_t
>::const_iterator p
=
12604 missing
.get_rmissing().lower_bound(pg_log
.get_log().last_requested
);
12605 while (p
!= missing
.get_rmissing().end()) {
12606 handle
.reset_tp_timeout();
12608 version_t v
= p
->first
;
12610 auto it_objects
= pg_log
.get_log().objects
.find(p
->second
);
12611 if (it_objects
!= pg_log
.get_log().objects
.end()) {
12612 latest
= it_objects
->second
;
12613 ceph_assert(latest
->is_update() || latest
->is_delete());
12614 soid
= latest
->soid
;
12619 const pg_missing_item
& item
= missing
.get_items().find(p
->second
)->second
;
12622 hobject_t head
= soid
.get_head();
12624 eversion_t need
= item
.need
;
12626 dout(10) << __func__
<< " "
12627 << soid
<< " " << item
.need
12628 << (missing
.is_missing(soid
) ? " (missing)":"")
12629 << (missing
.is_missing(head
) ? " (missing head)":"")
12630 << (recovering
.count(soid
) ? " (recovering)":"")
12631 << (recovering
.count(head
) ? " (recovering head)":"")
12635 switch (latest
->op
) {
12636 case pg_log_entry_t::CLONE
:
12638 * Handling for this special case removed for now, until we
12639 * can correctly construct an accurate SnapSet from the old
12644 case pg_log_entry_t::LOST_REVERT
:
12646 if (item
.have
== latest
->reverting_to
) {
12647 ObjectContextRef obc
= get_object_context(soid
, true);
12649 if (obc
->obs
.oi
.version
== latest
->version
) {
12650 // I'm already reverting
12651 dout(10) << " already reverting " << soid
<< dendl
;
12653 dout(10) << " reverting " << soid
<< " to " << latest
->prior_version
<< dendl
;
12654 obc
->obs
.oi
.version
= latest
->version
;
12656 ObjectStore::Transaction t
;
12658 obc
->obs
.oi
.encode(
12660 get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD
, nullptr));
12661 ceph_assert(!pool
.info
.require_rollback());
12662 t
.setattr(coll
, ghobject_t(soid
), OI_ATTR
, b2
);
12664 recover_got(soid
, latest
->version
);
12665 missing_loc
.add_location(soid
, pg_whoami
);
12669 t
.register_on_applied(new C_OSD_AppliedRecoveredObject(this, obc
));
12670 t
.register_on_commit(new C_OSD_CommittedPushedObject(
12672 get_osdmap_epoch(),
12673 info
.last_complete
));
12674 osd
->store
->queue_transaction(ch
, std::move(t
));
12679 * Pull the old version of the object. Update missing_loc here to have the location
12680 * of the version we want.
12682 * This doesn't use the usual missing_loc paths, but that's okay:
12683 * - if we have it locally, we hit the case above, and go from there.
12684 * - if we don't, we always pass through this case during recovery and set up the location
12686 * - this way we don't need to mangle the missing code to be general about needing an old
12689 eversion_t alternate_need
= latest
->reverting_to
;
12690 dout(10) << " need to pull prior_version " << alternate_need
<< " for revert " << item
<< dendl
;
12692 for (map
<pg_shard_t
, pg_missing_t
>::iterator p
= peer_missing
.begin();
12693 p
!= peer_missing
.end();
12695 if (p
->second
.is_missing(soid
, need
) &&
12696 p
->second
.get_items().at(soid
).have
== alternate_need
) {
12697 missing_loc
.add_location(soid
, p
->first
);
12699 dout(10) << " will pull " << alternate_need
<< " or " << need
12700 << " from one of " << missing_loc
.get_locations(soid
)
12708 if (!recovering
.count(soid
)) {
12709 if (recovering
.count(head
)) {
12712 int r
= recover_missing(
12713 soid
, need
, get_recovery_op_priority(), h
);
12726 if (started
>= max
)
12731 // only advance last_requested if we haven't skipped anything
12733 pg_log
.set_last_requested(v
);
12736 pgbackend
->run_recovery_op(h
, get_recovery_op_priority());
12740 bool PrimaryLogPG::primary_error(
12741 const hobject_t
& soid
, eversion_t v
)
12743 pg_log
.missing_add(soid
, v
, eversion_t());
12744 pg_log
.set_last_requested(0);
12745 missing_loc
.remove_location(soid
, pg_whoami
);
12747 ceph_assert(!acting_recovery_backfill
.empty());
12748 for (set
<pg_shard_t
>::iterator i
= acting_recovery_backfill
.begin();
12749 i
!= acting_recovery_backfill
.end();
12751 if (*i
== get_primary()) continue;
12752 pg_shard_t peer
= *i
;
12753 if (!peer_missing
[peer
].is_missing(soid
, v
)) {
12754 missing_loc
.add_location(soid
, peer
);
12755 dout(10) << info
.pgid
<< " unexpectedly missing " << soid
<< " v" << v
12756 << ", there should be a copy on shard " << peer
<< dendl
;
12761 osd
->clog
->error() << info
.pgid
<< " missing primary copy of " << soid
<< ", unfound";
12763 osd
->clog
->error() << info
.pgid
<< " missing primary copy of " << soid
12764 << ", will try copies on " << missing_loc
.get_locations(soid
);
12768 int PrimaryLogPG::prep_object_replica_deletes(
12769 const hobject_t
& soid
, eversion_t v
,
12770 PGBackend::RecoveryHandle
*h
,
12771 bool *work_started
)
12773 ceph_assert(is_primary());
12774 dout(10) << __func__
<< ": on " << soid
<< dendl
;
12776 ObjectContextRef obc
= get_object_context(soid
, false);
12778 if (!obc
->get_recovery_read()) {
12779 dout(20) << "replica delete delayed on " << soid
12780 << "; could not get rw_manager lock" << dendl
;
12781 *work_started
= true;
12784 dout(20) << "replica delete got recovery read lock on " << soid
12789 start_recovery_op(soid
);
12790 ceph_assert(!recovering
.count(soid
));
12792 recovering
.insert(make_pair(soid
, ObjectContextRef()));
12794 recovering
.insert(make_pair(soid
, obc
));
12796 pgbackend
->recover_delete_object(soid
, v
, h
);
12800 int PrimaryLogPG::prep_object_replica_pushes(
12801 const hobject_t
& soid
, eversion_t v
,
12802 PGBackend::RecoveryHandle
*h
,
12803 bool *work_started
)
12805 ceph_assert(is_primary());
12806 dout(10) << __func__
<< ": on " << soid
<< dendl
;
12808 // NOTE: we know we will get a valid oloc off of disk here.
12809 ObjectContextRef obc
= get_object_context(soid
, false);
12811 primary_error(soid
, v
);
12815 if (!obc
->get_recovery_read()) {
12816 dout(20) << "recovery delayed on " << soid
12817 << "; could not get rw_manager lock" << dendl
;
12818 *work_started
= true;
12821 dout(20) << "recovery got recovery read lock on " << soid
12825 start_recovery_op(soid
);
12826 ceph_assert(!recovering
.count(soid
));
12827 recovering
.insert(make_pair(soid
, obc
));
12829 /* We need this in case there is an in progress write on the object. In fact,
12830 * the only possible write is an update to the xattr due to a lost_revert --
12831 * a client write would be blocked since the object is degraded.
12832 * In almost all cases, therefore, this lock should be uncontended.
12834 int r
= pgbackend
->recover_object(
12837 ObjectContextRef(),
12838 obc
, // has snapset context
12841 dout(0) << __func__
<< " Error " << r
<< " on oid " << soid
<< dendl
;
12842 primary_failed(soid
);
12843 primary_error(soid
, v
);
12849 uint64_t PrimaryLogPG::recover_replicas(uint64_t max
, ThreadPool::TPHandle
&handle
,
12850 bool *work_started
)
12852 dout(10) << __func__
<< "(" << max
<< ")" << dendl
;
12853 uint64_t started
= 0;
12855 PGBackend::RecoveryHandle
*h
= pgbackend
->open_recovery_op();
12857 // this is FAR from an optimal recovery order. pretty lame, really.
12858 ceph_assert(!acting_recovery_backfill
.empty());
12859 // choose replicas to recover, replica has the shortest missing list first
12860 // so we can bring it back to normal ASAP
12861 std::vector
<std::pair
<unsigned int, pg_shard_t
>> replicas_by_num_missing
,
12862 async_by_num_missing
;
12863 replicas_by_num_missing
.reserve(acting_recovery_backfill
.size() - 1);
12864 for (auto &p
: acting_recovery_backfill
) {
12865 if (p
== get_primary()) {
12868 auto pm
= peer_missing
.find(p
);
12869 ceph_assert(pm
!= peer_missing
.end());
12870 auto nm
= pm
->second
.num_missing();
12872 if (async_recovery_targets
.count(p
)) {
12873 async_by_num_missing
.push_back(make_pair(nm
, p
));
12875 replicas_by_num_missing
.push_back(make_pair(nm
, p
));
12879 // sort by number of missing objects, in ascending order.
12880 auto func
= [](const std::pair
<unsigned int, pg_shard_t
> &lhs
,
12881 const std::pair
<unsigned int, pg_shard_t
> &rhs
) {
12882 return lhs
.first
< rhs
.first
;
12884 // acting goes first
12885 std::sort(replicas_by_num_missing
.begin(), replicas_by_num_missing
.end(), func
);
12886 // then async_recovery_targets
12887 std::sort(async_by_num_missing
.begin(), async_by_num_missing
.end(), func
);
12888 replicas_by_num_missing
.insert(replicas_by_num_missing
.end(),
12889 async_by_num_missing
.begin(), async_by_num_missing
.end());
12890 for (auto &replica
: replicas_by_num_missing
) {
12891 pg_shard_t
&peer
= replica
.second
;
12892 ceph_assert(peer
!= get_primary());
12893 map
<pg_shard_t
, pg_missing_t
>::const_iterator pm
= peer_missing
.find(peer
);
12894 ceph_assert(pm
!= peer_missing
.end());
12895 map
<pg_shard_t
, pg_info_t
>::const_iterator pi
= peer_info
.find(peer
);
12896 ceph_assert(pi
!= peer_info
.end());
12897 size_t m_sz
= pm
->second
.num_missing();
12899 dout(10) << " peer osd." << peer
<< " missing " << m_sz
<< " objects." << dendl
;
12900 dout(20) << " peer osd." << peer
<< " missing " << pm
->second
.get_items() << dendl
;
12903 const pg_missing_t
&m(pm
->second
);
12904 for (map
<version_t
, hobject_t
>::const_iterator p
= m
.get_rmissing().begin();
12905 p
!= m
.get_rmissing().end() && started
< max
;
12907 handle
.reset_tp_timeout();
12908 const hobject_t
soid(p
->second
);
12910 if (missing_loc
.is_unfound(soid
)) {
12911 dout(10) << __func__
<< ": " << soid
<< " still unfound" << dendl
;
12915 if (soid
> pi
->second
.last_backfill
) {
12916 if (!recovering
.count(soid
)) {
12917 derr
<< __func__
<< ": object " << soid
<< " last_backfill " << pi
->second
.last_backfill
<< dendl
;
12918 derr
<< __func__
<< ": object added to missing set for backfill, but "
12919 << "is not in recovering, error!" << dendl
;
12925 if (recovering
.count(soid
)) {
12926 dout(10) << __func__
<< ": already recovering " << soid
<< dendl
;
12930 if (missing_loc
.is_deleted(soid
)) {
12931 dout(10) << __func__
<< ": " << soid
<< " is a delete, removing" << dendl
;
12932 map
<hobject_t
,pg_missing_item
>::const_iterator r
= m
.get_items().find(soid
);
12933 started
+= prep_object_replica_deletes(soid
, r
->second
.need
, h
, work_started
);
12937 if (soid
.is_snap() && pg_log
.get_missing().is_missing(soid
.get_head())) {
12938 dout(10) << __func__
<< ": " << soid
.get_head()
12939 << " still missing on primary" << dendl
;
12943 if (pg_log
.get_missing().is_missing(soid
)) {
12944 dout(10) << __func__
<< ": " << soid
<< " still missing on primary" << dendl
;
12948 dout(10) << __func__
<< ": recover_object_replicas(" << soid
<< ")" << dendl
;
12949 map
<hobject_t
,pg_missing_item
>::const_iterator r
= m
.get_items().find(soid
);
12950 started
+= prep_object_replica_pushes(soid
, r
->second
.need
, h
, work_started
);
12954 pgbackend
->run_recovery_op(h
, get_recovery_op_priority());
12958 hobject_t
PrimaryLogPG::earliest_peer_backfill() const
12960 hobject_t e
= hobject_t::get_max();
12961 for (set
<pg_shard_t
>::const_iterator i
= backfill_targets
.begin();
12962 i
!= backfill_targets
.end();
12964 pg_shard_t peer
= *i
;
12965 map
<pg_shard_t
, BackfillInterval
>::const_iterator iter
=
12966 peer_backfill_info
.find(peer
);
12967 ceph_assert(iter
!= peer_backfill_info
.end());
12968 if (iter
->second
.begin
< e
)
12969 e
= iter
->second
.begin
;
12974 bool PrimaryLogPG::all_peer_done() const
12976 // Primary hasn't got any more objects
12977 ceph_assert(backfill_info
.empty());
12979 for (set
<pg_shard_t
>::const_iterator i
= backfill_targets
.begin();
12980 i
!= backfill_targets
.end();
12982 pg_shard_t bt
= *i
;
12983 map
<pg_shard_t
, BackfillInterval
>::const_iterator piter
=
12984 peer_backfill_info
.find(bt
);
12985 ceph_assert(piter
!= peer_backfill_info
.end());
12986 const BackfillInterval
& pbi
= piter
->second
;
12987 // See if peer has more to process
12988 if (!pbi
.extends_to_end() || !pbi
.empty())
12999 * backfilled: fully pushed to replica or present in replica's missing set (both
13000 * our copy and theirs).
13002 * All objects on a backfill_target in
13003 * [MIN,peer_backfill_info[backfill_target].begin) are valid; logically-removed
13004 * objects have been actually deleted and all logically-valid objects are replicated.
13005 * There may be PG objects in this interval yet to be backfilled.
13007 * All objects in PG in [MIN,backfill_info.begin) have been backfilled to all
13008 * backfill_targets. There may be objects on backfill_target(s) yet to be deleted.
13010 * For a backfill target, all objects < std::min(peer_backfill_info[target].begin,
13011 * backfill_info.begin) in PG are backfilled. No deleted objects in this
13012 * interval remain on the backfill target.
13014 * For a backfill target, all objects <= peer_info[target].last_backfill
13015 * have been backfilled to target
13017 * There *MAY* be missing/outdated objects between last_backfill_started and
13018 * std::min(peer_backfill_info[*].begin, backfill_info.begin) in the event that client
13019 * io created objects since the last scan. For this reason, we call
13020 * update_range() again before continuing backfill.
13022 uint64_t PrimaryLogPG::recover_backfill(
13024 ThreadPool::TPHandle
&handle
, bool *work_started
)
13026 dout(10) << __func__
<< " (" << max
<< ")"
13027 << " bft=" << backfill_targets
13028 << " last_backfill_started " << last_backfill_started
13029 << (new_backfill
? " new_backfill":"")
13031 ceph_assert(!backfill_targets
.empty());
13033 // Initialize from prior backfill state
13034 if (new_backfill
) {
13035 // on_activate() was called prior to getting here
13036 ceph_assert(last_backfill_started
== earliest_backfill());
13037 new_backfill
= false;
13039 // initialize BackfillIntervals
13040 for (set
<pg_shard_t
>::iterator i
= backfill_targets
.begin();
13041 i
!= backfill_targets
.end();
13043 peer_backfill_info
[*i
].reset(peer_info
[*i
].last_backfill
);
13045 backfill_info
.reset(last_backfill_started
);
13047 backfills_in_flight
.clear();
13048 pending_backfill_updates
.clear();
13051 for (set
<pg_shard_t
>::iterator i
= backfill_targets
.begin();
13052 i
!= backfill_targets
.end();
13054 dout(10) << "peer osd." << *i
13055 << " info " << peer_info
[*i
]
13056 << " interval " << peer_backfill_info
[*i
].begin
13057 << "-" << peer_backfill_info
[*i
].end
13058 << " " << peer_backfill_info
[*i
].objects
.size() << " objects"
13062 // update our local interval to cope with recent changes
13063 backfill_info
.begin
= last_backfill_started
;
13064 update_range(&backfill_info
, handle
);
13067 vector
<boost::tuple
<hobject_t
, eversion_t
, pg_shard_t
> > to_remove
;
13068 set
<hobject_t
> add_to_stat
;
13070 for (set
<pg_shard_t
>::iterator i
= backfill_targets
.begin();
13071 i
!= backfill_targets
.end();
13073 peer_backfill_info
[*i
].trim_to(
13074 std::max(peer_info
[*i
].last_backfill
, last_backfill_started
));
13076 backfill_info
.trim_to(last_backfill_started
);
13078 PGBackend::RecoveryHandle
*h
= pgbackend
->open_recovery_op();
13079 while (ops
< max
) {
13080 if (backfill_info
.begin
<= earliest_peer_backfill() &&
13081 !backfill_info
.extends_to_end() && backfill_info
.empty()) {
13082 hobject_t next
= backfill_info
.end
;
13083 backfill_info
.reset(next
);
13084 backfill_info
.end
= hobject_t::get_max();
13085 update_range(&backfill_info
, handle
);
13086 backfill_info
.trim();
13089 dout(20) << " my backfill interval " << backfill_info
<< dendl
;
13091 bool sent_scan
= false;
13092 for (set
<pg_shard_t
>::iterator i
= backfill_targets
.begin();
13093 i
!= backfill_targets
.end();
13095 pg_shard_t bt
= *i
;
13096 BackfillInterval
& pbi
= peer_backfill_info
[bt
];
13098 dout(20) << " peer shard " << bt
<< " backfill " << pbi
<< dendl
;
13099 if (pbi
.begin
<= backfill_info
.begin
&&
13100 !pbi
.extends_to_end() && pbi
.empty()) {
13101 dout(10) << " scanning peer osd." << bt
<< " from " << pbi
.end
<< dendl
;
13102 epoch_t e
= get_osdmap_epoch();
13103 MOSDPGScan
*m
= new MOSDPGScan(
13104 MOSDPGScan::OP_SCAN_GET_DIGEST
, pg_whoami
, e
, last_peering_reset
,
13105 spg_t(info
.pgid
.pgid
, bt
.shard
),
13106 pbi
.end
, hobject_t());
13107 osd
->send_message_osd_cluster(bt
.osd
, m
, get_osdmap_epoch());
13108 ceph_assert(waiting_on_backfill
.find(bt
) == waiting_on_backfill
.end());
13109 waiting_on_backfill
.insert(bt
);
13114 // Count simultaneous scans as a single op and let those complete
13117 start_recovery_op(hobject_t::get_max()); // XXX: was pbi.end
13121 if (backfill_info
.empty() && all_peer_done()) {
13122 dout(10) << " reached end for both local and all peers" << dendl
;
13126 // Get object within set of peers to operate on and
13127 // the set of targets for which that object applies.
13128 hobject_t check
= earliest_peer_backfill();
13130 if (check
< backfill_info
.begin
) {
13132 set
<pg_shard_t
> check_targets
;
13133 for (set
<pg_shard_t
>::iterator i
= backfill_targets
.begin();
13134 i
!= backfill_targets
.end();
13136 pg_shard_t bt
= *i
;
13137 BackfillInterval
& pbi
= peer_backfill_info
[bt
];
13138 if (pbi
.begin
== check
)
13139 check_targets
.insert(bt
);
13141 ceph_assert(!check_targets
.empty());
13143 dout(20) << " BACKFILL removing " << check
13144 << " from peers " << check_targets
<< dendl
;
13145 for (set
<pg_shard_t
>::iterator i
= check_targets
.begin();
13146 i
!= check_targets
.end();
13148 pg_shard_t bt
= *i
;
13149 BackfillInterval
& pbi
= peer_backfill_info
[bt
];
13150 ceph_assert(pbi
.begin
== check
);
13152 to_remove
.push_back(boost::make_tuple(check
, pbi
.objects
.begin()->second
, bt
));
13156 last_backfill_started
= check
;
13158 // Don't increment ops here because deletions
13159 // are cheap and not replied to unlike real recovery_ops,
13160 // and we can't increment ops without requeueing ourself
13163 eversion_t
& obj_v
= backfill_info
.objects
.begin()->second
;
13165 vector
<pg_shard_t
> need_ver_targs
, missing_targs
, keep_ver_targs
, skip_targs
;
13166 for (set
<pg_shard_t
>::iterator i
= backfill_targets
.begin();
13167 i
!= backfill_targets
.end();
13169 pg_shard_t bt
= *i
;
13170 BackfillInterval
& pbi
= peer_backfill_info
[bt
];
13171 // Find all check peers that have the wrong version
13172 if (check
== backfill_info
.begin
&& check
== pbi
.begin
) {
13173 if (pbi
.objects
.begin()->second
!= obj_v
) {
13174 need_ver_targs
.push_back(bt
);
13176 keep_ver_targs
.push_back(bt
);
13179 pg_info_t
& pinfo
= peer_info
[bt
];
13181 // Only include peers that we've caught up to their backfill line
13182 // otherwise, they only appear to be missing this object
13183 // because their pbi.begin > backfill_info.begin.
13184 if (backfill_info
.begin
> pinfo
.last_backfill
)
13185 missing_targs
.push_back(bt
);
13187 skip_targs
.push_back(bt
);
13191 if (!keep_ver_targs
.empty()) {
13192 // These peers have version obj_v
13193 dout(20) << " BACKFILL keeping " << check
13194 << " with ver " << obj_v
13195 << " on peers " << keep_ver_targs
<< dendl
;
13196 //assert(!waiting_for_degraded_object.count(check));
13198 if (!need_ver_targs
.empty() || !missing_targs
.empty()) {
13199 ObjectContextRef obc
= get_object_context(backfill_info
.begin
, false);
13201 if (obc
->get_recovery_read()) {
13202 if (!need_ver_targs
.empty()) {
13203 dout(20) << " BACKFILL replacing " << check
13204 << " with ver " << obj_v
13205 << " to peers " << need_ver_targs
<< dendl
;
13207 if (!missing_targs
.empty()) {
13208 dout(20) << " BACKFILL pushing " << backfill_info
.begin
13209 << " with ver " << obj_v
13210 << " to peers " << missing_targs
<< dendl
;
13212 vector
<pg_shard_t
> all_push
= need_ver_targs
;
13213 all_push
.insert(all_push
.end(), missing_targs
.begin(), missing_targs
.end());
13215 handle
.reset_tp_timeout();
13216 int r
= prep_backfill_object_push(backfill_info
.begin
, obj_v
, obc
, all_push
, h
);
13218 *work_started
= true;
13219 dout(0) << __func__
<< " Error " << r
<< " trying to backfill " << backfill_info
.begin
<< dendl
;
13224 *work_started
= true;
13225 dout(20) << "backfill blocking on " << backfill_info
.begin
13226 << "; could not get rw_manager lock" << dendl
;
13230 dout(20) << "need_ver_targs=" << need_ver_targs
13231 << " keep_ver_targs=" << keep_ver_targs
<< dendl
;
13232 dout(20) << "backfill_targets=" << backfill_targets
13233 << " missing_targs=" << missing_targs
13234 << " skip_targs=" << skip_targs
<< dendl
;
13236 last_backfill_started
= backfill_info
.begin
;
13237 add_to_stat
.insert(backfill_info
.begin
); // XXX: Only one for all pushes?
13238 backfill_info
.pop_front();
13239 vector
<pg_shard_t
> check_targets
= need_ver_targs
;
13240 check_targets
.insert(check_targets
.end(), keep_ver_targs
.begin(), keep_ver_targs
.end());
13241 for (vector
<pg_shard_t
>::iterator i
= check_targets
.begin();
13242 i
!= check_targets
.end();
13244 pg_shard_t bt
= *i
;
13245 BackfillInterval
& pbi
= peer_backfill_info
[bt
];
13251 hobject_t backfill_pos
=
13252 std::min(backfill_info
.begin
, earliest_peer_backfill());
13254 for (set
<hobject_t
>::iterator i
= add_to_stat
.begin();
13255 i
!= add_to_stat
.end();
13257 ObjectContextRef obc
= get_object_context(*i
, false);
13260 add_object_context_to_pg_stat(obc
, &stat
);
13261 pending_backfill_updates
[*i
] = stat
;
13263 map
<pg_shard_t
,MOSDPGBackfillRemove
*> reqs
;
13264 for (unsigned i
= 0; i
< to_remove
.size(); ++i
) {
13265 handle
.reset_tp_timeout();
13266 const hobject_t
& oid
= to_remove
[i
].get
<0>();
13267 eversion_t v
= to_remove
[i
].get
<1>();
13268 pg_shard_t peer
= to_remove
[i
].get
<2>();
13269 MOSDPGBackfillRemove
*m
;
13270 auto it
= reqs
.find(peer
);
13271 if (it
!= reqs
.end()) {
13274 m
= reqs
[peer
] = new MOSDPGBackfillRemove(
13275 spg_t(info
.pgid
.pgid
, peer
.shard
),
13276 get_osdmap_epoch());
13278 m
->ls
.push_back(make_pair(oid
, v
));
13280 if (oid
<= last_backfill_started
)
13281 pending_backfill_updates
[oid
]; // add empty stat!
13283 for (auto p
: reqs
) {
13284 osd
->send_message_osd_cluster(p
.first
.osd
, p
.second
,
13285 get_osdmap_epoch());
13288 pgbackend
->run_recovery_op(h
, get_recovery_op_priority());
13290 dout(5) << "backfill_pos is " << backfill_pos
<< dendl
;
13291 for (set
<hobject_t
>::iterator i
= backfills_in_flight
.begin();
13292 i
!= backfills_in_flight
.end();
13294 dout(20) << *i
<< " is still in flight" << dendl
;
13297 hobject_t next_backfill_to_complete
= backfills_in_flight
.empty() ?
13298 backfill_pos
: *(backfills_in_flight
.begin());
13299 hobject_t new_last_backfill
= earliest_backfill();
13300 dout(10) << "starting new_last_backfill at " << new_last_backfill
<< dendl
;
13301 for (map
<hobject_t
, pg_stat_t
>::iterator i
=
13302 pending_backfill_updates
.begin();
13303 i
!= pending_backfill_updates
.end() &&
13304 i
->first
< next_backfill_to_complete
;
13305 pending_backfill_updates
.erase(i
++)) {
13306 dout(20) << " pending_backfill_update " << i
->first
<< dendl
;
13307 ceph_assert(i
->first
> new_last_backfill
);
13308 for (set
<pg_shard_t
>::iterator j
= backfill_targets
.begin();
13309 j
!= backfill_targets
.end();
13311 pg_shard_t bt
= *j
;
13312 pg_info_t
& pinfo
= peer_info
[bt
];
13313 //Add stats to all peers that were missing object
13314 if (i
->first
> pinfo
.last_backfill
)
13315 pinfo
.stats
.add(i
->second
);
13317 new_last_backfill
= i
->first
;
13319 dout(10) << "possible new_last_backfill at " << new_last_backfill
<< dendl
;
13321 ceph_assert(!pending_backfill_updates
.empty() ||
13322 new_last_backfill
== last_backfill_started
);
13323 if (pending_backfill_updates
.empty() &&
13324 backfill_pos
.is_max()) {
13325 ceph_assert(backfills_in_flight
.empty());
13326 new_last_backfill
= backfill_pos
;
13327 last_backfill_started
= backfill_pos
;
13329 dout(10) << "final new_last_backfill at " << new_last_backfill
<< dendl
;
13331 // If new_last_backfill == MAX, then we will send OP_BACKFILL_FINISH to
13332 // all the backfill targets. Otherwise, we will move last_backfill up on
13333 // those targets need it and send OP_BACKFILL_PROGRESS to them.
13334 for (set
<pg_shard_t
>::iterator i
= backfill_targets
.begin();
13335 i
!= backfill_targets
.end();
13337 pg_shard_t bt
= *i
;
13338 pg_info_t
& pinfo
= peer_info
[bt
];
13340 if (new_last_backfill
> pinfo
.last_backfill
) {
13341 pinfo
.set_last_backfill(new_last_backfill
);
13342 epoch_t e
= get_osdmap_epoch();
13343 MOSDPGBackfill
*m
= NULL
;
13344 if (pinfo
.last_backfill
.is_max()) {
13345 m
= new MOSDPGBackfill(
13346 MOSDPGBackfill::OP_BACKFILL_FINISH
,
13348 last_peering_reset
,
13349 spg_t(info
.pgid
.pgid
, bt
.shard
));
13350 // Use default priority here, must match sub_op priority
13351 /* pinfo.stats might be wrong if we did log-based recovery on the
13352 * backfilled portion in addition to continuing backfill.
13354 pinfo
.stats
= info
.stats
;
13355 start_recovery_op(hobject_t::get_max());
13357 m
= new MOSDPGBackfill(
13358 MOSDPGBackfill::OP_BACKFILL_PROGRESS
,
13360 last_peering_reset
,
13361 spg_t(info
.pgid
.pgid
, bt
.shard
));
13362 // Use default priority here, must match sub_op priority
13364 m
->last_backfill
= pinfo
.last_backfill
;
13365 m
->stats
= pinfo
.stats
;
13366 osd
->send_message_osd_cluster(bt
.osd
, m
, get_osdmap_epoch());
13367 dout(10) << " peer " << bt
13368 << " num_objects now " << pinfo
.stats
.stats
.sum
.num_objects
13369 << " / " << info
.stats
.stats
.sum
.num_objects
<< dendl
;
13374 *work_started
= true;
13378 int PrimaryLogPG::prep_backfill_object_push(
13379 hobject_t oid
, eversion_t v
,
13380 ObjectContextRef obc
,
13381 vector
<pg_shard_t
> peers
,
13382 PGBackend::RecoveryHandle
*h
)
13384 dout(10) << __func__
<< " " << oid
<< " v " << v
<< " to peers " << peers
<< dendl
;
13385 ceph_assert(!peers
.empty());
13387 backfills_in_flight
.insert(oid
);
13388 for (unsigned int i
= 0 ; i
< peers
.size(); ++i
) {
13389 map
<pg_shard_t
, pg_missing_t
>::iterator bpm
= peer_missing
.find(peers
[i
]);
13390 ceph_assert(bpm
!= peer_missing
.end());
13391 bpm
->second
.add(oid
, eversion_t(), eversion_t(), false);
13394 ceph_assert(!recovering
.count(oid
));
13396 start_recovery_op(oid
);
13397 recovering
.insert(make_pair(oid
, obc
));
13399 // We need to take the read_lock here in order to flush in-progress writes
13400 int r
= pgbackend
->recover_object(
13403 ObjectContextRef(),
13407 dout(0) << __func__
<< " Error " << r
<< " on oid " << oid
<< dendl
;
13408 primary_failed(oid
);
13409 primary_error(oid
, v
);
13410 backfills_in_flight
.erase(oid
);
13411 missing_loc
.add_missing(oid
, v
, eversion_t());
13416 void PrimaryLogPG::update_range(
13417 BackfillInterval
*bi
,
13418 ThreadPool::TPHandle
&handle
)
13420 int local_min
= cct
->_conf
->osd_backfill_scan_min
;
13421 int local_max
= cct
->_conf
->osd_backfill_scan_max
;
13423 if (bi
->version
< info
.log_tail
) {
13424 dout(10) << __func__
<< ": bi is old, rescanning local backfill_info"
13426 bi
->version
= info
.last_update
;
13427 scan_range(local_min
, local_max
, bi
, handle
);
13430 if (bi
->version
>= projected_last_update
) {
13431 dout(10) << __func__
<< ": bi is current " << dendl
;
13432 ceph_assert(bi
->version
== projected_last_update
);
13433 } else if (bi
->version
>= info
.log_tail
) {
13434 if (pg_log
.get_log().empty() && projected_log
.empty()) {
13435 /* Because we don't move log_tail on split, the log might be
13436 * empty even if log_tail != last_update. However, the only
13437 * way to get here with an empty log is if log_tail is actually
13438 * eversion_t(), because otherwise the entry which changed
13439 * last_update since the last scan would have to be present.
13441 ceph_assert(bi
->version
== eversion_t());
13445 dout(10) << __func__
<< ": bi is old, (" << bi
->version
13446 << ") can be updated with log to projected_last_update "
13447 << projected_last_update
<< dendl
;
13449 auto func
= [&](const pg_log_entry_t
&e
) {
13450 dout(10) << __func__
<< ": updating from version " << e
.version
13452 const hobject_t
&soid
= e
.soid
;
13453 if (soid
>= bi
->begin
&&
13455 if (e
.is_update()) {
13456 dout(10) << __func__
<< ": " << e
.soid
<< " updated to version "
13457 << e
.version
<< dendl
;
13458 bi
->objects
.erase(e
.soid
);
13459 bi
->objects
.insert(
13463 } else if (e
.is_delete()) {
13464 dout(10) << __func__
<< ": " << e
.soid
<< " removed" << dendl
;
13465 bi
->objects
.erase(e
.soid
);
13469 dout(10) << "scanning pg log first" << dendl
;
13470 pg_log
.get_log().scan_log_after(bi
->version
, func
);
13471 dout(10) << "scanning projected log" << dendl
;
13472 projected_log
.scan_log_after(bi
->version
, func
);
13473 bi
->version
= projected_last_update
;
13475 ceph_abort_msg("scan_range should have raised bi->version past log_tail");
13479 void PrimaryLogPG::scan_range(
13480 int min
, int max
, BackfillInterval
*bi
,
13481 ThreadPool::TPHandle
&handle
)
13483 ceph_assert(is_locked());
13484 dout(10) << "scan_range from " << bi
->begin
<< dendl
;
13485 bi
->clear_objects();
13487 vector
<hobject_t
> ls
;
13489 int r
= pgbackend
->objects_list_partial(bi
->begin
, min
, max
, &ls
, &bi
->end
);
13490 ceph_assert(r
>= 0);
13491 dout(10) << " got " << ls
.size() << " items, next " << bi
->end
<< dendl
;
13492 dout(20) << ls
<< dendl
;
13494 for (vector
<hobject_t
>::iterator p
= ls
.begin(); p
!= ls
.end(); ++p
) {
13495 handle
.reset_tp_timeout();
13496 ObjectContextRef obc
;
13498 obc
= object_contexts
.lookup(*p
);
13500 bi
->objects
[*p
] = obc
->obs
.oi
.version
;
13501 dout(20) << " " << *p
<< " " << obc
->obs
.oi
.version
<< dendl
;
13504 int r
= pgbackend
->objects_get_attr(*p
, OI_ATTR
, &bl
);
13506 /* If the object does not exist here, it must have been removed
13507 * between the collection_list_partial and here. This can happen
13508 * for the first item in the range, which is usually last_backfill.
13513 ceph_assert(r
>= 0);
13514 object_info_t
oi(bl
);
13515 bi
->objects
[*p
] = oi
.version
;
13516 dout(20) << " " << *p
<< " " << oi
.version
<< dendl
;
13524 * verifies that stray objects have been deleted
13526 void PrimaryLogPG::check_local()
13528 dout(10) << __func__
<< dendl
;
13530 ceph_assert(info
.last_update
>= pg_log
.get_tail()); // otherwise we need some help!
13532 if (!cct
->_conf
->osd_debug_verify_stray_on_activate
)
13535 // just scan the log.
13536 set
<hobject_t
> did
;
13537 for (list
<pg_log_entry_t
>::const_reverse_iterator p
= pg_log
.get_log().log
.rbegin();
13538 p
!= pg_log
.get_log().log
.rend();
13540 if (did
.count(p
->soid
))
13542 did
.insert(p
->soid
);
13544 if (p
->is_delete() && !is_missing_object(p
->soid
)) {
13545 dout(10) << " checking " << p
->soid
13546 << " at " << p
->version
<< dendl
;
13548 int r
= osd
->store
->stat(
13550 ghobject_t(p
->soid
, ghobject_t::NO_GEN
, pg_whoami
.shard
),
13552 if (r
!= -ENOENT
) {
13553 derr
<< __func__
<< " " << p
->soid
<< " exists, but should have been "
13554 << "deleted" << dendl
;
13555 ceph_abort_msg("erroneously present object");
13558 // ignore old(+missing) objects
13565 // ===========================
13568 hobject_t
PrimaryLogPG::get_hit_set_current_object(utime_t stamp
)
13571 ss
<< "hit_set_" << info
.pgid
.pgid
<< "_current_" << stamp
;
13572 hobject_t
hoid(sobject_t(ss
.str(), CEPH_NOSNAP
), "",
13573 info
.pgid
.ps(), info
.pgid
.pool(),
13574 cct
->_conf
->osd_hit_set_namespace
);
13575 dout(20) << __func__
<< " " << hoid
<< dendl
;
13579 hobject_t
PrimaryLogPG::get_hit_set_archive_object(utime_t start
,
13584 ss
<< "hit_set_" << info
.pgid
.pgid
<< "_archive_";
13586 start
.gmtime(ss
) << "_";
13589 start
.localtime(ss
) << "_";
13592 hobject_t
hoid(sobject_t(ss
.str(), CEPH_NOSNAP
), "",
13593 info
.pgid
.ps(), info
.pgid
.pool(),
13594 cct
->_conf
->osd_hit_set_namespace
);
13595 dout(20) << __func__
<< " " << hoid
<< dendl
;
13599 void PrimaryLogPG::hit_set_clear()
13601 dout(20) << __func__
<< dendl
;
13603 hit_set_start_stamp
= utime_t();
13606 void PrimaryLogPG::hit_set_setup()
13608 if (!is_active() ||
13614 if (is_active() && is_primary() &&
13615 (!pool
.info
.hit_set_count
||
13616 !pool
.info
.hit_set_period
||
13617 pool
.info
.hit_set_params
.get_type() == HitSet::TYPE_NONE
)) {
13620 // only primary is allowed to remove all the hit set objects
13621 hit_set_remove_all();
13625 // FIXME: discard any previous data for now
13628 // include any writes we know about from the pg log. this doesn't
13629 // capture reads, but it is better than nothing!
13630 hit_set_apply_log();
13633 void PrimaryLogPG::hit_set_remove_all()
13635 // If any archives are degraded we skip this
13636 for (list
<pg_hit_set_info_t
>::iterator p
= info
.hit_set
.history
.begin();
13637 p
!= info
.hit_set
.history
.end();
13639 hobject_t aoid
= get_hit_set_archive_object(p
->begin
, p
->end
, p
->using_gmt
);
13641 // Once we hit a degraded object just skip
13642 if (is_degraded_or_backfilling_object(aoid
))
13644 if (write_blocked_by_scrub(aoid
))
13648 if (!info
.hit_set
.history
.empty()) {
13649 list
<pg_hit_set_info_t
>::reverse_iterator p
= info
.hit_set
.history
.rbegin();
13650 ceph_assert(p
!= info
.hit_set
.history
.rend());
13651 hobject_t oid
= get_hit_set_archive_object(p
->begin
, p
->end
, p
->using_gmt
);
13652 ceph_assert(!is_degraded_or_backfilling_object(oid
));
13653 ObjectContextRef obc
= get_object_context(oid
, false);
13656 OpContextUPtr ctx
= simple_opc_create(obc
);
13657 ctx
->at_version
= get_next_version();
13658 ctx
->updated_hset_history
= info
.hit_set
;
13659 utime_t now
= ceph_clock_now();
13661 hit_set_trim(ctx
, 0);
13662 simple_opc_submit(std::move(ctx
));
13665 info
.hit_set
= pg_hit_set_history_t();
13667 agent_state
->discard_hit_sets();
13671 void PrimaryLogPG::hit_set_create()
13673 utime_t now
= ceph_clock_now();
13674 // make a copy of the params to modify
13675 HitSet::Params
params(pool
.info
.hit_set_params
);
13677 dout(20) << __func__
<< " " << params
<< dendl
;
13678 if (pool
.info
.hit_set_params
.get_type() == HitSet::TYPE_BLOOM
) {
13679 BloomHitSet::Params
*p
=
13680 static_cast<BloomHitSet::Params
*>(params
.impl
.get());
13682 // convert false positive rate so it holds up across the full period
13683 p
->set_fpp(p
->get_fpp() / pool
.info
.hit_set_count
);
13684 if (p
->get_fpp() <= 0.0)
13685 p
->set_fpp(.01); // fpp cannot be zero!
13687 // if we don't have specified size, estimate target size based on the
13689 if (p
->target_size
== 0 && hit_set
) {
13690 utime_t dur
= now
- hit_set_start_stamp
;
13691 unsigned unique
= hit_set
->approx_unique_insert_count();
13692 dout(20) << __func__
<< " previous set had approx " << unique
13693 << " unique items over " << dur
<< " seconds" << dendl
;
13694 p
->target_size
= (double)unique
* (double)pool
.info
.hit_set_period
13697 if (p
->target_size
<
13698 static_cast<uint64_t>(cct
->_conf
->osd_hit_set_min_size
))
13699 p
->target_size
= cct
->_conf
->osd_hit_set_min_size
;
13702 > static_cast<uint64_t>(cct
->_conf
->osd_hit_set_max_size
))
13703 p
->target_size
= cct
->_conf
->osd_hit_set_max_size
;
13705 p
->seed
= now
.sec();
13707 dout(10) << __func__
<< " target_size " << p
->target_size
13708 << " fpp " << p
->get_fpp() << dendl
;
13710 hit_set
.reset(new HitSet(params
));
13711 hit_set_start_stamp
= now
;
13715 * apply log entries to set
13717 * this would only happen after peering, to at least capture writes
13718 * during an interval that was potentially lost.
13720 bool PrimaryLogPG::hit_set_apply_log()
13725 eversion_t to
= info
.last_update
;
13726 eversion_t from
= info
.hit_set
.current_last_update
;
13728 dout(20) << __func__
<< " no update" << dendl
;
13732 dout(20) << __func__
<< " " << to
<< " .. " << info
.last_update
<< dendl
;
13733 list
<pg_log_entry_t
>::const_reverse_iterator p
= pg_log
.get_log().log
.rbegin();
13734 while (p
!= pg_log
.get_log().log
.rend() && p
->version
> to
)
13736 while (p
!= pg_log
.get_log().log
.rend() && p
->version
> from
) {
13737 hit_set
->insert(p
->soid
);
13744 void PrimaryLogPG::hit_set_persist()
13746 dout(10) << __func__
<< dendl
;
13748 unsigned max
= pool
.info
.hit_set_count
;
13750 utime_t now
= ceph_clock_now();
13753 // If any archives are degraded we skip this persist request
13754 // account for the additional entry being added below
13755 for (list
<pg_hit_set_info_t
>::iterator p
= info
.hit_set
.history
.begin();
13756 p
!= info
.hit_set
.history
.end();
13758 hobject_t aoid
= get_hit_set_archive_object(p
->begin
, p
->end
, p
->using_gmt
);
13760 // Once we hit a degraded object just skip further trim
13761 if (is_degraded_or_backfilling_object(aoid
))
13763 if (write_blocked_by_scrub(aoid
))
13767 // If backfill is in progress and we could possibly overlap with the
13768 // hit_set_* objects, back off. Since these all have
13769 // hobject_t::hash set to pgid.ps(), and those sort first, we can
13770 // look just at that. This is necessary because our transactions
13771 // may include a modify of the new hit_set *and* a delete of the
13772 // old one, and this may span the backfill boundary.
13773 for (set
<pg_shard_t
>::iterator p
= backfill_targets
.begin();
13774 p
!= backfill_targets
.end();
13776 ceph_assert(peer_info
.count(*p
));
13777 const pg_info_t
& pi
= peer_info
[*p
];
13778 if (pi
.last_backfill
== hobject_t() ||
13779 pi
.last_backfill
.get_hash() == info
.pgid
.ps()) {
13780 dout(10) << __func__
<< " backfill target osd." << *p
13781 << " last_backfill has not progressed past pgid ps"
13788 pg_hit_set_info_t new_hset
= pg_hit_set_info_t(pool
.info
.use_gmt_hitset
);
13789 new_hset
.begin
= hit_set_start_stamp
;
13790 new_hset
.end
= now
;
13791 oid
= get_hit_set_archive_object(
13794 new_hset
.using_gmt
);
13796 // If the current object is degraded we skip this persist request
13797 if (write_blocked_by_scrub(oid
))
13801 encode(*hit_set
, bl
);
13802 dout(20) << __func__
<< " archive " << oid
<< dendl
;
13805 agent_state
->add_hit_set(new_hset
.begin
, hit_set
);
13806 uint32_t size
= agent_state
->hit_set_map
.size();
13807 if (size
>= pool
.info
.hit_set_count
) {
13808 size
= pool
.info
.hit_set_count
> 0 ? pool
.info
.hit_set_count
- 1: 0;
13810 hit_set_in_memory_trim(size
);
13813 ObjectContextRef obc
= get_object_context(oid
, true);
13814 OpContextUPtr ctx
= simple_opc_create(obc
);
13816 ctx
->at_version
= get_next_version();
13817 ctx
->updated_hset_history
= info
.hit_set
;
13818 pg_hit_set_history_t
&updated_hit_set_hist
= *(ctx
->updated_hset_history
);
13820 updated_hit_set_hist
.current_last_update
= info
.last_update
;
13821 new_hset
.version
= ctx
->at_version
;
13823 updated_hit_set_hist
.history
.push_back(new_hset
);
13826 // fabricate an object_info_t and SnapSet
13827 obc
->obs
.oi
.version
= ctx
->at_version
;
13828 obc
->obs
.oi
.mtime
= now
;
13829 obc
->obs
.oi
.size
= bl
.length();
13830 obc
->obs
.exists
= true;
13831 obc
->obs
.oi
.set_data_digest(bl
.crc32c(-1));
13833 ctx
->new_obs
= obc
->obs
;
13835 ctx
->new_snapset
= obc
->ssc
->snapset
;
13837 ctx
->delta_stats
.num_objects
++;
13838 ctx
->delta_stats
.num_objects_hit_set_archive
++;
13840 ctx
->delta_stats
.num_bytes
+= bl
.length();
13841 ctx
->delta_stats
.num_bytes_hit_set_archive
+= bl
.length();
13844 encode(ctx
->new_snapset
, bss
);
13845 bufferlist
boi(sizeof(ctx
->new_obs
.oi
));
13846 encode(ctx
->new_obs
.oi
, boi
,
13847 get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD
, nullptr));
13849 ctx
->op_t
->create(oid
);
13851 ctx
->op_t
->write(oid
, 0, bl
.length(), bl
, 0);
13853 map
<string
, bufferlist
> attrs
;
13854 attrs
[OI_ATTR
].claim(boi
);
13855 attrs
[SS_ATTR
].claim(bss
);
13856 setattrs_maybe_cache(ctx
->obc
, ctx
->op_t
.get(), attrs
);
13857 ctx
->log
.push_back(
13859 pg_log_entry_t::MODIFY
,
13869 hit_set_trim(ctx
, max
);
13871 simple_opc_submit(std::move(ctx
));
13874 void PrimaryLogPG::hit_set_trim(OpContextUPtr
&ctx
, unsigned max
)
13876 ceph_assert(ctx
->updated_hset_history
);
13877 pg_hit_set_history_t
&updated_hit_set_hist
=
13878 *(ctx
->updated_hset_history
);
13879 for (unsigned num
= updated_hit_set_hist
.history
.size(); num
> max
; --num
) {
13880 list
<pg_hit_set_info_t
>::iterator p
= updated_hit_set_hist
.history
.begin();
13881 ceph_assert(p
!= updated_hit_set_hist
.history
.end());
13882 hobject_t oid
= get_hit_set_archive_object(p
->begin
, p
->end
, p
->using_gmt
);
13884 ceph_assert(!is_degraded_or_backfilling_object(oid
));
13886 dout(20) << __func__
<< " removing " << oid
<< dendl
;
13887 ++ctx
->at_version
.version
;
13888 ctx
->log
.push_back(
13889 pg_log_entry_t(pg_log_entry_t::DELETE
,
13898 ctx
->op_t
->remove(oid
);
13899 updated_hit_set_hist
.history
.pop_front();
13901 ObjectContextRef obc
= get_object_context(oid
, false);
13903 --ctx
->delta_stats
.num_objects
;
13904 --ctx
->delta_stats
.num_objects_hit_set_archive
;
13905 ctx
->delta_stats
.num_bytes
-= obc
->obs
.oi
.size
;
13906 ctx
->delta_stats
.num_bytes_hit_set_archive
-= obc
->obs
.oi
.size
;
13910 void PrimaryLogPG::hit_set_in_memory_trim(uint32_t max_in_memory
)
13912 while (agent_state
->hit_set_map
.size() > max_in_memory
) {
13913 agent_state
->remove_oldest_hit_set();
13918 // =======================================
13921 void PrimaryLogPG::agent_setup()
13923 ceph_assert(is_locked());
13924 if (!is_active() ||
13926 state_test(PG_STATE_PREMERGE
) ||
13927 pool
.info
.cache_mode
== pg_pool_t::CACHEMODE_NONE
||
13928 pool
.info
.tier_of
< 0 ||
13929 !get_osdmap()->have_pg_pool(pool
.info
.tier_of
)) {
13933 if (!agent_state
) {
13934 agent_state
.reset(new TierAgentState
);
13936 // choose random starting position
13937 agent_state
->position
= hobject_t();
13938 agent_state
->position
.pool
= info
.pgid
.pool();
13939 agent_state
->position
.set_hash(pool
.info
.get_random_pg_position(
13942 agent_state
->start
= agent_state
->position
;
13944 dout(10) << __func__
<< " allocated new state, position "
13945 << agent_state
->position
<< dendl
;
13947 dout(10) << __func__
<< " keeping existing state" << dendl
;
13950 if (info
.stats
.stats_invalid
) {
13951 osd
->clog
->warn() << "pg " << info
.pgid
<< " has invalid (post-split) stats; must scrub before tier agent can activate";
13954 agent_choose_mode();
13957 void PrimaryLogPG::agent_clear()
13960 agent_state
.reset(NULL
);
13963 // Return false if no objects operated on since start of object hash space
13964 bool PrimaryLogPG::agent_work(int start_max
, int agent_flush_quota
)
13967 if (!agent_state
) {
13968 dout(10) << __func__
<< " no agent state, stopping" << dendl
;
13973 ceph_assert(!deleting
);
13975 if (agent_state
->is_idle()) {
13976 dout(10) << __func__
<< " idle, stopping" << dendl
;
13981 osd
->logger
->inc(l_osd_agent_wake
);
13983 dout(10) << __func__
13984 << " max " << start_max
13985 << ", flush " << agent_state
->get_flush_mode_name()
13986 << ", evict " << agent_state
->get_evict_mode_name()
13987 << ", pos " << agent_state
->position
13989 ceph_assert(is_primary());
13990 ceph_assert(is_active());
13992 agent_load_hit_sets();
13994 const pg_pool_t
*base_pool
= get_osdmap()->get_pg_pool(pool
.info
.tier_of
);
13995 ceph_assert(base_pool
);
13998 int ls_max
= cct
->_conf
->osd_pool_default_cache_max_evict_check_size
;
14000 // list some objects. this conveniently lists clones (oldest to
14001 // newest) before heads... the same order we want to flush in.
14003 // NOTE: do not flush the Sequencer. we will assume that the
14004 // listing we get back is imprecise.
14005 vector
<hobject_t
> ls
;
14007 int r
= pgbackend
->objects_list_partial(agent_state
->position
, ls_min
, ls_max
,
14009 ceph_assert(r
>= 0);
14010 dout(20) << __func__
<< " got " << ls
.size() << " objects" << dendl
;
14012 for (vector
<hobject_t
>::iterator p
= ls
.begin();
14015 if (p
->nspace
== cct
->_conf
->osd_hit_set_namespace
) {
14016 dout(20) << __func__
<< " skip (hit set) " << *p
<< dendl
;
14017 osd
->logger
->inc(l_osd_agent_skip
);
14020 if (is_degraded_or_backfilling_object(*p
)) {
14021 dout(20) << __func__
<< " skip (degraded) " << *p
<< dendl
;
14022 osd
->logger
->inc(l_osd_agent_skip
);
14025 if (is_missing_object(p
->get_head())) {
14026 dout(20) << __func__
<< " skip (missing head) " << *p
<< dendl
;
14027 osd
->logger
->inc(l_osd_agent_skip
);
14030 ObjectContextRef obc
= get_object_context(*p
, false, NULL
);
14032 // we didn't flush; we may miss something here.
14033 dout(20) << __func__
<< " skip (no obc) " << *p
<< dendl
;
14034 osd
->logger
->inc(l_osd_agent_skip
);
14037 if (!obc
->obs
.exists
) {
14038 dout(20) << __func__
<< " skip (dne) " << obc
->obs
.oi
.soid
<< dendl
;
14039 osd
->logger
->inc(l_osd_agent_skip
);
14042 if (range_intersects_scrub(obc
->obs
.oi
.soid
,
14043 obc
->obs
.oi
.soid
.get_head())) {
14044 dout(20) << __func__
<< " skip (scrubbing) " << obc
->obs
.oi
<< dendl
;
14045 osd
->logger
->inc(l_osd_agent_skip
);
14048 if (obc
->is_blocked()) {
14049 dout(20) << __func__
<< " skip (blocked) " << obc
->obs
.oi
<< dendl
;
14050 osd
->logger
->inc(l_osd_agent_skip
);
14053 if (obc
->is_request_pending()) {
14054 dout(20) << __func__
<< " skip (request pending) " << obc
->obs
.oi
<< dendl
;
14055 osd
->logger
->inc(l_osd_agent_skip
);
14059 // be careful flushing omap to an EC pool.
14060 if (!base_pool
->supports_omap() &&
14061 obc
->obs
.oi
.is_omap()) {
14062 dout(20) << __func__
<< " skip (omap to EC) " << obc
->obs
.oi
<< dendl
;
14063 osd
->logger
->inc(l_osd_agent_skip
);
14067 if (agent_state
->evict_mode
!= TierAgentState::EVICT_MODE_IDLE
&&
14068 agent_maybe_evict(obc
, false))
14070 else if (agent_state
->flush_mode
!= TierAgentState::FLUSH_MODE_IDLE
&&
14071 agent_flush_quota
> 0 && agent_maybe_flush(obc
)) {
14073 --agent_flush_quota
;
14075 if (started
>= start_max
) {
14076 // If finishing early, set "next" to the next object
14077 if (++p
!= ls
.end())
14083 if (++agent_state
->hist_age
> cct
->_conf
->osd_agent_hist_halflife
) {
14084 dout(20) << __func__
<< " resetting atime and temp histograms" << dendl
;
14085 agent_state
->hist_age
= 0;
14086 agent_state
->temp_hist
.decay();
14089 // Total objects operated on so far
14090 int total_started
= agent_state
->started
+ started
;
14091 bool need_delay
= false;
14093 dout(20) << __func__
<< " start pos " << agent_state
->position
14094 << " next start pos " << next
14095 << " started " << total_started
<< dendl
;
14097 // See if we've made a full pass over the object hash space
14098 // This might check at most ls_max objects a second time to notice that
14099 // we've checked every objects at least once.
14100 if (agent_state
->position
< agent_state
->start
&&
14101 next
>= agent_state
->start
) {
14102 dout(20) << __func__
<< " wrap around " << agent_state
->start
<< dendl
;
14103 if (total_started
== 0)
14107 agent_state
->start
= next
;
14109 agent_state
->started
= total_started
;
14111 // See if we are starting from beginning
14113 agent_state
->position
= hobject_t();
14115 agent_state
->position
= next
;
14117 // Discard old in memory HitSets
14118 hit_set_in_memory_trim(pool
.info
.hit_set_count
);
14121 ceph_assert(agent_state
->delaying
== false);
14126 agent_choose_mode();
14131 void PrimaryLogPG::agent_load_hit_sets()
14133 if (agent_state
->evict_mode
== TierAgentState::EVICT_MODE_IDLE
) {
14137 if (agent_state
->hit_set_map
.size() < info
.hit_set
.history
.size()) {
14138 dout(10) << __func__
<< dendl
;
14139 for (list
<pg_hit_set_info_t
>::iterator p
= info
.hit_set
.history
.begin();
14140 p
!= info
.hit_set
.history
.end(); ++p
) {
14141 if (agent_state
->hit_set_map
.count(p
->begin
.sec()) == 0) {
14142 dout(10) << __func__
<< " loading " << p
->begin
<< "-"
14143 << p
->end
<< dendl
;
14144 if (!pool
.info
.is_replicated()) {
14145 // FIXME: EC not supported here yet
14146 derr
<< __func__
<< " on non-replicated pool" << dendl
;
14150 hobject_t oid
= get_hit_set_archive_object(p
->begin
, p
->end
, p
->using_gmt
);
14151 if (is_unreadable_object(oid
)) {
14152 dout(10) << __func__
<< " unreadable " << oid
<< ", waiting" << dendl
;
14156 ObjectContextRef obc
= get_object_context(oid
, false);
14158 derr
<< __func__
<< ": could not load hitset " << oid
<< dendl
;
14164 int r
= osd
->store
->read(ch
, ghobject_t(oid
), 0, 0, bl
);
14165 ceph_assert(r
>= 0);
14167 HitSetRef
hs(new HitSet
);
14168 bufferlist::const_iterator pbl
= bl
.begin();
14170 agent_state
->add_hit_set(p
->begin
.sec(), hs
);
14176 bool PrimaryLogPG::agent_maybe_flush(ObjectContextRef
& obc
)
14178 if (!obc
->obs
.oi
.is_dirty()) {
14179 dout(20) << __func__
<< " skip (clean) " << obc
->obs
.oi
<< dendl
;
14180 osd
->logger
->inc(l_osd_agent_skip
);
14183 if (obc
->obs
.oi
.is_cache_pinned()) {
14184 dout(20) << __func__
<< " skip (cache_pinned) " << obc
->obs
.oi
<< dendl
;
14185 osd
->logger
->inc(l_osd_agent_skip
);
14189 utime_t now
= ceph_clock_now();
14190 utime_t ob_local_mtime
;
14191 if (obc
->obs
.oi
.local_mtime
!= utime_t()) {
14192 ob_local_mtime
= obc
->obs
.oi
.local_mtime
;
14194 ob_local_mtime
= obc
->obs
.oi
.mtime
;
14196 bool evict_mode_full
=
14197 (agent_state
->evict_mode
== TierAgentState::EVICT_MODE_FULL
);
14198 if (!evict_mode_full
&&
14199 obc
->obs
.oi
.soid
.snap
== CEPH_NOSNAP
&& // snaps immutable; don't delay
14200 (ob_local_mtime
+ utime_t(pool
.info
.cache_min_flush_age
, 0) > now
)) {
14201 dout(20) << __func__
<< " skip (too young) " << obc
->obs
.oi
<< dendl
;
14202 osd
->logger
->inc(l_osd_agent_skip
);
14206 if (osd
->agent_is_active_oid(obc
->obs
.oi
.soid
)) {
14207 dout(20) << __func__
<< " skip (flushing) " << obc
->obs
.oi
<< dendl
;
14208 osd
->logger
->inc(l_osd_agent_skip
);
14212 dout(10) << __func__
<< " flushing " << obc
->obs
.oi
<< dendl
;
14214 // FIXME: flush anything dirty, regardless of what distribution of
14217 hobject_t oid
= obc
->obs
.oi
.soid
;
14218 osd
->agent_start_op(oid
);
14219 // no need to capture a pg ref, can't outlive fop or ctx
14220 std::function
<void()> on_flush
= [this, oid
]() {
14221 osd
->agent_finish_op(oid
);
14224 int result
= start_flush(
14225 OpRequestRef(), obc
, false, NULL
,
14227 if (result
!= -EINPROGRESS
) {
14229 dout(10) << __func__
<< " start_flush() failed " << obc
->obs
.oi
14230 << " with " << result
<< dendl
;
14231 osd
->logger
->inc(l_osd_agent_skip
);
14235 osd
->logger
->inc(l_osd_agent_flush
);
14239 bool PrimaryLogPG::agent_maybe_evict(ObjectContextRef
& obc
, bool after_flush
)
14241 const hobject_t
& soid
= obc
->obs
.oi
.soid
;
14242 if (!after_flush
&& obc
->obs
.oi
.is_dirty()) {
14243 dout(20) << __func__
<< " skip (dirty) " << obc
->obs
.oi
<< dendl
;
14246 if (!obc
->obs
.oi
.watchers
.empty()) {
14247 dout(20) << __func__
<< " skip (watchers) " << obc
->obs
.oi
<< dendl
;
14250 if (obc
->is_blocked()) {
14251 dout(20) << __func__
<< " skip (blocked) " << obc
->obs
.oi
<< dendl
;
14254 if (obc
->obs
.oi
.is_cache_pinned()) {
14255 dout(20) << __func__
<< " skip (cache_pinned) " << obc
->obs
.oi
<< dendl
;
14259 if (soid
.snap
== CEPH_NOSNAP
) {
14260 int result
= _verify_no_head_clones(soid
, obc
->ssc
->snapset
);
14262 dout(20) << __func__
<< " skip (clones) " << obc
->obs
.oi
<< dendl
;
14267 if (agent_state
->evict_mode
!= TierAgentState::EVICT_MODE_FULL
) {
14268 // is this object old than cache_min_evict_age?
14269 utime_t now
= ceph_clock_now();
14270 utime_t ob_local_mtime
;
14271 if (obc
->obs
.oi
.local_mtime
!= utime_t()) {
14272 ob_local_mtime
= obc
->obs
.oi
.local_mtime
;
14274 ob_local_mtime
= obc
->obs
.oi
.mtime
;
14276 if (ob_local_mtime
+ utime_t(pool
.info
.cache_min_evict_age
, 0) > now
) {
14277 dout(20) << __func__
<< " skip (too young) " << obc
->obs
.oi
<< dendl
;
14278 osd
->logger
->inc(l_osd_agent_skip
);
14281 // is this object old and/or cold enough?
14283 uint64_t temp_upper
= 0, temp_lower
= 0;
14285 agent_estimate_temp(soid
, &temp
);
14286 agent_state
->temp_hist
.add(temp
);
14287 agent_state
->temp_hist
.get_position_micro(temp
, &temp_lower
, &temp_upper
);
14289 dout(20) << __func__
14290 << " temp " << temp
14291 << " pos " << temp_lower
<< "-" << temp_upper
14292 << ", evict_effort " << agent_state
->evict_effort
14294 dout(30) << "agent_state:\n";
14295 Formatter
*f
= Formatter::create("");
14296 f
->open_object_section("agent_state");
14297 agent_state
->dump(f
);
14298 f
->close_section();
14303 if (1000000 - temp_upper
>= agent_state
->evict_effort
)
14307 dout(10) << __func__
<< " evicting " << obc
->obs
.oi
<< dendl
;
14308 OpContextUPtr ctx
= simple_opc_create(obc
);
14310 auto null_op_req
= OpRequestRef();
14311 if (!ctx
->lock_manager
.get_lock_type(
14312 ObjectContext::RWState::RWWRITE
,
14316 close_op_ctx(ctx
.release());
14317 dout(20) << __func__
<< " skip (cannot get lock) " << obc
->obs
.oi
<< dendl
;
14321 osd
->agent_start_evict_op();
14322 ctx
->register_on_finish(
14324 osd
->agent_finish_evict_op();
14327 ctx
->at_version
= get_next_version();
14328 ceph_assert(ctx
->new_obs
.exists
);
14329 int r
= _delete_oid(ctx
.get(), true, false);
14330 if (obc
->obs
.oi
.is_omap())
14331 ctx
->delta_stats
.num_objects_omap
--;
14332 ctx
->delta_stats
.num_evict
++;
14333 ctx
->delta_stats
.num_evict_kb
+= shift_round_up(obc
->obs
.oi
.size
, 10);
14334 if (obc
->obs
.oi
.is_dirty())
14335 --ctx
->delta_stats
.num_objects_dirty
;
14336 ceph_assert(r
== 0);
14337 finish_ctx(ctx
.get(), pg_log_entry_t::DELETE
);
14338 simple_opc_submit(std::move(ctx
));
14339 osd
->logger
->inc(l_osd_tier_evict
);
14340 osd
->logger
->inc(l_osd_agent_evict
);
14344 void PrimaryLogPG::agent_stop()
14346 dout(20) << __func__
<< dendl
;
14347 if (agent_state
&& !agent_state
->is_idle()) {
14348 agent_state
->evict_mode
= TierAgentState::EVICT_MODE_IDLE
;
14349 agent_state
->flush_mode
= TierAgentState::FLUSH_MODE_IDLE
;
14350 osd
->agent_disable_pg(this, agent_state
->evict_effort
);
14354 void PrimaryLogPG::agent_delay()
14356 dout(20) << __func__
<< dendl
;
14357 if (agent_state
&& !agent_state
->is_idle()) {
14358 ceph_assert(agent_state
->delaying
== false);
14359 agent_state
->delaying
= true;
14360 osd
->agent_disable_pg(this, agent_state
->evict_effort
);
14364 void PrimaryLogPG::agent_choose_mode_restart()
14366 dout(20) << __func__
<< dendl
;
14368 if (agent_state
&& agent_state
->delaying
) {
14369 agent_state
->delaying
= false;
14370 agent_choose_mode(true);
14375 bool PrimaryLogPG::agent_choose_mode(bool restart
, OpRequestRef op
)
14377 bool requeued
= false;
14378 // Let delay play out
14379 if (agent_state
->delaying
) {
14380 dout(20) << __func__
<< " " << this << " delaying, ignored" << dendl
;
14384 TierAgentState::flush_mode_t flush_mode
= TierAgentState::FLUSH_MODE_IDLE
;
14385 TierAgentState::evict_mode_t evict_mode
= TierAgentState::EVICT_MODE_IDLE
;
14386 unsigned evict_effort
= 0;
14388 if (info
.stats
.stats_invalid
) {
14389 // idle; stats can't be trusted until we scrub.
14390 dout(20) << __func__
<< " stats invalid (post-split), idle" << dendl
;
14395 uint64_t divisor
= pool
.info
.get_pg_num_divisor(info
.pgid
.pgid
);
14396 ceph_assert(divisor
> 0);
14398 // adjust (effective) user objects down based on the number
14399 // of HitSet objects, which should not count toward our total since
14400 // they cannot be flushed.
14401 uint64_t unflushable
= info
.stats
.stats
.sum
.num_objects_hit_set_archive
;
14403 // also exclude omap objects if ec backing pool
14404 const pg_pool_t
*base_pool
= get_osdmap()->get_pg_pool(pool
.info
.tier_of
);
14405 ceph_assert(base_pool
);
14406 if (!base_pool
->supports_omap())
14407 unflushable
+= info
.stats
.stats
.sum
.num_objects_omap
;
14409 uint64_t num_user_objects
= info
.stats
.stats
.sum
.num_objects
;
14410 if (num_user_objects
> unflushable
)
14411 num_user_objects
-= unflushable
;
14413 num_user_objects
= 0;
14415 uint64_t num_user_bytes
= info
.stats
.stats
.sum
.num_bytes
;
14416 uint64_t unflushable_bytes
= info
.stats
.stats
.sum
.num_bytes_hit_set_archive
;
14417 num_user_bytes
-= unflushable_bytes
;
14418 uint64_t num_overhead_bytes
= osd
->store
->estimate_objects_overhead(num_user_objects
);
14419 num_user_bytes
+= num_overhead_bytes
;
14421 // also reduce the num_dirty by num_objects_omap
14422 int64_t num_dirty
= info
.stats
.stats
.sum
.num_objects_dirty
;
14423 if (!base_pool
->supports_omap()) {
14424 if (num_dirty
> info
.stats
.stats
.sum
.num_objects_omap
)
14425 num_dirty
-= info
.stats
.stats
.sum
.num_objects_omap
;
14430 dout(10) << __func__
14432 << TierAgentState::get_flush_mode_name(agent_state
->flush_mode
)
14434 << TierAgentState::get_evict_mode_name(agent_state
->evict_mode
)
14435 << " num_objects: " << info
.stats
.stats
.sum
.num_objects
14436 << " num_bytes: " << info
.stats
.stats
.sum
.num_bytes
14437 << " num_objects_dirty: " << info
.stats
.stats
.sum
.num_objects_dirty
14438 << " num_objects_omap: " << info
.stats
.stats
.sum
.num_objects_omap
14439 << " num_dirty: " << num_dirty
14440 << " num_user_objects: " << num_user_objects
14441 << " num_user_bytes: " << num_user_bytes
14442 << " num_overhead_bytes: " << num_overhead_bytes
14443 << " pool.info.target_max_bytes: " << pool
.info
.target_max_bytes
14444 << " pool.info.target_max_objects: " << pool
.info
.target_max_objects
14447 // get dirty, full ratios
14448 uint64_t dirty_micro
= 0;
14449 uint64_t full_micro
= 0;
14450 if (pool
.info
.target_max_bytes
&& num_user_objects
> 0) {
14451 uint64_t avg_size
= num_user_bytes
/ num_user_objects
;
14453 num_dirty
* avg_size
* 1000000 /
14454 std::max
<uint64_t>(pool
.info
.target_max_bytes
/ divisor
, 1);
14456 num_user_objects
* avg_size
* 1000000 /
14457 std::max
<uint64_t>(pool
.info
.target_max_bytes
/ divisor
, 1);
14459 if (pool
.info
.target_max_objects
> 0) {
14460 uint64_t dirty_objects_micro
=
14461 num_dirty
* 1000000 /
14462 std::max
<uint64_t>(pool
.info
.target_max_objects
/ divisor
, 1);
14463 if (dirty_objects_micro
> dirty_micro
)
14464 dirty_micro
= dirty_objects_micro
;
14465 uint64_t full_objects_micro
=
14466 num_user_objects
* 1000000 /
14467 std::max
<uint64_t>(pool
.info
.target_max_objects
/ divisor
, 1);
14468 if (full_objects_micro
> full_micro
)
14469 full_micro
= full_objects_micro
;
14471 dout(20) << __func__
<< " dirty " << ((float)dirty_micro
/ 1000000.0)
14472 << " full " << ((float)full_micro
/ 1000000.0)
14476 uint64_t flush_target
= pool
.info
.cache_target_dirty_ratio_micro
;
14477 uint64_t flush_high_target
= pool
.info
.cache_target_dirty_high_ratio_micro
;
14478 uint64_t flush_slop
= (float)flush_target
* cct
->_conf
->osd_agent_slop
;
14479 if (restart
|| agent_state
->flush_mode
== TierAgentState::FLUSH_MODE_IDLE
) {
14480 flush_target
+= flush_slop
;
14481 flush_high_target
+= flush_slop
;
14483 flush_target
-= std::min(flush_target
, flush_slop
);
14484 flush_high_target
-= std::min(flush_high_target
, flush_slop
);
14487 if (dirty_micro
> flush_high_target
) {
14488 flush_mode
= TierAgentState::FLUSH_MODE_HIGH
;
14489 } else if (dirty_micro
> flush_target
|| (!flush_target
&& num_dirty
> 0)) {
14490 flush_mode
= TierAgentState::FLUSH_MODE_LOW
;
14494 uint64_t evict_target
= pool
.info
.cache_target_full_ratio_micro
;
14495 uint64_t evict_slop
= (float)evict_target
* cct
->_conf
->osd_agent_slop
;
14496 if (restart
|| agent_state
->evict_mode
== TierAgentState::EVICT_MODE_IDLE
)
14497 evict_target
+= evict_slop
;
14499 evict_target
-= std::min(evict_target
, evict_slop
);
14501 if (full_micro
> 1000000) {
14502 // evict anything clean
14503 evict_mode
= TierAgentState::EVICT_MODE_FULL
;
14504 evict_effort
= 1000000;
14505 } else if (full_micro
> evict_target
) {
14506 // set effort in [0..1] range based on where we are between
14507 evict_mode
= TierAgentState::EVICT_MODE_SOME
;
14508 uint64_t over
= full_micro
- evict_target
;
14509 uint64_t span
= 1000000 - evict_target
;
14510 evict_effort
= std::max(over
* 1000000 / span
,
14511 uint64_t(1000000.0 *
14512 cct
->_conf
->osd_agent_min_evict_effort
));
14514 // quantize effort to avoid too much reordering in the agent_queue.
14515 uint64_t inc
= cct
->_conf
->osd_agent_quantize_effort
* 1000000;
14516 ceph_assert(inc
> 0);
14517 uint64_t was
= evict_effort
;
14518 evict_effort
-= evict_effort
% inc
;
14519 if (evict_effort
< inc
)
14520 evict_effort
= inc
;
14521 ceph_assert(evict_effort
>= inc
&& evict_effort
<= 1000000);
14522 dout(30) << __func__
<< " evict_effort " << was
<< " quantized by " << inc
<< " to " << evict_effort
<< dendl
;
14527 bool old_idle
= agent_state
->is_idle();
14528 if (flush_mode
!= agent_state
->flush_mode
) {
14529 dout(5) << __func__
<< " flush_mode "
14530 << TierAgentState::get_flush_mode_name(agent_state
->flush_mode
)
14532 << TierAgentState::get_flush_mode_name(flush_mode
)
14534 if (flush_mode
== TierAgentState::FLUSH_MODE_HIGH
) {
14535 osd
->agent_inc_high_count();
14536 info
.stats
.stats
.sum
.num_flush_mode_high
= 1;
14537 } else if (flush_mode
== TierAgentState::FLUSH_MODE_LOW
) {
14538 info
.stats
.stats
.sum
.num_flush_mode_low
= 1;
14540 if (agent_state
->flush_mode
== TierAgentState::FLUSH_MODE_HIGH
) {
14541 osd
->agent_dec_high_count();
14542 info
.stats
.stats
.sum
.num_flush_mode_high
= 0;
14543 } else if (agent_state
->flush_mode
== TierAgentState::FLUSH_MODE_LOW
) {
14544 info
.stats
.stats
.sum
.num_flush_mode_low
= 0;
14546 agent_state
->flush_mode
= flush_mode
;
14548 if (evict_mode
!= agent_state
->evict_mode
) {
14549 dout(5) << __func__
<< " evict_mode "
14550 << TierAgentState::get_evict_mode_name(agent_state
->evict_mode
)
14552 << TierAgentState::get_evict_mode_name(evict_mode
)
14554 if (agent_state
->evict_mode
== TierAgentState::EVICT_MODE_FULL
&&
14558 requeue_ops(waiting_for_flush
);
14559 requeue_ops(waiting_for_active
);
14560 requeue_ops(waiting_for_scrub
);
14561 requeue_ops(waiting_for_cache_not_full
);
14562 objects_blocked_on_cache_full
.clear();
14565 if (evict_mode
== TierAgentState::EVICT_MODE_SOME
) {
14566 info
.stats
.stats
.sum
.num_evict_mode_some
= 1;
14567 } else if (evict_mode
== TierAgentState::EVICT_MODE_FULL
) {
14568 info
.stats
.stats
.sum
.num_evict_mode_full
= 1;
14570 if (agent_state
->evict_mode
== TierAgentState::EVICT_MODE_SOME
) {
14571 info
.stats
.stats
.sum
.num_evict_mode_some
= 0;
14572 } else if (agent_state
->evict_mode
== TierAgentState::EVICT_MODE_FULL
) {
14573 info
.stats
.stats
.sum
.num_evict_mode_full
= 0;
14575 agent_state
->evict_mode
= evict_mode
;
14577 uint64_t old_effort
= agent_state
->evict_effort
;
14578 if (evict_effort
!= agent_state
->evict_effort
) {
14579 dout(5) << __func__
<< " evict_effort "
14580 << ((float)agent_state
->evict_effort
/ 1000000.0)
14582 << ((float)evict_effort
/ 1000000.0)
14584 agent_state
->evict_effort
= evict_effort
;
14587 // NOTE: we are using evict_effort as a proxy for *all* agent effort
14588 // (including flush). This is probably fine (they should be
14589 // correlated) but it is not precisely correct.
14590 if (agent_state
->is_idle()) {
14591 if (!restart
&& !old_idle
) {
14592 osd
->agent_disable_pg(this, old_effort
);
14595 if (restart
|| old_idle
) {
14596 osd
->agent_enable_pg(this, agent_state
->evict_effort
);
14597 } else if (old_effort
!= agent_state
->evict_effort
) {
14598 osd
->agent_adjust_pg(this, old_effort
, agent_state
->evict_effort
);
14604 void PrimaryLogPG::agent_estimate_temp(const hobject_t
& oid
, int *temp
)
14606 ceph_assert(hit_set
);
14609 if (hit_set
->contains(oid
))
14612 int last_n
= pool
.info
.hit_set_search_last_n
;
14613 for (map
<time_t,HitSetRef
>::reverse_iterator p
=
14614 agent_state
->hit_set_map
.rbegin(); last_n
> 0 &&
14615 p
!= agent_state
->hit_set_map
.rend(); ++p
, ++i
) {
14616 if (p
->second
->contains(oid
)) {
14617 *temp
+= pool
.info
.get_grade(i
);
14623 // Dup op detection
14625 bool PrimaryLogPG::already_complete(eversion_t v
)
14627 dout(20) << __func__
<< ": " << v
<< dendl
;
14628 for (xlist
<RepGather
*>::iterator i
= repop_queue
.begin();
14631 dout(20) << __func__
<< ": " << **i
<< dendl
;
14632 // skip copy from temp object ops
14633 if ((*i
)->v
== eversion_t()) {
14634 dout(20) << __func__
<< ": " << **i
14635 << " version is empty" << dendl
;
14639 dout(20) << __func__
<< ": " << **i
14640 << " (*i)->v past v" << dendl
;
14643 if (!(*i
)->all_committed
) {
14644 dout(20) << __func__
<< ": " << **i
14645 << " not committed, returning false"
14650 dout(20) << __func__
<< ": returning true" << dendl
;
14654 bool PrimaryLogPG::already_ack(eversion_t v
)
14656 dout(20) << __func__
<< ": " << v
<< dendl
;
14657 for (xlist
<RepGather
*>::iterator i
= repop_queue
.begin();
14660 // skip copy from temp object ops
14661 if ((*i
)->v
== eversion_t()) {
14662 dout(20) << __func__
<< ": " << **i
14663 << " version is empty" << dendl
;
14667 dout(20) << __func__
<< ": " << **i
14668 << " (*i)->v past v" << dendl
;
14672 dout(20) << __func__
<< ": returning true" << dendl
;
14677 // ==========================================================================================
14681 bool PrimaryLogPG::_range_available_for_scrub(
14682 const hobject_t
&begin
, const hobject_t
&end
)
14684 pair
<hobject_t
, ObjectContextRef
> next
;
14685 next
.second
= object_contexts
.lookup(begin
);
14686 next
.first
= begin
;
14688 while (more
&& next
.first
< end
) {
14689 if (next
.second
&& next
.second
->is_blocked()) {
14690 next
.second
->requeue_scrub_on_unblock
= true;
14691 dout(10) << __func__
<< ": scrub delayed, "
14692 << next
.first
<< " is blocked"
14696 more
= object_contexts
.get_next(next
.first
, &next
);
14701 static bool doing_clones(const boost::optional
<SnapSet
> &snapset
,
14702 const vector
<snapid_t
>::reverse_iterator
&curclone
) {
14703 return snapset
&& curclone
!= snapset
.get().clones
.rend();
14706 void PrimaryLogPG::log_missing(unsigned missing
,
14707 const boost::optional
<hobject_t
> &head
,
14708 LogChannelRef clog
,
14712 bool allow_incomplete_clones
)
14715 if (allow_incomplete_clones
) {
14716 dout(20) << func
<< " " << mode
<< " " << pgid
<< " " << head
.get()
14717 << " skipped " << missing
<< " clone(s) in cache tier" << dendl
;
14719 clog
->info() << mode
<< " " << pgid
<< " " << head
.get()
14720 << " : " << missing
<< " missing clone(s)";
14724 unsigned PrimaryLogPG::process_clones_to(const boost::optional
<hobject_t
> &head
,
14725 const boost::optional
<SnapSet
> &snapset
,
14726 LogChannelRef clog
,
14729 bool allow_incomplete_clones
,
14730 boost::optional
<snapid_t
> target
,
14731 vector
<snapid_t
>::reverse_iterator
*curclone
,
14732 inconsistent_snapset_wrapper
&e
)
14735 ceph_assert(snapset
);
14736 unsigned missing
= 0;
14738 // NOTE: clones are in descending order, thus **curclone > target test here
14739 hobject_t
next_clone(head
.get());
14740 while(doing_clones(snapset
, *curclone
) && (!target
|| **curclone
> *target
)) {
14742 // it is okay to be missing one or more clones in a cache tier.
14743 // skip higher-numbered clones in the list.
14744 if (!allow_incomplete_clones
) {
14745 next_clone
.snap
= **curclone
;
14746 clog
->error() << mode
<< " " << pgid
<< " " << head
.get()
14747 << " : expected clone " << next_clone
<< " " << missing
14749 ++scrubber
.shallow_errors
;
14750 e
.set_clone_missing(next_clone
.snap
);
14752 // Clones are descending
14759 * Validate consistency of the object info and snap sets.
14761 * We are sort of comparing 2 lists. The main loop is on objmap.objects. But
14762 * the comparison of the objects is against multiple snapset.clones. There are
14763 * multiple clone lists and in between lists we expect head.
14769 * obj1 snap 1 head, unexpected obj1 snap 1
14770 * obj2 head head, match
14771 * [SnapSet clones 6 4 2 1]
14772 * obj2 snap 7 obj2 snap 6, unexpected obj2 snap 7
14773 * obj2 snap 6 obj2 snap 6, match
14774 * obj2 snap 4 obj2 snap 4, match
14775 * obj3 head obj2 snap 2 (expected), obj2 snap 1 (expected), match
14776 * [Snapset clones 3 1]
14777 * obj3 snap 3 obj3 snap 3 match
14778 * obj3 snap 1 obj3 snap 1 match
14779 * obj4 head head, match
14780 * [Snapset clones 4]
14781 * EOL obj4 snap 4, (expected)
14783 void PrimaryLogPG::scrub_snapshot_metadata(
14784 ScrubMap
&scrubmap
,
14785 const map
<hobject_t
,
14786 pair
<boost::optional
<uint32_t>,
14787 boost::optional
<uint32_t>>> &missing_digest
)
14789 dout(10) << __func__
<< dendl
;
14791 bool repair
= state_test(PG_STATE_REPAIR
);
14792 bool deep_scrub
= state_test(PG_STATE_DEEP_SCRUB
);
14793 const char *mode
= (repair
? "repair": (deep_scrub
? "deep-scrub" : "scrub"));
14794 boost::optional
<snapid_t
> all_clones
; // Unspecified snapid_t or boost::none
14796 // traverse in reverse order.
14797 boost::optional
<hobject_t
> head
;
14798 boost::optional
<SnapSet
> snapset
; // If initialized so will head (above)
14799 vector
<snapid_t
>::reverse_iterator curclone
; // Defined only if snapset initialized
14800 unsigned missing
= 0;
14801 inconsistent_snapset_wrapper soid_error
, head_error
;
14802 unsigned soid_error_count
= 0;
14804 for (map
<hobject_t
,ScrubMap::object
>::reverse_iterator
14805 p
= scrubmap
.objects
.rbegin(); p
!= scrubmap
.objects
.rend(); ++p
) {
14806 const hobject_t
& soid
= p
->first
;
14807 ceph_assert(!soid
.is_snapdir());
14808 soid_error
= inconsistent_snapset_wrapper
{soid
};
14809 object_stat_sum_t stat
;
14810 boost::optional
<object_info_t
> oi
;
14812 stat
.num_objects
++;
14814 if (soid
.nspace
== cct
->_conf
->osd_hit_set_namespace
)
14815 stat
.num_objects_hit_set_archive
++;
14817 if (soid
.is_snap()) {
14819 stat
.num_object_clones
++;
14823 if (p
->second
.attrs
.count(OI_ATTR
) == 0) {
14825 osd
->clog
->error() << mode
<< " " << info
.pgid
<< " " << soid
14826 << " : no '" << OI_ATTR
<< "' attr";
14827 ++scrubber
.shallow_errors
;
14828 soid_error
.set_info_missing();
14831 bv
.push_back(p
->second
.attrs
[OI_ATTR
]);
14833 oi
= object_info_t(); // Initialize optional<> before decode into it
14834 oi
.get().decode(bv
);
14835 } catch (buffer::error
& e
) {
14837 osd
->clog
->error() << mode
<< " " << info
.pgid
<< " " << soid
14838 << " : can't decode '" << OI_ATTR
<< "' attr " << e
.what();
14839 ++scrubber
.shallow_errors
;
14840 soid_error
.set_info_corrupted();
14841 soid_error
.set_info_missing(); // Not available too
14846 if (pgbackend
->be_get_ondisk_size(oi
->size
) != p
->second
.size
) {
14847 osd
->clog
->error() << mode
<< " " << info
.pgid
<< " " << soid
14848 << " : on disk size (" << p
->second
.size
14849 << ") does not match object info size ("
14850 << oi
->size
<< ") adjusted for ondisk to ("
14851 << pgbackend
->be_get_ondisk_size(oi
->size
)
14853 soid_error
.set_size_mismatch();
14854 ++scrubber
.shallow_errors
;
14857 dout(20) << mode
<< " " << soid
<< " " << oi
.get() << dendl
;
14859 // A clone num_bytes will be added later when we have snapset
14860 if (!soid
.is_snap()) {
14861 stat
.num_bytes
+= oi
->size
;
14863 if (soid
.nspace
== cct
->_conf
->osd_hit_set_namespace
)
14864 stat
.num_bytes_hit_set_archive
+= oi
->size
;
14866 if (oi
->is_dirty())
14867 ++stat
.num_objects_dirty
;
14868 if (oi
->is_whiteout())
14869 ++stat
.num_whiteouts
;
14871 ++stat
.num_objects_omap
;
14872 if (oi
->is_cache_pinned())
14873 ++stat
.num_objects_pinned
;
14874 if (oi
->has_manifest())
14875 ++stat
.num_objects_manifest
;
14878 // Check for any problems while processing clones
14879 if (doing_clones(snapset
, curclone
)) {
14880 boost::optional
<snapid_t
> target
;
14881 // Expecting an object with snap for current head
14882 if (soid
.has_snapset() || soid
.get_head() != head
->get_head()) {
14884 dout(10) << __func__
<< " " << mode
<< " " << info
.pgid
<< " new object "
14885 << soid
<< " while processing " << head
.get() << dendl
;
14887 target
= all_clones
;
14889 ceph_assert(soid
.is_snap());
14890 target
= soid
.snap
;
14893 // Log any clones we were expecting to be there up to target
14894 // This will set missing, but will be a no-op if snap.soid == *curclone.
14895 missing
+= process_clones_to(head
, snapset
, osd
->clog
, info
.pgid
, mode
,
14896 pool
.info
.allow_incomplete_clones(), target
, &curclone
,
14900 // Check doing_clones() again in case we ran process_clones_to()
14901 if (doing_clones(snapset
, curclone
)) {
14902 // A head would have processed all clones above
14903 // or all greater than *curclone.
14904 ceph_assert(soid
.is_snap() && *curclone
<= soid
.snap
);
14906 // After processing above clone snap should match the expected curclone
14907 expected
= (*curclone
== soid
.snap
);
14909 // If we aren't doing clones any longer, then expecting head
14910 expected
= soid
.has_snapset();
14913 // If we couldn't read the head's snapset, just ignore clones
14914 if (head
&& !snapset
) {
14915 osd
->clog
->error() << mode
<< " " << info
.pgid
<< " " << soid
14916 << " : clone ignored due to missing snapset";
14918 osd
->clog
->error() << mode
<< " " << info
.pgid
<< " " << soid
14919 << " : is an unexpected clone";
14921 ++scrubber
.shallow_errors
;
14922 soid_error
.set_headless();
14923 scrubber
.store
->add_snap_error(pool
.id
, soid_error
);
14924 ++soid_error_count
;
14925 if (head
&& soid
.get_head() == head
->get_head())
14926 head_error
.set_clone(soid
.snap
);
14931 if (soid
.has_snapset()) {
14934 log_missing(missing
, head
, osd
->clog
, info
.pgid
, __func__
, mode
,
14935 pool
.info
.allow_incomplete_clones());
14938 // Save previous head error information
14939 if (head
&& (head_error
.errors
|| soid_error_count
))
14940 scrubber
.store
->add_snap_error(pool
.id
, head_error
);
14941 // Set this as a new head object
14944 head_error
= soid_error
;
14945 soid_error_count
= 0;
14947 dout(20) << __func__
<< " " << mode
<< " new head " << head
<< dendl
;
14949 if (p
->second
.attrs
.count(SS_ATTR
) == 0) {
14950 osd
->clog
->error() << mode
<< " " << info
.pgid
<< " " << soid
14951 << " : no '" << SS_ATTR
<< "' attr";
14952 ++scrubber
.shallow_errors
;
14953 snapset
= boost::none
;
14954 head_error
.set_snapset_missing();
14957 bl
.push_back(p
->second
.attrs
[SS_ATTR
]);
14958 auto blp
= bl
.cbegin();
14960 snapset
= SnapSet(); // Initialize optional<> before decoding into it
14961 decode(snapset
.get(), blp
);
14962 head_error
.ss_bl
.push_back(p
->second
.attrs
[SS_ATTR
]);
14963 } catch (buffer::error
& e
) {
14964 snapset
= boost::none
;
14965 osd
->clog
->error() << mode
<< " " << info
.pgid
<< " " << soid
14966 << " : can't decode '" << SS_ATTR
<< "' attr " << e
.what();
14967 ++scrubber
.shallow_errors
;
14968 head_error
.set_snapset_corrupted();
14973 // what will be next?
14974 curclone
= snapset
->clones
.rbegin();
14976 if (!snapset
->clones
.empty()) {
14977 dout(20) << " snapset " << snapset
.get() << dendl
;
14978 if (snapset
->seq
== 0) {
14979 osd
->clog
->error() << mode
<< " " << info
.pgid
<< " " << soid
14980 << " : snaps.seq not set";
14981 ++scrubber
.shallow_errors
;
14982 head_error
.set_snapset_error();
14987 ceph_assert(soid
.is_snap());
14989 ceph_assert(snapset
);
14990 ceph_assert(soid
.snap
== *curclone
);
14992 dout(20) << __func__
<< " " << mode
<< " matched clone " << soid
<< dendl
;
14994 if (snapset
->clone_size
.count(soid
.snap
) == 0) {
14995 osd
->clog
->error() << mode
<< " " << info
.pgid
<< " " << soid
14996 << " : is missing in clone_size";
14997 ++scrubber
.shallow_errors
;
14998 soid_error
.set_size_mismatch();
15000 if (oi
&& oi
->size
!= snapset
->clone_size
[soid
.snap
]) {
15001 osd
->clog
->error() << mode
<< " " << info
.pgid
<< " " << soid
15002 << " : size " << oi
->size
<< " != clone_size "
15003 << snapset
->clone_size
[*curclone
];
15004 ++scrubber
.shallow_errors
;
15005 soid_error
.set_size_mismatch();
15008 if (snapset
->clone_overlap
.count(soid
.snap
) == 0) {
15009 osd
->clog
->error() << mode
<< " " << info
.pgid
<< " " << soid
15010 << " : is missing in clone_overlap";
15011 ++scrubber
.shallow_errors
;
15012 soid_error
.set_size_mismatch();
15014 // This checking is based on get_clone_bytes(). The first 2 asserts
15015 // can't happen because we know we have a clone_size and
15016 // a clone_overlap. Now we check that the interval_set won't
15017 // cause the last assert.
15018 uint64_t size
= snapset
->clone_size
.find(soid
.snap
)->second
;
15019 const interval_set
<uint64_t> &overlap
=
15020 snapset
->clone_overlap
.find(soid
.snap
)->second
;
15021 bool bad_interval_set
= false;
15022 for (interval_set
<uint64_t>::const_iterator i
= overlap
.begin();
15023 i
!= overlap
.end(); ++i
) {
15024 if (size
< i
.get_len()) {
15025 bad_interval_set
= true;
15028 size
-= i
.get_len();
15031 if (bad_interval_set
) {
15032 osd
->clog
->error() << mode
<< " " << info
.pgid
<< " " << soid
15033 << " : bad interval_set in clone_overlap";
15034 ++scrubber
.shallow_errors
;
15035 soid_error
.set_size_mismatch();
15037 stat
.num_bytes
+= snapset
->get_clone_bytes(soid
.snap
);
15044 if (soid_error
.errors
) {
15045 scrubber
.store
->add_snap_error(pool
.id
, soid_error
);
15046 ++soid_error_count
;
15050 scrub_cstat
.add(stat
);
15053 if (doing_clones(snapset
, curclone
)) {
15054 dout(10) << __func__
<< " " << mode
<< " " << info
.pgid
15055 << " No more objects while processing " << head
.get() << dendl
;
15057 missing
+= process_clones_to(head
, snapset
, osd
->clog
, info
.pgid
, mode
,
15058 pool
.info
.allow_incomplete_clones(), all_clones
, &curclone
,
15061 // There could be missing found by the test above or even
15062 // before dropping out of the loop for the last head.
15064 log_missing(missing
, head
, osd
->clog
, info
.pgid
, __func__
,
15065 mode
, pool
.info
.allow_incomplete_clones());
15067 if (head
&& (head_error
.errors
|| soid_error_count
))
15068 scrubber
.store
->add_snap_error(pool
.id
, head_error
);
15070 for (auto p
= missing_digest
.begin(); p
!= missing_digest
.end(); ++p
) {
15071 ceph_assert(!p
->first
.is_snapdir());
15072 dout(10) << __func__
<< " recording digests for " << p
->first
<< dendl
;
15073 ObjectContextRef obc
= get_object_context(p
->first
, false);
15075 osd
->clog
->error() << info
.pgid
<< " " << mode
15076 << " cannot get object context for object "
15079 } else if (obc
->obs
.oi
.soid
!= p
->first
) {
15080 osd
->clog
->error() << info
.pgid
<< " " << mode
15082 << " : object has a valid oi attr with a mismatched name, "
15083 << " obc->obs.oi.soid: " << obc
->obs
.oi
.soid
;
15086 OpContextUPtr ctx
= simple_opc_create(obc
);
15087 ctx
->at_version
= get_next_version();
15088 ctx
->mtime
= utime_t(); // do not update mtime
15089 if (p
->second
.first
) {
15090 ctx
->new_obs
.oi
.set_data_digest(*p
->second
.first
);
15092 ctx
->new_obs
.oi
.clear_data_digest();
15094 if (p
->second
.second
) {
15095 ctx
->new_obs
.oi
.set_omap_digest(*p
->second
.second
);
15097 ctx
->new_obs
.oi
.clear_omap_digest();
15099 finish_ctx(ctx
.get(), pg_log_entry_t::MODIFY
);
15101 ctx
->register_on_success(
15103 dout(20) << "updating scrub digest" << dendl
;
15104 if (--scrubber
.num_digest_updates_pending
== 0) {
15109 simple_opc_submit(std::move(ctx
));
15110 ++scrubber
.num_digest_updates_pending
;
15113 dout(10) << __func__
<< " (" << mode
<< ") finish" << dendl
;
15116 void PrimaryLogPG::_scrub_clear_state()
15118 scrub_cstat
= object_stat_collection_t();
15121 void PrimaryLogPG::_scrub_finish()
15123 bool repair
= state_test(PG_STATE_REPAIR
);
15124 bool deep_scrub
= state_test(PG_STATE_DEEP_SCRUB
);
15125 const char *mode
= (repair
? "repair": (deep_scrub
? "deep-scrub" : "scrub"));
15127 if (info
.stats
.stats_invalid
) {
15128 info
.stats
.stats
= scrub_cstat
;
15129 info
.stats
.stats_invalid
= false;
15132 agent_choose_mode();
15135 dout(10) << mode
<< " got "
15136 << scrub_cstat
.sum
.num_objects
<< "/" << info
.stats
.stats
.sum
.num_objects
<< " objects, "
15137 << scrub_cstat
.sum
.num_object_clones
<< "/" << info
.stats
.stats
.sum
.num_object_clones
<< " clones, "
15138 << scrub_cstat
.sum
.num_objects_dirty
<< "/" << info
.stats
.stats
.sum
.num_objects_dirty
<< " dirty, "
15139 << scrub_cstat
.sum
.num_objects_omap
<< "/" << info
.stats
.stats
.sum
.num_objects_omap
<< " omap, "
15140 << scrub_cstat
.sum
.num_objects_pinned
<< "/" << info
.stats
.stats
.sum
.num_objects_pinned
<< " pinned, "
15141 << scrub_cstat
.sum
.num_objects_hit_set_archive
<< "/" << info
.stats
.stats
.sum
.num_objects_hit_set_archive
<< " hit_set_archive, "
15142 << scrub_cstat
.sum
.num_bytes
<< "/" << info
.stats
.stats
.sum
.num_bytes
<< " bytes, "
15143 << scrub_cstat
.sum
.num_objects_manifest
<< "/" << info
.stats
.stats
.sum
.num_objects_manifest
<< " manifest objects, "
15144 << scrub_cstat
.sum
.num_bytes_hit_set_archive
<< "/" << info
.stats
.stats
.sum
.num_bytes_hit_set_archive
<< " hit_set_archive bytes."
15147 if (scrub_cstat
.sum
.num_objects
!= info
.stats
.stats
.sum
.num_objects
||
15148 scrub_cstat
.sum
.num_object_clones
!= info
.stats
.stats
.sum
.num_object_clones
||
15149 (scrub_cstat
.sum
.num_objects_dirty
!= info
.stats
.stats
.sum
.num_objects_dirty
&&
15150 !info
.stats
.dirty_stats_invalid
) ||
15151 (scrub_cstat
.sum
.num_objects_omap
!= info
.stats
.stats
.sum
.num_objects_omap
&&
15152 !info
.stats
.omap_stats_invalid
) ||
15153 (scrub_cstat
.sum
.num_objects_pinned
!= info
.stats
.stats
.sum
.num_objects_pinned
&&
15154 !info
.stats
.pin_stats_invalid
) ||
15155 (scrub_cstat
.sum
.num_objects_hit_set_archive
!= info
.stats
.stats
.sum
.num_objects_hit_set_archive
&&
15156 !info
.stats
.hitset_stats_invalid
) ||
15157 (scrub_cstat
.sum
.num_bytes_hit_set_archive
!= info
.stats
.stats
.sum
.num_bytes_hit_set_archive
&&
15158 !info
.stats
.hitset_bytes_stats_invalid
) ||
15159 (scrub_cstat
.sum
.num_objects_manifest
!= info
.stats
.stats
.sum
.num_objects_manifest
&&
15160 !info
.stats
.manifest_stats_invalid
) ||
15161 scrub_cstat
.sum
.num_whiteouts
!= info
.stats
.stats
.sum
.num_whiteouts
||
15162 scrub_cstat
.sum
.num_bytes
!= info
.stats
.stats
.sum
.num_bytes
) {
15163 osd
->clog
->error() << info
.pgid
<< " " << mode
15164 << " : stat mismatch, got "
15165 << scrub_cstat
.sum
.num_objects
<< "/" << info
.stats
.stats
.sum
.num_objects
<< " objects, "
15166 << scrub_cstat
.sum
.num_object_clones
<< "/" << info
.stats
.stats
.sum
.num_object_clones
<< " clones, "
15167 << scrub_cstat
.sum
.num_objects_dirty
<< "/" << info
.stats
.stats
.sum
.num_objects_dirty
<< " dirty, "
15168 << scrub_cstat
.sum
.num_objects_omap
<< "/" << info
.stats
.stats
.sum
.num_objects_omap
<< " omap, "
15169 << scrub_cstat
.sum
.num_objects_pinned
<< "/" << info
.stats
.stats
.sum
.num_objects_pinned
<< " pinned, "
15170 << scrub_cstat
.sum
.num_objects_hit_set_archive
<< "/" << info
.stats
.stats
.sum
.num_objects_hit_set_archive
<< " hit_set_archive, "
15171 << scrub_cstat
.sum
.num_whiteouts
<< "/" << info
.stats
.stats
.sum
.num_whiteouts
<< " whiteouts, "
15172 << scrub_cstat
.sum
.num_bytes
<< "/" << info
.stats
.stats
.sum
.num_bytes
<< " bytes, "
15173 << scrub_cstat
.sum
.num_objects_manifest
<< "/" << info
.stats
.stats
.sum
.num_objects_manifest
<< " manifest objects, "
15174 << scrub_cstat
.sum
.num_bytes_hit_set_archive
<< "/" << info
.stats
.stats
.sum
.num_bytes_hit_set_archive
<< " hit_set_archive bytes.";
15175 ++scrubber
.shallow_errors
;
15179 info
.stats
.stats
= scrub_cstat
;
15180 info
.stats
.dirty_stats_invalid
= false;
15181 info
.stats
.omap_stats_invalid
= false;
15182 info
.stats
.hitset_stats_invalid
= false;
15183 info
.stats
.hitset_bytes_stats_invalid
= false;
15184 info
.stats
.pin_stats_invalid
= false;
15185 info
.stats
.manifest_stats_invalid
= false;
15186 publish_stats_to_osd();
15190 // Clear object context cache to get repair information
15192 object_contexts
.clear();
15195 bool PrimaryLogPG::check_osdmap_full(const set
<pg_shard_t
> &missing_on
)
15197 return osd
->check_osdmap_full(missing_on
);
15200 int PrimaryLogPG::rep_repair_primary_object(const hobject_t
& soid
, OpContext
*ctx
)
15202 OpRequestRef op
= ctx
->op
;
15203 // Only supports replicated pools
15204 ceph_assert(!pool
.info
.is_erasure());
15205 ceph_assert(is_primary());
15207 dout(10) << __func__
<< " " << soid
15208 << " peers osd.{" << acting_recovery_backfill
<< "}" << dendl
;
15211 block_for_clean(soid
, op
);
15215 ceph_assert(!pg_log
.get_missing().is_missing(soid
));
15216 auto& oi
= ctx
->new_obs
.oi
;
15217 eversion_t v
= oi
.version
;
15219 missing_loc
.add_missing(soid
, v
, eversion_t());
15220 if (primary_error(soid
, v
)) {
15221 dout(0) << __func__
<< " No other replicas available for " << soid
<< dendl
;
15222 // XXX: If we knew that there is no down osd which could include this
15223 // object, it would be nice if we could return EIO here.
15224 // If a "never fail" flag was available, that could be used
15225 // for rbd to NOT return EIO until object marked lost.
15227 // Drop through to save this op in case an osd comes up with the object.
15230 // Restart the op after object becomes readable again
15231 waiting_for_unreadable_object
[soid
].push_back(op
);
15232 op
->mark_delayed("waiting for missing object");
15234 if (!eio_errors_to_process
) {
15235 eio_errors_to_process
= true;
15236 ceph_assert(is_clean());
15237 state_set(PG_STATE_REPAIR
);
15238 queue_peering_event(
15240 std::make_shared
<PGPeeringEvent
>(
15241 get_osdmap_epoch(),
15242 get_osdmap_epoch(),
15245 // A prior error must have already cleared clean state and queued recovery
15246 // or a map change has triggered re-peering.
15247 // Not inlining the recovery by calling maybe_kick_recovery(soid);
15248 dout(5) << __func__
<< ": Read error on " << soid
<< ", but already seen errors" << dendl
;
15254 /*---SnapTrimmer Logging---*/
15256 #define dout_prefix pg->gen_prefix(*_dout)
15258 void PrimaryLogPG::SnapTrimmer::log_enter(const char *state_name
)
15260 ldout(pg
->cct
, 20) << "enter " << state_name
<< dendl
;
15263 void PrimaryLogPG::SnapTrimmer::log_exit(const char *state_name
, utime_t enter_time
)
15265 ldout(pg
->cct
, 20) << "exit " << state_name
<< dendl
;
15268 /*---SnapTrimmer states---*/
15270 #define dout_prefix (context< SnapTrimmer >().pg->gen_prefix(*_dout) \
15271 << "SnapTrimmer state<" << get_state_name() << ">: ")
15274 PrimaryLogPG::NotTrimming::NotTrimming(my_context ctx
)
15276 NamedState(context
< SnapTrimmer
>().pg
, "NotTrimming")
15278 context
< SnapTrimmer
>().log_enter(state_name
);
15281 void PrimaryLogPG::NotTrimming::exit()
15283 context
< SnapTrimmer
>().log_exit(state_name
, enter_time
);
15286 boost::statechart::result
PrimaryLogPG::NotTrimming::react(const KickTrim
&)
15288 PrimaryLogPG
*pg
= context
< SnapTrimmer
>().pg
;
15289 ldout(pg
->cct
, 10) << "NotTrimming react KickTrim" << dendl
;
15291 if (!(pg
->is_primary() && pg
->is_active())) {
15292 ldout(pg
->cct
, 10) << "NotTrimming not primary or active" << dendl
;
15293 return discard_event();
15295 if (!pg
->is_clean() ||
15296 pg
->snap_trimq
.empty()) {
15297 ldout(pg
->cct
, 10) << "NotTrimming not clean or nothing to trim" << dendl
;
15298 return discard_event();
15300 if (pg
->scrubber
.active
) {
15301 ldout(pg
->cct
, 10) << " scrubbing, will requeue snap_trimmer after" << dendl
;
15302 return transit
< WaitScrub
>();
15304 return transit
< Trimming
>();
15308 boost::statechart::result
PrimaryLogPG::WaitReservation::react(const SnapTrimReserved
&)
15310 PrimaryLogPG
*pg
= context
< SnapTrimmer
>().pg
;
15311 ldout(pg
->cct
, 10) << "WaitReservation react SnapTrimReserved" << dendl
;
15314 if (!context
< SnapTrimmer
>().can_trim()) {
15315 post_event(KickTrim());
15316 return transit
< NotTrimming
>();
15319 context
<Trimming
>().snap_to_trim
= pg
->snap_trimq
.range_start();
15320 ldout(pg
->cct
, 10) << "NotTrimming: trimming "
15321 << pg
->snap_trimq
.range_start()
15323 return transit
< AwaitAsyncWork
>();
15326 /* AwaitAsyncWork */
15327 PrimaryLogPG::AwaitAsyncWork::AwaitAsyncWork(my_context ctx
)
15329 NamedState(context
< SnapTrimmer
>().pg
, "Trimming/AwaitAsyncWork")
15331 auto *pg
= context
< SnapTrimmer
>().pg
;
15332 context
< SnapTrimmer
>().log_enter(state_name
);
15333 context
< SnapTrimmer
>().pg
->osd
->queue_for_snap_trim(pg
);
15334 pg
->state_set(PG_STATE_SNAPTRIM
);
15335 pg
->state_clear(PG_STATE_SNAPTRIM_ERROR
);
15336 pg
->publish_stats_to_osd();
15339 boost::statechart::result
PrimaryLogPG::AwaitAsyncWork::react(const DoSnapWork
&)
15341 PrimaryLogPGRef pg
= context
< SnapTrimmer
>().pg
;
15342 snapid_t snap_to_trim
= context
<Trimming
>().snap_to_trim
;
15343 auto &in_flight
= context
<Trimming
>().in_flight
;
15344 ceph_assert(in_flight
.empty());
15346 ceph_assert(pg
->is_primary() && pg
->is_active());
15347 if (!context
< SnapTrimmer
>().can_trim()) {
15348 ldout(pg
->cct
, 10) << "something changed, reverting to NotTrimming" << dendl
;
15349 post_event(KickTrim());
15350 return transit
< NotTrimming
>();
15353 ldout(pg
->cct
, 10) << "AwaitAsyncWork: trimming snap " << snap_to_trim
<< dendl
;
15355 vector
<hobject_t
> to_trim
;
15356 unsigned max
= pg
->cct
->_conf
->osd_pg_max_concurrent_snap_trims
;
15357 to_trim
.reserve(max
);
15358 int r
= pg
->snap_mapper
.get_next_objects_to_trim(
15362 if (r
!= 0 && r
!= -ENOENT
) {
15363 lderr(pg
->cct
) << "get_next_objects_to_trim returned "
15364 << cpp_strerror(r
) << dendl
;
15365 ceph_abort_msg("get_next_objects_to_trim returned an invalid code");
15366 } else if (r
== -ENOENT
) {
15368 ldout(pg
->cct
, 10) << "got ENOENT" << dendl
;
15370 ldout(pg
->cct
, 10) << "adding snap " << snap_to_trim
15371 << " to purged_snaps"
15373 pg
->info
.purged_snaps
.insert(snap_to_trim
);
15374 pg
->snap_trimq
.erase(snap_to_trim
);
15375 ldout(pg
->cct
, 10) << "purged_snaps now "
15376 << pg
->info
.purged_snaps
<< ", snap_trimq now "
15377 << pg
->snap_trimq
<< dendl
;
15379 ObjectStore::Transaction t
;
15380 pg
->dirty_big_info
= true;
15381 pg
->write_if_dirty(t
);
15382 int tr
= pg
->osd
->store
->queue_transaction(pg
->ch
, std::move(t
), NULL
);
15383 ceph_assert(tr
== 0);
15385 pg
->share_pg_info();
15386 post_event(KickTrim());
15387 return transit
< NotTrimming
>();
15389 ceph_assert(!to_trim
.empty());
15391 for (auto &&object
: to_trim
) {
15393 ldout(pg
->cct
, 10) << "AwaitAsyncWork react trimming " << object
<< dendl
;
15395 int error
= pg
->trim_object(in_flight
.empty(), object
, &ctx
);
15397 if (error
== -ENOLCK
) {
15398 ldout(pg
->cct
, 10) << "could not get write lock on obj "
15399 << object
<< dendl
;
15401 pg
->state_set(PG_STATE_SNAPTRIM_ERROR
);
15402 ldout(pg
->cct
, 10) << "Snaptrim error=" << error
<< dendl
;
15404 if (!in_flight
.empty()) {
15405 ldout(pg
->cct
, 10) << "letting the ones we already started finish" << dendl
;
15406 return transit
< WaitRepops
>();
15408 if (error
== -ENOLCK
) {
15409 ldout(pg
->cct
, 10) << "waiting for it to clear"
15411 return transit
< WaitRWLock
>();
15413 return transit
< NotTrimming
>();
15417 in_flight
.insert(object
);
15418 ctx
->register_on_success(
15419 [pg
, object
, &in_flight
]() {
15420 ceph_assert(in_flight
.find(object
) != in_flight
.end());
15421 in_flight
.erase(object
);
15422 if (in_flight
.empty()) {
15423 if (pg
->state_test(PG_STATE_SNAPTRIM_ERROR
)) {
15424 pg
->snap_trimmer_machine
.process_event(Reset());
15426 pg
->snap_trimmer_machine
.process_event(RepopsComplete());
15431 pg
->simple_opc_submit(std::move(ctx
));
15434 return transit
< WaitRepops
>();
15437 void PrimaryLogPG::setattr_maybe_cache(
15438 ObjectContextRef obc
,
15443 t
->setattr(obc
->obs
.oi
.soid
, key
, val
);
15446 void PrimaryLogPG::setattrs_maybe_cache(
15447 ObjectContextRef obc
,
15449 map
<string
, bufferlist
> &attrs
)
15451 t
->setattrs(obc
->obs
.oi
.soid
, attrs
);
15454 void PrimaryLogPG::rmattr_maybe_cache(
15455 ObjectContextRef obc
,
15459 t
->rmattr(obc
->obs
.oi
.soid
, key
);
15462 int PrimaryLogPG::getattr_maybe_cache(
15463 ObjectContextRef obc
,
15467 if (pool
.info
.is_erasure()) {
15468 map
<string
, bufferlist
>::iterator i
= obc
->attr_cache
.find(key
);
15469 if (i
!= obc
->attr_cache
.end()) {
15477 return pgbackend
->objects_get_attr(obc
->obs
.oi
.soid
, key
, val
);
15480 int PrimaryLogPG::getattrs_maybe_cache(
15481 ObjectContextRef obc
,
15482 map
<string
, bufferlist
> *out
)
15486 if (pool
.info
.is_erasure()) {
15487 *out
= obc
->attr_cache
;
15489 r
= pgbackend
->objects_get_attrs(obc
->obs
.oi
.soid
, out
);
15491 map
<string
, bufferlist
> tmp
;
15492 for (map
<string
, bufferlist
>::iterator i
= out
->begin();
15495 if (i
->first
.size() > 1 && i
->first
[0] == '_')
15496 tmp
[i
->first
.substr(1, i
->first
.size())].claim(i
->second
);
15502 bool PrimaryLogPG::check_failsafe_full() {
15503 return osd
->check_failsafe_full(get_dpp());
15506 void intrusive_ptr_add_ref(PrimaryLogPG
*pg
) { pg
->get("intptr"); }
15507 void intrusive_ptr_release(PrimaryLogPG
*pg
) { pg
->put("intptr"); }
15509 #ifdef PG_DEBUG_REFS
15510 uint64_t get_with_id(PrimaryLogPG
*pg
) { return pg
->get_with_id(); }
15511 void put_with_id(PrimaryLogPG
*pg
, uint64_t id
) { return pg
->put_with_id(id
); }
15514 void intrusive_ptr_add_ref(PrimaryLogPG::RepGather
*repop
) { repop
->get(); }
15515 void intrusive_ptr_release(PrimaryLogPG::RepGather
*repop
) { repop
->put(); }