1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 * Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com>
9 * Author: Loic Dachary <loic@dachary.org>
11 * This is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public
13 * License version 2.1, as published by the Free Software
14 * Foundation. See file COPYING.
18 #include "boost/tuple/tuple.hpp"
19 #include "boost/intrusive_ptr.hpp"
21 #include "PrimaryLogPG.h"
23 #include "OpRequest.h"
24 #include "ScrubStore.h"
26 #include "objclass/objclass.h"
28 #include "common/errno.h"
29 #include "common/scrub_types.h"
30 #include "common/perf_counters.h"
32 #include "messages/MOSDOp.h"
33 #include "messages/MOSDBackoff.h"
34 #include "messages/MOSDSubOp.h"
35 #include "messages/MOSDSubOpReply.h"
36 #include "messages/MOSDPGTrim.h"
37 #include "messages/MOSDPGScan.h"
38 #include "messages/MOSDRepScrub.h"
39 #include "messages/MOSDPGBackfill.h"
40 #include "messages/MOSDPGBackfillRemove.h"
41 #include "messages/MOSDPGUpdateLogMissing.h"
42 #include "messages/MOSDPGUpdateLogMissingReply.h"
43 #include "messages/MCommandReply.h"
44 #include "messages/MOSDScrubReserve.h"
45 #include "mds/inode_backtrace.h" // Ugh
46 #include "common/EventTrace.h"
48 #include "common/config.h"
49 #include "include/compat.h"
50 #include "mon/MonClient.h"
51 #include "osdc/Objecter.h"
52 #include "json_spirit/json_spirit_value.h"
53 #include "json_spirit/json_spirit_reader.h"
54 #include "include/assert.h" // json_spirit clobbers it
55 #include "include/rados/rados_types.hpp"
58 #include "tracing/osd.h"
60 #define tracepoint(...)
63 #define dout_context cct
64 #define dout_subsys ceph_subsys_osd
65 #define DOUT_PREFIX_ARGS this, osd->whoami, get_osdmap()
67 #define dout_prefix _prefix(_dout, this)
69 static ostream
& _prefix(std::ostream
*_dout
, T
*pg
) {
70 return *_dout
<< pg
->gen_prefix();
79 MEMPOOL_DEFINE_OBJECT_FACTORY(PrimaryLogPG
, replicatedpg
, osd
);
81 PGLSFilter::PGLSFilter() : cct(nullptr)
85 PGLSFilter::~PGLSFilter()
89 struct PrimaryLogPG::C_OSD_OnApplied
: Context
{
97 : pg(pg
), epoch(epoch
), v(v
) {}
98 void finish(int) override
{
100 if (!pg
->pg_has_reset_since(epoch
))
107 * The CopyCallback class defines an interface for completions to the
108 * copy_start code. Users of the copy infrastructure must implement
109 * one and give an instance of the class to start_copy.
111 * The implementer is responsible for making sure that the CopyCallback
112 * can associate itself with the correct copy operation.
114 class PrimaryLogPG::CopyCallback
: public GenContext
<CopyCallbackResults
> {
118 * results.get<0>() is the return code: 0 for success; -ECANCELED if
119 * the operation was cancelled by the local OSD; -errno for other issues.
120 * results.get<1>() is a pointer to a CopyResults object, which you are
121 * responsible for deleting.
123 void finish(CopyCallbackResults results_
) override
= 0;
126 /// Provide the final size of the copied object to the CopyCallback
127 ~CopyCallback() override
{}
130 template <typename T
>
131 class PrimaryLogPG::BlessedGenContext
: public GenContext
<T
> {
133 unique_ptr
<GenContext
<T
>> c
;
136 BlessedGenContext(PrimaryLogPG
*pg
, GenContext
<T
> *c
, epoch_t e
)
137 : pg(pg
), c(c
), e(e
) {}
138 void finish(T t
) override
{
140 if (pg
->pg_has_reset_since(e
))
143 c
.release()->complete(t
);
148 GenContext
<ThreadPool::TPHandle
&> *PrimaryLogPG::bless_gencontext(
149 GenContext
<ThreadPool::TPHandle
&> *c
) {
150 return new BlessedGenContext
<ThreadPool::TPHandle
&>(
151 this, c
, get_osdmap()->get_epoch());
154 class PrimaryLogPG::BlessedContext
: public Context
{
156 unique_ptr
<Context
> c
;
159 BlessedContext(PrimaryLogPG
*pg
, Context
*c
, epoch_t e
)
160 : pg(pg
), c(c
), e(e
) {}
161 void finish(int r
) override
{
163 if (pg
->pg_has_reset_since(e
))
166 c
.release()->complete(r
);
172 Context
*PrimaryLogPG::bless_context(Context
*c
) {
173 return new BlessedContext(this, c
, get_osdmap()->get_epoch());
176 class PrimaryLogPG::C_PG_ObjectContext
: public Context
{
180 C_PG_ObjectContext(PrimaryLogPG
*p
, ObjectContext
*o
) :
182 void finish(int r
) override
{
183 pg
->object_context_destructor_callback(obc
);
187 class PrimaryLogPG::C_OSD_OndiskWriteUnlock
: public Context
{
188 ObjectContextRef obc
, obc2
, obc3
;
190 C_OSD_OndiskWriteUnlock(
192 ObjectContextRef o2
= ObjectContextRef(),
193 ObjectContextRef o3
= ObjectContextRef()) : obc(o
), obc2(o2
), obc3(o3
) {}
194 void finish(int r
) override
{
195 obc
->ondisk_write_unlock();
197 obc2
->ondisk_write_unlock();
199 obc3
->ondisk_write_unlock();
203 struct OnReadComplete
: public Context
{
205 PrimaryLogPG::OpContext
*opcontext
;
208 PrimaryLogPG::OpContext
*ctx
) : pg(pg
), opcontext(ctx
) {}
209 void finish(int r
) override
{
210 opcontext
->finish_read(pg
);
212 ~OnReadComplete() override
{}
215 class PrimaryLogPG::C_OSD_AppliedRecoveredObject
: public Context
{
217 ObjectContextRef obc
;
219 C_OSD_AppliedRecoveredObject(PrimaryLogPG
*p
, ObjectContextRef o
) :
221 void finish(int r
) override
{
222 pg
->_applied_recovered_object(obc
);
226 class PrimaryLogPG::C_OSD_CommittedPushedObject
: public Context
{
229 eversion_t last_complete
;
231 C_OSD_CommittedPushedObject(
232 PrimaryLogPG
*p
, epoch_t epoch
, eversion_t lc
) :
233 pg(p
), epoch(epoch
), last_complete(lc
) {
235 void finish(int r
) override
{
236 pg
->_committed_pushed_object(epoch
, last_complete
);
240 class PrimaryLogPG::C_OSD_AppliedRecoveredObjectReplica
: public Context
{
243 explicit C_OSD_AppliedRecoveredObjectReplica(PrimaryLogPG
*p
) :
245 void finish(int r
) override
{
246 pg
->_applied_recovered_object_replica();
251 void PrimaryLogPG::OpContext::start_async_reads(PrimaryLogPG
*pg
)
254 list
<pair
<boost::tuple
<uint64_t, uint64_t, unsigned>,
255 pair
<bufferlist
*, Context
*> > > in
;
256 in
.swap(pending_async_reads
);
257 pg
->pgbackend
->objects_read_async(
260 new OnReadComplete(pg
, this), pg
->get_pool().fast_read
);
262 void PrimaryLogPG::OpContext::finish_read(PrimaryLogPG
*pg
)
264 assert(inflightreads
> 0);
266 if (async_reads_complete()) {
267 assert(pg
->in_progress_async_reads
.size());
268 assert(pg
->in_progress_async_reads
.front().second
== this);
269 pg
->in_progress_async_reads
.pop_front();
271 // Restart the op context now that all reads have been
272 // completed. Read failures will be handled by the op finisher
273 pg
->execute_ctx(this);
277 class CopyFromCallback
: public PrimaryLogPG::CopyCallback
{
279 PrimaryLogPG::CopyResults
*results
= nullptr;
280 PrimaryLogPG::OpContext
*ctx
;
283 CopyFromCallback(PrimaryLogPG::OpContext
*ctx
, OSDOp
&osd_op
)
284 : ctx(ctx
), osd_op(osd_op
) {
286 ~CopyFromCallback() override
{}
288 void finish(PrimaryLogPG::CopyCallbackResults results_
) override
{
289 results
= results_
.get
<1>();
290 int r
= results_
.get
<0>();
292 // for finish_copyfrom
293 ctx
->user_at_version
= results
->user_version
;
296 ctx
->pg
->execute_ctx(ctx
);
298 if (r
!= -ECANCELED
) { // on cancel just toss it out; client resends
300 ctx
->pg
->osd
->reply_op_error(ctx
->op
, r
);
301 } else if (results
->should_requeue
) {
303 ctx
->pg
->requeue_op(ctx
->op
);
305 ctx
->pg
->close_op_ctx(ctx
);
309 bool is_temp_obj_used() {
310 return results
->started_temp_obj
;
312 uint64_t get_data_size() {
313 return results
->object_size
;
317 struct CopyFromFinisher
: public PrimaryLogPG::OpFinisher
{
318 CopyFromCallback
*copy_from_callback
;
320 CopyFromFinisher(CopyFromCallback
*copy_from_callback
)
321 : copy_from_callback(copy_from_callback
) {
324 int execute() override
{
325 // instance will be destructed after this method completes
326 copy_from_callback
->ctx
->pg
->finish_copyfrom(copy_from_callback
);
331 // ======================
332 // PGBackend::Listener
334 void PrimaryLogPG::on_local_recover(
335 const hobject_t
&hoid
,
336 const ObjectRecoveryInfo
&_recovery_info
,
337 ObjectContextRef obc
,
339 ObjectStore::Transaction
*t
342 dout(10) << __func__
<< ": " << hoid
<< dendl
;
344 ObjectRecoveryInfo
recovery_info(_recovery_info
);
345 clear_object_snap_mapping(t
, hoid
);
346 if (!is_delete
&& recovery_info
.soid
.is_snap()) {
347 OSDriver::OSTransaction
_t(osdriver
.get_transaction(t
));
349 dout(20) << " snapset " << recovery_info
.ss
350 << " legacy_snaps " << recovery_info
.oi
.legacy_snaps
<< dendl
;
352 if (recovery_info
.ss
.is_legacy() ||
353 recovery_info
.ss
.seq
== 0 /* jewel osd doesn't populate this */) {
354 assert(recovery_info
.oi
.legacy_snaps
.size());
355 snaps
.insert(recovery_info
.oi
.legacy_snaps
.begin(),
356 recovery_info
.oi
.legacy_snaps
.end());
358 auto p
= recovery_info
.ss
.clone_snaps
.find(hoid
.snap
);
359 if (p
!= recovery_info
.ss
.clone_snaps
.end()) {
360 snaps
.insert(p
->second
.begin(), p
->second
.end());
362 derr
<< __func__
<< " " << hoid
<< " had no clone_snaps" << dendl
;
367 dout(20) << " snaps " << snaps
<< dendl
;
374 if (!is_delete
&& pg_log
.get_missing().is_missing(recovery_info
.soid
) &&
375 pg_log
.get_missing().get_items().find(recovery_info
.soid
)->second
.need
> recovery_info
.version
) {
376 assert(is_primary());
377 const pg_log_entry_t
*latest
= pg_log
.get_log().objects
.find(recovery_info
.soid
)->second
;
378 if (latest
->op
== pg_log_entry_t::LOST_REVERT
&&
379 latest
->reverting_to
== recovery_info
.version
) {
380 dout(10) << " got old revert version " << recovery_info
.version
381 << " for " << *latest
<< dendl
;
382 recovery_info
.version
= latest
->version
;
383 // update the attr to the revert event version
384 recovery_info
.oi
.prior_version
= recovery_info
.oi
.version
;
385 recovery_info
.oi
.version
= latest
->version
;
387 ::encode(recovery_info
.oi
, bl
,
388 get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD
, nullptr));
389 assert(!pool
.info
.require_rollback());
390 t
->setattr(coll
, ghobject_t(recovery_info
.soid
), OI_ATTR
, bl
);
392 obc
->attr_cache
[OI_ATTR
] = bl
;
396 // keep track of active pushes for scrub
399 if (recovery_info
.version
> pg_log
.get_can_rollback_to()) {
400 /* This can only happen during a repair, and even then, it would
401 * be one heck of a race. If we are repairing the object, the
402 * write in question must be fully committed, so it's not valid
403 * to roll it back anyway (and we'll be rolled forward shortly
405 PGLogEntryHandler h
{this, t
};
406 pg_log
.roll_forward_to(recovery_info
.version
, &h
);
408 recover_got(recovery_info
.soid
, recovery_info
.version
);
412 obc
->obs
.exists
= true;
413 obc
->ondisk_write_lock();
415 bool got
= obc
->get_recovery_read();
418 assert(recovering
.count(obc
->obs
.oi
.soid
));
419 recovering
[obc
->obs
.oi
.soid
] = obc
;
420 obc
->obs
.oi
= recovery_info
.oi
; // may have been updated above
421 t
->register_on_applied_sync(new C_OSD_OndiskWriteUnlock(obc
));
424 t
->register_on_applied(new C_OSD_AppliedRecoveredObject(this, obc
));
426 publish_stats_to_osd();
427 assert(missing_loc
.needs_recovery(hoid
));
429 missing_loc
.add_location(hoid
, pg_whoami
);
430 release_backoffs(hoid
);
431 if (!is_unreadable_object(hoid
)) {
432 auto unreadable_object_entry
= waiting_for_unreadable_object
.find(hoid
);
433 if (unreadable_object_entry
!= waiting_for_unreadable_object
.end()) {
434 dout(20) << " kicking unreadable waiters on " << hoid
<< dendl
;
435 requeue_ops(unreadable_object_entry
->second
);
436 waiting_for_unreadable_object
.erase(unreadable_object_entry
);
440 t
->register_on_applied(
441 new C_OSD_AppliedRecoveredObjectReplica(this));
445 t
->register_on_commit(
446 new C_OSD_CommittedPushedObject(
448 get_osdmap()->get_epoch(),
449 info
.last_complete
));
456 void PrimaryLogPG::on_global_recover(
457 const hobject_t
&soid
,
458 const object_stat_sum_t
&stat_diff
,
461 info
.stats
.stats
.sum
.add(stat_diff
);
462 missing_loc
.recovered(soid
);
463 publish_stats_to_osd();
464 dout(10) << "pushed " << soid
<< " to all replicas" << dendl
;
465 map
<hobject_t
, ObjectContextRef
>::iterator i
= recovering
.find(soid
);
466 assert(i
!= recovering
.end());
469 // recover missing won't have had an obc, but it gets filled in
470 // during on_local_recover
472 list
<OpRequestRef
> requeue_list
;
473 i
->second
->drop_recovery_read(&requeue_list
);
474 requeue_ops(requeue_list
);
477 backfills_in_flight
.erase(soid
);
480 finish_recovery_op(soid
);
481 release_backoffs(soid
);
482 auto degraded_object_entry
= waiting_for_degraded_object
.find(soid
);
483 if (degraded_object_entry
!= waiting_for_degraded_object
.end()) {
484 dout(20) << " kicking degraded waiters on " << soid
<< dendl
;
485 requeue_ops(degraded_object_entry
->second
);
486 waiting_for_degraded_object
.erase(degraded_object_entry
);
488 auto unreadable_object_entry
= waiting_for_unreadable_object
.find(soid
);
489 if (unreadable_object_entry
!= waiting_for_unreadable_object
.end()) {
490 dout(20) << " kicking unreadable waiters on " << soid
<< dendl
;
491 requeue_ops(unreadable_object_entry
->second
);
492 waiting_for_unreadable_object
.erase(unreadable_object_entry
);
494 finish_degraded_object(soid
);
497 void PrimaryLogPG::on_peer_recover(
499 const hobject_t
&soid
,
500 const ObjectRecoveryInfo
&recovery_info
)
502 publish_stats_to_osd();
504 peer_missing
[peer
].got(soid
, recovery_info
.version
);
507 void PrimaryLogPG::begin_peer_recover(
509 const hobject_t soid
)
511 peer_missing
[peer
].revise_have(soid
, eversion_t());
514 void PrimaryLogPG::schedule_recovery_work(
515 GenContext
<ThreadPool::TPHandle
&> *c
)
517 osd
->recovery_gen_wq
.queue(c
);
520 void PrimaryLogPG::send_message_osd_cluster(
521 int peer
, Message
*m
, epoch_t from_epoch
)
523 osd
->send_message_osd_cluster(peer
, m
, from_epoch
);
526 void PrimaryLogPG::send_message_osd_cluster(
527 Message
*m
, Connection
*con
)
529 osd
->send_message_osd_cluster(m
, con
);
532 void PrimaryLogPG::send_message_osd_cluster(
533 Message
*m
, const ConnectionRef
& con
)
535 osd
->send_message_osd_cluster(m
, con
);
538 void PrimaryLogPG::on_primary_error(
539 const hobject_t
&oid
,
542 dout(0) << __func__
<< ": oid " << oid
<< " version " << v
<< dendl
;
544 primary_error(oid
, v
);
545 backfill_add_missing(oid
, v
);
548 void PrimaryLogPG::backfill_add_missing(
549 const hobject_t
&oid
,
552 dout(0) << __func__
<< ": oid " << oid
<< " version " << v
<< dendl
;
553 backfills_in_flight
.erase(oid
);
554 missing_loc
.add_missing(oid
, v
, eversion_t());
557 ConnectionRef
PrimaryLogPG::get_con_osd_cluster(
558 int peer
, epoch_t from_epoch
)
560 return osd
->get_con_osd_cluster(peer
, from_epoch
);
563 PerfCounters
*PrimaryLogPG::get_logger()
569 // ====================
572 bool PrimaryLogPG::is_missing_object(const hobject_t
& soid
) const
574 return pg_log
.get_missing().get_items().count(soid
);
577 void PrimaryLogPG::maybe_kick_recovery(
578 const hobject_t
&soid
)
581 if (!missing_loc
.needs_recovery(soid
, &v
))
584 map
<hobject_t
, ObjectContextRef
>::const_iterator p
= recovering
.find(soid
);
585 if (p
!= recovering
.end()) {
586 dout(7) << "object " << soid
<< " v " << v
<< ", already recovering." << dendl
;
587 } else if (missing_loc
.is_unfound(soid
)) {
588 dout(7) << "object " << soid
<< " v " << v
<< ", is unfound." << dendl
;
590 dout(7) << "object " << soid
<< " v " << v
<< ", recovering." << dendl
;
591 PGBackend::RecoveryHandle
*h
= pgbackend
->open_recovery_op();
592 if (is_missing_object(soid
)) {
593 recover_missing(soid
, v
, cct
->_conf
->osd_client_op_priority
, h
);
594 } else if (missing_loc
.is_deleted(soid
)) {
595 prep_object_replica_deletes(soid
, v
, h
);
597 prep_object_replica_pushes(soid
, v
, h
);
599 pgbackend
->run_recovery_op(h
, cct
->_conf
->osd_client_op_priority
);
603 void PrimaryLogPG::wait_for_unreadable_object(
604 const hobject_t
& soid
, OpRequestRef op
)
606 assert(is_unreadable_object(soid
));
607 maybe_kick_recovery(soid
);
608 waiting_for_unreadable_object
[soid
].push_back(op
);
609 op
->mark_delayed("waiting for missing object");
612 bool PrimaryLogPG::is_degraded_or_backfilling_object(const hobject_t
& soid
)
614 /* The conditions below may clear (on_local_recover, before we queue
615 * the transaction) before we actually requeue the degraded waiters
616 * in on_global_recover after the transaction completes.
618 if (waiting_for_degraded_object
.count(soid
))
620 if (pg_log
.get_missing().get_items().count(soid
))
622 assert(!actingbackfill
.empty());
623 for (set
<pg_shard_t
>::iterator i
= actingbackfill
.begin();
624 i
!= actingbackfill
.end();
626 if (*i
== get_primary()) continue;
627 pg_shard_t peer
= *i
;
628 auto peer_missing_entry
= peer_missing
.find(peer
);
629 if (peer_missing_entry
!= peer_missing
.end() &&
630 peer_missing_entry
->second
.get_items().count(soid
))
633 // Object is degraded if after last_backfill AND
634 // we are backfilling it
635 if (is_backfill_targets(peer
) &&
636 peer_info
[peer
].last_backfill
<= soid
&&
637 last_backfill_started
>= soid
&&
638 backfills_in_flight
.count(soid
))
644 void PrimaryLogPG::wait_for_degraded_object(const hobject_t
& soid
, OpRequestRef op
)
646 assert(is_degraded_or_backfilling_object(soid
));
648 maybe_kick_recovery(soid
);
649 waiting_for_degraded_object
[soid
].push_back(op
);
650 op
->mark_delayed("waiting for degraded object");
653 void PrimaryLogPG::block_write_on_full_cache(
654 const hobject_t
& _oid
, OpRequestRef op
)
656 const hobject_t oid
= _oid
.get_head();
657 dout(20) << __func__
<< ": blocking object " << oid
658 << " on full cache" << dendl
;
659 objects_blocked_on_cache_full
.insert(oid
);
660 waiting_for_cache_not_full
.push_back(op
);
661 op
->mark_delayed("waiting for cache not full");
664 void PrimaryLogPG::block_for_clean(
665 const hobject_t
& oid
, OpRequestRef op
)
667 dout(20) << __func__
<< ": blocking object " << oid
668 << " on primary repair" << dendl
;
669 waiting_for_clean_to_primary_repair
.push_back(op
);
670 op
->mark_delayed("waiting for clean to repair");
673 void PrimaryLogPG::block_write_on_snap_rollback(
674 const hobject_t
& oid
, ObjectContextRef obc
, OpRequestRef op
)
676 dout(20) << __func__
<< ": blocking object " << oid
.get_head()
677 << " on snap promotion " << obc
->obs
.oi
.soid
<< dendl
;
678 // otherwise, we'd have blocked in do_op
679 assert(oid
.is_head());
680 assert(objects_blocked_on_snap_promotion
.count(oid
) == 0);
681 objects_blocked_on_snap_promotion
[oid
] = obc
;
682 wait_for_blocked_object(obc
->obs
.oi
.soid
, op
);
685 void PrimaryLogPG::block_write_on_degraded_snap(
686 const hobject_t
& snap
, OpRequestRef op
)
688 dout(20) << __func__
<< ": blocking object " << snap
.get_head()
689 << " on degraded snap " << snap
<< dendl
;
690 // otherwise, we'd have blocked in do_op
691 assert(objects_blocked_on_degraded_snap
.count(snap
.get_head()) == 0);
692 objects_blocked_on_degraded_snap
[snap
.get_head()] = snap
.snap
;
693 wait_for_degraded_object(snap
, op
);
696 bool PrimaryLogPG::maybe_await_blocked_snapset(
697 const hobject_t
&hoid
,
700 ObjectContextRef obc
;
701 obc
= object_contexts
.lookup(hoid
.get_head());
703 if (obc
->is_blocked()) {
704 wait_for_blocked_object(obc
->obs
.oi
.soid
, op
);
710 obc
= object_contexts
.lookup(hoid
.get_snapdir());
712 if (obc
->is_blocked()) {
713 wait_for_blocked_object(obc
->obs
.oi
.soid
, op
);
722 void PrimaryLogPG::wait_for_blocked_object(const hobject_t
& soid
, OpRequestRef op
)
724 dout(10) << __func__
<< " " << soid
<< " " << op
<< dendl
;
725 waiting_for_blocked_object
[soid
].push_back(op
);
726 op
->mark_delayed("waiting for blocked object");
729 void PrimaryLogPG::maybe_force_recovery()
731 // no force if not in degraded/recovery/backfill states
732 if (!is_degraded() &&
733 !state_test(PG_STATE_RECOVERING
|
734 PG_STATE_RECOVERY_WAIT
|
735 PG_STATE_BACKFILLING
|
736 PG_STATE_BACKFILL_WAIT
|
737 PG_STATE_BACKFILL_TOOFULL
))
740 if (pg_log
.get_log().approx_size() <
741 cct
->_conf
->osd_max_pg_log_entries
*
742 cct
->_conf
->osd_force_recovery_pg_log_entries_factor
)
745 // find the oldest missing object
746 version_t min_version
= 0;
748 if (!pg_log
.get_missing().get_items().empty()) {
749 min_version
= pg_log
.get_missing().get_rmissing().begin()->first
;
750 soid
= pg_log
.get_missing().get_rmissing().begin()->second
;
752 assert(!actingbackfill
.empty());
753 for (set
<pg_shard_t
>::iterator it
= actingbackfill
.begin();
754 it
!= actingbackfill
.end();
756 if (*it
== get_primary()) continue;
757 pg_shard_t peer
= *it
;
758 if (peer_missing
.count(peer
) &&
759 !peer_missing
[peer
].get_items().empty() &&
760 min_version
> peer_missing
[peer
].get_rmissing().begin()->first
) {
761 min_version
= peer_missing
[peer
].get_rmissing().begin()->first
;
762 soid
= peer_missing
[peer
].get_rmissing().begin()->second
;
767 if (soid
!= hobject_t())
768 maybe_kick_recovery(soid
);
771 class PGLSPlainFilter
: public PGLSFilter
{
774 int init(bufferlist::iterator
¶ms
) override
777 ::decode(xattr
, params
);
778 ::decode(val
, params
);
779 } catch (buffer::error
&e
) {
785 ~PGLSPlainFilter() override
{}
786 bool filter(const hobject_t
&obj
, bufferlist
& xattr_data
,
787 bufferlist
& outdata
) override
;
790 class PGLSParentFilter
: public PGLSFilter
{
791 inodeno_t parent_ino
;
794 PGLSParentFilter(CephContext
* cct
) : cct(cct
) {
797 int init(bufferlist::iterator
¶ms
) override
800 ::decode(parent_ino
, params
);
801 } catch (buffer::error
&e
) {
804 generic_dout(0) << "parent_ino=" << parent_ino
<< dendl
;
808 ~PGLSParentFilter() override
{}
809 bool filter(const hobject_t
&obj
, bufferlist
& xattr_data
,
810 bufferlist
& outdata
) override
;
813 bool PGLSParentFilter::filter(const hobject_t
&obj
,
814 bufferlist
& xattr_data
, bufferlist
& outdata
)
816 bufferlist::iterator iter
= xattr_data
.begin();
817 inode_backtrace_t bt
;
819 generic_dout(0) << "PGLSParentFilter::filter" << dendl
;
823 vector
<inode_backpointer_t
>::iterator vi
;
824 for (vi
= bt
.ancestors
.begin(); vi
!= bt
.ancestors
.end(); ++vi
) {
825 generic_dout(0) << "vi->dirino=" << vi
->dirino
<< " parent_ino=" << parent_ino
<< dendl
;
826 if (vi
->dirino
== parent_ino
) {
827 ::encode(*vi
, outdata
);
835 bool PGLSPlainFilter::filter(const hobject_t
&obj
,
836 bufferlist
& xattr_data
, bufferlist
& outdata
)
838 if (val
.size() != xattr_data
.length())
841 if (memcmp(val
.c_str(), xattr_data
.c_str(), val
.size()))
847 bool PrimaryLogPG::pgls_filter(PGLSFilter
*filter
, hobject_t
& sobj
, bufferlist
& outdata
)
851 // If filter has expressed an interest in an xattr, load it.
852 if (!filter
->get_xattr().empty()) {
853 int ret
= pgbackend
->objects_get_attr(
857 dout(0) << "getattr (sobj=" << sobj
<< ", attr=" << filter
->get_xattr() << ") returned " << ret
<< dendl
;
859 if (ret
!= -ENODATA
|| filter
->reject_empty_xattr()) {
865 return filter
->filter(sobj
, bl
, outdata
);
868 int PrimaryLogPG::get_pgls_filter(bufferlist::iterator
& iter
, PGLSFilter
**pfilter
)
874 ::decode(type
, iter
);
876 catch (buffer::error
& e
) {
880 if (type
.compare("parent") == 0) {
881 filter
= new PGLSParentFilter(cct
);
882 } else if (type
.compare("plain") == 0) {
883 filter
= new PGLSPlainFilter();
885 std::size_t dot
= type
.find(".");
886 if (dot
== std::string::npos
|| dot
== 0 || dot
== type
.size() - 1) {
890 const std::string class_name
= type
.substr(0, dot
);
891 const std::string filter_name
= type
.substr(dot
+ 1);
892 ClassHandler::ClassData
*cls
= NULL
;
893 int r
= osd
->class_handler
->open_class(class_name
, &cls
);
895 derr
<< "Error opening class '" << class_name
<< "': "
896 << cpp_strerror(r
) << dendl
;
897 if (r
!= -EPERM
) // propogate permission error
904 ClassHandler::ClassFilter
*class_filter
= cls
->get_filter(filter_name
);
905 if (class_filter
== NULL
) {
906 derr
<< "Error finding filter '" << filter_name
<< "' in class "
907 << class_name
<< dendl
;
910 filter
= class_filter
->fn();
912 // Object classes are obliged to return us something, but let's
913 // give an error rather than asserting out.
914 derr
<< "Buggy class " << class_name
<< " failed to construct "
915 "filter " << filter_name
<< dendl
;
921 int r
= filter
->init(iter
);
923 derr
<< "Error initializing filter " << type
<< ": "
924 << cpp_strerror(r
) << dendl
;
928 // Successfully constructed and initialized, return it.
935 // ==========================================================
937 int PrimaryLogPG::do_command(
945 const auto &missing
= pg_log
.get_missing();
949 cmd_getval(cct
, cmdmap
, "format", format
);
950 boost::scoped_ptr
<Formatter
> f(Formatter::create(format
, "json-pretty", "json"));
953 cmd_getval(cct
, cmdmap
, "cmd", command
);
954 if (command
== "query") {
955 f
->open_object_section("pg");
956 f
->dump_string("state", pg_state_string(get_state()));
957 f
->dump_stream("snap_trimq") << snap_trimq
;
958 f
->dump_unsigned("snap_trimq_len", snap_trimq
.size());
959 f
->dump_unsigned("epoch", get_osdmap()->get_epoch());
960 f
->open_array_section("up");
961 for (vector
<int>::iterator p
= up
.begin(); p
!= up
.end(); ++p
)
962 f
->dump_unsigned("osd", *p
);
964 f
->open_array_section("acting");
965 for (vector
<int>::iterator p
= acting
.begin(); p
!= acting
.end(); ++p
)
966 f
->dump_unsigned("osd", *p
);
968 if (!backfill_targets
.empty()) {
969 f
->open_array_section("backfill_targets");
970 for (set
<pg_shard_t
>::iterator p
= backfill_targets
.begin();
971 p
!= backfill_targets
.end();
973 f
->dump_stream("shard") << *p
;
976 if (!actingbackfill
.empty()) {
977 f
->open_array_section("actingbackfill");
978 for (set
<pg_shard_t
>::iterator p
= actingbackfill
.begin();
979 p
!= actingbackfill
.end();
981 f
->dump_stream("shard") << *p
;
984 f
->open_object_section("info");
985 _update_calc_stats();
989 f
->open_array_section("peer_info");
990 for (map
<pg_shard_t
, pg_info_t
>::iterator p
= peer_info
.begin();
991 p
!= peer_info
.end();
993 f
->open_object_section("info");
994 f
->dump_stream("peer") << p
->first
;
995 p
->second
.dump(f
.get());
1000 f
->open_array_section("recovery_state");
1001 handle_query_state(f
.get());
1004 f
->open_object_section("agent_state");
1006 agent_state
->dump(f
.get());
1013 else if (command
== "mark_unfound_lost") {
1015 cmd_getval(cct
, cmdmap
, "mulcmd", mulcmd
);
1017 if (mulcmd
== "revert") {
1018 if (pool
.info
.ec_pool()) {
1019 ss
<< "mode must be 'delete' for ec pool";
1022 mode
= pg_log_entry_t::LOST_REVERT
;
1023 } else if (mulcmd
== "delete") {
1024 mode
= pg_log_entry_t::LOST_DELETE
;
1026 ss
<< "mode must be 'revert' or 'delete'; mark not yet implemented";
1029 assert(mode
== pg_log_entry_t::LOST_REVERT
||
1030 mode
== pg_log_entry_t::LOST_DELETE
);
1032 if (!is_primary()) {
1033 ss
<< "not primary";
1037 uint64_t unfound
= missing_loc
.num_unfound();
1039 ss
<< "pg has no unfound objects";
1040 return 0; // make command idempotent
1043 if (!all_unfound_are_queried_or_lost(get_osdmap())) {
1044 ss
<< "pg has " << unfound
1045 << " unfound objects but we haven't probed all sources, not marking lost";
1049 mark_all_unfound_lost(mode
, con
, tid
);
1052 else if (command
== "list_missing") {
1055 if (cmd_getval(cct
, cmdmap
, "offset", offset_json
)) {
1056 json_spirit::Value v
;
1058 if (!json_spirit::read(offset_json
, v
))
1059 throw std::runtime_error("bad json");
1061 } catch (std::runtime_error
& e
) {
1062 ss
<< "error parsing offset: " << e
.what();
1066 f
->open_object_section("missing");
1068 f
->open_object_section("offset");
1069 offset
.dump(f
.get());
1072 f
->dump_int("num_missing", missing
.num_missing());
1073 f
->dump_int("num_unfound", get_num_unfound());
1074 const map
<hobject_t
, pg_missing_item
> &needs_recovery_map
=
1075 missing_loc
.get_needs_recovery();
1076 map
<hobject_t
, pg_missing_item
>::const_iterator p
=
1077 needs_recovery_map
.upper_bound(offset
);
1079 f
->open_array_section("objects");
1081 for (; p
!= needs_recovery_map
.end() && num
< cct
->_conf
->osd_command_max_records
; ++p
) {
1082 if (missing_loc
.is_unfound(p
->first
)) {
1083 f
->open_object_section("object");
1085 f
->open_object_section("oid");
1086 p
->first
.dump(f
.get());
1089 p
->second
.dump(f
.get()); // have, need keys
1091 f
->open_array_section("locations");
1092 for (set
<pg_shard_t
>::iterator r
=
1093 missing_loc
.get_locations(p
->first
).begin();
1094 r
!= missing_loc
.get_locations(p
->first
).end();
1096 f
->dump_stream("shard") << *r
;
1105 f
->dump_bool("more", p
!= needs_recovery_map
.end());
1111 ss
<< "unknown pg command " << prefix
;
1115 // ==========================================================
1117 void PrimaryLogPG::do_pg_op(OpRequestRef op
)
1119 // NOTE: this is non-const because we modify the OSDOp.outdata in
1121 MOSDOp
*m
= static_cast<MOSDOp
*>(op
->get_nonconst_req());
1122 assert(m
->get_type() == CEPH_MSG_OSD_OP
);
1123 dout(10) << "do_pg_op " << *m
<< dendl
;
1128 string cname
, mname
;
1129 PGLSFilter
*filter
= NULL
;
1130 bufferlist filter_out
;
1132 snapid_t snapid
= m
->get_snapid();
1134 vector
<OSDOp
> ops
= m
->ops
;
1136 for (vector
<OSDOp
>::iterator p
= ops
.begin(); p
!= ops
.end(); ++p
) {
1138 bufferlist::iterator bp
= p
->indata
.begin();
1140 case CEPH_OSD_OP_PGNLS_FILTER
:
1142 ::decode(cname
, bp
);
1143 ::decode(mname
, bp
);
1145 catch (const buffer::error
& e
) {
1146 dout(0) << "unable to decode PGLS_FILTER description in " << *m
<< dendl
;
1154 result
= get_pgls_filter(bp
, &filter
);
1162 case CEPH_OSD_OP_PGNLS
:
1163 if (snapid
!= CEPH_NOSNAP
) {
1167 if (get_osdmap()->raw_pg_to_pg(m
->get_pg()) != info
.pgid
.pgid
) {
1168 dout(10) << " pgnls pg=" << m
->get_pg()
1169 << " " << get_osdmap()->raw_pg_to_pg(m
->get_pg())
1170 << " != " << info
.pgid
<< dendl
;
1173 unsigned list_size
= MIN(cct
->_conf
->osd_max_pgls
, p
->op
.pgls
.count
);
1175 dout(10) << " pgnls pg=" << m
->get_pg() << " count " << list_size
<< dendl
;
1176 // read into a buffer
1177 vector
<hobject_t
> sentries
;
1178 pg_nls_response_t response
;
1180 ::decode(response
.handle
, bp
);
1182 catch (const buffer::error
& e
) {
1183 dout(0) << "unable to decode PGNLS handle in " << *m
<< dendl
;
1189 hobject_t lower_bound
= response
.handle
;
1190 hobject_t pg_start
= info
.pgid
.pgid
.get_hobj_start();
1191 hobject_t pg_end
= info
.pgid
.pgid
.get_hobj_end(pool
.info
.get_pg_num());
1192 dout(10) << " pgnls lower_bound " << lower_bound
1193 << " pg_end " << pg_end
<< dendl
;
1194 if (((!lower_bound
.is_max() && lower_bound
>= pg_end
) ||
1195 (lower_bound
!= hobject_t() && lower_bound
< pg_start
))) {
1196 // this should only happen with a buggy client.
1197 dout(10) << "outside of PG bounds " << pg_start
<< " .. "
1203 hobject_t current
= lower_bound
;
1205 int r
= pgbackend
->objects_list_partial(
1216 map
<hobject_t
, pg_missing_item
>::const_iterator missing_iter
=
1217 pg_log
.get_missing().get_items().lower_bound(current
);
1218 vector
<hobject_t
>::iterator ls_iter
= sentries
.begin();
1219 hobject_t _max
= hobject_t::get_max();
1221 const hobject_t
&mcand
=
1222 missing_iter
== pg_log
.get_missing().get_items().end() ?
1224 missing_iter
->first
;
1225 const hobject_t
&lcand
=
1226 ls_iter
== sentries
.end() ?
1230 hobject_t candidate
;
1231 if (mcand
== lcand
) {
1233 if (!mcand
.is_max()) {
1237 } else if (mcand
< lcand
) {
1239 assert(!mcand
.is_max());
1243 assert(!lcand
.is_max());
1247 dout(10) << " pgnls candidate 0x" << std::hex
<< candidate
.get_hash()
1248 << " vs lower bound 0x" << lower_bound
.get_hash() << dendl
;
1250 if (candidate
>= next
) {
1254 if (response
.entries
.size() == list_size
) {
1259 // skip snapdir objects
1260 if (candidate
.snap
== CEPH_SNAPDIR
)
1263 if (candidate
.snap
!= CEPH_NOSNAP
)
1266 // skip internal namespace
1267 if (candidate
.get_namespace() == cct
->_conf
->osd_hit_set_namespace
)
1270 if (missing_loc
.is_deleted(candidate
))
1273 // skip wrong namespace
1274 if (m
->get_hobj().nspace
!= librados::all_nspaces
&&
1275 candidate
.get_namespace() != m
->get_hobj().nspace
)
1278 if (filter
&& !pgls_filter(filter
, candidate
, filter_out
))
1281 dout(20) << "pgnls item 0x" << std::hex
1282 << candidate
.get_hash()
1283 << ", rev 0x" << hobject_t::_reverse_bits(candidate
.get_hash())
1285 << candidate
.oid
.name
<< dendl
;
1287 librados::ListObjectImpl item
;
1288 item
.nspace
= candidate
.get_namespace();
1289 item
.oid
= candidate
.oid
.name
;
1290 item
.locator
= candidate
.get_key();
1291 response
.entries
.push_back(item
);
1294 if (next
.is_max() &&
1295 missing_iter
== pg_log
.get_missing().get_items().end() &&
1296 ls_iter
== sentries
.end()) {
1299 // Set response.handle to the start of the next PG according
1300 // to the object sort order.
1301 response
.handle
= info
.pgid
.pgid
.get_hobj_end(pool
.info
.get_pg_num());
1303 response
.handle
= next
;
1305 dout(10) << "pgnls handle=" << response
.handle
<< dendl
;
1306 ::encode(response
, osd_op
.outdata
);
1308 ::encode(filter_out
, osd_op
.outdata
);
1309 dout(10) << " pgnls result=" << result
<< " outdata.length()="
1310 << osd_op
.outdata
.length() << dendl
;
1314 case CEPH_OSD_OP_PGLS_FILTER
:
1316 ::decode(cname
, bp
);
1317 ::decode(mname
, bp
);
1319 catch (const buffer::error
& e
) {
1320 dout(0) << "unable to decode PGLS_FILTER description in " << *m
<< dendl
;
1328 result
= get_pgls_filter(bp
, &filter
);
1336 case CEPH_OSD_OP_PGLS
:
1337 if (snapid
!= CEPH_NOSNAP
) {
1341 if (get_osdmap()->raw_pg_to_pg(m
->get_pg()) != info
.pgid
.pgid
) {
1342 dout(10) << " pgls pg=" << m
->get_pg()
1343 << " " << get_osdmap()->raw_pg_to_pg(m
->get_pg())
1344 << " != " << info
.pgid
<< dendl
;
1347 unsigned list_size
= MIN(cct
->_conf
->osd_max_pgls
, p
->op
.pgls
.count
);
1349 dout(10) << " pgls pg=" << m
->get_pg() << " count " << list_size
<< dendl
;
1350 // read into a buffer
1351 vector
<hobject_t
> sentries
;
1352 pg_ls_response_t response
;
1354 ::decode(response
.handle
, bp
);
1356 catch (const buffer::error
& e
) {
1357 dout(0) << "unable to decode PGLS handle in " << *m
<< dendl
;
1363 hobject_t current
= response
.handle
;
1365 int r
= pgbackend
->objects_list_partial(
1376 assert(snapid
== CEPH_NOSNAP
|| pg_log
.get_missing().get_items().empty());
1378 map
<hobject_t
, pg_missing_item
>::const_iterator missing_iter
=
1379 pg_log
.get_missing().get_items().lower_bound(current
);
1380 vector
<hobject_t
>::iterator ls_iter
= sentries
.begin();
1381 hobject_t _max
= hobject_t::get_max();
1383 const hobject_t
&mcand
=
1384 missing_iter
== pg_log
.get_missing().get_items().end() ?
1386 missing_iter
->first
;
1387 const hobject_t
&lcand
=
1388 ls_iter
== sentries
.end() ?
1392 hobject_t candidate
;
1393 if (mcand
== lcand
) {
1395 if (!mcand
.is_max()) {
1399 } else if (mcand
< lcand
) {
1401 assert(!mcand
.is_max());
1405 assert(!lcand
.is_max());
1409 if (candidate
>= next
) {
1413 if (response
.entries
.size() == list_size
) {
1418 // skip snapdir objects
1419 if (candidate
.snap
== CEPH_SNAPDIR
)
1422 if (candidate
.snap
!= CEPH_NOSNAP
)
1425 // skip wrong namespace
1426 if (candidate
.get_namespace() != m
->get_hobj().nspace
)
1429 if (missing_loc
.is_deleted(candidate
))
1432 if (filter
&& !pgls_filter(filter
, candidate
, filter_out
))
1435 response
.entries
.push_back(make_pair(candidate
.oid
,
1436 candidate
.get_key()));
1438 if (next
.is_max() &&
1439 missing_iter
== pg_log
.get_missing().get_items().end() &&
1440 ls_iter
== sentries
.end()) {
1443 response
.handle
= next
;
1444 ::encode(response
, osd_op
.outdata
);
1446 ::encode(filter_out
, osd_op
.outdata
);
1447 dout(10) << " pgls result=" << result
<< " outdata.length()="
1448 << osd_op
.outdata
.length() << dendl
;
1452 case CEPH_OSD_OP_PG_HITSET_LS
:
1454 list
< pair
<utime_t
,utime_t
> > ls
;
1455 for (list
<pg_hit_set_info_t
>::const_iterator p
= info
.hit_set
.history
.begin();
1456 p
!= info
.hit_set
.history
.end();
1458 ls
.push_back(make_pair(p
->begin
, p
->end
));
1460 ls
.push_back(make_pair(hit_set_start_stamp
, utime_t()));
1461 ::encode(ls
, osd_op
.outdata
);
1465 case CEPH_OSD_OP_PG_HITSET_GET
:
1467 utime_t
stamp(osd_op
.op
.hit_set_get
.stamp
);
1468 if (hit_set_start_stamp
&& stamp
>= hit_set_start_stamp
) {
1469 // read the current in-memory HitSet, not the version we've
1475 ::encode(*hit_set
, osd_op
.outdata
);
1476 result
= osd_op
.outdata
.length();
1478 // read an archived HitSet.
1480 for (list
<pg_hit_set_info_t
>::const_iterator p
= info
.hit_set
.history
.begin();
1481 p
!= info
.hit_set
.history
.end();
1483 if (stamp
>= p
->begin
&& stamp
<= p
->end
) {
1484 oid
= get_hit_set_archive_object(p
->begin
, p
->end
, p
->using_gmt
);
1488 if (oid
== hobject_t()) {
1492 if (!pool
.info
.is_replicated()) {
1493 // FIXME: EC not supported yet
1494 result
= -EOPNOTSUPP
;
1497 if (is_unreadable_object(oid
)) {
1498 wait_for_unreadable_object(oid
, op
);
1502 result
= osd
->store
->read(ch
, ghobject_t(oid
), 0, 0, osd_op
.outdata
);
1507 case CEPH_OSD_OP_SCRUBLS
:
1508 result
= do_scrub_ls(m
, &osd_op
);
1521 MOSDOpReply
*reply
= new MOSDOpReply(m
, 0, get_osdmap()->get_epoch(),
1522 CEPH_OSD_FLAG_ACK
| CEPH_OSD_FLAG_ONDISK
,
1524 reply
->claim_op_out_data(ops
);
1525 reply
->set_result(result
);
1526 reply
->set_reply_versions(info
.last_update
, info
.last_user_version
);
1527 osd
->send_message_osd_client(reply
, m
->get_connection());
1531 int PrimaryLogPG::do_scrub_ls(MOSDOp
*m
, OSDOp
*osd_op
)
1533 if (m
->get_pg() != info
.pgid
.pgid
) {
1534 dout(10) << " scrubls pg=" << m
->get_pg() << " != " << info
.pgid
<< dendl
;
1535 return -EINVAL
; // hmm?
1537 auto bp
= osd_op
->indata
.begin();
1541 } catch (buffer::error
&) {
1542 dout(10) << " corrupted scrub_ls_arg_t" << dendl
;
1546 scrub_ls_result_t result
= {.interval
= info
.history
.same_interval_since
};
1547 if (arg
.interval
!= 0 && arg
.interval
!= info
.history
.same_interval_since
) {
1549 } else if (!scrubber
.store
) {
1551 } else if (arg
.get_snapsets
) {
1552 result
.vals
= scrubber
.store
->get_snap_errors(osd
->store
,
1557 result
.vals
= scrubber
.store
->get_object_errors(osd
->store
,
1562 ::encode(result
, osd_op
->outdata
);
1566 void PrimaryLogPG::calc_trim_to()
1568 size_t target
= cct
->_conf
->osd_min_pg_log_entries
;
1569 if (is_degraded() ||
1570 state_test(PG_STATE_RECOVERING
|
1571 PG_STATE_RECOVERY_WAIT
|
1572 PG_STATE_BACKFILLING
|
1573 PG_STATE_BACKFILL_WAIT
|
1574 PG_STATE_BACKFILL_TOOFULL
)) {
1575 target
= cct
->_conf
->osd_max_pg_log_entries
;
1578 eversion_t limit
= MIN(
1579 min_last_complete_ondisk
,
1580 pg_log
.get_can_rollback_to());
1581 if (limit
!= eversion_t() &&
1582 limit
!= pg_trim_to
&&
1583 pg_log
.get_log().approx_size() > target
) {
1584 size_t num_to_trim
= MIN(pg_log
.get_log().approx_size() - target
,
1585 cct
->_conf
->osd_pg_log_trim_max
);
1586 if (num_to_trim
< cct
->_conf
->osd_pg_log_trim_min
&&
1587 cct
->_conf
->osd_pg_log_trim_max
>= cct
->_conf
->osd_pg_log_trim_min
) {
1590 list
<pg_log_entry_t
>::const_iterator it
= pg_log
.get_log().log
.begin();
1591 eversion_t new_trim_to
;
1592 for (size_t i
= 0; i
< num_to_trim
; ++i
) {
1593 new_trim_to
= it
->version
;
1595 if (new_trim_to
> limit
) {
1596 new_trim_to
= limit
;
1597 dout(10) << "calc_trim_to trimming to min_last_complete_ondisk" << dendl
;
1601 dout(10) << "calc_trim_to " << pg_trim_to
<< " -> " << new_trim_to
<< dendl
;
1602 pg_trim_to
= new_trim_to
;
1603 assert(pg_trim_to
<= pg_log
.get_head());
1604 assert(pg_trim_to
<= min_last_complete_ondisk
);
1608 PrimaryLogPG::PrimaryLogPG(OSDService
*o
, OSDMapRef curmap
,
1609 const PGPool
&_pool
, spg_t p
) :
1610 PG(o
, curmap
, _pool
, p
),
1612 PGBackend::build_pg_backend(
1613 _pool
.info
, curmap
, this, coll_t(p
), ch
, o
->store
, cct
)),
1614 object_contexts(o
->cct
, o
->cct
->_conf
->osd_pg_object_context_cache_count
),
1615 snapset_contexts_lock("PrimaryLogPG::snapset_contexts_lock"),
1616 new_backfill(false),
1618 snap_trimmer_machine(this)
1620 missing_loc
.set_backend_predicates(
1621 pgbackend
->get_is_readable_predicate(),
1622 pgbackend
->get_is_recoverable_predicate());
1623 snap_trimmer_machine
.initiate();
1626 void PrimaryLogPG::get_src_oloc(const object_t
& oid
, const object_locator_t
& oloc
, object_locator_t
& src_oloc
)
1629 if (oloc
.key
.empty())
1630 src_oloc
.key
= oid
.name
;
1633 void PrimaryLogPG::handle_backoff(OpRequestRef
& op
)
1635 const MOSDBackoff
*m
= static_cast<const MOSDBackoff
*>(op
->get_req());
1636 SessionRef session
= static_cast<Session
*>(m
->get_connection()->get_priv());
1639 session
->put(); // get_priv takes a ref, and so does the SessionRef
1640 hobject_t begin
= info
.pgid
.pgid
.get_hobj_start();
1641 hobject_t end
= info
.pgid
.pgid
.get_hobj_end(pool
.info
.get_pg_num());
1642 if (begin
< m
->begin
) {
1648 dout(10) << __func__
<< " backoff ack id " << m
->id
1649 << " [" << begin
<< "," << end
<< ")" << dendl
;
1650 session
->ack_backoff(cct
, m
->pgid
, m
->id
, begin
, end
);
1653 void PrimaryLogPG::do_request(
1655 ThreadPool::TPHandle
&handle
)
1657 if (op
->osd_trace
) {
1658 op
->pg_trace
.init("pg op", &trace_endpoint
, &op
->osd_trace
);
1659 op
->pg_trace
.event("do request");
1661 // make sure we have a new enough map
1662 auto p
= waiting_for_map
.find(op
->get_source());
1663 if (p
!= waiting_for_map
.end()) {
1664 // preserve ordering
1665 dout(20) << __func__
<< " waiting_for_map "
1666 << p
->first
<< " not empty, queueing" << dendl
;
1667 p
->second
.push_back(op
);
1668 op
->mark_delayed("waiting_for_map not empty");
1671 if (!have_same_or_newer_map(op
->min_epoch
)) {
1672 dout(20) << __func__
<< " min " << op
->min_epoch
1673 << ", queue on waiting_for_map " << op
->get_source() << dendl
;
1674 waiting_for_map
[op
->get_source()].push_back(op
);
1675 op
->mark_delayed("op must wait for map");
1676 osd
->request_osdmap_update(op
->min_epoch
);
1680 if (can_discard_request(op
)) {
1685 const Message
*m
= op
->get_req();
1686 if (m
->get_connection()->has_feature(CEPH_FEATURE_RADOS_BACKOFF
)) {
1687 SessionRef session
= static_cast<Session
*>(m
->get_connection()->get_priv());
1690 session
->put(); // get_priv takes a ref, and so does the SessionRef
1692 if (op
->get_req()->get_type() == CEPH_MSG_OSD_OP
) {
1693 if (session
->check_backoff(cct
, info
.pgid
,
1694 info
.pgid
.pgid
.get_hobj_start(), m
)) {
1701 (!is_active() && is_peered());
1702 if (g_conf
->osd_backoff_on_peering
&& !backoff
) {
1708 add_pg_backoff(session
);
1712 // pg backoff acks at pg-level
1713 if (op
->get_req()->get_type() == CEPH_MSG_OSD_BACKOFF
) {
1714 const MOSDBackoff
*ba
= static_cast<const MOSDBackoff
*>(m
);
1715 if (ba
->begin
!= ba
->end
) {
1723 // Delay unless PGBackend says it's ok
1724 if (pgbackend
->can_handle_while_inactive(op
)) {
1725 bool handled
= pgbackend
->handle_message(op
);
1729 waiting_for_peered
.push_back(op
);
1730 op
->mark_delayed("waiting for peered");
1735 if (flushes_in_progress
> 0) {
1736 dout(20) << flushes_in_progress
1737 << " flushes_in_progress pending "
1738 << "waiting for flush on " << op
<< dendl
;
1739 waiting_for_flush
.push_back(op
);
1740 op
->mark_delayed("waiting for flush");
1744 assert(is_peered() && flushes_in_progress
== 0);
1745 if (pgbackend
->handle_message(op
))
1748 switch (op
->get_req()->get_type()) {
1749 case CEPH_MSG_OSD_OP
:
1750 case CEPH_MSG_OSD_BACKOFF
:
1752 dout(20) << " peered, not active, waiting for active on " << op
<< dendl
;
1753 waiting_for_active
.push_back(op
);
1754 op
->mark_delayed("waiting for active");
1757 switch (op
->get_req()->get_type()) {
1758 case CEPH_MSG_OSD_OP
:
1759 // verify client features
1760 if ((pool
.info
.has_tiers() || pool
.info
.is_tier()) &&
1761 !op
->has_feature(CEPH_FEATURE_OSD_CACHEPOOL
)) {
1762 osd
->reply_op_error(op
, -EOPNOTSUPP
);
1767 case CEPH_MSG_OSD_BACKOFF
:
1768 // object-level backoff acks handled in osdop context
1778 case MSG_OSD_SUBOPREPLY
:
1779 do_sub_op_reply(op
);
1782 case MSG_OSD_PG_SCAN
:
1783 do_scan(op
, handle
);
1786 case MSG_OSD_PG_BACKFILL
:
1790 case MSG_OSD_PG_BACKFILL_REMOVE
:
1791 do_backfill_remove(op
);
1794 case MSG_OSD_SCRUB_RESERVE
:
1796 const MOSDScrubReserve
*m
=
1797 static_cast<const MOSDScrubReserve
*>(op
->get_req());
1799 case MOSDScrubReserve::REQUEST
:
1800 handle_scrub_reserve_request(op
);
1802 case MOSDScrubReserve::GRANT
:
1803 handle_scrub_reserve_grant(op
, m
->from
);
1805 case MOSDScrubReserve::REJECT
:
1806 handle_scrub_reserve_reject(op
, m
->from
);
1808 case MOSDScrubReserve::RELEASE
:
1809 handle_scrub_reserve_release(op
);
1815 case MSG_OSD_REP_SCRUB
:
1816 replica_scrub(op
, handle
);
1819 case MSG_OSD_REP_SCRUBMAP
:
1820 do_replica_scrub_map(op
);
1823 case MSG_OSD_PG_UPDATE_LOG_MISSING
:
1824 do_update_log_missing(op
);
1827 case MSG_OSD_PG_UPDATE_LOG_MISSING_REPLY
:
1828 do_update_log_missing_reply(op
);
1832 assert(0 == "bad message type in do_request");
1836 hobject_t
PrimaryLogPG::earliest_backfill() const
1838 hobject_t e
= hobject_t::get_max();
1839 for (set
<pg_shard_t
>::iterator i
= backfill_targets
.begin();
1840 i
!= backfill_targets
.end();
1843 map
<pg_shard_t
, pg_info_t
>::const_iterator iter
= peer_info
.find(bt
);
1844 assert(iter
!= peer_info
.end());
1845 if (iter
->second
.last_backfill
< e
)
1846 e
= iter
->second
.last_backfill
;
1851 /** do_op - do an op
1852 * pg lock will be held (if multithreaded)
1853 * osd_lock NOT held.
1855 void PrimaryLogPG::do_op(OpRequestRef
& op
)
1858 // NOTE: take a non-const pointer here; we must be careful not to
1859 // change anything that will break other reads on m (operator<<).
1860 MOSDOp
*m
= static_cast<MOSDOp
*>(op
->get_nonconst_req());
1861 assert(m
->get_type() == CEPH_MSG_OSD_OP
);
1862 if (m
->finish_decode()) {
1863 op
->reset_desc(); // for TrackedOp
1867 dout(20) << __func__
<< ": op " << *m
<< dendl
;
1869 hobject_t head
= m
->get_hobj();
1870 head
.snap
= CEPH_NOSNAP
;
1872 if (!info
.pgid
.pgid
.contains(
1873 info
.pgid
.pgid
.get_split_bits(pool
.info
.get_pg_num()), head
)) {
1874 derr
<< __func__
<< " " << info
.pgid
.pgid
<< " does not contain "
1875 << head
<< " pg_num " << pool
.info
.get_pg_num() << " hash "
1876 << std::hex
<< head
.get_hash() << std::dec
<< dendl
;
1877 osd
->clog
->warn() << info
.pgid
.pgid
<< " does not contain " << head
1879 assert(!cct
->_conf
->osd_debug_misdirected_ops
);
1884 m
->get_connection()->has_feature(CEPH_FEATURE_RADOS_BACKOFF
);
1887 session
= static_cast<Session
*>(m
->get_connection()->get_priv());
1888 if (!session
.get()) {
1889 dout(10) << __func__
<< " no session" << dendl
;
1892 session
->put(); // get_priv() takes a ref, and so does the intrusive_ptr
1894 if (session
->check_backoff(cct
, info
.pgid
, head
, m
)) {
1899 if (m
->has_flag(CEPH_OSD_FLAG_PARALLELEXEC
)) {
1901 dout(20) << __func__
<< ": PARALLELEXEC not implemented " << *m
<< dendl
;
1902 osd
->reply_op_error(op
, -EINVAL
);
1906 if (op
->rmw_flags
== 0) {
1907 int r
= osd
->osd
->init_op_flags(op
);
1909 osd
->reply_op_error(op
, r
);
1914 if ((m
->get_flags() & (CEPH_OSD_FLAG_BALANCE_READS
|
1915 CEPH_OSD_FLAG_LOCALIZE_READS
)) &&
1917 !(op
->may_write() || op
->may_cache())) {
1918 // balanced reads; any replica will do
1919 if (!(is_primary() || is_replica())) {
1920 osd
->handle_misdirected_op(this, op
);
1924 // normal case; must be primary
1925 if (!is_primary()) {
1926 osd
->handle_misdirected_op(this, op
);
1931 if (!op_has_sufficient_caps(op
)) {
1932 osd
->reply_op_error(op
, -EPERM
);
1936 if (op
->includes_pg_op()) {
1937 return do_pg_op(op
);
1940 // object name too long?
1941 if (m
->get_oid().name
.size() > cct
->_conf
->osd_max_object_name_len
) {
1942 dout(4) << "do_op name is longer than "
1943 << cct
->_conf
->osd_max_object_name_len
1944 << " bytes" << dendl
;
1945 osd
->reply_op_error(op
, -ENAMETOOLONG
);
1948 if (m
->get_hobj().get_key().size() > cct
->_conf
->osd_max_object_name_len
) {
1949 dout(4) << "do_op locator is longer than "
1950 << cct
->_conf
->osd_max_object_name_len
1951 << " bytes" << dendl
;
1952 osd
->reply_op_error(op
, -ENAMETOOLONG
);
1955 if (m
->get_hobj().nspace
.size() > cct
->_conf
->osd_max_object_namespace_len
) {
1956 dout(4) << "do_op namespace is longer than "
1957 << cct
->_conf
->osd_max_object_namespace_len
1958 << " bytes" << dendl
;
1959 osd
->reply_op_error(op
, -ENAMETOOLONG
);
1963 if (int r
= osd
->store
->validate_hobject_key(head
)) {
1964 dout(4) << "do_op object " << head
<< " invalid for backing store: "
1966 osd
->reply_op_error(op
, r
);
1971 if (get_osdmap()->is_blacklisted(m
->get_source_addr())) {
1972 dout(10) << "do_op " << m
->get_source_addr() << " is blacklisted" << dendl
;
1973 osd
->reply_op_error(op
, -EBLACKLISTED
);
1977 // order this op as a write?
1978 bool write_ordered
= op
->rwordered();
1980 // discard due to cluster full transition? (we discard any op that
1981 // originates before the cluster or pool is marked full; the client
1982 // will resend after the full flag is removed or if they expect the
1983 // op to succeed despite being full). The except is FULL_FORCE and
1984 // FULL_TRY ops, which there is no reason to discard because they
1985 // bypass all full checks anyway. If this op isn't write or
1986 // read-ordered, we skip.
1987 // FIXME: we exclude mds writes for now.
1988 if (write_ordered
&& !(m
->get_source().is_mds() ||
1989 m
->has_flag(CEPH_OSD_FLAG_FULL_TRY
) ||
1990 m
->has_flag(CEPH_OSD_FLAG_FULL_FORCE
)) &&
1991 info
.history
.last_epoch_marked_full
> m
->get_map_epoch()) {
1992 dout(10) << __func__
<< " discarding op sent before full " << m
<< " "
1996 // mds should have stopped writing before this point.
1997 // We can't allow OSD to become non-startable even if mds
1998 // could be writing as part of file removals.
2000 if (write_ordered
&& osd
->check_failsafe_full(ss
)) {
2001 dout(10) << __func__
<< " fail-safe full check failed, dropping request"
2006 int64_t poolid
= get_pgid().pool();
2007 if (op
->may_write()) {
2009 const pg_pool_t
*pi
= get_osdmap()->get_pg_pool(poolid
);
2015 if (m
->get_snapid() != CEPH_NOSNAP
) {
2016 dout(20) << __func__
<< ": write to clone not valid " << *m
<< dendl
;
2017 osd
->reply_op_error(op
, -EINVAL
);
2022 if (cct
->_conf
->osd_max_write_size
&&
2023 m
->get_data_len() > cct
->_conf
->osd_max_write_size
<< 20) {
2024 // journal can't hold commit!
2025 derr
<< "do_op msg data len " << m
->get_data_len()
2026 << " > osd_max_write_size " << (cct
->_conf
->osd_max_write_size
<< 20)
2027 << " on " << *m
<< dendl
;
2028 osd
->reply_op_error(op
, -OSD_WRITETOOBIG
);
2033 dout(10) << "do_op " << *m
2034 << (op
->may_write() ? " may_write" : "")
2035 << (op
->may_read() ? " may_read" : "")
2036 << (op
->may_cache() ? " may_cache" : "")
2037 << " -> " << (write_ordered
? "write-ordered" : "read-ordered")
2038 << " flags " << ceph_osd_flag_string(m
->get_flags())
2042 if (is_unreadable_object(head
)) {
2043 if (!is_primary()) {
2044 osd
->reply_op_error(op
, -EAGAIN
);
2048 (g_conf
->osd_backoff_on_degraded
||
2049 (g_conf
->osd_backoff_on_unfound
&& missing_loc
.is_unfound(head
)))) {
2050 add_backoff(session
, head
, head
);
2051 maybe_kick_recovery(head
);
2053 wait_for_unreadable_object(head
, op
);
2059 if (write_ordered
&& is_degraded_or_backfilling_object(head
)) {
2060 if (can_backoff
&& g_conf
->osd_backoff_on_degraded
) {
2061 add_backoff(session
, head
, head
);
2062 maybe_kick_recovery(head
);
2064 wait_for_degraded_object(head
, op
);
2069 if (write_ordered
&& scrubber
.is_chunky_scrub_active() &&
2070 write_blocked_by_scrub(head
)) {
2071 dout(20) << __func__
<< ": waiting for scrub" << dendl
;
2072 waiting_for_scrub
.push_back(op
);
2073 op
->mark_delayed("waiting for scrub");
2078 map
<hobject_t
, snapid_t
>::iterator blocked_iter
=
2079 objects_blocked_on_degraded_snap
.find(head
);
2080 if (write_ordered
&& blocked_iter
!= objects_blocked_on_degraded_snap
.end()) {
2081 hobject_t
to_wait_on(head
);
2082 to_wait_on
.snap
= blocked_iter
->second
;
2083 wait_for_degraded_object(to_wait_on
, op
);
2086 map
<hobject_t
, ObjectContextRef
>::iterator blocked_snap_promote_iter
=
2087 objects_blocked_on_snap_promotion
.find(head
);
2088 if (write_ordered
&&
2089 blocked_snap_promote_iter
!= objects_blocked_on_snap_promotion
.end()) {
2090 wait_for_blocked_object(
2091 blocked_snap_promote_iter
->second
->obs
.oi
.soid
,
2095 if (write_ordered
&& objects_blocked_on_cache_full
.count(head
)) {
2096 block_write_on_full_cache(head
, op
);
2101 hobject_t snapdir
= head
.get_snapdir();
2103 if (is_unreadable_object(snapdir
)) {
2104 wait_for_unreadable_object(snapdir
, op
);
2109 if (write_ordered
&& is_degraded_or_backfilling_object(snapdir
)) {
2110 wait_for_degraded_object(snapdir
, op
);
2115 if (op
->may_write() || op
->may_cache()) {
2116 // warning: we will get back *a* request for this reqid, but not
2117 // necessarily the most recent. this happens with flush and
2118 // promote ops, but we can't possible have both in our log where
2119 // the original request is still not stable on disk, so for our
2120 // purposes here it doesn't matter which one we get.
2122 version_t user_version
;
2123 int return_code
= 0;
2124 bool got
= check_in_progress_op(
2125 m
->get_reqid(), &version
, &user_version
, &return_code
);
2127 dout(3) << __func__
<< " dup " << m
->get_reqid()
2128 << " version " << version
<< dendl
;
2129 if (already_complete(version
)) {
2130 osd
->reply_op_error(op
, return_code
, version
, user_version
);
2132 dout(10) << " waiting for " << version
<< " to commit" << dendl
;
2133 // always queue ondisk waiters, so that we can requeue if needed
2134 waiting_for_ondisk
[version
].push_back(make_pair(op
, user_version
));
2135 op
->mark_delayed("waiting for ondisk");
2141 ObjectContextRef obc
;
2142 bool can_create
= op
->may_write() || op
->may_cache();
2143 hobject_t missing_oid
;
2144 const hobject_t
& oid
= m
->get_hobj();
2146 // io blocked on obc?
2147 if (!m
->has_flag(CEPH_OSD_FLAG_FLUSH
) &&
2148 maybe_await_blocked_snapset(oid
, op
)) {
2152 int r
= find_object_context(
2153 oid
, &obc
, can_create
,
2154 m
->has_flag(CEPH_OSD_FLAG_MAP_SNAP_CLONE
),
2158 // If we're not the primary of this OSD, we just return -EAGAIN. Otherwise,
2159 // we have to wait for the object.
2161 // missing the specific snap we need; requeue and wait.
2162 assert(!op
->may_write()); // only happens on a read/cache
2163 wait_for_unreadable_object(missing_oid
, op
);
2166 } else if (r
== 0) {
2167 if (is_unreadable_object(obc
->obs
.oi
.soid
)) {
2168 dout(10) << __func__
<< ": clone " << obc
->obs
.oi
.soid
2169 << " is unreadable, waiting" << dendl
;
2170 wait_for_unreadable_object(obc
->obs
.oi
.soid
, op
);
2174 // degraded object? (the check above was for head; this could be a clone)
2175 if (write_ordered
&&
2176 obc
->obs
.oi
.soid
.snap
!= CEPH_NOSNAP
&&
2177 is_degraded_or_backfilling_object(obc
->obs
.oi
.soid
)) {
2178 dout(10) << __func__
<< ": clone " << obc
->obs
.oi
.soid
2179 << " is degraded, waiting" << dendl
;
2180 wait_for_degraded_object(obc
->obs
.oi
.soid
, op
);
2185 bool in_hit_set
= false;
2188 if (obc
->obs
.oi
.soid
!= hobject_t() && hit_set
->contains(obc
->obs
.oi
.soid
))
2191 if (missing_oid
!= hobject_t() && hit_set
->contains(missing_oid
))
2194 if (!op
->hitset_inserted
) {
2195 hit_set
->insert(oid
);
2196 op
->hitset_inserted
= true;
2197 if (hit_set
->is_full() ||
2198 hit_set_start_stamp
+ pool
.info
.hit_set_period
<= m
->get_recv_stamp()) {
2205 if (agent_choose_mode(false, op
))
2209 if (obc
.get() && obc
->obs
.exists
&& obc
->obs
.oi
.has_manifest()) {
2210 if (maybe_handle_manifest(op
,
2216 if (maybe_handle_cache(op
,
2225 if (r
&& (r
!= -ENOENT
|| !obc
)) {
2226 // copy the reqids for copy get on ENOENT
2228 (m
->ops
[0].op
.op
== CEPH_OSD_OP_COPY_GET
)) {
2229 fill_in_copy_get_noent(op
, oid
, m
->ops
[0]);
2232 dout(20) << __func__
<< ": find_object_context got error " << r
<< dendl
;
2233 if (op
->may_write() &&
2234 get_osdmap()->require_osd_release
>= CEPH_RELEASE_KRAKEN
) {
2235 record_write_error(op
, oid
, nullptr, r
);
2237 osd
->reply_op_error(op
, r
);
2242 // make sure locator is consistent
2243 object_locator_t
oloc(obc
->obs
.oi
.soid
);
2244 if (m
->get_object_locator() != oloc
) {
2245 dout(10) << " provided locator " << m
->get_object_locator()
2246 << " != object's " << obc
->obs
.oi
.soid
<< dendl
;
2247 osd
->clog
->warn() << "bad locator " << m
->get_object_locator()
2248 << " on object " << oloc
2252 // io blocked on obc?
2253 if (obc
->is_blocked() &&
2254 !m
->has_flag(CEPH_OSD_FLAG_FLUSH
)) {
2255 wait_for_blocked_object(obc
->obs
.oi
.soid
, op
);
2259 dout(25) << __func__
<< " oi " << obc
->obs
.oi
<< dendl
;
2261 for (vector
<OSDOp
>::iterator p
= m
->ops
.begin(); p
!= m
->ops
.end(); ++p
) {
2264 // make sure LIST_SNAPS is on CEPH_SNAPDIR and nothing else
2265 if (osd_op
.op
.op
== CEPH_OSD_OP_LIST_SNAPS
&&
2266 m
->get_snapid() != CEPH_SNAPDIR
) {
2267 dout(10) << "LIST_SNAPS with incorrect context" << dendl
;
2268 osd
->reply_op_error(op
, -EINVAL
);
2273 OpContext
*ctx
= new OpContext(op
, m
->get_reqid(), &m
->ops
, obc
, this);
2275 if (!obc
->obs
.exists
)
2276 ctx
->snapset_obc
= get_object_context(obc
->obs
.oi
.soid
.get_snapdir(), false);
2278 /* Due to obc caching, we might have a cached non-existent snapset_obc
2279 * for the snapdir. If so, we can ignore it. Subsequent parts of the
2280 * do_op pipeline make decisions based on whether snapset_obc is
2283 if (ctx
->snapset_obc
&& !ctx
->snapset_obc
->obs
.exists
)
2284 ctx
->snapset_obc
= ObjectContextRef();
2286 if (m
->has_flag(CEPH_OSD_FLAG_SKIPRWLOCKS
)) {
2287 dout(20) << __func__
<< ": skipping rw locks" << dendl
;
2288 } else if (m
->get_flags() & CEPH_OSD_FLAG_FLUSH
) {
2289 dout(20) << __func__
<< ": part of flush, will ignore write lock" << dendl
;
2291 // verify there is in fact a flush in progress
2292 // FIXME: we could make this a stronger test.
2293 map
<hobject_t
,FlushOpRef
>::iterator p
= flush_ops
.find(obc
->obs
.oi
.soid
);
2294 if (p
== flush_ops
.end()) {
2295 dout(10) << __func__
<< " no flush in progress, aborting" << dendl
;
2296 reply_ctx(ctx
, -EINVAL
);
2299 } else if (!get_rw_locks(write_ordered
, ctx
)) {
2300 dout(20) << __func__
<< " waiting for rw locks " << dendl
;
2301 op
->mark_delayed("waiting for rw locks");
2305 dout(20) << __func__
<< " obc " << *obc
<< dendl
;
2308 dout(20) << __func__
<< " returned an error: " << r
<< dendl
;
2310 if (op
->may_write() &&
2311 get_osdmap()->require_osd_release
>= CEPH_RELEASE_KRAKEN
) {
2312 record_write_error(op
, oid
, nullptr, r
);
2314 osd
->reply_op_error(op
, r
);
2319 if (m
->has_flag(CEPH_OSD_FLAG_IGNORE_CACHE
)) {
2320 ctx
->ignore_cache
= true;
2323 if ((op
->may_read()) && (obc
->obs
.oi
.is_lost())) {
2324 // This object is lost. Reading from it returns an error.
2325 dout(20) << __func__
<< ": object " << obc
->obs
.oi
.soid
2326 << " is lost" << dendl
;
2327 reply_ctx(ctx
, -ENFILE
);
2330 if (!op
->may_write() &&
2332 (!obc
->obs
.exists
||
2333 ((m
->get_snapid() != CEPH_SNAPDIR
) &&
2334 obc
->obs
.oi
.is_whiteout()))) {
2335 // copy the reqids for copy get on ENOENT
2336 if (m
->ops
[0].op
.op
== CEPH_OSD_OP_COPY_GET
) {
2337 fill_in_copy_get_noent(op
, oid
, m
->ops
[0]);
2341 reply_ctx(ctx
, -ENOENT
);
2348 utime_t prepare_latency
= ceph_clock_now();
2349 prepare_latency
-= op
->get_dequeued_time();
2350 osd
->logger
->tinc(l_osd_op_prepare_lat
, prepare_latency
);
2351 if (op
->may_read() && op
->may_write()) {
2352 osd
->logger
->tinc(l_osd_op_rw_prepare_lat
, prepare_latency
);
2353 } else if (op
->may_read()) {
2354 osd
->logger
->tinc(l_osd_op_r_prepare_lat
, prepare_latency
);
2355 } else if (op
->may_write() || op
->may_cache()) {
2356 osd
->logger
->tinc(l_osd_op_w_prepare_lat
, prepare_latency
);
2359 // force recovery of the oldest missing object if too many logs
2360 maybe_force_recovery();
2363 PrimaryLogPG::cache_result_t
PrimaryLogPG::maybe_handle_manifest_detail(
2366 ObjectContextRef obc
)
2368 if (static_cast<const MOSDOp
*>(op
->get_req())->get_flags() &
2369 CEPH_OSD_FLAG_IGNORE_REDIRECT
) {
2370 dout(20) << __func__
<< ": ignoring redirect due to flag" << dendl
;
2371 return cache_result_t::NOOP
;
2375 dout(10) << __func__
<< " " << obc
->obs
.oi
<< " "
2376 << (obc
->obs
.exists
? "exists" : "DNE")
2379 // if it is write-ordered and blocked, stop now
2380 if (obc
.get() && obc
->is_blocked() && write_ordered
) {
2381 // we're already doing something with this object
2382 dout(20) << __func__
<< " blocked on " << obc
->obs
.oi
.soid
<< dendl
;
2383 return cache_result_t::NOOP
;
2386 vector
<OSDOp
> ops
= static_cast<const MOSDOp
*>(op
->get_req())->ops
;
2387 for (vector
<OSDOp
>::iterator p
= ops
.begin(); p
!= ops
.end(); ++p
) {
2389 ceph_osd_op
& op
= osd_op
.op
;
2390 if (op
.op
== CEPH_OSD_OP_SET_REDIRECT
) {
2391 return cache_result_t::NOOP
;
2395 switch (obc
->obs
.oi
.manifest
.type
) {
2396 case object_manifest_t::TYPE_REDIRECT
:
2397 if (op
->may_write() || write_ordered
) {
2398 do_proxy_write(op
, obc
->obs
.oi
.soid
, obc
);
2400 do_proxy_read(op
, obc
);
2402 return cache_result_t::HANDLED_PROXY
;
2403 case object_manifest_t::TYPE_CHUNKED
:
2405 assert(0 == "unrecognized manifest type");
2408 return cache_result_t::NOOP
;
2411 void PrimaryLogPG::record_write_error(OpRequestRef op
, const hobject_t
&soid
,
2412 MOSDOpReply
*orig_reply
, int r
)
2414 dout(20) << __func__
<< " r=" << r
<< dendl
;
2415 assert(op
->may_write());
2416 const osd_reqid_t
&reqid
= static_cast<const MOSDOp
*>(op
->get_req())->get_reqid();
2417 mempool::osd_pglog::list
<pg_log_entry_t
> entries
;
2418 entries
.push_back(pg_log_entry_t(pg_log_entry_t::ERROR
, soid
,
2419 get_next_version(), eversion_t(), 0,
2420 reqid
, utime_t(), r
));
2425 boost::intrusive_ptr
<MOSDOpReply
> orig_reply
;
2430 MOSDOpReply
*orig_reply
,
2433 orig_reply(orig_reply
, false /* take over ref */), r(r
)
2436 ldpp_dout(pg
, 20) << "finished " << __func__
<< " r=" << r
<< dendl
;
2437 const MOSDOp
*m
= static_cast<const MOSDOp
*>(op
->get_req());
2438 int flags
= m
->get_flags() & (CEPH_OSD_FLAG_ACK
| CEPH_OSD_FLAG_ONDISK
);
2439 MOSDOpReply
*reply
= orig_reply
.detach();
2440 if (reply
== nullptr) {
2441 reply
= new MOSDOpReply(m
, r
, pg
->get_osdmap()->get_epoch(),
2444 ldpp_dout(pg
, 10) << " sending commit on " << *m
<< " " << reply
<< dendl
;
2445 pg
->osd
->send_message_osd_client(reply
, m
->get_connection());
2449 ObcLockManager lock_manager
;
2452 std::move(lock_manager
),
2453 boost::optional
<std::function
<void(void)> >(
2454 OnComplete(this, op
, orig_reply
, r
)),
2459 PrimaryLogPG::cache_result_t
PrimaryLogPG::maybe_handle_cache_detail(
2462 ObjectContextRef obc
,
2463 int r
, hobject_t missing_oid
,
2466 ObjectContextRef
*promote_obc
)
2468 // return quickly if caching is not enabled
2469 if (pool
.info
.cache_mode
== pg_pool_t::CACHEMODE_NONE
)
2470 return cache_result_t::NOOP
;
2474 op
->get_req()->get_type() == CEPH_MSG_OSD_OP
&&
2475 (static_cast<const MOSDOp
*>(op
->get_req())->get_flags() &
2476 CEPH_OSD_FLAG_IGNORE_CACHE
)) {
2477 dout(20) << __func__
<< ": ignoring cache due to flag" << dendl
;
2478 return cache_result_t::NOOP
;
2481 must_promote
= must_promote
|| op
->need_promote();
2484 dout(25) << __func__
<< " " << obc
->obs
.oi
<< " "
2485 << (obc
->obs
.exists
? "exists" : "DNE")
2486 << " missing_oid " << missing_oid
2487 << " must_promote " << (int)must_promote
2488 << " in_hit_set " << (int)in_hit_set
2491 dout(25) << __func__
<< " (no obc)"
2492 << " missing_oid " << missing_oid
2493 << " must_promote " << (int)must_promote
2494 << " in_hit_set " << (int)in_hit_set
2497 // if it is write-ordered and blocked, stop now
2498 if (obc
.get() && obc
->is_blocked() && write_ordered
) {
2499 // we're already doing something with this object
2500 dout(20) << __func__
<< " blocked on " << obc
->obs
.oi
.soid
<< dendl
;
2501 return cache_result_t::NOOP
;
2504 if (r
== -ENOENT
&& missing_oid
== hobject_t()) {
2505 // we know this object is logically absent (e.g., an undefined clone)
2506 return cache_result_t::NOOP
;
2509 if (obc
.get() && obc
->obs
.exists
) {
2510 osd
->logger
->inc(l_osd_op_cache_hit
);
2511 return cache_result_t::NOOP
;
2513 if (!is_primary()) {
2514 dout(20) << __func__
<< " cache miss; ask the primary" << dendl
;
2515 osd
->reply_op_error(op
, -EAGAIN
);
2516 return cache_result_t::REPLIED_WITH_EAGAIN
;
2519 if (missing_oid
== hobject_t() && obc
.get()) {
2520 missing_oid
= obc
->obs
.oi
.soid
;
2523 const MOSDOp
*m
= static_cast<const MOSDOp
*>(op
->get_req());
2524 const object_locator_t oloc
= m
->get_object_locator();
2526 if (op
->need_skip_handle_cache()) {
2527 return cache_result_t::NOOP
;
2530 // older versions do not proxy the feature bits.
2531 bool can_proxy_write
= get_osdmap()->get_up_osd_features() &
2532 CEPH_FEATURE_OSD_PROXY_WRITE_FEATURES
;
2533 OpRequestRef promote_op
;
2535 switch (pool
.info
.cache_mode
) {
2536 case pg_pool_t::CACHEMODE_WRITEBACK
:
2538 agent_state
->evict_mode
== TierAgentState::EVICT_MODE_FULL
) {
2539 if (!op
->may_write() && !op
->may_cache() &&
2540 !write_ordered
&& !must_promote
) {
2541 dout(20) << __func__
<< " cache pool full, proxying read" << dendl
;
2543 return cache_result_t::HANDLED_PROXY
;
2545 dout(20) << __func__
<< " cache pool full, waiting" << dendl
;
2546 block_write_on_full_cache(missing_oid
, op
);
2547 return cache_result_t::BLOCKED_FULL
;
2550 if (must_promote
|| (!hit_set
&& !op
->need_skip_promote())) {
2551 promote_object(obc
, missing_oid
, oloc
, op
, promote_obc
);
2552 return cache_result_t::BLOCKED_PROMOTE
;
2555 if (op
->may_write() || op
->may_cache()) {
2556 if (can_proxy_write
) {
2557 do_proxy_write(op
, missing_oid
);
2559 // promote if can't proxy the write
2560 promote_object(obc
, missing_oid
, oloc
, op
, promote_obc
);
2561 return cache_result_t::BLOCKED_PROMOTE
;
2565 if (!op
->need_skip_promote() &&
2566 maybe_promote(obc
, missing_oid
, oloc
, in_hit_set
,
2567 pool
.info
.min_write_recency_for_promote
,
2570 return cache_result_t::BLOCKED_PROMOTE
;
2572 return cache_result_t::HANDLED_PROXY
;
2576 // Avoid duplicate promotion
2577 if (obc
.get() && obc
->is_blocked()) {
2580 return cache_result_t::BLOCKED_PROMOTE
;
2584 if (!op
->need_skip_promote()) {
2585 (void)maybe_promote(obc
, missing_oid
, oloc
, in_hit_set
,
2586 pool
.info
.min_read_recency_for_promote
,
2587 promote_op
, promote_obc
);
2590 return cache_result_t::HANDLED_PROXY
;
2592 assert(0 == "unreachable");
2593 return cache_result_t::NOOP
;
2595 case pg_pool_t::CACHEMODE_FORWARD
:
2596 // FIXME: this mode allows requests to be reordered.
2597 do_cache_redirect(op
);
2598 return cache_result_t::HANDLED_REDIRECT
;
2600 case pg_pool_t::CACHEMODE_READONLY
:
2601 // TODO: clean this case up
2602 if (!obc
.get() && r
== -ENOENT
) {
2603 // we don't have the object and op's a read
2604 promote_object(obc
, missing_oid
, oloc
, op
, promote_obc
);
2605 return cache_result_t::BLOCKED_PROMOTE
;
2607 if (!r
) { // it must be a write
2608 do_cache_redirect(op
);
2609 return cache_result_t::HANDLED_REDIRECT
;
2611 // crap, there was a failure of some kind
2612 return cache_result_t::NOOP
;
2614 case pg_pool_t::CACHEMODE_READFORWARD
:
2615 // Do writeback to the cache tier for writes
2616 if (op
->may_write() || write_ordered
|| must_promote
) {
2618 agent_state
->evict_mode
== TierAgentState::EVICT_MODE_FULL
) {
2619 dout(20) << __func__
<< " cache pool full, waiting" << dendl
;
2620 block_write_on_full_cache(missing_oid
, op
);
2621 return cache_result_t::BLOCKED_FULL
;
2623 promote_object(obc
, missing_oid
, oloc
, op
, promote_obc
);
2624 return cache_result_t::BLOCKED_PROMOTE
;
2627 // If it is a read, we can read, we need to forward it
2628 do_cache_redirect(op
);
2629 return cache_result_t::HANDLED_REDIRECT
;
2631 case pg_pool_t::CACHEMODE_PROXY
:
2632 if (!must_promote
) {
2633 if (op
->may_write() || op
->may_cache() || write_ordered
) {
2634 if (can_proxy_write
) {
2635 do_proxy_write(op
, missing_oid
);
2636 return cache_result_t::HANDLED_PROXY
;
2640 return cache_result_t::HANDLED_PROXY
;
2643 // ugh, we're forced to promote.
2645 agent_state
->evict_mode
== TierAgentState::EVICT_MODE_FULL
) {
2646 dout(20) << __func__
<< " cache pool full, waiting" << dendl
;
2647 block_write_on_full_cache(missing_oid
, op
);
2648 return cache_result_t::BLOCKED_FULL
;
2650 promote_object(obc
, missing_oid
, oloc
, op
, promote_obc
);
2651 return cache_result_t::BLOCKED_PROMOTE
;
2653 case pg_pool_t::CACHEMODE_READPROXY
:
2654 // Do writeback to the cache tier for writes
2655 if (op
->may_write() || write_ordered
|| must_promote
) {
2657 agent_state
->evict_mode
== TierAgentState::EVICT_MODE_FULL
) {
2658 dout(20) << __func__
<< " cache pool full, waiting" << dendl
;
2659 block_write_on_full_cache(missing_oid
, op
);
2660 return cache_result_t::BLOCKED_FULL
;
2662 promote_object(obc
, missing_oid
, oloc
, op
, promote_obc
);
2663 return cache_result_t::BLOCKED_PROMOTE
;
2666 // If it is a read, we can read, we need to proxy it
2668 return cache_result_t::HANDLED_PROXY
;
2671 assert(0 == "unrecognized cache_mode");
2673 return cache_result_t::NOOP
;
2676 bool PrimaryLogPG::maybe_promote(ObjectContextRef obc
,
2677 const hobject_t
& missing_oid
,
2678 const object_locator_t
& oloc
,
2681 OpRequestRef promote_op
,
2682 ObjectContextRef
*promote_obc
)
2684 dout(20) << __func__
<< " missing_oid " << missing_oid
2685 << " in_hit_set " << in_hit_set
<< dendl
;
2691 // Check if in the current hit set
2701 unsigned count
= (int)in_hit_set
;
2703 // Check if in other hit sets
2704 const hobject_t
& oid
= obc
.get() ? obc
->obs
.oi
.soid
: missing_oid
;
2705 for (map
<time_t,HitSetRef
>::reverse_iterator itor
=
2706 agent_state
->hit_set_map
.rbegin();
2707 itor
!= agent_state
->hit_set_map
.rend();
2709 if (!itor
->second
->contains(oid
)) {
2713 if (count
>= recency
) {
2718 if (count
>= recency
) {
2721 return false; // not promoting
2726 if (osd
->promote_throttle()) {
2727 dout(10) << __func__
<< " promote throttled" << dendl
;
2730 promote_object(obc
, missing_oid
, oloc
, promote_op
, promote_obc
);
2734 void PrimaryLogPG::do_cache_redirect(OpRequestRef op
)
2736 const MOSDOp
*m
= static_cast<const MOSDOp
*>(op
->get_req());
2737 int flags
= m
->get_flags() & (CEPH_OSD_FLAG_ACK
|CEPH_OSD_FLAG_ONDISK
);
2738 MOSDOpReply
*reply
= new MOSDOpReply(m
, -ENOENT
,
2739 get_osdmap()->get_epoch(), flags
, false);
2740 request_redirect_t
redir(m
->get_object_locator(), pool
.info
.tier_of
);
2741 reply
->set_redirect(redir
);
2742 dout(10) << "sending redirect to pool " << pool
.info
.tier_of
<< " for op "
2744 m
->get_connection()->send_message(reply
);
2748 struct C_ProxyRead
: public Context
{
2751 epoch_t last_peering_reset
;
2753 PrimaryLogPG::ProxyReadOpRef prdop
;
2755 C_ProxyRead(PrimaryLogPG
*p
, hobject_t o
, epoch_t lpr
,
2756 const PrimaryLogPG::ProxyReadOpRef
& prd
)
2757 : pg(p
), oid(o
), last_peering_reset(lpr
),
2758 tid(0), prdop(prd
), start(ceph_clock_now())
2760 void finish(int r
) override
{
2761 if (prdop
->canceled
)
2764 if (prdop
->canceled
) {
2768 if (last_peering_reset
== pg
->get_last_peering_reset()) {
2769 pg
->finish_proxy_read(oid
, tid
, r
);
2770 pg
->osd
->logger
->tinc(l_osd_tier_r_lat
, ceph_clock_now() - start
);
2776 void PrimaryLogPG::do_proxy_read(OpRequestRef op
, ObjectContextRef obc
)
2778 // NOTE: non-const here because the ProxyReadOp needs mutable refs to
2779 // stash the result in the request's OSDOp vector
2780 MOSDOp
*m
= static_cast<MOSDOp
*>(op
->get_nonconst_req());
2781 object_locator_t oloc
;
2783 /* extensible tier */
2784 if (obc
&& obc
->obs
.exists
&& obc
->obs
.oi
.has_manifest()) {
2785 switch (obc
->obs
.oi
.manifest
.type
) {
2786 case object_manifest_t::TYPE_REDIRECT
:
2787 oloc
= object_locator_t(obc
->obs
.oi
.manifest
.redirect_target
);
2788 soid
= obc
->obs
.oi
.manifest
.redirect_target
;
2790 case object_manifest_t::TYPE_CHUNKED
:
2792 assert(0 == "unrecognized manifest type");
2796 soid
= m
->get_hobj();
2797 oloc
= object_locator_t(m
->get_object_locator());
2798 oloc
.pool
= pool
.info
.tier_of
;
2800 unsigned flags
= CEPH_OSD_FLAG_IGNORE_CACHE
| CEPH_OSD_FLAG_IGNORE_OVERLAY
;
2802 // pass through some original flags that make sense.
2803 // - leave out redirection and balancing flags since we are
2804 // already proxying through the primary
2805 // - leave off read/write/exec flags that are derived from the op
2806 flags
|= m
->get_flags() & (CEPH_OSD_FLAG_RWORDERED
|
2807 CEPH_OSD_FLAG_ORDERSNAP
|
2808 CEPH_OSD_FLAG_ENFORCE_SNAPC
|
2809 CEPH_OSD_FLAG_MAP_SNAP_CLONE
);
2811 dout(10) << __func__
<< " Start proxy read for " << *m
<< dendl
;
2813 ProxyReadOpRef
prdop(std::make_shared
<ProxyReadOp
>(op
, soid
, m
->ops
));
2815 ObjectOperation obj_op
;
2816 obj_op
.dup(prdop
->ops
);
2818 if (pool
.info
.cache_mode
== pg_pool_t::CACHEMODE_WRITEBACK
&&
2819 (agent_state
&& agent_state
->evict_mode
!= TierAgentState::EVICT_MODE_FULL
)) {
2820 for (unsigned i
= 0; i
< obj_op
.ops
.size(); i
++) {
2821 ceph_osd_op op
= obj_op
.ops
[i
].op
;
2823 case CEPH_OSD_OP_READ
:
2824 case CEPH_OSD_OP_SYNC_READ
:
2825 case CEPH_OSD_OP_SPARSE_READ
:
2826 case CEPH_OSD_OP_CHECKSUM
:
2827 case CEPH_OSD_OP_CMPEXT
:
2828 op
.flags
= (op
.flags
| CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL
) &
2829 ~(CEPH_OSD_OP_FLAG_FADVISE_DONTNEED
| CEPH_OSD_OP_FLAG_FADVISE_NOCACHE
);
2834 C_ProxyRead
*fin
= new C_ProxyRead(this, soid
, get_last_peering_reset(),
2836 ceph_tid_t tid
= osd
->objecter
->read(
2837 soid
.oid
, oloc
, obj_op
,
2838 m
->get_snapid(), NULL
,
2839 flags
, new C_OnFinisher(fin
, &osd
->objecter_finisher
),
2840 &prdop
->user_version
,
2841 &prdop
->data_offset
,
2844 prdop
->objecter_tid
= tid
;
2845 proxyread_ops
[tid
] = prdop
;
2846 in_progress_proxy_ops
[soid
].push_back(op
);
2849 void PrimaryLogPG::finish_proxy_read(hobject_t oid
, ceph_tid_t tid
, int r
)
2851 dout(10) << __func__
<< " " << oid
<< " tid " << tid
2852 << " " << cpp_strerror(r
) << dendl
;
2854 map
<ceph_tid_t
, ProxyReadOpRef
>::iterator p
= proxyread_ops
.find(tid
);
2855 if (p
== proxyread_ops
.end()) {
2856 dout(10) << __func__
<< " no proxyread_op found" << dendl
;
2859 ProxyReadOpRef prdop
= p
->second
;
2860 if (tid
!= prdop
->objecter_tid
) {
2861 dout(10) << __func__
<< " tid " << tid
<< " != prdop " << prdop
2862 << " tid " << prdop
->objecter_tid
<< dendl
;
2865 if (oid
!= prdop
->soid
) {
2866 dout(10) << __func__
<< " oid " << oid
<< " != prdop " << prdop
2867 << " soid " << prdop
->soid
<< dendl
;
2870 proxyread_ops
.erase(tid
);
2872 map
<hobject_t
, list
<OpRequestRef
>>::iterator q
= in_progress_proxy_ops
.find(oid
);
2873 if (q
== in_progress_proxy_ops
.end()) {
2874 dout(10) << __func__
<< " no in_progress_proxy_ops found" << dendl
;
2877 assert(q
->second
.size());
2878 list
<OpRequestRef
>::iterator it
= std::find(q
->second
.begin(),
2881 assert(it
!= q
->second
.end());
2882 OpRequestRef op
= *it
;
2883 q
->second
.erase(it
);
2884 if (q
->second
.size() == 0) {
2885 in_progress_proxy_ops
.erase(oid
);
2888 osd
->logger
->inc(l_osd_tier_proxy_read
);
2890 const MOSDOp
*m
= static_cast<const MOSDOp
*>(op
->get_req());
2891 OpContext
*ctx
= new OpContext(op
, m
->get_reqid(), &prdop
->ops
, this);
2892 ctx
->reply
= new MOSDOpReply(m
, 0, get_osdmap()->get_epoch(), 0, false);
2893 ctx
->user_at_version
= prdop
->user_version
;
2894 ctx
->data_off
= prdop
->data_offset
;
2895 ctx
->ignore_log_op_stats
= true;
2896 complete_read_ctx(r
, ctx
);
2899 void PrimaryLogPG::kick_proxy_ops_blocked(hobject_t
& soid
)
2901 map
<hobject_t
, list
<OpRequestRef
>>::iterator p
= in_progress_proxy_ops
.find(soid
);
2902 if (p
== in_progress_proxy_ops
.end())
2905 list
<OpRequestRef
>& ls
= p
->second
;
2906 dout(10) << __func__
<< " " << soid
<< " requeuing " << ls
.size() << " requests" << dendl
;
2908 in_progress_proxy_ops
.erase(p
);
2911 void PrimaryLogPG::cancel_proxy_read(ProxyReadOpRef prdop
,
2912 vector
<ceph_tid_t
> *tids
)
2914 dout(10) << __func__
<< " " << prdop
->soid
<< dendl
;
2915 prdop
->canceled
= true;
2917 // cancel objecter op, if we can
2918 if (prdop
->objecter_tid
) {
2919 tids
->push_back(prdop
->objecter_tid
);
2920 for (uint32_t i
= 0; i
< prdop
->ops
.size(); i
++) {
2921 prdop
->ops
[i
].outdata
.clear();
2923 proxyread_ops
.erase(prdop
->objecter_tid
);
2924 prdop
->objecter_tid
= 0;
2928 void PrimaryLogPG::cancel_proxy_ops(bool requeue
, vector
<ceph_tid_t
> *tids
)
2930 dout(10) << __func__
<< dendl
;
2932 // cancel proxy reads
2933 map
<ceph_tid_t
, ProxyReadOpRef
>::iterator p
= proxyread_ops
.begin();
2934 while (p
!= proxyread_ops
.end()) {
2935 cancel_proxy_read((p
++)->second
, tids
);
2938 // cancel proxy writes
2939 map
<ceph_tid_t
, ProxyWriteOpRef
>::iterator q
= proxywrite_ops
.begin();
2940 while (q
!= proxywrite_ops
.end()) {
2941 cancel_proxy_write((q
++)->second
, tids
);
2945 map
<hobject_t
, list
<OpRequestRef
>>::iterator p
=
2946 in_progress_proxy_ops
.begin();
2947 while (p
!= in_progress_proxy_ops
.end()) {
2948 list
<OpRequestRef
>& ls
= p
->second
;
2949 dout(10) << __func__
<< " " << p
->first
<< " requeuing " << ls
.size()
2950 << " requests" << dendl
;
2952 in_progress_proxy_ops
.erase(p
++);
2955 in_progress_proxy_ops
.clear();
2959 struct C_ProxyWrite_Commit
: public Context
{
2962 epoch_t last_peering_reset
;
2964 PrimaryLogPG::ProxyWriteOpRef pwop
;
2965 C_ProxyWrite_Commit(PrimaryLogPG
*p
, hobject_t o
, epoch_t lpr
,
2966 const PrimaryLogPG::ProxyWriteOpRef
& pw
)
2967 : pg(p
), oid(o
), last_peering_reset(lpr
),
2970 void finish(int r
) override
{
2974 if (pwop
->canceled
) {
2978 if (last_peering_reset
== pg
->get_last_peering_reset()) {
2979 pg
->finish_proxy_write(oid
, tid
, r
);
2985 void PrimaryLogPG::do_proxy_write(OpRequestRef op
, const hobject_t
& missing_oid
, ObjectContextRef obc
)
2987 // NOTE: non-const because ProxyWriteOp takes a mutable ref
2988 MOSDOp
*m
= static_cast<MOSDOp
*>(op
->get_nonconst_req());
2989 object_locator_t oloc
;
2990 SnapContext
snapc(m
->get_snap_seq(), m
->get_snaps());
2992 /* extensible tier */
2993 if (obc
&& obc
->obs
.exists
&& obc
->obs
.oi
.has_manifest()) {
2994 switch (obc
->obs
.oi
.manifest
.type
) {
2995 case object_manifest_t::TYPE_REDIRECT
:
2996 oloc
= object_locator_t(obc
->obs
.oi
.manifest
.redirect_target
);
2997 soid
= obc
->obs
.oi
.manifest
.redirect_target
;
2999 case object_manifest_t::TYPE_CHUNKED
:
3001 assert(0 == "unrecognized manifest type");
3005 soid
= m
->get_hobj();
3006 oloc
= object_locator_t(m
->get_object_locator());
3007 oloc
.pool
= pool
.info
.tier_of
;
3010 unsigned flags
= CEPH_OSD_FLAG_IGNORE_CACHE
| CEPH_OSD_FLAG_IGNORE_OVERLAY
;
3011 if (!(op
->may_write() || op
->may_cache())) {
3012 flags
|= CEPH_OSD_FLAG_RWORDERED
;
3014 dout(10) << __func__
<< " Start proxy write for " << *m
<< dendl
;
3016 ProxyWriteOpRef
pwop(std::make_shared
<ProxyWriteOp
>(op
, soid
, m
->ops
, m
->get_reqid()));
3017 pwop
->ctx
= new OpContext(op
, m
->get_reqid(), &pwop
->ops
, this);
3018 pwop
->mtime
= m
->get_mtime();
3020 ObjectOperation obj_op
;
3021 obj_op
.dup(pwop
->ops
);
3023 C_ProxyWrite_Commit
*fin
= new C_ProxyWrite_Commit(
3024 this, soid
, get_last_peering_reset(), pwop
);
3025 ceph_tid_t tid
= osd
->objecter
->mutate(
3026 soid
.oid
, oloc
, obj_op
, snapc
,
3027 ceph::real_clock::from_ceph_timespec(pwop
->mtime
),
3028 flags
, new C_OnFinisher(fin
, &osd
->objecter_finisher
),
3029 &pwop
->user_version
, pwop
->reqid
);
3031 pwop
->objecter_tid
= tid
;
3032 proxywrite_ops
[tid
] = pwop
;
3033 in_progress_proxy_ops
[soid
].push_back(op
);
3036 void PrimaryLogPG::finish_proxy_write(hobject_t oid
, ceph_tid_t tid
, int r
)
3038 dout(10) << __func__
<< " " << oid
<< " tid " << tid
3039 << " " << cpp_strerror(r
) << dendl
;
3041 map
<ceph_tid_t
, ProxyWriteOpRef
>::iterator p
= proxywrite_ops
.find(tid
);
3042 if (p
== proxywrite_ops
.end()) {
3043 dout(10) << __func__
<< " no proxywrite_op found" << dendl
;
3046 ProxyWriteOpRef pwop
= p
->second
;
3047 assert(tid
== pwop
->objecter_tid
);
3048 assert(oid
== pwop
->soid
);
3050 proxywrite_ops
.erase(tid
);
3052 map
<hobject_t
, list
<OpRequestRef
> >::iterator q
= in_progress_proxy_ops
.find(oid
);
3053 if (q
== in_progress_proxy_ops
.end()) {
3054 dout(10) << __func__
<< " no in_progress_proxy_ops found" << dendl
;
3059 list
<OpRequestRef
>& in_progress_op
= q
->second
;
3060 assert(in_progress_op
.size());
3061 list
<OpRequestRef
>::iterator it
= std::find(in_progress_op
.begin(),
3062 in_progress_op
.end(),
3064 assert(it
!= in_progress_op
.end());
3065 in_progress_op
.erase(it
);
3066 if (in_progress_op
.size() == 0) {
3067 in_progress_proxy_ops
.erase(oid
);
3070 osd
->logger
->inc(l_osd_tier_proxy_write
);
3072 const MOSDOp
*m
= static_cast<const MOSDOp
*>(pwop
->op
->get_req());
3075 if (!pwop
->sent_reply
) {
3077 MOSDOpReply
*reply
= pwop
->ctx
->reply
;
3079 pwop
->ctx
->reply
= NULL
;
3081 reply
= new MOSDOpReply(m
, r
, get_osdmap()->get_epoch(), 0, true);
3082 reply
->set_reply_versions(eversion_t(), pwop
->user_version
);
3084 reply
->add_flags(CEPH_OSD_FLAG_ACK
| CEPH_OSD_FLAG_ONDISK
);
3085 dout(10) << " sending commit on " << pwop
<< " " << reply
<< dendl
;
3086 osd
->send_message_osd_client(reply
, m
->get_connection());
3087 pwop
->sent_reply
= true;
3088 pwop
->ctx
->op
->mark_commit_sent();
3095 void PrimaryLogPG::cancel_proxy_write(ProxyWriteOpRef pwop
,
3096 vector
<ceph_tid_t
> *tids
)
3098 dout(10) << __func__
<< " " << pwop
->soid
<< dendl
;
3099 pwop
->canceled
= true;
3101 // cancel objecter op, if we can
3102 if (pwop
->objecter_tid
) {
3103 tids
->push_back(pwop
->objecter_tid
);
3106 proxywrite_ops
.erase(pwop
->objecter_tid
);
3107 pwop
->objecter_tid
= 0;
3111 class PromoteCallback
: public PrimaryLogPG::CopyCallback
{
3112 ObjectContextRef obc
;
3116 PromoteCallback(ObjectContextRef obc_
, PrimaryLogPG
*pg_
)
3119 start(ceph_clock_now()) {}
3121 void finish(PrimaryLogPG::CopyCallbackResults results
) override
{
3122 PrimaryLogPG::CopyResults
*results_data
= results
.get
<1>();
3123 int r
= results
.get
<0>();
3124 pg
->finish_promote(r
, results_data
, obc
);
3125 pg
->osd
->logger
->tinc(l_osd_tier_promote_lat
, ceph_clock_now() - start
);
3129 void PrimaryLogPG::promote_object(ObjectContextRef obc
,
3130 const hobject_t
& missing_oid
,
3131 const object_locator_t
& oloc
,
3133 ObjectContextRef
*promote_obc
)
3135 hobject_t hoid
= obc
? obc
->obs
.oi
.soid
: missing_oid
;
3136 assert(hoid
!= hobject_t());
3137 if (write_blocked_by_scrub(hoid
)) {
3138 dout(10) << __func__
<< " " << hoid
3139 << " blocked by scrub" << dendl
;
3141 waiting_for_scrub
.push_back(op
);
3142 op
->mark_delayed("waiting for scrub");
3143 dout(10) << __func__
<< " " << hoid
3144 << " placing op in waiting_for_scrub" << dendl
;
3146 dout(10) << __func__
<< " " << hoid
3147 << " no op, dropping on the floor" << dendl
;
3151 if (!obc
) { // we need to create an ObjectContext
3152 assert(missing_oid
!= hobject_t());
3153 obc
= get_object_context(missing_oid
, true);
3159 * Before promote complete, if there are proxy-reads for the object,
3160 * for this case we don't use DONTNEED.
3162 unsigned src_fadvise_flags
= LIBRADOS_OP_FLAG_FADVISE_SEQUENTIAL
;
3163 map
<hobject_t
, list
<OpRequestRef
>>::iterator q
= in_progress_proxy_ops
.find(obc
->obs
.oi
.soid
);
3164 if (q
== in_progress_proxy_ops
.end()) {
3165 src_fadvise_flags
|= LIBRADOS_OP_FLAG_FADVISE_DONTNEED
;
3168 PromoteCallback
*cb
= new PromoteCallback(obc
, this);
3169 object_locator_t my_oloc
= oloc
;
3170 my_oloc
.pool
= pool
.info
.tier_of
;
3172 unsigned flags
= CEPH_OSD_COPY_FROM_FLAG_IGNORE_OVERLAY
|
3173 CEPH_OSD_COPY_FROM_FLAG_IGNORE_CACHE
|
3174 CEPH_OSD_COPY_FROM_FLAG_MAP_SNAP_CLONE
|
3175 CEPH_OSD_COPY_FROM_FLAG_RWORDERED
;
3176 start_copy(cb
, obc
, obc
->obs
.oi
.soid
, my_oloc
, 0, flags
,
3177 obc
->obs
.oi
.soid
.snap
== CEPH_NOSNAP
,
3178 src_fadvise_flags
, 0);
3180 assert(obc
->is_blocked());
3183 wait_for_blocked_object(obc
->obs
.oi
.soid
, op
);
3184 info
.stats
.stats
.sum
.num_promote
++;
3187 void PrimaryLogPG::execute_ctx(OpContext
*ctx
)
3190 dout(10) << __func__
<< " " << ctx
<< dendl
;
3191 ctx
->reset_obs(ctx
->obc
);
3192 ctx
->update_log_only
= false; // reset in case finish_copyfrom() is re-running execute_ctx
3193 OpRequestRef op
= ctx
->op
;
3194 const MOSDOp
*m
= static_cast<const MOSDOp
*>(op
->get_req());
3195 ObjectContextRef obc
= ctx
->obc
;
3196 const hobject_t
& soid
= obc
->obs
.oi
.soid
;
3198 // this method must be idempotent since we may call it several times
3199 // before we finally apply the resulting transaction.
3200 ctx
->op_t
.reset(new PGTransaction
);
3202 if (op
->may_write() || op
->may_cache()) {
3204 if (!(m
->has_flag(CEPH_OSD_FLAG_ENFORCE_SNAPC
)) &&
3205 pool
.info
.is_pool_snaps_mode()) {
3207 ctx
->snapc
= pool
.snapc
;
3209 // client specified snapc
3210 ctx
->snapc
.seq
= m
->get_snap_seq();
3211 ctx
->snapc
.snaps
= m
->get_snaps();
3212 filter_snapc(ctx
->snapc
.snaps
);
3214 if ((m
->has_flag(CEPH_OSD_FLAG_ORDERSNAP
)) &&
3215 ctx
->snapc
.seq
< obc
->ssc
->snapset
.seq
) {
3216 dout(10) << " ORDERSNAP flag set and snapc seq " << ctx
->snapc
.seq
3217 << " < snapset seq " << obc
->ssc
->snapset
.seq
3218 << " on " << obc
->obs
.oi
.soid
<< dendl
;
3219 reply_ctx(ctx
, -EOLDSNAPC
);
3224 ctx
->at_version
= get_next_version();
3225 ctx
->mtime
= m
->get_mtime();
3227 dout(10) << __func__
<< " " << soid
<< " " << *ctx
->ops
3228 << " ov " << obc
->obs
.oi
.version
<< " av " << ctx
->at_version
3229 << " snapc " << ctx
->snapc
3230 << " snapset " << obc
->ssc
->snapset
3233 dout(10) << __func__
<< " " << soid
<< " " << *ctx
->ops
3234 << " ov " << obc
->obs
.oi
.version
3238 if (!ctx
->user_at_version
)
3239 ctx
->user_at_version
= obc
->obs
.oi
.user_version
;
3240 dout(30) << __func__
<< " user_at_version " << ctx
->user_at_version
<< dendl
;
3242 if (op
->may_read()) {
3243 dout(10) << " taking ondisk_read_lock" << dendl
;
3244 obc
->ondisk_read_lock();
3249 osd_reqid_t reqid
= ctx
->op
->get_reqid();
3251 tracepoint(osd
, prepare_tx_enter
, reqid
.name
._type
,
3252 reqid
.name
._num
, reqid
.tid
, reqid
.inc
);
3255 int result
= prepare_transaction(ctx
);
3259 osd_reqid_t reqid
= ctx
->op
->get_reqid();
3261 tracepoint(osd
, prepare_tx_exit
, reqid
.name
._type
,
3262 reqid
.name
._num
, reqid
.tid
, reqid
.inc
);
3265 if (op
->may_read()) {
3266 dout(10) << " dropping ondisk_read_lock" << dendl
;
3267 obc
->ondisk_read_unlock();
3270 bool pending_async_reads
= !ctx
->pending_async_reads
.empty();
3271 if (result
== -EINPROGRESS
|| pending_async_reads
) {
3273 if (pending_async_reads
) {
3274 in_progress_async_reads
.push_back(make_pair(op
, ctx
));
3275 ctx
->start_async_reads(this);
3280 if (result
== -EAGAIN
) {
3281 // clean up after the ctx
3286 bool successful_write
= !ctx
->op_t
->empty() && op
->may_write() && result
>= 0;
3287 // prepare the reply
3288 ctx
->reply
= new MOSDOpReply(m
, 0, get_osdmap()->get_epoch(), 0,
3291 // Write operations aren't allowed to return a data payload because
3292 // we can't do so reliably. If the client has to resend the request
3293 // and it has already been applied, we will return 0 with no
3294 // payload. Non-deterministic behavior is no good. However, it is
3295 // possible to construct an operation that does a read, does a guard
3296 // check (e.g., CMPXATTR), and then a write. Then we either succeed
3297 // with the write, or return a CMPXATTR and the read value.
3298 if (successful_write
) {
3299 // write. normalize the result code.
3300 dout(20) << " zeroing write result code " << result
<< dendl
;
3303 ctx
->reply
->set_result(result
);
3306 if ((ctx
->op_t
->empty() || result
< 0) && !ctx
->update_log_only
) {
3307 // finish side-effects
3309 do_osd_op_effects(ctx
, m
->get_connection());
3311 complete_read_ctx(result
, ctx
);
3315 ctx
->reply
->set_reply_versions(ctx
->at_version
, ctx
->user_at_version
);
3317 assert(op
->may_write() || op
->may_cache());
3322 // verify that we are doing this in order?
3323 if (cct
->_conf
->osd_debug_op_order
&& m
->get_source().is_client() &&
3324 !pool
.info
.is_tier() && !pool
.info
.has_tiers()) {
3325 map
<client_t
,ceph_tid_t
>& cm
= debug_op_order
[obc
->obs
.oi
.soid
];
3326 ceph_tid_t t
= m
->get_tid();
3327 client_t n
= m
->get_source().num();
3328 map
<client_t
,ceph_tid_t
>::iterator p
= cm
.find(n
);
3329 if (p
== cm
.end()) {
3330 dout(20) << " op order client." << n
<< " tid " << t
<< " (first)" << dendl
;
3333 dout(20) << " op order client." << n
<< " tid " << t
<< " last was " << p
->second
<< dendl
;
3334 if (p
->second
> t
) {
3335 derr
<< "bad op order, already applied " << p
->second
<< " > this " << t
<< dendl
;
3336 assert(0 == "out of order op");
3342 if (ctx
->update_log_only
) {
3344 do_osd_op_effects(ctx
, m
->get_connection());
3346 dout(20) << __func__
<< " update_log_only -- result=" << result
<< dendl
;
3347 // save just what we need from ctx
3348 MOSDOpReply
*reply
= ctx
->reply
;
3349 ctx
->reply
= nullptr;
3350 reply
->claim_op_out_data(*ctx
->ops
);
3351 reply
->get_header().data_off
= (ctx
->data_off
? *ctx
->data_off
: 0);
3354 if (result
== -ENOENT
) {
3355 reply
->set_enoent_reply_versions(info
.last_update
,
3356 info
.last_user_version
);
3358 reply
->add_flags(CEPH_OSD_FLAG_ACK
| CEPH_OSD_FLAG_ONDISK
);
3359 // append to pg log for dup detection - don't save buffers for now
3360 record_write_error(op
, soid
, reply
, result
);
3364 // no need to capture PG ref, repop cancel will handle that
3365 // Can capture the ctx by pointer, it's owned by the repop
3366 ctx
->register_on_commit(
3372 if (m
&& !ctx
->sent_reply
) {
3373 MOSDOpReply
*reply
= ctx
->reply
;
3375 ctx
->reply
= nullptr;
3377 reply
= new MOSDOpReply(m
, 0, get_osdmap()->get_epoch(), 0, true);
3378 reply
->set_reply_versions(ctx
->at_version
,
3379 ctx
->user_at_version
);
3381 reply
->add_flags(CEPH_OSD_FLAG_ACK
| CEPH_OSD_FLAG_ONDISK
);
3382 dout(10) << " sending reply on " << *m
<< " " << reply
<< dendl
;
3383 osd
->send_message_osd_client(reply
, m
->get_connection());
3384 ctx
->sent_reply
= true;
3385 ctx
->op
->mark_commit_sent();
3388 ctx
->register_on_success(
3392 ctx
->op
? ctx
->op
->get_req()->get_connection() :
3395 ctx
->register_on_finish(
3400 // issue replica writes
3401 ceph_tid_t rep_tid
= osd
->get_tid();
3403 RepGather
*repop
= new_repop(ctx
, obc
, rep_tid
);
3405 issue_repop(repop
, ctx
);
3410 void PrimaryLogPG::close_op_ctx(OpContext
*ctx
) {
3411 release_object_locks(ctx
->lock_manager
);
3415 for (auto p
= ctx
->on_finish
.begin(); p
!= ctx
->on_finish
.end();
3416 ctx
->on_finish
.erase(p
++)) {
3422 void PrimaryLogPG::reply_ctx(OpContext
*ctx
, int r
)
3425 osd
->reply_op_error(ctx
->op
, r
);
3429 void PrimaryLogPG::reply_ctx(OpContext
*ctx
, int r
, eversion_t v
, version_t uv
)
3432 osd
->reply_op_error(ctx
->op
, r
, v
, uv
);
3436 void PrimaryLogPG::log_op_stats(OpContext
*ctx
)
3438 OpRequestRef op
= ctx
->op
;
3439 const MOSDOp
*m
= static_cast<const MOSDOp
*>(op
->get_req());
3441 utime_t now
= ceph_clock_now();
3442 utime_t latency
= now
;
3443 latency
-= ctx
->op
->get_req()->get_recv_stamp();
3444 utime_t process_latency
= now
;
3445 process_latency
-= ctx
->op
->get_dequeued_time();
3447 uint64_t inb
= ctx
->bytes_written
;
3448 uint64_t outb
= ctx
->bytes_read
;
3450 osd
->logger
->inc(l_osd_op
);
3452 osd
->logger
->inc(l_osd_op_outb
, outb
);
3453 osd
->logger
->inc(l_osd_op_inb
, inb
);
3454 osd
->logger
->tinc(l_osd_op_lat
, latency
);
3455 osd
->logger
->tinc(l_osd_op_process_lat
, process_latency
);
3457 if (op
->may_read() && op
->may_write()) {
3458 osd
->logger
->inc(l_osd_op_rw
);
3459 osd
->logger
->inc(l_osd_op_rw_inb
, inb
);
3460 osd
->logger
->inc(l_osd_op_rw_outb
, outb
);
3461 osd
->logger
->tinc(l_osd_op_rw_lat
, latency
);
3462 osd
->logger
->hinc(l_osd_op_rw_lat_inb_hist
, latency
.to_nsec(), inb
);
3463 osd
->logger
->hinc(l_osd_op_rw_lat_outb_hist
, latency
.to_nsec(), outb
);
3464 osd
->logger
->tinc(l_osd_op_rw_process_lat
, process_latency
);
3465 } else if (op
->may_read()) {
3466 osd
->logger
->inc(l_osd_op_r
);
3467 osd
->logger
->inc(l_osd_op_r_outb
, outb
);
3468 osd
->logger
->tinc(l_osd_op_r_lat
, latency
);
3469 osd
->logger
->hinc(l_osd_op_r_lat_outb_hist
, latency
.to_nsec(), outb
);
3470 osd
->logger
->tinc(l_osd_op_r_process_lat
, process_latency
);
3471 } else if (op
->may_write() || op
->may_cache()) {
3472 osd
->logger
->inc(l_osd_op_w
);
3473 osd
->logger
->inc(l_osd_op_w_inb
, inb
);
3474 osd
->logger
->tinc(l_osd_op_w_lat
, latency
);
3475 osd
->logger
->hinc(l_osd_op_w_lat_inb_hist
, latency
.to_nsec(), inb
);
3476 osd
->logger
->tinc(l_osd_op_w_process_lat
, process_latency
);
3480 dout(15) << "log_op_stats " << *m
3483 << " lat " << latency
<< dendl
;
3486 void PrimaryLogPG::do_sub_op(OpRequestRef op
)
3488 const MOSDSubOp
*m
= static_cast<const MOSDSubOp
*>(op
->get_req());
3489 assert(have_same_or_newer_map(m
->map_epoch
));
3490 assert(m
->get_type() == MSG_OSD_SUBOP
);
3491 dout(15) << "do_sub_op " << *op
->get_req() << dendl
;
3494 waiting_for_peered
.push_back(op
);
3495 op
->mark_delayed("waiting for active");
3499 const OSDOp
*first
= NULL
;
3500 if (m
->ops
.size() >= 1) {
3505 switch (first
->op
.op
) {
3506 case CEPH_OSD_OP_DELETE
:
3509 case CEPH_OSD_OP_SCRUB_RESERVE
:
3510 handle_scrub_reserve_request(op
);
3512 case CEPH_OSD_OP_SCRUB_UNRESERVE
:
3513 handle_scrub_reserve_release(op
);
3515 case CEPH_OSD_OP_SCRUB_MAP
:
3516 sub_op_scrub_map(op
);
3522 void PrimaryLogPG::do_sub_op_reply(OpRequestRef op
)
3524 const MOSDSubOpReply
*r
= static_cast<const MOSDSubOpReply
*>(op
->get_req());
3525 assert(r
->get_type() == MSG_OSD_SUBOPREPLY
);
3526 if (r
->ops
.size() >= 1) {
3527 const OSDOp
& first
= r
->ops
[0];
3528 switch (first
.op
.op
) {
3529 case CEPH_OSD_OP_SCRUB_RESERVE
:
3531 pg_shard_t from
= r
->from
;
3532 bufferlist::iterator p
= const_cast<bufferlist
&>(r
->get_data()).begin();
3534 ::decode(reserved
, p
);
3536 handle_scrub_reserve_grant(op
, from
);
3538 handle_scrub_reserve_reject(op
, from
);
3546 void PrimaryLogPG::do_scan(
3548 ThreadPool::TPHandle
&handle
)
3550 const MOSDPGScan
*m
= static_cast<const MOSDPGScan
*>(op
->get_req());
3551 assert(m
->get_type() == MSG_OSD_PG_SCAN
);
3552 dout(10) << "do_scan " << *m
<< dendl
;
3557 case MOSDPGScan::OP_SCAN_GET_DIGEST
:
3560 if (osd
->check_backfill_full(ss
)) {
3561 dout(1) << __func__
<< ": Canceling backfill, " << ss
.str() << dendl
;
3562 queue_peering_event(
3564 std::make_shared
<CephPeeringEvt
>(
3565 get_osdmap()->get_epoch(),
3566 get_osdmap()->get_epoch(),
3567 BackfillTooFull())));
3571 BackfillInterval bi
;
3572 bi
.begin
= m
->begin
;
3573 // No need to flush, there won't be any in progress writes occuring
3576 cct
->_conf
->osd_backfill_scan_min
,
3577 cct
->_conf
->osd_backfill_scan_max
,
3580 MOSDPGScan
*reply
= new MOSDPGScan(
3581 MOSDPGScan::OP_SCAN_DIGEST
,
3583 get_osdmap()->get_epoch(), m
->query_epoch
,
3584 spg_t(info
.pgid
.pgid
, get_primary().shard
), bi
.begin
, bi
.end
);
3585 ::encode(bi
.objects
, reply
->get_data());
3586 osd
->send_message_osd_cluster(reply
, m
->get_connection());
3590 case MOSDPGScan::OP_SCAN_DIGEST
:
3592 pg_shard_t from
= m
->from
;
3594 // Check that from is in backfill_targets vector
3595 assert(is_backfill_targets(from
));
3597 BackfillInterval
& bi
= peer_backfill_info
[from
];
3598 bi
.begin
= m
->begin
;
3600 bufferlist::iterator p
= const_cast<bufferlist
&>(m
->get_data()).begin();
3602 // take care to preserve ordering!
3604 ::decode_noclear(bi
.objects
, p
);
3606 if (waiting_on_backfill
.erase(from
)) {
3607 if (waiting_on_backfill
.empty()) {
3608 assert(peer_backfill_info
.size() == backfill_targets
.size());
3609 finish_recovery_op(hobject_t::get_max());
3612 // we canceled backfill for a while due to a too full, and this
3613 // is an extra response from a non-too-full peer
3620 void PrimaryLogPG::do_backfill(OpRequestRef op
)
3622 const MOSDPGBackfill
*m
= static_cast<const MOSDPGBackfill
*>(op
->get_req());
3623 assert(m
->get_type() == MSG_OSD_PG_BACKFILL
);
3624 dout(10) << "do_backfill " << *m
<< dendl
;
3629 case MOSDPGBackfill::OP_BACKFILL_FINISH
:
3631 assert(cct
->_conf
->osd_kill_backfill_at
!= 1);
3633 MOSDPGBackfill
*reply
= new MOSDPGBackfill(
3634 MOSDPGBackfill::OP_BACKFILL_FINISH_ACK
,
3635 get_osdmap()->get_epoch(),
3637 spg_t(info
.pgid
.pgid
, get_primary().shard
));
3638 reply
->set_priority(get_recovery_op_priority());
3639 osd
->send_message_osd_cluster(reply
, m
->get_connection());
3640 queue_peering_event(
3642 std::make_shared
<CephPeeringEvt
>(
3643 get_osdmap()->get_epoch(),
3644 get_osdmap()->get_epoch(),
3649 case MOSDPGBackfill::OP_BACKFILL_PROGRESS
:
3651 assert(cct
->_conf
->osd_kill_backfill_at
!= 2);
3653 info
.set_last_backfill(m
->last_backfill
);
3654 info
.stats
= m
->stats
;
3656 ObjectStore::Transaction t
;
3659 int tr
= osd
->store
->queue_transaction(osr
.get(), std::move(t
), NULL
);
3664 case MOSDPGBackfill::OP_BACKFILL_FINISH_ACK
:
3666 assert(is_primary());
3667 assert(cct
->_conf
->osd_kill_backfill_at
!= 3);
3668 finish_recovery_op(hobject_t::get_max());
3674 void PrimaryLogPG::do_backfill_remove(OpRequestRef op
)
3676 const MOSDPGBackfillRemove
*m
= static_cast<const MOSDPGBackfillRemove
*>(
3678 assert(m
->get_type() == MSG_OSD_PG_BACKFILL_REMOVE
);
3679 dout(7) << __func__
<< " " << m
->ls
<< dendl
;
3683 ObjectStore::Transaction t
;
3684 for (auto& p
: m
->ls
) {
3685 remove_snap_mapped_object(t
, p
.first
);
3687 int r
= osd
->store
->queue_transaction(osr
.get(), std::move(t
), NULL
);
3691 int PrimaryLogPG::trim_object(
3692 bool first
, const hobject_t
&coid
, PrimaryLogPG::OpContextUPtr
*ctxp
)
3697 ObjectContextRef obc
= get_object_context(coid
, false, NULL
);
3698 if (!obc
|| !obc
->ssc
|| !obc
->ssc
->exists
) {
3699 osd
->clog
->error() << __func__
<< ": Can not trim " << coid
3700 << " repair needed " << (obc
? "(no obc->ssc or !exists)" : "(no obc)");
3705 coid
.oid
, coid
.get_key(),
3706 obc
->ssc
->snapset
.head_exists
? CEPH_NOSNAP
:CEPH_SNAPDIR
, coid
.get_hash(),
3707 info
.pgid
.pool(), coid
.get_namespace());
3708 ObjectContextRef snapset_obc
= get_object_context(snapoid
, false);
3710 osd
->clog
->error() << __func__
<< ": Can not trim " << coid
3711 << " repair needed, no snapset obc for " << snapoid
;
3715 SnapSet
& snapset
= obc
->ssc
->snapset
;
3717 bool legacy
= snapset
.is_legacy() ||
3718 get_osdmap()->require_osd_release
< CEPH_RELEASE_LUMINOUS
;
3720 object_info_t
&coi
= obc
->obs
.oi
;
3721 set
<snapid_t
> old_snaps
;
3723 old_snaps
.insert(coi
.legacy_snaps
.begin(), coi
.legacy_snaps
.end());
3725 auto p
= snapset
.clone_snaps
.find(coid
.snap
);
3726 if (p
== snapset
.clone_snaps
.end()) {
3727 osd
->clog
->error() << "No clone_snaps in snapset " << snapset
3728 << " for object " << coid
<< "\n";
3731 old_snaps
.insert(snapset
.clone_snaps
[coid
.snap
].begin(),
3732 snapset
.clone_snaps
[coid
.snap
].end());
3734 if (old_snaps
.empty()) {
3735 osd
->clog
->error() << "No object info snaps for object " << coid
;
3739 dout(10) << coid
<< " old_snaps " << old_snaps
3740 << " old snapset " << snapset
<< dendl
;
3741 if (snapset
.seq
== 0) {
3742 osd
->clog
->error() << "No snapset.seq for object " << coid
;
3746 set
<snapid_t
> new_snaps
;
3747 for (set
<snapid_t
>::iterator i
= old_snaps
.begin();
3748 i
!= old_snaps
.end();
3750 if (!pool
.info
.is_removed_snap(*i
))
3751 new_snaps
.insert(*i
);
3754 vector
<snapid_t
>::iterator p
= snapset
.clones
.end();
3756 if (new_snaps
.empty()) {
3757 p
= std::find(snapset
.clones
.begin(), snapset
.clones
.end(), coid
.snap
);
3758 if (p
== snapset
.clones
.end()) {
3759 osd
->clog
->error() << "Snap " << coid
.snap
<< " not in clones";
3764 OpContextUPtr ctx
= simple_opc_create(obc
);
3765 ctx
->snapset_obc
= snapset_obc
;
3767 if (!ctx
->lock_manager
.get_snaptrimmer_write(
3771 close_op_ctx(ctx
.release());
3772 dout(10) << __func__
<< ": Unable to get a wlock on " << coid
<< dendl
;
3776 if (!ctx
->lock_manager
.get_snaptrimmer_write(
3780 close_op_ctx(ctx
.release());
3781 dout(10) << __func__
<< ": Unable to get a wlock on " << snapoid
<< dendl
;
3785 ctx
->at_version
= get_next_version();
3787 PGTransaction
*t
= ctx
->op_t
.get();
3789 if (new_snaps
.empty()) {
3791 dout(10) << coid
<< " snaps " << old_snaps
<< " -> "
3792 << new_snaps
<< " ... deleting" << dendl
;
3795 assert(p
!= snapset
.clones
.end());
3797 snapid_t last
= coid
.snap
;
3798 ctx
->delta_stats
.num_bytes
-= snapset
.get_clone_bytes(last
);
3800 if (p
!= snapset
.clones
.begin()) {
3801 // not the oldest... merge overlap into next older clone
3802 vector
<snapid_t
>::iterator n
= p
- 1;
3803 hobject_t prev_coid
= coid
;
3804 prev_coid
.snap
= *n
;
3805 bool adjust_prev_bytes
= is_present_clone(prev_coid
);
3807 if (adjust_prev_bytes
)
3808 ctx
->delta_stats
.num_bytes
-= snapset
.get_clone_bytes(*n
);
3810 snapset
.clone_overlap
[*n
].intersection_of(
3811 snapset
.clone_overlap
[*p
]);
3813 if (adjust_prev_bytes
)
3814 ctx
->delta_stats
.num_bytes
+= snapset
.get_clone_bytes(*n
);
3816 ctx
->delta_stats
.num_objects
--;
3818 ctx
->delta_stats
.num_objects_dirty
--;
3820 ctx
->delta_stats
.num_objects_omap
--;
3821 if (coi
.is_whiteout()) {
3822 dout(20) << __func__
<< " trimming whiteout on " << coid
<< dendl
;
3823 ctx
->delta_stats
.num_whiteouts
--;
3825 ctx
->delta_stats
.num_object_clones
--;
3826 if (coi
.is_cache_pinned())
3827 ctx
->delta_stats
.num_objects_pinned
--;
3828 obc
->obs
.exists
= false;
3830 snapset
.clones
.erase(p
);
3831 snapset
.clone_overlap
.erase(last
);
3832 snapset
.clone_size
.erase(last
);
3833 snapset
.clone_snaps
.erase(last
);
3837 pg_log_entry_t::DELETE
,
3840 ctx
->obs
->oi
.version
,
3852 coi
= object_info_t(coid
);
3854 ctx
->at_version
.version
++;
3856 // save adjusted snaps for this object
3857 dout(10) << coid
<< " snaps " << old_snaps
<< " -> " << new_snaps
<< dendl
;
3859 coi
.legacy_snaps
= vector
<snapid_t
>(new_snaps
.rbegin(), new_snaps
.rend());
3861 snapset
.clone_snaps
[coid
.snap
] = vector
<snapid_t
>(new_snaps
.rbegin(),
3863 // we still do a 'modify' event on this object just to trigger a
3864 // snapmapper.update ... :(
3867 coi
.prior_version
= coi
.version
;
3868 coi
.version
= ctx
->at_version
;
3870 ::encode(coi
, bl
, get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD
, nullptr));
3871 t
->setattr(coid
, OI_ATTR
, bl
);
3875 pg_log_entry_t::MODIFY
,
3884 ctx
->at_version
.version
++;
3892 // save head snapset
3893 dout(10) << coid
<< " new snapset " << snapset
<< " on "
3894 << snapset_obc
->obs
.oi
<< dendl
;
3895 if (snapset
.clones
.empty() &&
3896 (!snapset
.head_exists
||
3897 (snapset_obc
->obs
.oi
.is_whiteout() &&
3898 !(snapset_obc
->obs
.oi
.is_dirty() && pool
.info
.is_tier()) &&
3899 !snapset_obc
->obs
.oi
.is_cache_pinned()))) {
3900 // NOTE: this arguably constitutes minor interference with the
3901 // tiering agent if this is a cache tier since a snap trim event
3902 // is effectively evicting a whiteout we might otherwise want to
3904 dout(10) << coid
<< " removing " << snapoid
<< dendl
;
3907 pg_log_entry_t::DELETE
,
3910 ctx
->snapset_obc
->obs
.oi
.version
,
3916 if (snapoid
.is_head()) {
3917 derr
<< "removing snap head" << dendl
;
3918 object_info_t
& oi
= ctx
->snapset_obc
->obs
.oi
;
3919 ctx
->delta_stats
.num_objects
--;
3920 if (oi
.is_dirty()) {
3921 ctx
->delta_stats
.num_objects_dirty
--;
3924 ctx
->delta_stats
.num_objects_omap
--;
3925 if (oi
.is_whiteout()) {
3926 dout(20) << __func__
<< " trimming whiteout on " << oi
.soid
<< dendl
;
3927 ctx
->delta_stats
.num_whiteouts
--;
3929 if (oi
.is_cache_pinned()) {
3930 ctx
->delta_stats
.num_objects_pinned
--;
3933 ctx
->snapset_obc
->obs
.exists
= false;
3934 ctx
->snapset_obc
->obs
.oi
= object_info_t(snapoid
);
3937 dout(10) << coid
<< " filtering snapset on " << snapoid
<< dendl
;
3938 snapset
.filter(pool
.info
);
3939 dout(10) << coid
<< " writing updated snapset on " << snapoid
3940 << ", snapset is " << snapset
<< dendl
;
3943 pg_log_entry_t::MODIFY
,
3946 ctx
->snapset_obc
->obs
.oi
.version
,
3953 ctx
->snapset_obc
->obs
.oi
.prior_version
=
3954 ctx
->snapset_obc
->obs
.oi
.version
;
3955 ctx
->snapset_obc
->obs
.oi
.version
= ctx
->at_version
;
3957 map
<string
, bufferlist
> attrs
;
3959 ::encode(snapset
, bl
);
3960 attrs
[SS_ATTR
].claim(bl
);
3963 ::encode(ctx
->snapset_obc
->obs
.oi
, bl
,
3964 get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD
, nullptr));
3965 attrs
[OI_ATTR
].claim(bl
);
3966 t
->setattrs(snapoid
, attrs
);
3969 *ctxp
= std::move(ctx
);
3973 void PrimaryLogPG::kick_snap_trim()
3975 assert(is_active());
3976 assert(is_primary());
3977 if (is_clean() && !snap_trimq
.empty()) {
3978 dout(10) << __func__
<< ": clean and snaps to trim, kicking" << dendl
;
3979 snap_trimmer_machine
.process_event(KickTrim());
3983 void PrimaryLogPG::snap_trimmer_scrub_complete()
3985 if (is_primary() && is_active() && is_clean()) {
3986 assert(!snap_trimq
.empty());
3987 snap_trimmer_machine
.process_event(ScrubComplete());
3991 void PrimaryLogPG::snap_trimmer(epoch_t queued
)
3993 if (deleting
|| pg_has_reset_since(queued
)) {
3997 assert(is_primary());
3999 dout(10) << "snap_trimmer posting" << dendl
;
4000 snap_trimmer_machine
.process_event(DoSnapWork());
4001 dout(10) << "snap_trimmer complete" << dendl
;
4005 int PrimaryLogPG::do_xattr_cmp_u64(int op
, __u64 v1
, bufferlist
& xattr
)
4009 string
v2s(xattr
.c_str(), xattr
.length());
4011 v2
= strtoull(v2s
.c_str(), NULL
, 10);
4015 dout(20) << "do_xattr_cmp_u64 '" << v1
<< "' vs '" << v2
<< "' op " << op
<< dendl
;
4018 case CEPH_OSD_CMPXATTR_OP_EQ
:
4020 case CEPH_OSD_CMPXATTR_OP_NE
:
4022 case CEPH_OSD_CMPXATTR_OP_GT
:
4024 case CEPH_OSD_CMPXATTR_OP_GTE
:
4026 case CEPH_OSD_CMPXATTR_OP_LT
:
4028 case CEPH_OSD_CMPXATTR_OP_LTE
:
4035 int PrimaryLogPG::do_xattr_cmp_str(int op
, string
& v1s
, bufferlist
& xattr
)
4037 string
v2s(xattr
.c_str(), xattr
.length());
4039 dout(20) << "do_xattr_cmp_str '" << v1s
<< "' vs '" << v2s
<< "' op " << op
<< dendl
;
4042 case CEPH_OSD_CMPXATTR_OP_EQ
:
4043 return (v1s
.compare(v2s
) == 0);
4044 case CEPH_OSD_CMPXATTR_OP_NE
:
4045 return (v1s
.compare(v2s
) != 0);
4046 case CEPH_OSD_CMPXATTR_OP_GT
:
4047 return (v1s
.compare(v2s
) > 0);
4048 case CEPH_OSD_CMPXATTR_OP_GTE
:
4049 return (v1s
.compare(v2s
) >= 0);
4050 case CEPH_OSD_CMPXATTR_OP_LT
:
4051 return (v1s
.compare(v2s
) < 0);
4052 case CEPH_OSD_CMPXATTR_OP_LTE
:
4053 return (v1s
.compare(v2s
) <= 0);
4059 int PrimaryLogPG::do_writesame(OpContext
*ctx
, OSDOp
& osd_op
)
4061 ceph_osd_op
& op
= osd_op
.op
;
4062 vector
<OSDOp
> write_ops(1);
4063 OSDOp
& write_op
= write_ops
[0];
4064 uint64_t write_length
= op
.writesame
.length
;
4070 if (!op
.writesame
.data_length
|| write_length
% op
.writesame
.data_length
)
4073 if (op
.writesame
.data_length
!= osd_op
.indata
.length()) {
4074 derr
<< "invalid length ws data length " << op
.writesame
.data_length
<< " actual len " << osd_op
.indata
.length() << dendl
;
4078 while (write_length
) {
4079 write_op
.indata
.append(osd_op
.indata
);
4080 write_length
-= op
.writesame
.data_length
;
4083 write_op
.op
.op
= CEPH_OSD_OP_WRITE
;
4084 write_op
.op
.extent
.offset
= op
.writesame
.offset
;
4085 write_op
.op
.extent
.length
= op
.writesame
.length
;
4086 result
= do_osd_ops(ctx
, write_ops
);
4088 derr
<< "do_writesame do_osd_ops failed " << result
<< dendl
;
4093 // ========================================================================
4094 // low level osd ops
4096 int PrimaryLogPG::do_tmap2omap(OpContext
*ctx
, unsigned flags
)
4098 dout(20) << " convert tmap to omap for " << ctx
->new_obs
.oi
.soid
<< dendl
;
4099 bufferlist header
, vals
;
4100 int r
= _get_tmap(ctx
, &header
, &vals
);
4102 if (r
== -ENODATA
&& (flags
& CEPH_OSD_TMAP2OMAP_NULLOK
))
4107 vector
<OSDOp
> ops(3);
4109 ops
[0].op
.op
= CEPH_OSD_OP_TRUNCATE
;
4110 ops
[0].op
.extent
.offset
= 0;
4111 ops
[0].op
.extent
.length
= 0;
4113 ops
[1].op
.op
= CEPH_OSD_OP_OMAPSETHEADER
;
4114 ops
[1].indata
.claim(header
);
4116 ops
[2].op
.op
= CEPH_OSD_OP_OMAPSETVALS
;
4117 ops
[2].indata
.claim(vals
);
4119 return do_osd_ops(ctx
, ops
);
4122 int PrimaryLogPG::do_tmapup_slow(OpContext
*ctx
, bufferlist::iterator
& bp
, OSDOp
& osd_op
,
4127 map
<string
, bufferlist
> m
;
4129 bufferlist::iterator p
= bl
.begin();
4130 ::decode(header
, p
);
4142 case CEPH_OSD_TMAP_SET
: // insert key
4150 case CEPH_OSD_TMAP_RM
: // remove key
4152 if (!m
.count(key
)) {
4157 case CEPH_OSD_TMAP_RMSLOPPY
: // remove key
4161 case CEPH_OSD_TMAP_HDR
: // update header
4163 ::decode(header
, bp
);
4173 ::encode(header
, obl
);
4177 vector
<OSDOp
> nops(1);
4178 OSDOp
& newop
= nops
[0];
4179 newop
.op
.op
= CEPH_OSD_OP_WRITEFULL
;
4180 newop
.op
.extent
.offset
= 0;
4181 newop
.op
.extent
.length
= obl
.length();
4183 do_osd_ops(ctx
, nops
);
4184 osd_op
.outdata
.claim(newop
.outdata
);
4188 int PrimaryLogPG::do_tmapup(OpContext
*ctx
, bufferlist::iterator
& bp
, OSDOp
& osd_op
)
4190 bufferlist::iterator orig_bp
= bp
;
4193 dout(10) << "tmapup is a no-op" << dendl
;
4195 // read the whole object
4196 vector
<OSDOp
> nops(1);
4197 OSDOp
& newop
= nops
[0];
4198 newop
.op
.op
= CEPH_OSD_OP_READ
;
4199 newop
.op
.extent
.offset
= 0;
4200 newop
.op
.extent
.length
= 0;
4201 result
= do_osd_ops(ctx
, nops
);
4203 dout(10) << "tmapup read " << newop
.outdata
.length() << dendl
;
4205 dout(30) << " starting is \n";
4206 newop
.outdata
.hexdump(*_dout
);
4209 bufferlist::iterator ip
= newop
.outdata
.begin();
4212 dout(30) << "the update command is: \n";
4213 osd_op
.indata
.hexdump(*_dout
);
4219 if (newop
.outdata
.length()) {
4220 ::decode(header
, ip
);
4221 ::decode(nkeys
, ip
);
4223 dout(10) << "tmapup header " << header
.length() << dendl
;
4225 if (!bp
.end() && *bp
== CEPH_OSD_TMAP_HDR
) {
4227 ::decode(header
, bp
);
4228 dout(10) << "tmapup new header " << header
.length() << dendl
;
4231 ::encode(header
, obl
);
4233 dout(20) << "tmapup initial nkeys " << nkeys
<< dendl
;
4236 bufferlist newkeydata
;
4237 string nextkey
, last_in_key
;
4239 bool have_next
= false;
4242 ::decode(nextkey
, ip
);
4243 ::decode(nextval
, ip
);
4245 while (!bp
.end() && !result
) {
4252 catch (buffer::error
& e
) {
4255 if (key
< last_in_key
) {
4256 dout(5) << "tmapup warning: key '" << key
<< "' < previous key '" << last_in_key
4257 << "', falling back to an inefficient (unsorted) update" << dendl
;
4259 return do_tmapup_slow(ctx
, bp
, osd_op
, newop
.outdata
);
4263 dout(10) << "tmapup op " << (int)op
<< " key " << key
<< dendl
;
4265 // skip existing intervening keys
4266 bool key_exists
= false;
4267 while (have_next
&& !key_exists
) {
4268 dout(20) << " (have_next=" << have_next
<< " nextkey=" << nextkey
<< ")" << dendl
;
4271 if (nextkey
< key
) {
4273 ::encode(nextkey
, newkeydata
);
4274 ::encode(nextval
, newkeydata
);
4275 dout(20) << " keep " << nextkey
<< " " << nextval
.length() << dendl
;
4277 // don't copy; discard old value. and stop.
4278 dout(20) << " drop " << nextkey
<< " " << nextval
.length() << dendl
;
4283 ::decode(nextkey
, ip
);
4284 ::decode(nextval
, ip
);
4290 if (op
== CEPH_OSD_TMAP_SET
) {
4295 catch (buffer::error
& e
) {
4298 ::encode(key
, newkeydata
);
4299 ::encode(val
, newkeydata
);
4300 dout(20) << " set " << key
<< " " << val
.length() << dendl
;
4302 } else if (op
== CEPH_OSD_TMAP_CREATE
) {
4310 catch (buffer::error
& e
) {
4313 ::encode(key
, newkeydata
);
4314 ::encode(val
, newkeydata
);
4315 dout(20) << " create " << key
<< " " << val
.length() << dendl
;
4317 } else if (op
== CEPH_OSD_TMAP_RM
) {
4322 } else if (op
== CEPH_OSD_TMAP_RMSLOPPY
) {
4325 dout(10) << " invalid tmap op " << (int)op
<< dendl
;
4332 ::encode(nextkey
, newkeydata
);
4333 ::encode(nextval
, newkeydata
);
4334 dout(20) << " keep " << nextkey
<< " " << nextval
.length() << dendl
;
4338 rest
.substr_of(newop
.outdata
, ip
.get_off(), newop
.outdata
.length() - ip
.get_off());
4339 dout(20) << " keep trailing " << rest
.length()
4340 << " at " << newkeydata
.length() << dendl
;
4341 newkeydata
.claim_append(rest
);
4344 // encode final key count + key data
4345 dout(20) << "tmapup final nkeys " << nkeys
<< dendl
;
4346 ::encode(nkeys
, obl
);
4347 obl
.claim_append(newkeydata
);
4350 dout(30) << " final is \n";
4351 obl
.hexdump(*_dout
);
4355 bufferlist::iterator tp
= obl
.begin();
4358 map
<string
,bufferlist
> d
;
4361 dout(0) << " **** debug sanity check, looks ok ****" << dendl
;
4366 dout(20) << "tmapput write " << obl
.length() << dendl
;
4367 newop
.op
.op
= CEPH_OSD_OP_WRITEFULL
;
4368 newop
.op
.extent
.offset
= 0;
4369 newop
.op
.extent
.length
= obl
.length();
4371 do_osd_ops(ctx
, nops
);
4372 osd_op
.outdata
.claim(newop
.outdata
);
4378 static int check_offset_and_length(uint64_t offset
, uint64_t length
, uint64_t max
)
4380 if (offset
>= max
||
4382 offset
+ length
> max
)
4388 struct FillInVerifyExtent
: public Context
{
4391 bufferlist
*outdatap
;
4392 boost::optional
<uint32_t> maybe_crc
;
4397 FillInVerifyExtent(ceph_le64
*r
, int32_t *rv
, bufferlist
*blp
,
4398 boost::optional
<uint32_t> mc
, uint64_t size
,
4399 OSDService
*osd
, hobject_t soid
, __le32 flags
) :
4400 r(r
), rval(rv
), outdatap(blp
), maybe_crc(mc
),
4401 size(size
), osd(osd
), soid(soid
), flags(flags
) {}
4402 void finish(int len
) override
{
4410 // whole object? can we verify the checksum?
4411 if (maybe_crc
&& *r
== size
) {
4412 uint32_t crc
= outdatap
->crc32c(-1);
4413 if (maybe_crc
!= crc
) {
4414 osd
->clog
->error() << std::hex
<< " full-object read crc 0x" << crc
4415 << " != expected 0x" << *maybe_crc
4416 << std::dec
<< " on " << soid
;
4417 if (!(flags
& CEPH_OSD_OP_FLAG_FAILOK
)) {
4426 struct ToSparseReadResult
: public Context
{
4428 bufferlist
* data_bl
;
4429 uint64_t data_offset
;
4431 ToSparseReadResult(int* result
, bufferlist
* bl
, uint64_t offset
,
4433 : result(result
), data_bl(bl
), data_offset(offset
),len(len
) {}
4434 void finish(int r
) override
{
4442 map
<uint64_t, uint64_t> extents
= {{data_offset
, r
}};
4443 ::encode(extents
, outdata
);
4444 ::encode_destructively(*data_bl
, outdata
);
4445 data_bl
->swap(outdata
);
4449 template<typename V
>
4450 static string
list_keys(const map
<string
, V
>& m
) {
4452 for (typename map
<string
, V
>::const_iterator itr
= m
.begin(); itr
!= m
.end(); ++itr
) {
4456 s
.append(itr
->first
);
4461 template<typename T
>
4462 static string
list_entries(const T
& m
) {
4464 for (typename
T::const_iterator itr
= m
.begin(); itr
!= m
.end(); ++itr
) {
4473 void PrimaryLogPG::maybe_create_new_object(
4475 bool ignore_transaction
)
4477 ObjectState
& obs
= ctx
->new_obs
;
4479 ctx
->delta_stats
.num_objects
++;
4481 assert(!obs
.oi
.is_whiteout());
4482 obs
.oi
.new_object();
4483 if (!ignore_transaction
)
4484 ctx
->op_t
->create(obs
.oi
.soid
);
4485 } else if (obs
.oi
.is_whiteout()) {
4486 dout(10) << __func__
<< " clearing whiteout on " << obs
.oi
.soid
<< dendl
;
4487 ctx
->new_obs
.oi
.clear_flag(object_info_t::FLAG_WHITEOUT
);
4488 --ctx
->delta_stats
.num_whiteouts
;
4492 struct ReadFinisher
: public PrimaryLogPG::OpFinisher
{
4495 ReadFinisher(OSDOp
& osd_op
) : osd_op(osd_op
) {
4498 int execute() override
{
4503 struct C_ChecksumRead
: public Context
{
4504 PrimaryLogPG
*primary_log_pg
;
4506 Checksummer::CSumType csum_type
;
4507 bufferlist init_value_bl
;
4508 ceph_le64 read_length
;
4510 Context
*fill_extent_ctx
;
4512 C_ChecksumRead(PrimaryLogPG
*primary_log_pg
, OSDOp
&osd_op
,
4513 Checksummer::CSumType csum_type
, bufferlist
&&init_value_bl
,
4514 boost::optional
<uint32_t> maybe_crc
, uint64_t size
,
4515 OSDService
*osd
, hobject_t soid
, __le32 flags
)
4516 : primary_log_pg(primary_log_pg
), osd_op(osd_op
),
4517 csum_type(csum_type
), init_value_bl(std::move(init_value_bl
)),
4518 fill_extent_ctx(new FillInVerifyExtent(&read_length
, &osd_op
.rval
,
4519 &read_bl
, maybe_crc
, size
,
4520 osd
, soid
, flags
)) {
4522 ~C_ChecksumRead() override
{
4523 delete fill_extent_ctx
;
4526 void finish(int r
) override
{
4527 fill_extent_ctx
->complete(r
);
4528 fill_extent_ctx
= nullptr;
4530 if (osd_op
.rval
>= 0) {
4531 bufferlist::iterator init_value_bl_it
= init_value_bl
.begin();
4532 osd_op
.rval
= primary_log_pg
->finish_checksum(osd_op
, csum_type
,
4533 &init_value_bl_it
, read_bl
);
4538 int PrimaryLogPG::do_checksum(OpContext
*ctx
, OSDOp
& osd_op
,
4539 bufferlist::iterator
*bl_it
)
4541 dout(20) << __func__
<< dendl
;
4542 bool skip_data_digest
=
4543 (osd
->store
->has_builtin_csum() && g_conf
->osd_skip_data_digest
) ||
4544 g_conf
->osd_distrust_data_digest
;
4546 auto& op
= osd_op
.op
;
4547 if (op
.checksum
.chunk_size
> 0) {
4548 if (op
.checksum
.length
== 0) {
4549 dout(10) << __func__
<< ": length required when chunk size provided"
4553 if (op
.checksum
.length
% op
.checksum
.chunk_size
!= 0) {
4554 dout(10) << __func__
<< ": length not aligned to chunk size" << dendl
;
4559 auto& oi
= ctx
->new_obs
.oi
;
4560 if (op
.checksum
.offset
== 0 && op
.checksum
.length
== 0) {
4561 // zeroed offset+length implies checksum whole object
4562 op
.checksum
.length
= oi
.size
;
4563 } else if (op
.checksum
.offset
+ op
.checksum
.length
> oi
.size
) {
4567 Checksummer::CSumType csum_type
;
4568 switch (op
.checksum
.type
) {
4569 case CEPH_OSD_CHECKSUM_OP_TYPE_XXHASH32
:
4570 csum_type
= Checksummer::CSUM_XXHASH32
;
4572 case CEPH_OSD_CHECKSUM_OP_TYPE_XXHASH64
:
4573 csum_type
= Checksummer::CSUM_XXHASH64
;
4575 case CEPH_OSD_CHECKSUM_OP_TYPE_CRC32C
:
4576 csum_type
= Checksummer::CSUM_CRC32C
;
4579 dout(10) << __func__
<< ": unknown crc type ("
4580 << static_cast<uint32_t>(op
.checksum
.type
) << ")" << dendl
;
4584 size_t csum_init_value_size
= Checksummer::get_csum_init_value_size(csum_type
);
4585 if (bl_it
->get_remaining() < csum_init_value_size
) {
4586 dout(10) << __func__
<< ": init value not provided" << dendl
;
4590 bufferlist init_value_bl
;
4591 init_value_bl
.substr_of(bl_it
->get_bl(), bl_it
->get_off(),
4592 csum_init_value_size
);
4593 bl_it
->advance(csum_init_value_size
);
4595 if (pool
.info
.require_rollback() && op
.checksum
.length
> 0) {
4596 // If there is a data digest and it is possible we are reading
4597 // entire object, pass the digest.
4598 boost::optional
<uint32_t> maybe_crc
;
4599 if (!skip_data_digest
&&
4600 oi
.is_data_digest() && op
.checksum
.offset
== 0 &&
4601 op
.checksum
.length
>= oi
.size
) {
4602 maybe_crc
= oi
.data_digest
;
4606 auto& soid
= oi
.soid
;
4607 auto checksum_ctx
= new C_ChecksumRead(this, osd_op
, csum_type
,
4608 std::move(init_value_bl
), maybe_crc
,
4609 oi
.size
, osd
, soid
, op
.flags
);
4611 ctx
->pending_async_reads
.push_back({
4612 {op
.checksum
.offset
, op
.checksum
.length
, op
.flags
},
4613 {&checksum_ctx
->read_bl
, checksum_ctx
}});
4615 dout(10) << __func__
<< ": async_read noted for " << soid
<< dendl
;
4616 ctx
->op_finishers
[ctx
->current_osd_subop_num
].reset(
4617 new ReadFinisher(osd_op
));
4618 return -EINPROGRESS
;
4622 std::vector
<OSDOp
> read_ops(1);
4623 auto& read_op
= read_ops
[0];
4624 if (op
.checksum
.length
> 0) {
4625 read_op
.op
.op
= CEPH_OSD_OP_READ
;
4626 read_op
.op
.flags
= op
.flags
;
4627 read_op
.op
.extent
.offset
= op
.checksum
.offset
;
4628 read_op
.op
.extent
.length
= op
.checksum
.length
;
4629 read_op
.op
.extent
.truncate_size
= 0;
4630 read_op
.op
.extent
.truncate_seq
= 0;
4632 int r
= do_osd_ops(ctx
, read_ops
);
4634 derr
<< __func__
<< ": do_osd_ops failed: " << cpp_strerror(r
) << dendl
;
4639 bufferlist::iterator init_value_bl_it
= init_value_bl
.begin();
4640 return finish_checksum(osd_op
, csum_type
, &init_value_bl_it
,
4644 int PrimaryLogPG::finish_checksum(OSDOp
& osd_op
,
4645 Checksummer::CSumType csum_type
,
4646 bufferlist::iterator
*init_value_bl_it
,
4647 const bufferlist
&read_bl
) {
4648 dout(20) << __func__
<< dendl
;
4650 auto& op
= osd_op
.op
;
4652 if (op
.checksum
.length
> 0 && read_bl
.length() != op
.checksum
.length
) {
4653 derr
<< __func__
<< ": bytes read " << read_bl
.length() << " != "
4654 << op
.checksum
.length
<< dendl
;
4658 size_t csum_chunk_size
= (op
.checksum
.chunk_size
!= 0 ?
4659 op
.checksum
.chunk_size
: read_bl
.length());
4660 uint32_t csum_count
= (csum_chunk_size
> 0 ?
4661 read_bl
.length() / csum_chunk_size
: 0);
4664 bufferptr csum_data
;
4665 if (csum_count
> 0) {
4666 size_t csum_value_size
= Checksummer::get_csum_value_size(csum_type
);
4667 csum_data
= buffer::create(csum_value_size
* csum_count
);
4669 csum
.append(csum_data
);
4671 switch (csum_type
) {
4672 case Checksummer::CSUM_XXHASH32
:
4674 Checksummer::xxhash32::init_value_t init_value
;
4675 ::decode(init_value
, *init_value_bl_it
);
4676 Checksummer::calculate
<Checksummer::xxhash32
>(
4677 init_value
, csum_chunk_size
, 0, read_bl
.length(), read_bl
,
4681 case Checksummer::CSUM_XXHASH64
:
4683 Checksummer::xxhash64::init_value_t init_value
;
4684 ::decode(init_value
, *init_value_bl_it
);
4685 Checksummer::calculate
<Checksummer::xxhash64
>(
4686 init_value
, csum_chunk_size
, 0, read_bl
.length(), read_bl
,
4690 case Checksummer::CSUM_CRC32C
:
4692 Checksummer::crc32c::init_value_t init_value
;
4693 ::decode(init_value
, *init_value_bl_it
);
4694 Checksummer::calculate
<Checksummer::crc32c
>(
4695 init_value
, csum_chunk_size
, 0, read_bl
.length(), read_bl
,
4704 ::encode(csum_count
, osd_op
.outdata
);
4705 osd_op
.outdata
.claim_append(csum
);
4709 struct C_ExtentCmpRead
: public Context
{
4710 PrimaryLogPG
*primary_log_pg
;
4712 ceph_le64 read_length
;
4714 Context
*fill_extent_ctx
;
4716 C_ExtentCmpRead(PrimaryLogPG
*primary_log_pg
, OSDOp
&osd_op
,
4717 boost::optional
<uint32_t> maybe_crc
, uint64_t size
,
4718 OSDService
*osd
, hobject_t soid
, __le32 flags
)
4719 : primary_log_pg(primary_log_pg
), osd_op(osd_op
),
4720 fill_extent_ctx(new FillInVerifyExtent(&read_length
, &osd_op
.rval
,
4721 &read_bl
, maybe_crc
, size
,
4722 osd
, soid
, flags
)) {
4724 ~C_ExtentCmpRead() override
{
4725 delete fill_extent_ctx
;
4728 void finish(int r
) override
{
4732 delete fill_extent_ctx
;
4734 fill_extent_ctx
->complete(r
);
4736 fill_extent_ctx
= nullptr;
4738 if (osd_op
.rval
>= 0) {
4739 osd_op
.rval
= primary_log_pg
->finish_extent_cmp(osd_op
, read_bl
);
4744 int PrimaryLogPG::do_extent_cmp(OpContext
*ctx
, OSDOp
& osd_op
)
4746 dout(20) << __func__
<< dendl
;
4747 ceph_osd_op
& op
= osd_op
.op
;
4748 bool skip_data_digest
=
4749 (osd
->store
->has_builtin_csum() && g_conf
->osd_skip_data_digest
) ||
4750 g_conf
->osd_distrust_data_digest
;
4752 auto& oi
= ctx
->new_obs
.oi
;
4753 uint64_t size
= oi
.size
;
4754 if ((oi
.truncate_seq
< op
.extent
.truncate_seq
) &&
4755 (op
.extent
.offset
+ op
.extent
.length
> op
.extent
.truncate_size
)) {
4756 size
= op
.extent
.truncate_size
;
4759 if (op
.extent
.offset
>= size
) {
4760 op
.extent
.length
= 0;
4761 } else if (op
.extent
.offset
+ op
.extent
.length
> size
) {
4762 op
.extent
.length
= size
- op
.extent
.offset
;
4765 if (op
.extent
.length
== 0) {
4766 dout(20) << __func__
<< " zero length extent" << dendl
;
4767 return finish_extent_cmp(osd_op
, bufferlist
{});
4768 } else if (!ctx
->obs
->exists
|| ctx
->obs
->oi
.is_whiteout()) {
4769 dout(20) << __func__
<< " object DNE" << dendl
;
4770 return finish_extent_cmp(osd_op
, {});
4771 } else if (pool
.info
.require_rollback()) {
4772 // If there is a data digest and it is possible we are reading
4773 // entire object, pass the digest.
4774 boost::optional
<uint32_t> maybe_crc
;
4775 if (!skip_data_digest
&&
4776 oi
.is_data_digest() && op
.checksum
.offset
== 0 &&
4777 op
.checksum
.length
>= oi
.size
) {
4778 maybe_crc
= oi
.data_digest
;
4782 auto& soid
= oi
.soid
;
4783 auto extent_cmp_ctx
= new C_ExtentCmpRead(this, osd_op
, maybe_crc
, oi
.size
,
4784 osd
, soid
, op
.flags
);
4785 ctx
->pending_async_reads
.push_back({
4786 {op
.extent
.offset
, op
.extent
.length
, op
.flags
},
4787 {&extent_cmp_ctx
->read_bl
, extent_cmp_ctx
}});
4789 dout(10) << __func__
<< ": async_read noted for " << soid
<< dendl
;
4791 ctx
->op_finishers
[ctx
->current_osd_subop_num
].reset(
4792 new ReadFinisher(osd_op
));
4793 return -EINPROGRESS
;
4797 vector
<OSDOp
> read_ops(1);
4798 OSDOp
& read_op
= read_ops
[0];
4800 read_op
.op
.op
= CEPH_OSD_OP_SYNC_READ
;
4801 read_op
.op
.extent
.offset
= op
.extent
.offset
;
4802 read_op
.op
.extent
.length
= op
.extent
.length
;
4803 read_op
.op
.extent
.truncate_seq
= op
.extent
.truncate_seq
;
4804 read_op
.op
.extent
.truncate_size
= op
.extent
.truncate_size
;
4806 int result
= do_osd_ops(ctx
, read_ops
);
4808 derr
<< __func__
<< " failed " << result
<< dendl
;
4811 return finish_extent_cmp(osd_op
, read_op
.outdata
);
4814 int PrimaryLogPG::finish_extent_cmp(OSDOp
& osd_op
, const bufferlist
&read_bl
)
4816 for (uint64_t idx
= 0; idx
< osd_op
.indata
.length(); ++idx
) {
4817 char read_byte
= (idx
< read_bl
.length() ? read_bl
[idx
] : 0);
4818 if (osd_op
.indata
[idx
] != read_byte
) {
4819 return (-MAX_ERRNO
- idx
);
4826 int PrimaryLogPG::do_read(OpContext
*ctx
, OSDOp
& osd_op
) {
4827 dout(20) << __func__
<< dendl
;
4828 auto& op
= osd_op
.op
;
4829 auto& oi
= ctx
->new_obs
.oi
;
4830 auto& soid
= oi
.soid
;
4831 __u32 seq
= oi
.truncate_seq
;
4832 uint64_t size
= oi
.size
;
4833 bool trimmed_read
= false;
4834 bool skip_data_digest
=
4835 (osd
->store
->has_builtin_csum() && g_conf
->osd_skip_data_digest
) ||
4836 g_conf
->osd_distrust_data_digest
;
4838 // are we beyond truncate_size?
4839 if ( (seq
< op
.extent
.truncate_seq
) &&
4840 (op
.extent
.offset
+ op
.extent
.length
> op
.extent
.truncate_size
) )
4841 size
= op
.extent
.truncate_size
;
4843 if (op
.extent
.length
== 0) //length is zero mean read the whole object
4844 op
.extent
.length
= size
;
4846 if (op
.extent
.offset
>= size
) {
4847 op
.extent
.length
= 0;
4848 trimmed_read
= true;
4849 } else if (op
.extent
.offset
+ op
.extent
.length
> size
) {
4850 op
.extent
.length
= size
- op
.extent
.offset
;
4851 trimmed_read
= true;
4854 // read into a buffer
4856 if (trimmed_read
&& op
.extent
.length
== 0) {
4857 // read size was trimmed to zero and it is expected to do nothing
4858 // a read operation of 0 bytes does *not* do nothing, this is why
4859 // the trimmed_read boolean is needed
4860 } else if (pool
.info
.require_rollback()) {
4861 boost::optional
<uint32_t> maybe_crc
;
4862 // If there is a data digest and it is possible we are reading
4863 // entire object, pass the digest. FillInVerifyExtent will
4864 // will check the oi.size again.
4865 if (!skip_data_digest
&&
4866 oi
.is_data_digest() && op
.extent
.offset
== 0 &&
4867 op
.extent
.length
>= oi
.size
)
4868 maybe_crc
= oi
.data_digest
;
4869 ctx
->pending_async_reads
.push_back(
4871 boost::make_tuple(op
.extent
.offset
, op
.extent
.length
, op
.flags
),
4872 make_pair(&osd_op
.outdata
,
4873 new FillInVerifyExtent(&op
.extent
.length
, &osd_op
.rval
,
4874 &osd_op
.outdata
, maybe_crc
, oi
.size
,
4875 osd
, soid
, op
.flags
))));
4876 dout(10) << " async_read noted for " << soid
<< dendl
;
4878 ctx
->op_finishers
[ctx
->current_osd_subop_num
].reset(
4879 new ReadFinisher(osd_op
));
4881 int r
= pgbackend
->objects_read_sync(
4882 soid
, op
.extent
.offset
, op
.extent
.length
, op
.flags
, &osd_op
.outdata
);
4883 // whole object? can we verify the checksum?
4884 if (!skip_data_digest
&& r
>= 0 && op
.extent
.offset
== 0 &&
4885 (uint64_t)r
== oi
.size
&& oi
.is_data_digest()) {
4886 uint32_t crc
= osd_op
.outdata
.crc32c(-1);
4887 if (oi
.data_digest
!= crc
) {
4888 osd
->clog
->error() << info
.pgid
<< std::hex
4889 << " full-object read crc 0x" << crc
4890 << " != expected 0x" << oi
.data_digest
4891 << std::dec
<< " on " << soid
;
4892 r
= -EIO
; // try repair later
4896 r
= rep_repair_primary_object(soid
, ctx
->op
);
4899 op
.extent
.length
= r
;
4902 op
.extent
.length
= 0;
4904 dout(10) << " read got " << r
<< " / " << op
.extent
.length
4905 << " bytes from obj " << soid
<< dendl
;
4908 // XXX the op.extent.length is the requested length for async read
4909 // On error this length is changed to 0 after the error comes back.
4910 ctx
->delta_stats
.num_rd_kb
+= SHIFT_ROUND_UP(op
.extent
.length
, 10);
4911 ctx
->delta_stats
.num_rd
++;
4915 int PrimaryLogPG::do_sparse_read(OpContext
*ctx
, OSDOp
& osd_op
) {
4916 dout(20) << __func__
<< dendl
;
4917 auto& op
= osd_op
.op
;
4918 auto& oi
= ctx
->new_obs
.oi
;
4919 auto& soid
= oi
.soid
;
4920 bool skip_data_digest
=
4921 (osd
->store
->has_builtin_csum() && g_conf
->osd_skip_data_digest
) ||
4922 g_conf
->osd_distrust_data_digest
;
4924 if (op
.extent
.truncate_seq
) {
4925 dout(0) << "sparse_read does not support truncation sequence " << dendl
;
4930 if (pool
.info
.ec_pool()) {
4931 // translate sparse read to a normal one if not supported
4932 uint64_t offset
= op
.extent
.offset
;
4933 uint64_t length
= op
.extent
.length
;
4934 if (offset
> oi
.size
) {
4936 } else if (offset
+ length
> oi
.size
) {
4937 length
= oi
.size
- offset
;
4941 ctx
->pending_async_reads
.push_back(
4943 boost::make_tuple(offset
, length
, op
.flags
),
4946 new ToSparseReadResult(&osd_op
.rval
, &osd_op
.outdata
, offset
,
4947 &op
.extent
.length
))));
4948 dout(10) << " async_read (was sparse_read) noted for " << soid
<< dendl
;
4950 ctx
->op_finishers
[ctx
->current_osd_subop_num
].reset(
4951 new ReadFinisher(osd_op
));
4953 dout(10) << " sparse read ended up empty for " << soid
<< dendl
;
4954 map
<uint64_t, uint64_t> extents
;
4955 ::encode(extents
, osd_op
.outdata
);
4958 // read into a buffer
4959 map
<uint64_t, uint64_t> m
;
4960 uint32_t total_read
= 0;
4961 int r
= osd
->store
->fiemap(ch
, ghobject_t(soid
, ghobject_t::NO_GEN
,
4963 op
.extent
.offset
, op
.extent
.length
, m
);
4968 map
<uint64_t, uint64_t>::iterator miter
;
4970 uint64_t last
= op
.extent
.offset
;
4971 for (miter
= m
.begin(); miter
!= m
.end(); ++miter
) {
4973 if (cct
->_conf
->osd_verify_sparse_read_holes
&&
4974 last
< miter
->first
) {
4976 uint64_t len
= miter
->first
- last
;
4977 r
= pgbackend
->objects_read_sync(soid
, last
, len
, op
.flags
, &t
);
4979 osd
->clog
->error() << coll
<< " " << soid
4980 << " sparse-read failed to read: "
4982 } else if (!t
.is_zero()) {
4983 osd
->clog
->error() << coll
<< " " << soid
4984 << " sparse-read found data in hole "
4985 << last
<< "~" << len
;
4990 r
= pgbackend
->objects_read_sync(soid
, miter
->first
, miter
->second
,
4993 r
= rep_repair_primary_object(soid
, ctx
->op
);
4999 // this is usually happen when we get extent that exceeds the actual file
5001 if (r
< (int)miter
->second
)
5004 dout(10) << "sparse-read " << miter
->first
<< "@" << miter
->second
5006 data_bl
.claim_append(tmpbl
);
5007 last
= miter
->first
+ r
;
5014 // verify trailing hole?
5015 if (cct
->_conf
->osd_verify_sparse_read_holes
) {
5016 uint64_t end
= MIN(op
.extent
.offset
+ op
.extent
.length
, oi
.size
);
5019 uint64_t len
= end
- last
;
5020 r
= pgbackend
->objects_read_sync(soid
, last
, len
, op
.flags
, &t
);
5022 osd
->clog
->error() << coll
<< " " << soid
5023 << " sparse-read failed to read: " << r
;
5024 } else if (!t
.is_zero()) {
5025 osd
->clog
->error() << coll
<< " " << soid
5026 << " sparse-read found data in hole "
5027 << last
<< "~" << len
;
5032 // Why SPARSE_READ need checksum? In fact, librbd always use sparse-read.
5033 // Maybe at first, there is no much whole objects. With continued use, more
5034 // and more whole object exist. So from this point, for spare-read add
5035 // checksum make sense.
5036 if (!skip_data_digest
&&
5037 total_read
== oi
.size
&& oi
.is_data_digest()) {
5038 uint32_t crc
= data_bl
.crc32c(-1);
5039 if (oi
.data_digest
!= crc
) {
5040 osd
->clog
->error() << info
.pgid
<< std::hex
5041 << " full-object read crc 0x" << crc
5042 << " != expected 0x" << oi
.data_digest
5043 << std::dec
<< " on " << soid
;
5044 r
= rep_repair_primary_object(soid
, ctx
->op
);
5051 op
.extent
.length
= total_read
;
5053 ::encode(m
, osd_op
.outdata
); // re-encode since it might be modified
5054 ::encode_destructively(data_bl
, osd_op
.outdata
);
5056 dout(10) << " sparse_read got " << total_read
<< " bytes from object "
5060 ctx
->delta_stats
.num_rd_kb
+= SHIFT_ROUND_UP(op
.extent
.length
, 10);
5061 ctx
->delta_stats
.num_rd
++;
5065 int PrimaryLogPG::do_osd_ops(OpContext
*ctx
, vector
<OSDOp
>& ops
)
5068 SnapSetContext
*ssc
= ctx
->obc
->ssc
;
5069 ObjectState
& obs
= ctx
->new_obs
;
5070 object_info_t
& oi
= obs
.oi
;
5071 const hobject_t
& soid
= oi
.soid
;
5072 bool skip_data_digest
=
5073 osd
->store
->has_builtin_csum() && g_conf
->osd_skip_data_digest
;
5075 PGTransaction
* t
= ctx
->op_t
.get();
5077 dout(10) << "do_osd_op " << soid
<< " " << ops
<< dendl
;
5079 ctx
->current_osd_subop_num
= 0;
5080 for (auto p
= ops
.begin(); p
!= ops
.end(); ++p
, ctx
->current_osd_subop_num
++, ctx
->processed_subop_count
++) {
5082 ceph_osd_op
& op
= osd_op
.op
;
5084 OpFinisher
* op_finisher
= nullptr;
5086 auto op_finisher_it
= ctx
->op_finishers
.find(ctx
->current_osd_subop_num
);
5087 if (op_finisher_it
!= ctx
->op_finishers
.end()) {
5088 op_finisher
= op_finisher_it
->second
.get();
5092 // TODO: check endianness (__le32 vs uint32_t, etc.)
5093 // The fields in ceph_osd_op are little-endian (according to the definition in rados.h),
5094 // but the code in this function seems to treat them as native-endian. What should the
5096 tracepoint(osd
, do_osd_op_pre
, soid
.oid
.name
.c_str(), soid
.snap
.val
, op
.op
, ceph_osd_op_name(op
.op
), op
.flags
);
5098 dout(10) << "do_osd_op " << osd_op
<< dendl
;
5100 bufferlist::iterator bp
= osd_op
.indata
.begin();
5102 // user-visible modifcation?
5104 // non user-visible modifications
5105 case CEPH_OSD_OP_WATCH
:
5106 case CEPH_OSD_OP_CACHE_EVICT
:
5107 case CEPH_OSD_OP_CACHE_FLUSH
:
5108 case CEPH_OSD_OP_CACHE_TRY_FLUSH
:
5109 case CEPH_OSD_OP_UNDIRTY
:
5110 case CEPH_OSD_OP_COPY_FROM
: // we handle user_version update explicitly
5111 case CEPH_OSD_OP_CACHE_PIN
:
5112 case CEPH_OSD_OP_CACHE_UNPIN
:
5113 case CEPH_OSD_OP_SET_REDIRECT
:
5116 if (op
.op
& CEPH_OSD_OP_MODE_WR
)
5117 ctx
->user_modify
= true;
5120 // munge -1 truncate to 0 truncate
5121 if (ceph_osd_op_uses_extent(op
.op
) &&
5122 op
.extent
.truncate_seq
== 1 &&
5123 op
.extent
.truncate_size
== (-1ULL)) {
5124 op
.extent
.truncate_size
= 0;
5125 op
.extent
.truncate_seq
= 0;
5128 // munge ZERO -> TRUNCATE? (don't munge to DELETE or we risk hosing attributes)
5129 if (op
.op
== CEPH_OSD_OP_ZERO
&&
5131 op
.extent
.offset
< cct
->_conf
->osd_max_object_size
&&
5132 op
.extent
.length
>= 1 &&
5133 op
.extent
.length
<= cct
->_conf
->osd_max_object_size
&&
5134 op
.extent
.offset
+ op
.extent
.length
>= oi
.size
) {
5135 if (op
.extent
.offset
>= oi
.size
) {
5139 dout(10) << " munging ZERO " << op
.extent
.offset
<< "~" << op
.extent
.length
5140 << " -> TRUNCATE " << op
.extent
.offset
<< " (old size is " << oi
.size
<< ")" << dendl
;
5141 op
.op
= CEPH_OSD_OP_TRUNCATE
;
5148 case CEPH_OSD_OP_CMPEXT
:
5150 tracepoint(osd
, do_osd_op_pre_extent_cmp
, soid
.oid
.name
.c_str(),
5151 soid
.snap
.val
, oi
.size
, oi
.truncate_seq
, op
.extent
.offset
,
5152 op
.extent
.length
, op
.extent
.truncate_size
,
5153 op
.extent
.truncate_seq
);
5155 if (op_finisher
== nullptr) {
5156 result
= do_extent_cmp(ctx
, osd_op
);
5158 result
= op_finisher
->execute();
5162 case CEPH_OSD_OP_SYNC_READ
:
5163 if (pool
.info
.require_rollback()) {
5164 result
= -EOPNOTSUPP
;
5168 case CEPH_OSD_OP_READ
:
5170 tracepoint(osd
, do_osd_op_pre_read
, soid
.oid
.name
.c_str(),
5171 soid
.snap
.val
, oi
.size
, oi
.truncate_seq
, op
.extent
.offset
,
5172 op
.extent
.length
, op
.extent
.truncate_size
,
5173 op
.extent
.truncate_seq
);
5174 if (op_finisher
== nullptr) {
5175 if (!ctx
->data_off
) {
5176 ctx
->data_off
= op
.extent
.offset
;
5178 result
= do_read(ctx
, osd_op
);
5180 result
= op_finisher
->execute();
5184 case CEPH_OSD_OP_CHECKSUM
:
5187 tracepoint(osd
, do_osd_op_pre_checksum
, soid
.oid
.name
.c_str(),
5188 soid
.snap
.val
, oi
.size
, oi
.truncate_seq
, op
.checksum
.type
,
5189 op
.checksum
.offset
, op
.checksum
.length
,
5190 op
.checksum
.chunk_size
);
5192 if (op_finisher
== nullptr) {
5193 result
= do_checksum(ctx
, osd_op
, &bp
);
5195 result
= op_finisher
->execute();
5201 case CEPH_OSD_OP_MAPEXT
:
5202 tracepoint(osd
, do_osd_op_pre_mapext
, soid
.oid
.name
.c_str(), soid
.snap
.val
, op
.extent
.offset
, op
.extent
.length
);
5203 if (pool
.info
.require_rollback()) {
5204 result
= -EOPNOTSUPP
;
5209 // read into a buffer
5211 int r
= osd
->store
->fiemap(ch
, ghobject_t(soid
, ghobject_t::NO_GEN
,
5213 op
.extent
.offset
, op
.extent
.length
, bl
);
5214 osd_op
.outdata
.claim(bl
);
5218 ctx
->delta_stats
.num_rd_kb
+= SHIFT_ROUND_UP(bl
.length(), 10);
5219 ctx
->delta_stats
.num_rd
++;
5220 dout(10) << " map_extents done on object " << soid
<< dendl
;
5225 case CEPH_OSD_OP_SPARSE_READ
:
5226 tracepoint(osd
, do_osd_op_pre_sparse_read
, soid
.oid
.name
.c_str(),
5227 soid
.snap
.val
, oi
.size
, oi
.truncate_seq
, op
.extent
.offset
,
5228 op
.extent
.length
, op
.extent
.truncate_size
,
5229 op
.extent
.truncate_seq
);
5230 if (op_finisher
== nullptr) {
5231 result
= do_sparse_read(ctx
, osd_op
);
5233 result
= op_finisher
->execute();
5237 case CEPH_OSD_OP_CALL
:
5239 string cname
, mname
;
5242 bp
.copy(op
.cls
.class_len
, cname
);
5243 bp
.copy(op
.cls
.method_len
, mname
);
5244 bp
.copy(op
.cls
.indata_len
, indata
);
5245 } catch (buffer::error
& e
) {
5246 dout(10) << "call unable to decode class + method + indata" << dendl
;
5247 dout(30) << "in dump: ";
5248 osd_op
.indata
.hexdump(*_dout
);
5251 tracepoint(osd
, do_osd_op_pre_call
, soid
.oid
.name
.c_str(), soid
.snap
.val
, "???", "???");
5254 tracepoint(osd
, do_osd_op_pre_call
, soid
.oid
.name
.c_str(), soid
.snap
.val
, cname
.c_str(), mname
.c_str());
5256 ClassHandler::ClassData
*cls
;
5257 result
= osd
->class_handler
->open_class(cname
, &cls
);
5258 assert(result
== 0); // init_op_flags() already verified this works.
5260 ClassHandler::ClassMethod
*method
= cls
->get_method(mname
.c_str());
5262 dout(10) << "call method " << cname
<< "." << mname
<< " does not exist" << dendl
;
5263 result
= -EOPNOTSUPP
;
5267 int flags
= method
->get_flags();
5268 if (flags
& CLS_METHOD_WR
)
5269 ctx
->user_modify
= true;
5272 dout(10) << "call method " << cname
<< "." << mname
<< dendl
;
5273 int prev_rd
= ctx
->num_read
;
5274 int prev_wr
= ctx
->num_write
;
5275 result
= method
->exec((cls_method_context_t
)&ctx
, indata
, outdata
);
5277 if (ctx
->num_read
> prev_rd
&& !(flags
& CLS_METHOD_RD
)) {
5278 derr
<< "method " << cname
<< "." << mname
<< " tried to read object but is not marked RD" << dendl
;
5282 if (ctx
->num_write
> prev_wr
&& !(flags
& CLS_METHOD_WR
)) {
5283 derr
<< "method " << cname
<< "." << mname
<< " tried to update object but is not marked WR" << dendl
;
5288 dout(10) << "method called response length=" << outdata
.length() << dendl
;
5289 op
.extent
.length
= outdata
.length();
5290 osd_op
.outdata
.claim_append(outdata
);
5291 dout(30) << "out dump: ";
5292 osd_op
.outdata
.hexdump(*_dout
);
5297 case CEPH_OSD_OP_STAT
:
5298 // note: stat does not require RD
5300 tracepoint(osd
, do_osd_op_pre_stat
, soid
.oid
.name
.c_str(), soid
.snap
.val
);
5302 if (obs
.exists
&& !oi
.is_whiteout()) {
5303 ::encode(oi
.size
, osd_op
.outdata
);
5304 ::encode(oi
.mtime
, osd_op
.outdata
);
5305 dout(10) << "stat oi has " << oi
.size
<< " " << oi
.mtime
<< dendl
;
5308 dout(10) << "stat oi object does not exist" << dendl
;
5311 ctx
->delta_stats
.num_rd
++;
5315 case CEPH_OSD_OP_ISDIRTY
:
5318 tracepoint(osd
, do_osd_op_pre_isdirty
, soid
.oid
.name
.c_str(), soid
.snap
.val
);
5319 bool is_dirty
= obs
.oi
.is_dirty();
5320 ::encode(is_dirty
, osd_op
.outdata
);
5321 ctx
->delta_stats
.num_rd
++;
5326 case CEPH_OSD_OP_UNDIRTY
:
5329 tracepoint(osd
, do_osd_op_pre_undirty
, soid
.oid
.name
.c_str(), soid
.snap
.val
);
5330 if (oi
.is_dirty()) {
5331 ctx
->undirty
= true; // see make_writeable()
5333 ctx
->delta_stats
.num_wr
++;
5339 case CEPH_OSD_OP_CACHE_TRY_FLUSH
:
5342 tracepoint(osd
, do_osd_op_pre_try_flush
, soid
.oid
.name
.c_str(), soid
.snap
.val
);
5343 if (ctx
->lock_type
!= ObjectContext::RWState::RWNONE
) {
5344 dout(10) << "cache-try-flush without SKIPRWLOCKS flag set" << dendl
;
5348 if (pool
.info
.cache_mode
== pg_pool_t::CACHEMODE_NONE
) {
5356 if (oi
.is_cache_pinned()) {
5357 dout(10) << "cache-try-flush on a pinned object, consider unpin this object first" << dendl
;
5361 if (oi
.is_dirty()) {
5362 result
= start_flush(ctx
->op
, ctx
->obc
, false, NULL
, boost::none
);
5363 if (result
== -EINPROGRESS
)
5371 case CEPH_OSD_OP_CACHE_FLUSH
:
5374 tracepoint(osd
, do_osd_op_pre_cache_flush
, soid
.oid
.name
.c_str(), soid
.snap
.val
);
5375 if (ctx
->lock_type
== ObjectContext::RWState::RWNONE
) {
5376 dout(10) << "cache-flush with SKIPRWLOCKS flag set" << dendl
;
5380 if (pool
.info
.cache_mode
== pg_pool_t::CACHEMODE_NONE
) {
5388 if (oi
.is_cache_pinned()) {
5389 dout(10) << "cache-flush on a pinned object, consider unpin this object first" << dendl
;
5394 if (oi
.is_dirty()) {
5395 result
= start_flush(ctx
->op
, ctx
->obc
, true, &missing
, boost::none
);
5396 if (result
== -EINPROGRESS
)
5401 // Check special return value which has set missing_return
5402 if (result
== -ENOENT
) {
5403 dout(10) << __func__
<< " CEPH_OSD_OP_CACHE_FLUSH got ENOENT" << dendl
;
5404 assert(!missing
.is_min());
5405 wait_for_unreadable_object(missing
, ctx
->op
);
5406 // Error code which is used elsewhere when wait_for_unreadable_object() is used
5412 case CEPH_OSD_OP_CACHE_EVICT
:
5415 tracepoint(osd
, do_osd_op_pre_cache_evict
, soid
.oid
.name
.c_str(), soid
.snap
.val
);
5416 if (pool
.info
.cache_mode
== pg_pool_t::CACHEMODE_NONE
) {
5424 if (oi
.is_cache_pinned()) {
5425 dout(10) << "cache-evict on a pinned object, consider unpin this object first" << dendl
;
5429 if (oi
.is_dirty()) {
5433 if (!oi
.watchers
.empty()) {
5437 if (soid
.snap
== CEPH_NOSNAP
) {
5438 result
= _verify_no_head_clones(soid
, ssc
->snapset
);
5442 result
= _delete_oid(ctx
, true, false);
5444 // mark that this is a cache eviction to avoid triggering normal
5445 // make_writeable() clone or snapdir object creation in finish_ctx()
5446 ctx
->cache_evict
= true;
5448 osd
->logger
->inc(l_osd_tier_evict
);
5452 case CEPH_OSD_OP_GETXATTR
:
5456 bp
.copy(op
.xattr
.name_len
, aname
);
5457 tracepoint(osd
, do_osd_op_pre_getxattr
, soid
.oid
.name
.c_str(), soid
.snap
.val
, aname
.c_str());
5458 string name
= "_" + aname
;
5459 int r
= getattr_maybe_cache(
5464 op
.xattr
.value_len
= osd_op
.outdata
.length();
5466 ctx
->delta_stats
.num_rd_kb
+= SHIFT_ROUND_UP(osd_op
.outdata
.length(), 10);
5470 ctx
->delta_stats
.num_rd
++;
5474 case CEPH_OSD_OP_GETXATTRS
:
5477 tracepoint(osd
, do_osd_op_pre_getxattrs
, soid
.oid
.name
.c_str(), soid
.snap
.val
);
5478 map
<string
, bufferlist
> out
;
5479 result
= getattrs_maybe_cache(
5485 ctx
->delta_stats
.num_rd_kb
+= SHIFT_ROUND_UP(bl
.length(), 10);
5486 ctx
->delta_stats
.num_rd
++;
5487 osd_op
.outdata
.claim_append(bl
);
5491 case CEPH_OSD_OP_CMPXATTR
:
5495 bp
.copy(op
.xattr
.name_len
, aname
);
5496 tracepoint(osd
, do_osd_op_pre_cmpxattr
, soid
.oid
.name
.c_str(), soid
.snap
.val
, aname
.c_str());
5497 string name
= "_" + aname
;
5498 name
[op
.xattr
.name_len
+ 1] = 0;
5501 result
= getattr_maybe_cache(
5505 if (result
< 0 && result
!= -EEXIST
&& result
!= -ENODATA
)
5508 ctx
->delta_stats
.num_rd
++;
5509 ctx
->delta_stats
.num_rd_kb
+= SHIFT_ROUND_UP(xattr
.length(), 10);
5511 switch (op
.xattr
.cmp_mode
) {
5512 case CEPH_OSD_CMPXATTR_MODE_STRING
:
5515 bp
.copy(op
.xattr
.value_len
, val
);
5516 val
[op
.xattr
.value_len
] = 0;
5517 dout(10) << "CEPH_OSD_OP_CMPXATTR name=" << name
<< " val=" << val
5518 << " op=" << (int)op
.xattr
.cmp_op
<< " mode=" << (int)op
.xattr
.cmp_mode
<< dendl
;
5519 result
= do_xattr_cmp_str(op
.xattr
.cmp_op
, val
, xattr
);
5523 case CEPH_OSD_CMPXATTR_MODE_U64
:
5527 ::decode(u64val
, bp
);
5529 catch (buffer::error
& e
) {
5533 dout(10) << "CEPH_OSD_OP_CMPXATTR name=" << name
<< " val=" << u64val
5534 << " op=" << (int)op
.xattr
.cmp_op
<< " mode=" << (int)op
.xattr
.cmp_mode
<< dendl
;
5535 result
= do_xattr_cmp_u64(op
.xattr
.cmp_op
, u64val
, xattr
);
5540 dout(10) << "bad cmp mode " << (int)op
.xattr
.cmp_mode
<< dendl
;
5545 dout(10) << "comparison returned false" << dendl
;
5546 result
= -ECANCELED
;
5550 dout(10) << "comparison returned " << result
<< " " << cpp_strerror(-result
) << dendl
;
5554 dout(10) << "comparison returned true" << dendl
;
5558 case CEPH_OSD_OP_ASSERT_VER
:
5561 uint64_t ver
= op
.assert_ver
.ver
;
5562 tracepoint(osd
, do_osd_op_pre_assert_ver
, soid
.oid
.name
.c_str(), soid
.snap
.val
, ver
);
5565 else if (ver
< oi
.user_version
)
5567 else if (ver
> oi
.user_version
)
5568 result
= -EOVERFLOW
;
5572 case CEPH_OSD_OP_LIST_WATCHERS
:
5575 tracepoint(osd
, do_osd_op_pre_list_watchers
, soid
.oid
.name
.c_str(), soid
.snap
.val
);
5576 obj_list_watch_response_t resp
;
5578 map
<pair
<uint64_t, entity_name_t
>, watch_info_t
>::const_iterator oi_iter
;
5579 for (oi_iter
= oi
.watchers
.begin(); oi_iter
!= oi
.watchers
.end();
5581 dout(20) << "key cookie=" << oi_iter
->first
.first
5582 << " entity=" << oi_iter
->first
.second
<< " "
5583 << oi_iter
->second
<< dendl
;
5584 assert(oi_iter
->first
.first
== oi_iter
->second
.cookie
);
5585 assert(oi_iter
->first
.second
.is_client());
5587 watch_item_t
wi(oi_iter
->first
.second
, oi_iter
->second
.cookie
,
5588 oi_iter
->second
.timeout_seconds
, oi_iter
->second
.addr
);
5589 resp
.entries
.push_back(wi
);
5592 resp
.encode(osd_op
.outdata
, ctx
->get_features());
5595 ctx
->delta_stats
.num_rd
++;
5599 case CEPH_OSD_OP_LIST_SNAPS
:
5602 tracepoint(osd
, do_osd_op_pre_list_snaps
, soid
.oid
.name
.c_str(), soid
.snap
.val
);
5603 obj_list_snap_response_t resp
;
5606 ssc
= ctx
->obc
->ssc
= get_snapset_context(soid
, false);
5610 int clonecount
= ssc
->snapset
.clones
.size();
5611 if (ssc
->snapset
.head_exists
)
5613 resp
.clones
.reserve(clonecount
);
5614 for (auto clone_iter
= ssc
->snapset
.clones
.begin();
5615 clone_iter
!= ssc
->snapset
.clones
.end(); ++clone_iter
) {
5617 ci
.cloneid
= *clone_iter
;
5619 hobject_t clone_oid
= soid
;
5620 clone_oid
.snap
= *clone_iter
;
5622 if (!ssc
->snapset
.is_legacy()) {
5623 auto p
= ssc
->snapset
.clone_snaps
.find(*clone_iter
);
5624 if (p
== ssc
->snapset
.clone_snaps
.end()) {
5625 osd
->clog
->error() << "osd." << osd
->whoami
5626 << ": inconsistent clone_snaps found for oid "
5627 << soid
<< " clone " << *clone_iter
5628 << " snapset " << ssc
->snapset
;
5632 for (auto q
= p
->second
.rbegin(); q
!= p
->second
.rend(); ++q
) {
5633 ci
.snaps
.push_back(*q
);
5636 /* No need to take a lock here. We are only inspecting state cached on
5637 * in the ObjectContext, so we aren't performing an actual read unless
5638 * the clone obc is not already loaded (in which case, it cannot have
5639 * an in progress write). We also do not risk exposing uncommitted
5640 * state since we do have a read lock on the head object or snapdir,
5641 * which we would have to write lock in order to make user visible
5642 * modifications to the snapshot state (snap trim related mutations
5643 * are not user visible).
5645 if (is_missing_object(clone_oid
)) {
5646 dout(20) << "LIST_SNAPS " << clone_oid
<< " missing" << dendl
;
5647 wait_for_unreadable_object(clone_oid
, ctx
->op
);
5652 ObjectContextRef clone_obc
= get_object_context(clone_oid
, false);
5654 if (maybe_handle_cache(
5655 ctx
->op
, true, clone_obc
, -ENOENT
, clone_oid
, true)) {
5656 // promoting the clone
5659 osd
->clog
->error() << "osd." << osd
->whoami
5660 << ": missing clone " << clone_oid
5663 // should not happen
5668 for (vector
<snapid_t
>::reverse_iterator p
=
5669 clone_obc
->obs
.oi
.legacy_snaps
.rbegin();
5670 p
!= clone_obc
->obs
.oi
.legacy_snaps
.rend();
5672 ci
.snaps
.push_back(*p
);
5676 dout(20) << " clone " << *clone_iter
<< " snaps " << ci
.snaps
<< dendl
;
5678 map
<snapid_t
, interval_set
<uint64_t> >::const_iterator coi
;
5679 coi
= ssc
->snapset
.clone_overlap
.find(ci
.cloneid
);
5680 if (coi
== ssc
->snapset
.clone_overlap
.end()) {
5681 osd
->clog
->error() << "osd." << osd
->whoami
5682 << ": inconsistent clone_overlap found for oid "
5683 << soid
<< " clone " << *clone_iter
;
5687 const interval_set
<uint64_t> &o
= coi
->second
;
5688 ci
.overlap
.reserve(o
.num_intervals());
5689 for (interval_set
<uint64_t>::const_iterator r
= o
.begin();
5690 r
!= o
.end(); ++r
) {
5691 ci
.overlap
.push_back(pair
<uint64_t,uint64_t>(r
.get_start(),
5695 map
<snapid_t
, uint64_t>::const_iterator si
;
5696 si
= ssc
->snapset
.clone_size
.find(ci
.cloneid
);
5697 if (si
== ssc
->snapset
.clone_size
.end()) {
5698 osd
->clog
->error() << "osd." << osd
->whoami
5699 << ": inconsistent clone_size found for oid "
5700 << soid
<< " clone " << *clone_iter
;
5704 ci
.size
= si
->second
;
5706 resp
.clones
.push_back(ci
);
5711 if (ssc
->snapset
.head_exists
&&
5712 !ctx
->obc
->obs
.oi
.is_whiteout()) {
5715 ci
.cloneid
= CEPH_NOSNAP
;
5717 //Size for HEAD is oi.size
5720 resp
.clones
.push_back(ci
);
5722 resp
.seq
= ssc
->snapset
.seq
;
5724 resp
.encode(osd_op
.outdata
);
5727 ctx
->delta_stats
.num_rd
++;
5731 case CEPH_OSD_OP_NOTIFY
:
5738 uint32_t ver
; // obsolete
5740 ::decode(timeout
, bp
);
5742 } catch (const buffer::error
&e
) {
5745 tracepoint(osd
, do_osd_op_pre_notify
, soid
.oid
.name
.c_str(), soid
.snap
.val
, timeout
);
5747 timeout
= cct
->_conf
->osd_default_notify_timeout
;
5750 n
.timeout
= timeout
;
5751 n
.notify_id
= osd
->get_next_id(get_osdmap()->get_epoch());
5752 n
.cookie
= op
.watch
.cookie
;
5754 ctx
->notifies
.push_back(n
);
5756 // return our unique notify id to the client
5757 ::encode(n
.notify_id
, osd_op
.outdata
);
5761 case CEPH_OSD_OP_NOTIFY_ACK
:
5765 uint64_t notify_id
= 0;
5766 uint64_t watch_cookie
= 0;
5767 ::decode(notify_id
, bp
);
5768 ::decode(watch_cookie
, bp
);
5769 bufferlist reply_bl
;
5771 ::decode(reply_bl
, bp
);
5773 tracepoint(osd
, do_osd_op_pre_notify_ack
, soid
.oid
.name
.c_str(), soid
.snap
.val
, notify_id
, watch_cookie
, "Y");
5774 OpContext::NotifyAck
ack(notify_id
, watch_cookie
, reply_bl
);
5775 ctx
->notify_acks
.push_back(ack
);
5776 } catch (const buffer::error
&e
) {
5777 tracepoint(osd
, do_osd_op_pre_notify_ack
, soid
.oid
.name
.c_str(), soid
.snap
.val
, op
.watch
.cookie
, 0, "N");
5778 OpContext::NotifyAck
ack(
5779 // op.watch.cookie is actually the notify_id for historical reasons
5782 ctx
->notify_acks
.push_back(ack
);
5787 case CEPH_OSD_OP_SETALLOCHINT
:
5790 tracepoint(osd
, do_osd_op_pre_setallochint
, soid
.oid
.name
.c_str(), soid
.snap
.val
, op
.alloc_hint
.expected_object_size
, op
.alloc_hint
.expected_write_size
);
5791 maybe_create_new_object(ctx
);
5792 oi
.expected_object_size
= op
.alloc_hint
.expected_object_size
;
5793 oi
.expected_write_size
= op
.alloc_hint
.expected_write_size
;
5794 oi
.alloc_hint_flags
= op
.alloc_hint
.flags
;
5795 t
->set_alloc_hint(soid
, op
.alloc_hint
.expected_object_size
,
5796 op
.alloc_hint
.expected_write_size
,
5797 op
.alloc_hint
.flags
);
5798 ctx
->delta_stats
.num_wr
++;
5806 // -- object data --
5808 case CEPH_OSD_OP_WRITE
:
5811 __u32 seq
= oi
.truncate_seq
;
5812 tracepoint(osd
, do_osd_op_pre_write
, soid
.oid
.name
.c_str(), soid
.snap
.val
, oi
.size
, seq
, op
.extent
.offset
, op
.extent
.length
, op
.extent
.truncate_size
, op
.extent
.truncate_seq
);
5813 if (op
.extent
.length
!= osd_op
.indata
.length()) {
5818 if (pool
.info
.has_flag(pg_pool_t::FLAG_WRITE_FADVISE_DONTNEED
))
5819 op
.flags
= op
.flags
| CEPH_OSD_OP_FLAG_FADVISE_DONTNEED
;
5821 if (pool
.info
.requires_aligned_append() &&
5822 (op
.extent
.offset
% pool
.info
.required_alignment() != 0)) {
5823 result
= -EOPNOTSUPP
;
5828 if (pool
.info
.requires_aligned_append() && op
.extent
.offset
) {
5829 result
= -EOPNOTSUPP
;
5832 } else if (op
.extent
.offset
!= oi
.size
&&
5833 pool
.info
.requires_aligned_append()) {
5834 result
= -EOPNOTSUPP
;
5838 if (seq
&& (seq
> op
.extent
.truncate_seq
) &&
5839 (op
.extent
.offset
+ op
.extent
.length
> oi
.size
)) {
5840 // old write, arrived after trimtrunc
5841 op
.extent
.length
= (op
.extent
.offset
> oi
.size
? 0 : oi
.size
- op
.extent
.offset
);
5842 dout(10) << " old truncate_seq " << op
.extent
.truncate_seq
<< " < current " << seq
5843 << ", adjusting write length to " << op
.extent
.length
<< dendl
;
5845 t
.substr_of(osd_op
.indata
, 0, op
.extent
.length
);
5846 osd_op
.indata
.swap(t
);
5848 if (op
.extent
.truncate_seq
> seq
) {
5849 // write arrives before trimtrunc
5850 if (obs
.exists
&& !oi
.is_whiteout()) {
5851 dout(10) << " truncate_seq " << op
.extent
.truncate_seq
<< " > current " << seq
5852 << ", truncating to " << op
.extent
.truncate_size
<< dendl
;
5853 t
->truncate(soid
, op
.extent
.truncate_size
);
5854 oi
.truncate_seq
= op
.extent
.truncate_seq
;
5855 oi
.truncate_size
= op
.extent
.truncate_size
;
5856 if (op
.extent
.truncate_size
!= oi
.size
) {
5857 ctx
->delta_stats
.num_bytes
-= oi
.size
;
5858 ctx
->delta_stats
.num_bytes
+= op
.extent
.truncate_size
;
5859 oi
.size
= op
.extent
.truncate_size
;
5862 dout(10) << " truncate_seq " << op
.extent
.truncate_seq
<< " > current " << seq
5863 << ", but object is new" << dendl
;
5864 oi
.truncate_seq
= op
.extent
.truncate_seq
;
5865 oi
.truncate_size
= op
.extent
.truncate_size
;
5868 result
= check_offset_and_length(op
.extent
.offset
, op
.extent
.length
, cct
->_conf
->osd_max_object_size
);
5872 maybe_create_new_object(ctx
);
5874 if (op
.extent
.length
== 0) {
5875 if (op
.extent
.offset
> oi
.size
) {
5877 soid
, op
.extent
.offset
);
5883 soid
, op
.extent
.offset
, op
.extent
.length
, osd_op
.indata
, op
.flags
);
5886 if (op
.extent
.offset
== 0 && op
.extent
.length
>= oi
.size
5887 && !skip_data_digest
) {
5888 obs
.oi
.set_data_digest(osd_op
.indata
.crc32c(-1));
5889 } else if (op
.extent
.offset
== oi
.size
&& obs
.oi
.is_data_digest()) {
5890 if (skip_data_digest
) {
5891 obs
.oi
.clear_data_digest();
5893 obs
.oi
.set_data_digest(osd_op
.indata
.crc32c(obs
.oi
.data_digest
));
5896 obs
.oi
.clear_data_digest();
5898 write_update_size_and_usage(ctx
->delta_stats
, oi
, ctx
->modified_ranges
,
5899 op
.extent
.offset
, op
.extent
.length
);
5904 case CEPH_OSD_OP_WRITEFULL
:
5906 { // write full object
5907 tracepoint(osd
, do_osd_op_pre_writefull
, soid
.oid
.name
.c_str(), soid
.snap
.val
, oi
.size
, 0, op
.extent
.length
);
5909 if (op
.extent
.length
!= osd_op
.indata
.length()) {
5913 result
= check_offset_and_length(0, op
.extent
.length
, cct
->_conf
->osd_max_object_size
);
5917 if (pool
.info
.has_flag(pg_pool_t::FLAG_WRITE_FADVISE_DONTNEED
))
5918 op
.flags
= op
.flags
| CEPH_OSD_OP_FLAG_FADVISE_DONTNEED
;
5920 maybe_create_new_object(ctx
);
5921 if (pool
.info
.require_rollback()) {
5922 t
->truncate(soid
, 0);
5923 } else if (obs
.exists
&& op
.extent
.length
< oi
.size
) {
5924 t
->truncate(soid
, op
.extent
.length
);
5926 if (op
.extent
.length
) {
5927 t
->write(soid
, 0, op
.extent
.length
, osd_op
.indata
, op
.flags
);
5929 if (!skip_data_digest
) {
5930 obs
.oi
.set_data_digest(osd_op
.indata
.crc32c(-1));
5932 obs
.oi
.clear_data_digest();
5935 write_update_size_and_usage(ctx
->delta_stats
, oi
, ctx
->modified_ranges
,
5936 0, op
.extent
.length
, true);
5940 case CEPH_OSD_OP_WRITESAME
:
5942 tracepoint(osd
, do_osd_op_pre_writesame
, soid
.oid
.name
.c_str(), soid
.snap
.val
, oi
.size
, op
.writesame
.offset
, op
.writesame
.length
, op
.writesame
.data_length
);
5943 result
= do_writesame(ctx
, osd_op
);
5946 case CEPH_OSD_OP_ROLLBACK
:
5948 tracepoint(osd
, do_osd_op_pre_rollback
, soid
.oid
.name
.c_str(), soid
.snap
.val
);
5949 result
= _rollback_to(ctx
, op
);
5952 case CEPH_OSD_OP_ZERO
:
5953 tracepoint(osd
, do_osd_op_pre_zero
, soid
.oid
.name
.c_str(), soid
.snap
.val
, op
.extent
.offset
, op
.extent
.length
);
5954 if (pool
.info
.requires_aligned_append()) {
5955 result
= -EOPNOTSUPP
;
5960 result
= check_offset_and_length(op
.extent
.offset
, op
.extent
.length
, cct
->_conf
->osd_max_object_size
);
5963 assert(op
.extent
.length
);
5964 if (obs
.exists
&& !oi
.is_whiteout()) {
5965 t
->zero(soid
, op
.extent
.offset
, op
.extent
.length
);
5966 interval_set
<uint64_t> ch
;
5967 ch
.insert(op
.extent
.offset
, op
.extent
.length
);
5968 ctx
->modified_ranges
.union_of(ch
);
5969 ctx
->delta_stats
.num_wr
++;
5970 oi
.clear_data_digest();
5976 case CEPH_OSD_OP_CREATE
:
5979 tracepoint(osd
, do_osd_op_pre_create
, soid
.oid
.name
.c_str(), soid
.snap
.val
);
5980 int flags
= le32_to_cpu(op
.flags
);
5981 if (obs
.exists
&& !oi
.is_whiteout() &&
5982 (flags
& CEPH_OSD_OP_FLAG_EXCL
)) {
5983 result
= -EEXIST
; /* this is an exclusive create */
5985 if (osd_op
.indata
.length()) {
5986 bufferlist::iterator p
= osd_op
.indata
.begin();
5989 ::decode(category
, p
);
5991 catch (buffer::error
& e
) {
5995 // category is no longer implemented.
5998 maybe_create_new_object(ctx
);
6005 case CEPH_OSD_OP_TRIMTRUNC
:
6006 op
.extent
.offset
= op
.extent
.truncate_size
;
6009 case CEPH_OSD_OP_TRUNCATE
:
6010 tracepoint(osd
, do_osd_op_pre_truncate
, soid
.oid
.name
.c_str(), soid
.snap
.val
, oi
.size
, oi
.truncate_seq
, op
.extent
.offset
, op
.extent
.length
, op
.extent
.truncate_size
, op
.extent
.truncate_seq
);
6011 if (pool
.info
.requires_aligned_append()) {
6012 result
= -EOPNOTSUPP
;
6018 if (!obs
.exists
|| oi
.is_whiteout()) {
6019 dout(10) << " object dne, truncate is a no-op" << dendl
;
6023 if (op
.extent
.offset
> cct
->_conf
->osd_max_object_size
) {
6028 if (op
.extent
.truncate_seq
) {
6029 assert(op
.extent
.offset
== op
.extent
.truncate_size
);
6030 if (op
.extent
.truncate_seq
<= oi
.truncate_seq
) {
6031 dout(10) << " truncate seq " << op
.extent
.truncate_seq
<< " <= current " << oi
.truncate_seq
6032 << ", no-op" << dendl
;
6035 dout(10) << " truncate seq " << op
.extent
.truncate_seq
<< " > current " << oi
.truncate_seq
6036 << ", truncating" << dendl
;
6037 oi
.truncate_seq
= op
.extent
.truncate_seq
;
6038 oi
.truncate_size
= op
.extent
.truncate_size
;
6041 maybe_create_new_object(ctx
);
6042 t
->truncate(soid
, op
.extent
.offset
);
6043 if (oi
.size
> op
.extent
.offset
) {
6044 interval_set
<uint64_t> trim
;
6045 trim
.insert(op
.extent
.offset
, oi
.size
-op
.extent
.offset
);
6046 ctx
->modified_ranges
.union_of(trim
);
6048 if (op
.extent
.offset
!= oi
.size
) {
6049 ctx
->delta_stats
.num_bytes
-= oi
.size
;
6050 ctx
->delta_stats
.num_bytes
+= op
.extent
.offset
;
6051 oi
.size
= op
.extent
.offset
;
6053 ctx
->delta_stats
.num_wr
++;
6054 // do no set exists, or we will break above DELETE -> TRUNCATE munging.
6056 oi
.clear_data_digest();
6060 case CEPH_OSD_OP_DELETE
:
6062 tracepoint(osd
, do_osd_op_pre_delete
, soid
.oid
.name
.c_str(), soid
.snap
.val
);
6064 result
= _delete_oid(ctx
, false, ctx
->ignore_cache
);
6068 case CEPH_OSD_OP_WATCH
:
6071 tracepoint(osd
, do_osd_op_pre_watch
, soid
.oid
.name
.c_str(), soid
.snap
.val
,
6072 op
.watch
.cookie
, op
.watch
.op
);
6077 uint64_t cookie
= op
.watch
.cookie
;
6078 entity_name_t entity
= ctx
->reqid
.name
;
6079 ObjectContextRef obc
= ctx
->obc
;
6081 dout(10) << "watch " << ceph_osd_watch_op_name(op
.watch
.op
)
6082 << ": ctx->obc=" << (void *)obc
.get() << " cookie=" << cookie
6083 << " oi.version=" << oi
.version
.version
<< " ctx->at_version=" << ctx
->at_version
<< dendl
;
6084 dout(10) << "watch: oi.user_version=" << oi
.user_version
<< dendl
;
6085 dout(10) << "watch: peer_addr="
6086 << ctx
->op
->get_req()->get_connection()->get_peer_addr() << dendl
;
6088 uint32_t timeout
= cct
->_conf
->osd_client_watch_timeout
;
6089 if (op
.watch
.timeout
!= 0) {
6090 timeout
= op
.watch
.timeout
;
6093 watch_info_t
w(cookie
, timeout
,
6094 ctx
->op
->get_req()->get_connection()->get_peer_addr());
6095 if (op
.watch
.op
== CEPH_OSD_WATCH_OP_WATCH
||
6096 op
.watch
.op
== CEPH_OSD_WATCH_OP_LEGACY_WATCH
) {
6097 if (oi
.watchers
.count(make_pair(cookie
, entity
))) {
6098 dout(10) << " found existing watch " << w
<< " by " << entity
<< dendl
;
6100 dout(10) << " registered new watch " << w
<< " by " << entity
<< dendl
;
6101 oi
.watchers
[make_pair(cookie
, entity
)] = w
;
6102 t
->nop(soid
); // make sure update the object_info on disk!
6104 bool will_ping
= (op
.watch
.op
== CEPH_OSD_WATCH_OP_WATCH
);
6105 ctx
->watch_connects
.push_back(make_pair(w
, will_ping
));
6106 } else if (op
.watch
.op
== CEPH_OSD_WATCH_OP_RECONNECT
) {
6107 if (!oi
.watchers
.count(make_pair(cookie
, entity
))) {
6111 dout(10) << " found existing watch " << w
<< " by " << entity
<< dendl
;
6112 ctx
->watch_connects
.push_back(make_pair(w
, true));
6113 } else if (op
.watch
.op
== CEPH_OSD_WATCH_OP_PING
) {
6114 /* Note: WATCH with PING doesn't cause may_write() to return true,
6115 * so if there is nothing else in the transaction, this is going
6116 * to run do_osd_op_effects, but not write out a log entry */
6117 if (!oi
.watchers
.count(make_pair(cookie
, entity
))) {
6121 map
<pair
<uint64_t,entity_name_t
>,WatchRef
>::iterator p
=
6122 obc
->watchers
.find(make_pair(cookie
, entity
));
6123 if (p
== obc
->watchers
.end() ||
6124 !p
->second
->is_connected()) {
6125 // client needs to reconnect
6126 result
= -ETIMEDOUT
;
6129 dout(10) << " found existing watch " << w
<< " by " << entity
<< dendl
;
6130 p
->second
->got_ping(ceph_clock_now());
6132 } else if (op
.watch
.op
== CEPH_OSD_WATCH_OP_UNWATCH
) {
6133 map
<pair
<uint64_t, entity_name_t
>, watch_info_t
>::iterator oi_iter
=
6134 oi
.watchers
.find(make_pair(cookie
, entity
));
6135 if (oi_iter
!= oi
.watchers
.end()) {
6136 dout(10) << " removed watch " << oi_iter
->second
<< " by "
6138 oi
.watchers
.erase(oi_iter
);
6139 t
->nop(soid
); // update oi on disk
6140 ctx
->watch_disconnects
.push_back(
6141 watch_disconnect_t(cookie
, entity
, false));
6143 dout(10) << " can't remove: no watch by " << entity
<< dendl
;
6149 case CEPH_OSD_OP_CACHE_PIN
:
6150 tracepoint(osd
, do_osd_op_pre_cache_pin
, soid
.oid
.name
.c_str(), soid
.snap
.val
);
6151 if ((!pool
.info
.is_tier() ||
6152 pool
.info
.cache_mode
== pg_pool_t::CACHEMODE_NONE
)) {
6154 dout(10) << " pin object is only allowed on the cache tier " << dendl
;
6159 if (!obs
.exists
|| oi
.is_whiteout()) {
6164 if (!oi
.is_cache_pinned()) {
6165 oi
.set_flag(object_info_t::FLAG_CACHE_PIN
);
6167 ctx
->delta_stats
.num_objects_pinned
++;
6168 ctx
->delta_stats
.num_wr
++;
6174 case CEPH_OSD_OP_CACHE_UNPIN
:
6175 tracepoint(osd
, do_osd_op_pre_cache_unpin
, soid
.oid
.name
.c_str(), soid
.snap
.val
);
6176 if ((!pool
.info
.is_tier() ||
6177 pool
.info
.cache_mode
== pg_pool_t::CACHEMODE_NONE
)) {
6179 dout(10) << " pin object is only allowed on the cache tier " << dendl
;
6184 if (!obs
.exists
|| oi
.is_whiteout()) {
6189 if (oi
.is_cache_pinned()) {
6190 oi
.clear_flag(object_info_t::FLAG_CACHE_PIN
);
6192 ctx
->delta_stats
.num_objects_pinned
--;
6193 ctx
->delta_stats
.num_wr
++;
6199 case CEPH_OSD_OP_SET_REDIRECT
:
6202 if (pool
.info
.is_tier()) {
6210 if (get_osdmap()->require_osd_release
< CEPH_RELEASE_LUMINOUS
) {
6211 result
= -EOPNOTSUPP
;
6215 object_t target_name
;
6216 object_locator_t target_oloc
;
6217 snapid_t target_snapid
= (uint64_t)op
.copy_from
.snapid
;
6218 version_t target_version
= op
.copy_from
.src_version
;
6220 ::decode(target_name
, bp
);
6221 ::decode(target_oloc
, bp
);
6223 catch (buffer::error
& e
) {
6228 get_osdmap()->object_locator_to_pg(target_name
, target_oloc
, raw_pg
);
6229 hobject_t
target(target_name
, target_oloc
.key
, target_snapid
,
6230 raw_pg
.ps(), raw_pg
.pool(),
6231 target_oloc
.nspace
);
6232 if (target
== soid
) {
6233 dout(20) << " set-redirect self is invalid" << dendl
;
6237 oi
.set_flag(object_info_t::FLAG_MANIFEST
);
6238 oi
.manifest
.redirect_target
= target
;
6239 oi
.manifest
.type
= object_manifest_t::TYPE_REDIRECT
;
6240 t
->truncate(soid
, 0);
6241 if (oi
.is_omap() && pool
.info
.supports_omap()) {
6242 t
->omap_clear(soid
);
6243 obs
.oi
.clear_omap_digest();
6244 obs
.oi
.clear_flag(object_info_t::FLAG_OMAP
);
6246 ctx
->delta_stats
.num_bytes
-= oi
.size
;
6249 oi
.user_version
= target_version
;
6250 ctx
->user_at_version
= target_version
;
6252 map
<string
,bufferlist
> rmattrs
;
6253 result
= getattrs_maybe_cache(ctx
->obc
,
6258 map
<string
, bufferlist
>::iterator iter
;
6259 for (iter
= rmattrs
.begin(); iter
!= rmattrs
.end(); ++iter
) {
6260 const string
& name
= iter
->first
;
6261 t
->rmattr(soid
, name
);
6263 dout(10) << "set-redirect oid:" << oi
.soid
<< " user_version: " << oi
.user_version
<< dendl
;
6268 // -- object attrs --
6270 case CEPH_OSD_OP_SETXATTR
:
6273 if (cct
->_conf
->osd_max_attr_size
> 0 &&
6274 op
.xattr
.value_len
> cct
->_conf
->osd_max_attr_size
) {
6275 tracepoint(osd
, do_osd_op_pre_setxattr
, soid
.oid
.name
.c_str(), soid
.snap
.val
, "???");
6279 unsigned max_name_len
= MIN(osd
->store
->get_max_attr_name_length(),
6280 cct
->_conf
->osd_max_attr_name_len
);
6281 if (op
.xattr
.name_len
> max_name_len
) {
6282 result
= -ENAMETOOLONG
;
6285 maybe_create_new_object(ctx
);
6287 bp
.copy(op
.xattr
.name_len
, aname
);
6288 tracepoint(osd
, do_osd_op_pre_setxattr
, soid
.oid
.name
.c_str(), soid
.snap
.val
, aname
.c_str());
6289 string name
= "_" + aname
;
6291 bp
.copy(op
.xattr
.value_len
, bl
);
6292 t
->setattr(soid
, name
, bl
);
6293 ctx
->delta_stats
.num_wr
++;
6297 case CEPH_OSD_OP_RMXATTR
:
6301 bp
.copy(op
.xattr
.name_len
, aname
);
6302 tracepoint(osd
, do_osd_op_pre_rmxattr
, soid
.oid
.name
.c_str(), soid
.snap
.val
, aname
.c_str());
6303 if (!obs
.exists
|| oi
.is_whiteout()) {
6307 string name
= "_" + aname
;
6308 t
->rmattr(soid
, name
);
6309 ctx
->delta_stats
.num_wr
++;
6314 // -- fancy writers --
6315 case CEPH_OSD_OP_APPEND
:
6317 tracepoint(osd
, do_osd_op_pre_append
, soid
.oid
.name
.c_str(), soid
.snap
.val
, oi
.size
, oi
.truncate_seq
, op
.extent
.offset
, op
.extent
.length
, op
.extent
.truncate_size
, op
.extent
.truncate_seq
);
6318 // just do it inline; this works because we are happy to execute
6319 // fancy op on replicas as well.
6320 vector
<OSDOp
> nops(1);
6321 OSDOp
& newop
= nops
[0];
6322 newop
.op
.op
= CEPH_OSD_OP_WRITE
;
6323 newop
.op
.extent
.offset
= oi
.size
;
6324 newop
.op
.extent
.length
= op
.extent
.length
;
6325 newop
.op
.extent
.truncate_seq
= oi
.truncate_seq
;
6326 newop
.indata
= osd_op
.indata
;
6327 result
= do_osd_ops(ctx
, nops
);
6328 osd_op
.outdata
.claim(newop
.outdata
);
6332 case CEPH_OSD_OP_STARTSYNC
:
6333 tracepoint(osd
, do_osd_op_pre_startsync
, soid
.oid
.name
.c_str(), soid
.snap
.val
);
6338 // -- trivial map --
6339 case CEPH_OSD_OP_TMAPGET
:
6340 tracepoint(osd
, do_osd_op_pre_tmapget
, soid
.oid
.name
.c_str(), soid
.snap
.val
);
6341 if (pool
.info
.require_rollback()) {
6342 result
= -EOPNOTSUPP
;
6346 vector
<OSDOp
> nops(1);
6347 OSDOp
& newop
= nops
[0];
6348 newop
.op
.op
= CEPH_OSD_OP_SYNC_READ
;
6349 newop
.op
.extent
.offset
= 0;
6350 newop
.op
.extent
.length
= 0;
6351 do_osd_ops(ctx
, nops
);
6352 osd_op
.outdata
.claim(newop
.outdata
);
6356 case CEPH_OSD_OP_TMAPPUT
:
6357 tracepoint(osd
, do_osd_op_pre_tmapput
, soid
.oid
.name
.c_str(), soid
.snap
.val
);
6358 if (pool
.info
.require_rollback()) {
6359 result
= -EOPNOTSUPP
;
6363 //_dout_lock.Lock();
6364 //osd_op.data.hexdump(*_dout);
6365 //_dout_lock.Unlock();
6367 // verify sort order
6368 bool unsorted
= false;
6371 ::decode(header
, bp
);
6378 dout(10) << "tmapput key " << key
<< dendl
;
6381 if (key
< last_key
) {
6382 dout(10) << "TMAPPUT is unordered; resorting" << dendl
;
6391 vector
<OSDOp
> nops(1);
6392 OSDOp
& newop
= nops
[0];
6393 newop
.op
.op
= CEPH_OSD_OP_WRITEFULL
;
6394 newop
.op
.extent
.offset
= 0;
6395 newop
.op
.extent
.length
= osd_op
.indata
.length();
6396 newop
.indata
= osd_op
.indata
;
6399 bp
= osd_op
.indata
.begin();
6401 map
<string
, bufferlist
> m
;
6402 ::decode(header
, bp
);
6406 ::encode(header
, newbl
);
6408 newop
.indata
= newbl
;
6410 result
= do_osd_ops(ctx
, nops
);
6411 assert(result
== 0);
6415 case CEPH_OSD_OP_TMAPUP
:
6416 tracepoint(osd
, do_osd_op_pre_tmapup
, soid
.oid
.name
.c_str(), soid
.snap
.val
);
6417 if (pool
.info
.require_rollback()) {
6418 result
= -EOPNOTSUPP
;
6422 result
= do_tmapup(ctx
, bp
, osd_op
);
6425 case CEPH_OSD_OP_TMAP2OMAP
:
6427 tracepoint(osd
, do_osd_op_pre_tmap2omap
, soid
.oid
.name
.c_str(), soid
.snap
.val
);
6428 result
= do_tmap2omap(ctx
, op
.tmap2omap
.flags
);
6432 case CEPH_OSD_OP_OMAPGETKEYS
:
6436 uint64_t max_return
;
6438 ::decode(start_after
, bp
);
6439 ::decode(max_return
, bp
);
6441 catch (buffer::error
& e
) {
6443 tracepoint(osd
, do_osd_op_pre_omapgetkeys
, soid
.oid
.name
.c_str(), soid
.snap
.val
, "???", 0);
6446 if (max_return
> cct
->_conf
->osd_max_omap_entries_per_request
) {
6447 max_return
= cct
->_conf
->osd_max_omap_entries_per_request
;
6449 tracepoint(osd
, do_osd_op_pre_omapgetkeys
, soid
.oid
.name
.c_str(), soid
.snap
.val
, start_after
.c_str(), max_return
);
6453 bool truncated
= false;
6455 ObjectMap::ObjectMapIterator iter
= osd
->store
->get_omap_iterator(
6456 coll
, ghobject_t(soid
)
6459 iter
->upper_bound(start_after
);
6460 for (num
= 0; iter
->valid(); ++num
, iter
->next(false)) {
6461 if (num
>= max_return
||
6462 bl
.length() >= cct
->_conf
->osd_max_omap_bytes_per_request
) {
6466 ::encode(iter
->key(), bl
);
6468 } // else return empty out_set
6469 ::encode(num
, osd_op
.outdata
);
6470 osd_op
.outdata
.claim_append(bl
);
6471 ::encode(truncated
, osd_op
.outdata
);
6472 ctx
->delta_stats
.num_rd_kb
+= SHIFT_ROUND_UP(osd_op
.outdata
.length(), 10);
6473 ctx
->delta_stats
.num_rd
++;
6477 case CEPH_OSD_OP_OMAPGETVALS
:
6481 uint64_t max_return
;
6482 string filter_prefix
;
6484 ::decode(start_after
, bp
);
6485 ::decode(max_return
, bp
);
6486 ::decode(filter_prefix
, bp
);
6488 catch (buffer::error
& e
) {
6490 tracepoint(osd
, do_osd_op_pre_omapgetvals
, soid
.oid
.name
.c_str(), soid
.snap
.val
, "???", 0, "???");
6493 if (max_return
> cct
->_conf
->osd_max_omap_entries_per_request
) {
6494 max_return
= cct
->_conf
->osd_max_omap_entries_per_request
;
6496 tracepoint(osd
, do_osd_op_pre_omapgetvals
, soid
.oid
.name
.c_str(), soid
.snap
.val
, start_after
.c_str(), max_return
, filter_prefix
.c_str());
6499 bool truncated
= false;
6502 ObjectMap::ObjectMapIterator iter
= osd
->store
->get_omap_iterator(
6503 coll
, ghobject_t(soid
)
6509 iter
->upper_bound(start_after
);
6510 if (filter_prefix
> start_after
) iter
->lower_bound(filter_prefix
);
6513 iter
->key().substr(0, filter_prefix
.size()) == filter_prefix
;
6514 ++num
, iter
->next(false)) {
6515 dout(20) << "Found key " << iter
->key() << dendl
;
6516 if (num
>= max_return
||
6517 bl
.length() >= cct
->_conf
->osd_max_omap_bytes_per_request
) {
6521 ::encode(iter
->key(), bl
);
6522 ::encode(iter
->value(), bl
);
6524 } // else return empty out_set
6525 ::encode(num
, osd_op
.outdata
);
6526 osd_op
.outdata
.claim_append(bl
);
6527 ::encode(truncated
, osd_op
.outdata
);
6528 ctx
->delta_stats
.num_rd_kb
+= SHIFT_ROUND_UP(osd_op
.outdata
.length(), 10);
6529 ctx
->delta_stats
.num_rd
++;
6533 case CEPH_OSD_OP_OMAPGETHEADER
:
6534 tracepoint(osd
, do_osd_op_pre_omapgetheader
, soid
.oid
.name
.c_str(), soid
.snap
.val
);
6535 if (!oi
.is_omap()) {
6536 // return empty header
6541 osd
->store
->omap_get_header(ch
, ghobject_t(soid
), &osd_op
.outdata
);
6542 ctx
->delta_stats
.num_rd_kb
+= SHIFT_ROUND_UP(osd_op
.outdata
.length(), 10);
6543 ctx
->delta_stats
.num_rd
++;
6547 case CEPH_OSD_OP_OMAPGETVALSBYKEYS
:
6550 set
<string
> keys_to_get
;
6552 ::decode(keys_to_get
, bp
);
6554 catch (buffer::error
& e
) {
6556 tracepoint(osd
, do_osd_op_pre_omapgetvalsbykeys
, soid
.oid
.name
.c_str(), soid
.snap
.val
, "???");
6559 tracepoint(osd
, do_osd_op_pre_omapgetvalsbykeys
, soid
.oid
.name
.c_str(), soid
.snap
.val
, list_entries(keys_to_get
).c_str());
6560 map
<string
, bufferlist
> out
;
6562 osd
->store
->omap_get_values(ch
, ghobject_t(soid
), keys_to_get
, &out
);
6563 } // else return empty omap entries
6564 ::encode(out
, osd_op
.outdata
);
6565 ctx
->delta_stats
.num_rd_kb
+= SHIFT_ROUND_UP(osd_op
.outdata
.length(), 10);
6566 ctx
->delta_stats
.num_rd
++;
6570 case CEPH_OSD_OP_OMAP_CMP
:
6573 if (!obs
.exists
|| oi
.is_whiteout()) {
6575 tracepoint(osd
, do_osd_op_pre_omap_cmp
, soid
.oid
.name
.c_str(), soid
.snap
.val
, "???");
6578 map
<string
, pair
<bufferlist
, int> > assertions
;
6580 ::decode(assertions
, bp
);
6582 catch (buffer::error
& e
) {
6584 tracepoint(osd
, do_osd_op_pre_omap_cmp
, soid
.oid
.name
.c_str(), soid
.snap
.val
, "???");
6587 tracepoint(osd
, do_osd_op_pre_omap_cmp
, soid
.oid
.name
.c_str(), soid
.snap
.val
, list_keys(assertions
).c_str());
6589 map
<string
, bufferlist
> out
;
6593 for (map
<string
, pair
<bufferlist
, int> >::iterator i
= assertions
.begin();
6594 i
!= assertions
.end();
6596 to_get
.insert(i
->first
);
6597 int r
= osd
->store
->omap_get_values(ch
, ghobject_t(soid
),
6603 } // else leave out empty
6605 //Should set num_rd_kb based on encode length of map
6606 ctx
->delta_stats
.num_rd
++;
6610 for (map
<string
, pair
<bufferlist
, int> >::iterator i
= assertions
.begin();
6611 i
!= assertions
.end();
6613 auto out_entry
= out
.find(i
->first
);
6614 bufferlist
&bl
= (out_entry
!= out
.end()) ?
6615 out_entry
->second
: empty
;
6616 switch (i
->second
.second
) {
6617 case CEPH_OSD_CMPXATTR_OP_EQ
:
6618 if (!(bl
== i
->second
.first
)) {
6622 case CEPH_OSD_CMPXATTR_OP_LT
:
6623 if (!(bl
< i
->second
.first
)) {
6627 case CEPH_OSD_CMPXATTR_OP_GT
:
6628 if (!(bl
> i
->second
.first
)) {
6646 case CEPH_OSD_OP_OMAPSETVALS
:
6647 if (!pool
.info
.supports_omap()) {
6648 result
= -EOPNOTSUPP
;
6649 tracepoint(osd
, do_osd_op_pre_omapsetvals
, soid
.oid
.name
.c_str(), soid
.snap
.val
);
6654 maybe_create_new_object(ctx
);
6655 bufferlist to_set_bl
;
6657 decode_str_str_map_to_bl(bp
, &to_set_bl
);
6659 catch (buffer::error
& e
) {
6661 tracepoint(osd
, do_osd_op_pre_omapsetvals
, soid
.oid
.name
.c_str(), soid
.snap
.val
);
6664 tracepoint(osd
, do_osd_op_pre_omapsetvals
, soid
.oid
.name
.c_str(), soid
.snap
.val
);
6665 if (cct
->_conf
->subsys
.should_gather(dout_subsys
, 20)) {
6666 dout(20) << "setting vals: " << dendl
;
6667 map
<string
,bufferlist
> to_set
;
6668 bufferlist::iterator pt
= to_set_bl
.begin();
6669 ::decode(to_set
, pt
);
6670 for (map
<string
, bufferlist
>::iterator i
= to_set
.begin();
6673 dout(20) << "\t" << i
->first
<< dendl
;
6676 t
->omap_setkeys(soid
, to_set_bl
);
6677 ctx
->delta_stats
.num_wr
++;
6679 obs
.oi
.set_flag(object_info_t::FLAG_OMAP
);
6680 obs
.oi
.clear_omap_digest();
6683 case CEPH_OSD_OP_OMAPSETHEADER
:
6684 tracepoint(osd
, do_osd_op_pre_omapsetheader
, soid
.oid
.name
.c_str(), soid
.snap
.val
);
6685 if (!pool
.info
.supports_omap()) {
6686 result
= -EOPNOTSUPP
;
6691 maybe_create_new_object(ctx
);
6692 t
->omap_setheader(soid
, osd_op
.indata
);
6693 ctx
->delta_stats
.num_wr
++;
6695 obs
.oi
.set_flag(object_info_t::FLAG_OMAP
);
6696 obs
.oi
.clear_omap_digest();
6699 case CEPH_OSD_OP_OMAPCLEAR
:
6700 tracepoint(osd
, do_osd_op_pre_omapclear
, soid
.oid
.name
.c_str(), soid
.snap
.val
);
6701 if (!pool
.info
.supports_omap()) {
6702 result
= -EOPNOTSUPP
;
6707 if (!obs
.exists
|| oi
.is_whiteout()) {
6712 t
->omap_clear(soid
);
6713 ctx
->delta_stats
.num_wr
++;
6714 obs
.oi
.clear_omap_digest();
6715 obs
.oi
.clear_flag(object_info_t::FLAG_OMAP
);
6720 case CEPH_OSD_OP_OMAPRMKEYS
:
6721 if (!pool
.info
.supports_omap()) {
6722 result
= -EOPNOTSUPP
;
6723 tracepoint(osd
, do_osd_op_pre_omaprmkeys
, soid
.oid
.name
.c_str(), soid
.snap
.val
);
6728 if (!obs
.exists
|| oi
.is_whiteout()) {
6730 tracepoint(osd
, do_osd_op_pre_omaprmkeys
, soid
.oid
.name
.c_str(), soid
.snap
.val
);
6733 bufferlist to_rm_bl
;
6735 decode_str_set_to_bl(bp
, &to_rm_bl
);
6737 catch (buffer::error
& e
) {
6739 tracepoint(osd
, do_osd_op_pre_omaprmkeys
, soid
.oid
.name
.c_str(), soid
.snap
.val
);
6742 tracepoint(osd
, do_osd_op_pre_omaprmkeys
, soid
.oid
.name
.c_str(), soid
.snap
.val
);
6743 t
->omap_rmkeys(soid
, to_rm_bl
);
6744 ctx
->delta_stats
.num_wr
++;
6746 obs
.oi
.clear_omap_digest();
6749 case CEPH_OSD_OP_COPY_GET
:
6751 tracepoint(osd
, do_osd_op_pre_copy_get
, soid
.oid
.name
.c_str(),
6753 if (op_finisher
== nullptr) {
6754 result
= do_copy_get(ctx
, bp
, osd_op
, ctx
->obc
);
6756 result
= op_finisher
->execute();
6760 case CEPH_OSD_OP_COPY_FROM
:
6764 object_locator_t src_oloc
;
6765 snapid_t src_snapid
= (uint64_t)op
.copy_from
.snapid
;
6766 version_t src_version
= op
.copy_from
.src_version
;
6768 ::decode(src_name
, bp
);
6769 ::decode(src_oloc
, bp
);
6771 catch (buffer::error
& e
) {
6774 do_osd_op_pre_copy_from
,
6775 soid
.oid
.name
.c_str(),
6787 do_osd_op_pre_copy_from
,
6788 soid
.oid
.name
.c_str(),
6790 src_name
.name
.c_str(),
6792 src_oloc
.key
.c_str(),
6793 src_oloc
.nspace
.c_str(),
6797 if (op_finisher
== nullptr) {
6800 get_osdmap()->object_locator_to_pg(src_name
, src_oloc
, raw_pg
);
6801 hobject_t
src(src_name
, src_oloc
.key
, src_snapid
,
6802 raw_pg
.ps(), raw_pg
.pool(),
6805 dout(20) << " copy from self is invalid" << dendl
;
6809 CopyFromCallback
*cb
= new CopyFromCallback(ctx
, osd_op
);
6810 ctx
->op_finishers
[ctx
->current_osd_subop_num
].reset(
6811 new CopyFromFinisher(cb
));
6812 start_copy(cb
, ctx
->obc
, src
, src_oloc
, src_version
,
6815 op
.copy_from
.src_fadvise_flags
,
6817 result
= -EINPROGRESS
;
6820 result
= op_finisher
->execute();
6821 assert(result
== 0);
6823 // COPY_FROM cannot be executed multiple times -- it must restart
6824 ctx
->op_finishers
.erase(ctx
->current_osd_subop_num
);
6830 tracepoint(osd
, do_osd_op_pre_unknown
, soid
.oid
.name
.c_str(), soid
.snap
.val
, op
.op
, ceph_osd_op_name(op
.op
));
6831 dout(1) << "unrecognized osd op " << op
.op
6832 << " " << ceph_osd_op_name(op
.op
)
6834 result
= -EOPNOTSUPP
;
6838 osd_op
.rval
= result
;
6839 tracepoint(osd
, do_osd_op_post
, soid
.oid
.name
.c_str(), soid
.snap
.val
, op
.op
, ceph_osd_op_name(op
.op
), op
.flags
, result
);
6840 if (result
< 0 && (op
.flags
& CEPH_OSD_OP_FLAG_FAILOK
))
6849 int PrimaryLogPG::_get_tmap(OpContext
*ctx
, bufferlist
*header
, bufferlist
*vals
)
6851 if (ctx
->new_obs
.oi
.size
== 0) {
6852 dout(20) << "unable to get tmap for zero sized " << ctx
->new_obs
.oi
.soid
<< dendl
;
6855 vector
<OSDOp
> nops(1);
6856 OSDOp
&newop
= nops
[0];
6857 newop
.op
.op
= CEPH_OSD_OP_TMAPGET
;
6858 do_osd_ops(ctx
, nops
);
6860 bufferlist::iterator i
= newop
.outdata
.begin();
6861 ::decode(*header
, i
);
6862 (*vals
).substr_of(newop
.outdata
, i
.get_off(), i
.get_remaining());
6864 dout(20) << "unsuccessful at decoding tmap for " << ctx
->new_obs
.oi
.soid
6868 dout(20) << "successful at decoding tmap for " << ctx
->new_obs
.oi
.soid
6873 int PrimaryLogPG::_verify_no_head_clones(const hobject_t
& soid
,
6876 // verify that all clones have been evicted
6877 dout(20) << __func__
<< " verifying clones are absent "
6879 for (vector
<snapid_t
>::const_iterator p
= ss
.clones
.begin();
6880 p
!= ss
.clones
.end();
6882 hobject_t clone_oid
= soid
;
6883 clone_oid
.snap
= *p
;
6884 if (is_missing_object(clone_oid
))
6886 ObjectContextRef clone_obc
= get_object_context(clone_oid
, false);
6887 if (clone_obc
&& clone_obc
->obs
.exists
) {
6888 dout(10) << __func__
<< " cannot evict head before clone "
6889 << clone_oid
<< dendl
;
6892 if (copy_ops
.count(clone_oid
)) {
6893 dout(10) << __func__
<< " cannot evict head, pending promote on clone "
6894 << clone_oid
<< dendl
;
6901 inline int PrimaryLogPG::_delete_oid(
6903 bool no_whiteout
, // no whiteouts, no matter what.
6904 bool try_no_whiteout
) // try not to whiteout
6906 SnapSet
& snapset
= ctx
->new_snapset
;
6907 ObjectState
& obs
= ctx
->new_obs
;
6908 object_info_t
& oi
= obs
.oi
;
6909 const hobject_t
& soid
= oi
.soid
;
6910 PGTransaction
* t
= ctx
->op_t
.get();
6912 // cache: cache: set whiteout on delete?
6913 bool whiteout
= false;
6914 if (pool
.info
.cache_mode
!= pg_pool_t::CACHEMODE_NONE
6916 && !try_no_whiteout
) {
6920 if (get_osdmap()->require_osd_release
>= CEPH_RELEASE_LUMINOUS
) {
6922 // in luminous or later, we can't delete the head if there are
6923 // clones. we trust the caller passing no_whiteout has already
6924 // verified they don't exist.
6925 if (!snapset
.clones
.empty() ||
6926 (!ctx
->snapc
.snaps
.empty() && ctx
->snapc
.snaps
[0] > snapset
.seq
)) {
6928 dout(20) << __func__
<< " has or will have clones but no_whiteout=1"
6931 dout(20) << __func__
<< " has or will have clones; will whiteout"
6939 dout(20) << __func__
<< " " << soid
<< " whiteout=" << (int)whiteout
6940 << " no_whiteout=" << (int)no_whiteout
6941 << " try_no_whiteout=" << (int)try_no_whiteout
6943 if (!obs
.exists
|| (obs
.oi
.is_whiteout() && whiteout
))
6949 interval_set
<uint64_t> ch
;
6950 ch
.insert(0, oi
.size
);
6951 ctx
->modified_ranges
.union_of(ch
);
6954 ctx
->delta_stats
.num_wr
++;
6955 if (soid
.is_snap()) {
6956 assert(ctx
->obc
->ssc
->snapset
.clone_overlap
.count(soid
.snap
));
6957 ctx
->delta_stats
.num_bytes
-= ctx
->obc
->ssc
->snapset
.get_clone_bytes(soid
.snap
);
6959 ctx
->delta_stats
.num_bytes
-= oi
.size
;
6964 // disconnect all watchers
6965 for (map
<pair
<uint64_t, entity_name_t
>, watch_info_t
>::iterator p
=
6966 oi
.watchers
.begin();
6967 p
!= oi
.watchers
.end();
6969 dout(20) << __func__
<< " will disconnect watcher " << p
->first
<< dendl
;
6970 ctx
->watch_disconnects
.push_back(
6971 watch_disconnect_t(p
->first
.first
, p
->first
.second
, true));
6973 oi
.watchers
.clear();
6976 dout(20) << __func__
<< " setting whiteout on " << soid
<< dendl
;
6977 oi
.set_flag(object_info_t::FLAG_WHITEOUT
);
6978 ctx
->delta_stats
.num_whiteouts
++;
6980 osd
->logger
->inc(l_osd_tier_whiteout
);
6985 ctx
->delta_stats
.num_objects
--;
6987 ctx
->delta_stats
.num_object_clones
--;
6988 if (oi
.is_whiteout()) {
6989 dout(20) << __func__
<< " deleting whiteout on " << soid
<< dendl
;
6990 ctx
->delta_stats
.num_whiteouts
--;
6991 oi
.clear_flag(object_info_t::FLAG_WHITEOUT
);
6993 if (oi
.is_cache_pinned()) {
6994 ctx
->delta_stats
.num_objects_pinned
--;
6996 if ((legacy
|| snapset
.is_legacy()) && soid
.is_head()) {
6997 snapset
.head_exists
= false;
7003 int PrimaryLogPG::_rollback_to(OpContext
*ctx
, ceph_osd_op
& op
)
7005 SnapSet
& snapset
= ctx
->new_snapset
;
7006 ObjectState
& obs
= ctx
->new_obs
;
7007 object_info_t
& oi
= obs
.oi
;
7008 const hobject_t
& soid
= oi
.soid
;
7009 PGTransaction
* t
= ctx
->op_t
.get();
7010 snapid_t snapid
= (uint64_t)op
.snap
.snapid
;
7011 hobject_t missing_oid
;
7013 dout(10) << "_rollback_to " << soid
<< " snapid " << snapid
<< dendl
;
7015 ObjectContextRef rollback_to
;
7016 int ret
= find_object_context(
7017 hobject_t(soid
.oid
, soid
.get_key(), snapid
, soid
.get_hash(), info
.pgid
.pool(),
7018 soid
.get_namespace()),
7019 &rollback_to
, false, false, &missing_oid
);
7020 if (ret
== -EAGAIN
) {
7021 /* clone must be missing */
7022 assert(is_degraded_or_backfilling_object(missing_oid
));
7023 dout(20) << "_rollback_to attempted to roll back to a missing or backfilling clone "
7024 << missing_oid
<< " (requested snapid: ) " << snapid
<< dendl
;
7025 block_write_on_degraded_snap(missing_oid
, ctx
->op
);
7029 ObjectContextRef promote_obc
;
7030 cache_result_t tier_mode_result
;
7031 if (obs
.exists
&& obs
.oi
.has_manifest()) {
7033 maybe_handle_manifest_detail(
7039 maybe_handle_cache_detail(
7049 switch (tier_mode_result
) {
7050 case cache_result_t::NOOP
:
7052 case cache_result_t::BLOCKED_PROMOTE
:
7053 assert(promote_obc
);
7054 block_write_on_snap_rollback(soid
, promote_obc
, ctx
->op
);
7056 case cache_result_t::BLOCKED_FULL
:
7057 block_write_on_full_cache(soid
, ctx
->op
);
7059 case cache_result_t::REPLIED_WITH_EAGAIN
:
7060 assert(0 == "this can't happen, no rollback on replica");
7062 assert(0 == "must promote was set, other values are not valid");
7067 if (ret
== -ENOENT
|| (rollback_to
&& rollback_to
->obs
.oi
.is_whiteout())) {
7068 // there's no snapshot here, or there's no object.
7069 // if there's no snapshot, we delete the object; otherwise, do nothing.
7070 dout(20) << "_rollback_to deleting head on " << soid
.oid
7071 << " because got ENOENT|whiteout on find_object_context" << dendl
;
7072 if (ctx
->obc
->obs
.oi
.watchers
.size()) {
7073 // Cannot delete an object with watchers
7076 _delete_oid(ctx
, false, false);
7080 // ummm....huh? It *can't* return anything else at time of writing.
7081 assert(0 == "unexpected error code in _rollback_to");
7082 } else { //we got our context, let's use it to do the rollback!
7083 hobject_t
& rollback_to_sobject
= rollback_to
->obs
.oi
.soid
;
7084 if (is_degraded_or_backfilling_object(rollback_to_sobject
)) {
7085 dout(20) << "_rollback_to attempted to roll back to a degraded object "
7086 << rollback_to_sobject
<< " (requested snapid: ) " << snapid
<< dendl
;
7087 block_write_on_degraded_snap(rollback_to_sobject
, ctx
->op
);
7089 } else if (rollback_to
->obs
.oi
.soid
.snap
== CEPH_NOSNAP
) {
7090 // rolling back to the head; we just need to clone it.
7093 /* 1) Delete current head
7094 * 2) Clone correct snapshot into head
7095 * 3) Calculate clone_overlaps by following overlaps
7096 * forward from rollback snapshot */
7097 dout(10) << "_rollback_to deleting " << soid
.oid
7098 << " and rolling back to old snap" << dendl
;
7103 t
->clone(soid
, rollback_to_sobject
);
7104 snapset
.head_exists
= true;
7105 t
->add_obc(rollback_to
);
7107 map
<snapid_t
, interval_set
<uint64_t> >::iterator iter
=
7108 snapset
.clone_overlap
.lower_bound(snapid
);
7109 interval_set
<uint64_t> overlaps
= iter
->second
;
7110 assert(iter
!= snapset
.clone_overlap
.end());
7112 iter
!= snapset
.clone_overlap
.end();
7114 overlaps
.intersection_of(iter
->second
);
7116 if (obs
.oi
.size
> 0) {
7117 interval_set
<uint64_t> modified
;
7118 modified
.insert(0, obs
.oi
.size
);
7119 overlaps
.intersection_of(modified
);
7120 modified
.subtract(overlaps
);
7121 ctx
->modified_ranges
.union_of(modified
);
7124 // Adjust the cached objectcontext
7125 maybe_create_new_object(ctx
, true);
7126 ctx
->delta_stats
.num_bytes
-= obs
.oi
.size
;
7127 ctx
->delta_stats
.num_bytes
+= rollback_to
->obs
.oi
.size
;
7128 obs
.oi
.size
= rollback_to
->obs
.oi
.size
;
7129 if (rollback_to
->obs
.oi
.is_data_digest())
7130 obs
.oi
.set_data_digest(rollback_to
->obs
.oi
.data_digest
);
7132 obs
.oi
.clear_data_digest();
7133 if (rollback_to
->obs
.oi
.is_omap_digest())
7134 obs
.oi
.set_omap_digest(rollback_to
->obs
.oi
.omap_digest
);
7136 obs
.oi
.clear_omap_digest();
7138 if (rollback_to
->obs
.oi
.is_omap()) {
7139 dout(10) << __func__
<< " setting omap flag on " << obs
.oi
.soid
<< dendl
;
7140 obs
.oi
.set_flag(object_info_t::FLAG_OMAP
);
7142 dout(10) << __func__
<< " clearing omap flag on " << obs
.oi
.soid
<< dendl
;
7143 obs
.oi
.clear_flag(object_info_t::FLAG_OMAP
);
7146 snapset
.head_exists
= true;
7152 void PrimaryLogPG::_make_clone(
7155 ObjectContextRef obc
,
7156 const hobject_t
& head
, const hobject_t
& coid
,
7160 ::encode(*poi
, bv
, get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD
, nullptr));
7162 t
->clone(coid
, head
);
7163 setattr_maybe_cache(obc
, ctx
, t
, OI_ATTR
, bv
);
7164 rmattr_maybe_cache(obc
, ctx
, t
, SS_ATTR
);
7167 void PrimaryLogPG::make_writeable(OpContext
*ctx
)
7169 const hobject_t
& soid
= ctx
->obs
->oi
.soid
;
7170 SnapContext
& snapc
= ctx
->snapc
;
7173 assert(soid
.snap
== CEPH_NOSNAP
);
7174 dout(20) << "make_writeable " << soid
<< " snapset=" << ctx
->new_snapset
7175 << " snapc=" << snapc
<< dendl
;
7177 bool was_dirty
= ctx
->obc
->obs
.oi
.is_dirty();
7178 if (ctx
->new_obs
.exists
) {
7179 // we will mark the object dirty
7180 if (ctx
->undirty
&& was_dirty
) {
7181 dout(20) << " clearing DIRTY flag" << dendl
;
7182 assert(ctx
->new_obs
.oi
.is_dirty());
7183 ctx
->new_obs
.oi
.clear_flag(object_info_t::FLAG_DIRTY
);
7184 --ctx
->delta_stats
.num_objects_dirty
;
7185 osd
->logger
->inc(l_osd_tier_clean
);
7186 } else if (!was_dirty
&& !ctx
->undirty
) {
7187 dout(20) << " setting DIRTY flag" << dendl
;
7188 ctx
->new_obs
.oi
.set_flag(object_info_t::FLAG_DIRTY
);
7189 ++ctx
->delta_stats
.num_objects_dirty
;
7190 osd
->logger
->inc(l_osd_tier_dirty
);
7194 dout(20) << " deletion, decrementing num_dirty and clearing flag" << dendl
;
7195 ctx
->new_obs
.oi
.clear_flag(object_info_t::FLAG_DIRTY
);
7196 --ctx
->delta_stats
.num_objects_dirty
;
7200 if ((ctx
->new_obs
.exists
&&
7201 ctx
->new_obs
.oi
.is_omap()) &&
7202 (!ctx
->obc
->obs
.exists
||
7203 !ctx
->obc
->obs
.oi
.is_omap())) {
7204 ++ctx
->delta_stats
.num_objects_omap
;
7206 if ((!ctx
->new_obs
.exists
||
7207 !ctx
->new_obs
.oi
.is_omap()) &&
7208 (ctx
->obc
->obs
.exists
&&
7209 ctx
->obc
->obs
.oi
.is_omap())) {
7210 --ctx
->delta_stats
.num_objects_omap
;
7214 if (ctx
->new_snapset
.seq
> snapc
.seq
) {
7215 snapc
.seq
= ctx
->new_snapset
.seq
;
7216 snapc
.snaps
= ctx
->new_snapset
.snaps
;
7217 filter_snapc(snapc
.snaps
);
7218 dout(10) << " using newer snapc " << snapc
<< dendl
;
7221 if ((ctx
->obs
->exists
&& !ctx
->obs
->oi
.is_whiteout()) && // head exist(ed)
7222 snapc
.snaps
.size() && // there are snaps
7223 !ctx
->cache_evict
&&
7224 snapc
.snaps
[0] > ctx
->new_snapset
.seq
) { // existing object is old
7226 hobject_t coid
= soid
;
7227 coid
.snap
= snapc
.seq
;
7230 for (l
=1; l
<snapc
.snaps
.size() && snapc
.snaps
[l
] > ctx
->new_snapset
.seq
; l
++) ;
7232 vector
<snapid_t
> snaps(l
);
7233 for (unsigned i
=0; i
<l
; i
++)
7234 snaps
[i
] = snapc
.snaps
[i
];
7237 object_info_t
static_snap_oi(coid
);
7238 object_info_t
*snap_oi
;
7240 ctx
->clone_obc
= object_contexts
.lookup_or_create(static_snap_oi
.soid
);
7241 ctx
->clone_obc
->destructor_callback
= new C_PG_ObjectContext(this, ctx
->clone_obc
.get());
7242 ctx
->clone_obc
->obs
.oi
= static_snap_oi
;
7243 ctx
->clone_obc
->obs
.exists
= true;
7244 ctx
->clone_obc
->ssc
= ctx
->obc
->ssc
;
7245 ctx
->clone_obc
->ssc
->ref
++;
7246 if (pool
.info
.require_rollback())
7247 ctx
->clone_obc
->attr_cache
= ctx
->obc
->attr_cache
;
7248 snap_oi
= &ctx
->clone_obc
->obs
.oi
;
7249 bool got
= ctx
->lock_manager
.get_write_greedy(
7254 dout(20) << " got greedy write on clone_obc " << *ctx
->clone_obc
<< dendl
;
7256 snap_oi
= &static_snap_oi
;
7258 snap_oi
->version
= ctx
->at_version
;
7259 snap_oi
->prior_version
= ctx
->obs
->oi
.version
;
7260 snap_oi
->copy_user_bits(ctx
->obs
->oi
);
7262 bool legacy
= ctx
->new_snapset
.is_legacy() ||
7263 get_osdmap()->require_osd_release
< CEPH_RELEASE_LUMINOUS
;
7265 snap_oi
->legacy_snaps
= snaps
;
7268 _make_clone(ctx
, ctx
->op_t
.get(), ctx
->clone_obc
, soid
, coid
, snap_oi
);
7270 ctx
->delta_stats
.num_objects
++;
7271 if (snap_oi
->is_dirty()) {
7272 ctx
->delta_stats
.num_objects_dirty
++;
7273 osd
->logger
->inc(l_osd_tier_dirty
);
7275 if (snap_oi
->is_omap())
7276 ctx
->delta_stats
.num_objects_omap
++;
7277 if (snap_oi
->is_cache_pinned())
7278 ctx
->delta_stats
.num_objects_pinned
++;
7279 ctx
->delta_stats
.num_object_clones
++;
7280 ctx
->new_snapset
.clones
.push_back(coid
.snap
);
7281 ctx
->new_snapset
.clone_size
[coid
.snap
] = ctx
->obs
->oi
.size
;
7283 ctx
->new_snapset
.clone_snaps
[coid
.snap
] = snaps
;
7286 // clone_overlap should contain an entry for each clone
7287 // (an empty interval_set if there is no overlap)
7288 ctx
->new_snapset
.clone_overlap
[coid
.snap
];
7289 if (ctx
->obs
->oi
.size
)
7290 ctx
->new_snapset
.clone_overlap
[coid
.snap
].insert(0, ctx
->obs
->oi
.size
);
7293 dout(10) << " cloning v " << ctx
->obs
->oi
.version
7294 << " to " << coid
<< " v " << ctx
->at_version
7295 << " snaps=" << snaps
7296 << " snapset=" << ctx
->new_snapset
<< dendl
;
7297 ctx
->log
.push_back(pg_log_entry_t(pg_log_entry_t::CLONE
, coid
, ctx
->at_version
,
7298 ctx
->obs
->oi
.version
,
7299 ctx
->obs
->oi
.user_version
,
7300 osd_reqid_t(), ctx
->new_obs
.oi
.mtime
, 0));
7301 ::encode(snaps
, ctx
->log
.back().snaps
);
7303 ctx
->at_version
.version
++;
7306 // update most recent clone_overlap and usage stats
7307 if (ctx
->new_snapset
.clones
.size() > 0) {
7308 /* we need to check whether the most recent clone exists, if it's been evicted,
7309 * it's not included in the stats */
7310 hobject_t last_clone_oid
= soid
;
7311 last_clone_oid
.snap
= ctx
->new_snapset
.clone_overlap
.rbegin()->first
;
7312 if (is_present_clone(last_clone_oid
)) {
7313 interval_set
<uint64_t> &newest_overlap
= ctx
->new_snapset
.clone_overlap
.rbegin()->second
;
7314 ctx
->modified_ranges
.intersection_of(newest_overlap
);
7315 // modified_ranges is still in use by the clone
7316 add_interval_usage(ctx
->modified_ranges
, ctx
->delta_stats
);
7317 newest_overlap
.subtract(ctx
->modified_ranges
);
7321 // update snapset with latest snap context
7322 ctx
->new_snapset
.seq
= snapc
.seq
;
7323 ctx
->new_snapset
.snaps
= snapc
.snaps
;
7324 if (get_osdmap()->require_osd_release
< CEPH_RELEASE_LUMINOUS
) {
7325 // pessimistic assumption that this is a net-new legacy SnapSet
7326 ctx
->delta_stats
.num_legacy_snapsets
++;
7327 ctx
->new_snapset
.head_exists
= ctx
->new_obs
.exists
;
7328 } else if (ctx
->new_snapset
.is_legacy()) {
7329 ctx
->new_snapset
.head_exists
= ctx
->new_obs
.exists
;
7331 dout(20) << "make_writeable " << soid
7332 << " done, snapset=" << ctx
->new_snapset
<< dendl
;
7336 void PrimaryLogPG::write_update_size_and_usage(object_stat_sum_t
& delta_stats
, object_info_t
& oi
,
7337 interval_set
<uint64_t>& modified
, uint64_t offset
,
7338 uint64_t length
, bool write_full
)
7340 interval_set
<uint64_t> ch
;
7343 ch
.insert(0, oi
.size
);
7345 ch
.insert(offset
, length
);
7346 modified
.union_of(ch
);
7347 if (write_full
|| offset
+ length
> oi
.size
) {
7348 uint64_t new_size
= offset
+ length
;
7349 delta_stats
.num_bytes
-= oi
.size
;
7350 delta_stats
.num_bytes
+= new_size
;
7353 delta_stats
.num_wr
++;
7354 delta_stats
.num_wr_kb
+= SHIFT_ROUND_UP(length
, 10);
7357 void PrimaryLogPG::add_interval_usage(interval_set
<uint64_t>& s
, object_stat_sum_t
& delta_stats
)
7359 for (interval_set
<uint64_t>::const_iterator p
= s
.begin(); p
!= s
.end(); ++p
) {
7360 delta_stats
.num_bytes
+= p
.get_len();
7364 void PrimaryLogPG::complete_disconnect_watches(
7365 ObjectContextRef obc
,
7366 const list
<watch_disconnect_t
> &to_disconnect
)
7368 for (list
<watch_disconnect_t
>::const_iterator i
=
7369 to_disconnect
.begin();
7370 i
!= to_disconnect
.end();
7372 pair
<uint64_t, entity_name_t
> watcher(i
->cookie
, i
->name
);
7373 auto watchers_entry
= obc
->watchers
.find(watcher
);
7374 if (watchers_entry
!= obc
->watchers
.end()) {
7375 WatchRef watch
= watchers_entry
->second
;
7376 dout(10) << "do_osd_op_effects disconnect watcher " << watcher
<< dendl
;
7377 obc
->watchers
.erase(watcher
);
7378 watch
->remove(i
->send_disconnect
);
7380 dout(10) << "do_osd_op_effects disconnect failed to find watcher "
7381 << watcher
<< dendl
;
7386 void PrimaryLogPG::do_osd_op_effects(OpContext
*ctx
, const ConnectionRef
& conn
)
7388 entity_name_t entity
= ctx
->reqid
.name
;
7389 dout(15) << "do_osd_op_effects " << entity
<< " con " << conn
.get() << dendl
;
7391 // disconnects first
7392 complete_disconnect_watches(ctx
->obc
, ctx
->watch_disconnects
);
7396 boost::intrusive_ptr
<Session
> session((Session
*)conn
->get_priv());
7399 session
->put(); // get_priv() takes a ref, and so does the intrusive_ptr
7401 for (list
<pair
<watch_info_t
,bool> >::iterator i
= ctx
->watch_connects
.begin();
7402 i
!= ctx
->watch_connects
.end();
7404 pair
<uint64_t, entity_name_t
> watcher(i
->first
.cookie
, entity
);
7405 dout(15) << "do_osd_op_effects applying watch connect on session "
7406 << session
.get() << " watcher " << watcher
<< dendl
;
7408 if (ctx
->obc
->watchers
.count(watcher
)) {
7409 dout(15) << "do_osd_op_effects found existing watch watcher " << watcher
7411 watch
= ctx
->obc
->watchers
[watcher
];
7413 dout(15) << "do_osd_op_effects new watcher " << watcher
7415 watch
= Watch::makeWatchRef(
7416 this, osd
, ctx
->obc
, i
->first
.timeout_seconds
,
7417 i
->first
.cookie
, entity
, conn
->get_peer_addr());
7418 ctx
->obc
->watchers
.insert(
7423 watch
->connect(conn
, i
->second
);
7426 for (list
<notify_info_t
>::iterator p
= ctx
->notifies
.begin();
7427 p
!= ctx
->notifies
.end();
7429 dout(10) << "do_osd_op_effects, notify " << *p
<< dendl
;
7430 ConnectionRef
conn(ctx
->op
->get_req()->get_connection());
7432 Notify::makeNotifyRef(
7434 ctx
->reqid
.name
.num(),
7439 ctx
->obc
->obs
.oi
.user_version
,
7441 for (map
<pair
<uint64_t, entity_name_t
>, WatchRef
>::iterator i
=
7442 ctx
->obc
->watchers
.begin();
7443 i
!= ctx
->obc
->watchers
.end();
7445 dout(10) << "starting notify on watch " << i
->first
<< dendl
;
7446 i
->second
->start_notify(notif
);
7451 for (list
<OpContext::NotifyAck
>::iterator p
= ctx
->notify_acks
.begin();
7452 p
!= ctx
->notify_acks
.end();
7454 if (p
->watch_cookie
)
7455 dout(10) << "notify_ack " << make_pair(p
->watch_cookie
.get(), p
->notify_id
) << dendl
;
7457 dout(10) << "notify_ack " << make_pair("NULL", p
->notify_id
) << dendl
;
7458 for (map
<pair
<uint64_t, entity_name_t
>, WatchRef
>::iterator i
=
7459 ctx
->obc
->watchers
.begin();
7460 i
!= ctx
->obc
->watchers
.end();
7462 if (i
->first
.second
!= entity
) continue;
7463 if (p
->watch_cookie
&&
7464 p
->watch_cookie
.get() != i
->first
.first
) continue;
7465 dout(10) << "acking notify on watch " << i
->first
<< dendl
;
7466 i
->second
->notify_ack(p
->notify_id
, p
->reply_bl
);
7471 hobject_t
PrimaryLogPG::generate_temp_object(const hobject_t
& target
)
7474 ss
<< "temp_" << info
.pgid
<< "_" << get_role()
7475 << "_" << osd
->monc
->get_global_id() << "_" << (++temp_seq
);
7476 hobject_t hoid
= target
.make_temp_hobject(ss
.str());
7477 dout(20) << __func__
<< " " << hoid
<< dendl
;
7481 hobject_t
PrimaryLogPG::get_temp_recovery_object(
7482 const hobject_t
& target
,
7486 ss
<< "temp_recovering_" << info
.pgid
// (note this includes the shardid)
7488 << "_" << info
.history
.same_interval_since
7489 << "_" << target
.snap
;
7490 // pgid + version + interval + snapid is unique, and short
7491 hobject_t hoid
= target
.make_temp_hobject(ss
.str());
7492 dout(20) << __func__
<< " " << hoid
<< dendl
;
7496 int PrimaryLogPG::prepare_transaction(OpContext
*ctx
)
7498 assert(!ctx
->ops
->empty());
7500 const hobject_t
& soid
= ctx
->obs
->oi
.soid
;
7502 // valid snap context?
7503 if (!ctx
->snapc
.is_valid()) {
7504 dout(10) << " invalid snapc " << ctx
->snapc
<< dendl
;
7508 // prepare the actual mutation
7509 int result
= do_osd_ops(ctx
, *ctx
->ops
);
7511 if (ctx
->op
->may_write() &&
7512 get_osdmap()->require_osd_release
>= CEPH_RELEASE_KRAKEN
) {
7513 // need to save the error code in the pg log, to detect dup ops,
7514 // but do nothing else
7515 ctx
->update_log_only
= true;
7520 // read-op? write-op noop? done?
7521 if (ctx
->op_t
->empty() && !ctx
->modify
) {
7522 unstable_stats
.add(ctx
->delta_stats
);
7523 if (ctx
->op
->may_write() &&
7524 get_osdmap()->require_osd_release
>= CEPH_RELEASE_KRAKEN
) {
7525 ctx
->update_log_only
= true;
7531 if ((ctx
->delta_stats
.num_bytes
> 0 ||
7532 ctx
->delta_stats
.num_objects
> 0) && // FIXME: keys?
7533 (pool
.info
.has_flag(pg_pool_t::FLAG_FULL
) ||
7534 get_osdmap()->test_flag(CEPH_OSDMAP_FULL
))) {
7535 const MOSDOp
*m
= static_cast<const MOSDOp
*>(ctx
->op
->get_req());
7536 if (ctx
->reqid
.name
.is_mds() || // FIXME: ignore MDS for now
7537 m
->has_flag(CEPH_OSD_FLAG_FULL_FORCE
)) {
7538 dout(20) << __func__
<< " full, but proceeding due to FULL_FORCE or MDS"
7540 } else if (m
->has_flag(CEPH_OSD_FLAG_FULL_TRY
)) {
7541 // they tried, they failed.
7542 dout(20) << __func__
<< " full, replying to FULL_TRY op" << dendl
;
7543 return pool
.info
.has_flag(pg_pool_t::FLAG_FULL
) ? -EDQUOT
: -ENOSPC
;
7546 dout(20) << __func__
<< " full, dropping request (bad client)" << dendl
;
7551 // clone, if necessary
7552 if (soid
.snap
== CEPH_NOSNAP
)
7553 make_writeable(ctx
);
7556 ctx
->new_obs
.exists
? pg_log_entry_t::MODIFY
:
7557 pg_log_entry_t::DELETE
);
7562 void PrimaryLogPG::finish_ctx(OpContext
*ctx
, int log_op_type
, bool maintain_ssc
)
7564 const hobject_t
& soid
= ctx
->obs
->oi
.soid
;
7565 dout(20) << __func__
<< " " << soid
<< " " << ctx
7566 << " op " << pg_log_entry_t::get_op_name(log_op_type
)
7568 utime_t now
= ceph_clock_now();
7573 if (soid
.snap
== CEPH_NOSNAP
&& maintain_ssc
) {
7574 ::encode(ctx
->new_snapset
, bss
);
7575 assert(ctx
->new_obs
.exists
== ctx
->new_snapset
.head_exists
||
7576 !ctx
->new_snapset
.is_legacy());
7578 if (ctx
->new_obs
.exists
) {
7579 if (!ctx
->obs
->exists
) {
7580 if (ctx
->snapset_obc
&& ctx
->snapset_obc
->obs
.exists
) {
7581 hobject_t snapoid
= soid
.get_snapdir();
7582 dout(10) << " removing unneeded snapdir " << snapoid
<< dendl
;
7583 ctx
->log
.push_back(pg_log_entry_t(pg_log_entry_t::DELETE
, snapoid
,
7585 ctx
->snapset_obc
->obs
.oi
.version
,
7586 0, osd_reqid_t(), ctx
->mtime
, 0));
7587 ctx
->op_t
->remove(snapoid
);
7589 ctx
->at_version
.version
++;
7591 ctx
->snapset_obc
->obs
.exists
= false;
7594 } else if (!ctx
->new_snapset
.clones
.empty() &&
7595 !ctx
->cache_evict
&&
7596 !ctx
->new_snapset
.head_exists
&&
7597 (!ctx
->snapset_obc
|| !ctx
->snapset_obc
->obs
.exists
)) {
7598 // save snapset on _snap
7599 hobject_t
snapoid(soid
.oid
, soid
.get_key(), CEPH_SNAPDIR
, soid
.get_hash(),
7600 info
.pgid
.pool(), soid
.get_namespace());
7601 dout(10) << " final snapset " << ctx
->new_snapset
7602 << " in " << snapoid
<< dendl
;
7603 assert(get_osdmap()->require_osd_release
< CEPH_RELEASE_LUMINOUS
);
7604 ctx
->log
.push_back(pg_log_entry_t(pg_log_entry_t::MODIFY
, snapoid
,
7607 0, osd_reqid_t(), ctx
->mtime
, 0));
7609 if (!ctx
->snapset_obc
)
7610 ctx
->snapset_obc
= get_object_context(snapoid
, true);
7612 if (ctx
->lock_type
== ObjectContext::RWState::RWWRITE
) {
7613 got
= ctx
->lock_manager
.get_write_greedy(
7618 assert(ctx
->lock_type
== ObjectContext::RWState::RWEXCL
);
7619 got
= ctx
->lock_manager
.get_lock_type(
7620 ObjectContext::RWState::RWEXCL
,
7626 dout(20) << " got greedy write on snapset_obc " << *ctx
->snapset_obc
<< dendl
;
7627 ctx
->snapset_obc
->obs
.exists
= true;
7628 ctx
->snapset_obc
->obs
.oi
.version
= ctx
->at_version
;
7629 ctx
->snapset_obc
->obs
.oi
.last_reqid
= ctx
->reqid
;
7630 ctx
->snapset_obc
->obs
.oi
.mtime
= ctx
->mtime
;
7631 ctx
->snapset_obc
->obs
.oi
.local_mtime
= now
;
7633 map
<string
, bufferlist
> attrs
;
7634 bufferlist
bv(sizeof(ctx
->new_obs
.oi
));
7635 ::encode(ctx
->snapset_obc
->obs
.oi
, bv
,
7636 get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD
, nullptr));
7637 ctx
->op_t
->create(snapoid
);
7638 attrs
[OI_ATTR
].claim(bv
);
7639 attrs
[SS_ATTR
].claim(bss
);
7640 setattrs_maybe_cache(ctx
->snapset_obc
, ctx
, ctx
->op_t
.get(), attrs
);
7641 ctx
->at_version
.version
++;
7645 // finish and log the op.
7646 if (ctx
->user_modify
) {
7647 // update the user_version for any modify ops, except for the watch op
7648 ctx
->user_at_version
= MAX(info
.last_user_version
, ctx
->new_obs
.oi
.user_version
) + 1;
7649 /* In order for new clients and old clients to interoperate properly
7650 * when exchanging versions, we need to lower bound the user_version
7651 * (which our new clients pay proper attention to)
7652 * by the at_version (which is all the old clients can ever see). */
7653 if (ctx
->at_version
.version
> ctx
->user_at_version
)
7654 ctx
->user_at_version
= ctx
->at_version
.version
;
7655 ctx
->new_obs
.oi
.user_version
= ctx
->user_at_version
;
7657 ctx
->bytes_written
= ctx
->op_t
->get_bytes_written();
7659 if (ctx
->new_obs
.exists
) {
7660 // on the head object
7661 ctx
->new_obs
.oi
.version
= ctx
->at_version
;
7662 ctx
->new_obs
.oi
.prior_version
= ctx
->obs
->oi
.version
;
7663 ctx
->new_obs
.oi
.last_reqid
= ctx
->reqid
;
7664 if (ctx
->mtime
!= utime_t()) {
7665 ctx
->new_obs
.oi
.mtime
= ctx
->mtime
;
7666 dout(10) << " set mtime to " << ctx
->new_obs
.oi
.mtime
<< dendl
;
7667 ctx
->new_obs
.oi
.local_mtime
= now
;
7669 dout(10) << " mtime unchanged at " << ctx
->new_obs
.oi
.mtime
<< dendl
;
7672 map
<string
, bufferlist
> attrs
;
7673 bufferlist
bv(sizeof(ctx
->new_obs
.oi
));
7674 ::encode(ctx
->new_obs
.oi
, bv
,
7675 get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD
, nullptr));
7676 attrs
[OI_ATTR
].claim(bv
);
7678 if (soid
.snap
== CEPH_NOSNAP
) {
7679 dout(10) << " final snapset " << ctx
->new_snapset
7680 << " in " << soid
<< dendl
;
7681 attrs
[SS_ATTR
].claim(bss
);
7683 dout(10) << " no snapset (this is a clone)" << dendl
;
7685 ctx
->op_t
->setattrs(soid
, attrs
);
7687 ctx
->new_obs
.oi
= object_info_t(ctx
->obc
->obs
.oi
.soid
);
7690 bool legacy_snapset
= ctx
->new_snapset
.is_legacy() ||
7691 get_osdmap()->require_osd_release
< CEPH_RELEASE_LUMINOUS
;
7694 ctx
->log
.push_back(pg_log_entry_t(log_op_type
, soid
, ctx
->at_version
,
7695 ctx
->obs
->oi
.version
,
7696 ctx
->user_at_version
, ctx
->reqid
,
7698 if (soid
.snap
< CEPH_NOSNAP
) {
7699 switch (log_op_type
) {
7700 case pg_log_entry_t::MODIFY
:
7701 case pg_log_entry_t::PROMOTE
:
7702 case pg_log_entry_t::CLEAN
:
7703 if (legacy_snapset
) {
7704 dout(20) << __func__
<< " encoding legacy_snaps "
7705 << ctx
->new_obs
.oi
.legacy_snaps
7707 ::encode(ctx
->new_obs
.oi
.legacy_snaps
, ctx
->log
.back().snaps
);
7709 dout(20) << __func__
<< " encoding snaps from " << ctx
->new_snapset
7711 ::encode(ctx
->new_snapset
.clone_snaps
[soid
.snap
], ctx
->log
.back().snaps
);
7719 if (!ctx
->extra_reqids
.empty()) {
7720 dout(20) << __func__
<< " extra_reqids " << ctx
->extra_reqids
<< dendl
;
7721 ctx
->log
.back().extra_reqids
.swap(ctx
->extra_reqids
);
7724 // apply new object state.
7725 ctx
->obc
->obs
= ctx
->new_obs
;
7727 if (soid
.is_head() && !ctx
->obc
->obs
.exists
&&
7728 (!maintain_ssc
|| ctx
->cache_evict
)) {
7729 ctx
->obc
->ssc
->exists
= false;
7730 ctx
->obc
->ssc
->snapset
= SnapSet();
7732 ctx
->obc
->ssc
->exists
= true;
7733 ctx
->obc
->ssc
->snapset
= ctx
->new_snapset
;
7737 void PrimaryLogPG::apply_stats(
7738 const hobject_t
&soid
,
7739 const object_stat_sum_t
&delta_stats
) {
7741 info
.stats
.stats
.add(delta_stats
);
7743 for (set
<pg_shard_t
>::iterator i
= backfill_targets
.begin();
7744 i
!= backfill_targets
.end();
7747 pg_info_t
& pinfo
= peer_info
[bt
];
7748 if (soid
<= pinfo
.last_backfill
)
7749 pinfo
.stats
.stats
.add(delta_stats
);
7750 else if (soid
<= last_backfill_started
)
7751 pending_backfill_updates
[soid
].stats
.add(delta_stats
);
7754 if (is_primary() && scrubber
.active
) {
7755 if (soid
< scrubber
.start
) {
7756 dout(20) << __func__
<< " " << soid
<< " < [" << scrubber
.start
7757 << "," << scrubber
.end
<< ")" << dendl
;
7758 scrub_cstat
.add(delta_stats
);
7760 dout(20) << __func__
<< " " << soid
<< " >= [" << scrubber
.start
7761 << "," << scrubber
.end
<< ")" << dendl
;
7766 void PrimaryLogPG::complete_read_ctx(int result
, OpContext
*ctx
)
7768 const MOSDOp
*m
= static_cast<const MOSDOp
*>(ctx
->op
->get_req());
7769 assert(ctx
->async_reads_complete());
7771 for (vector
<OSDOp
>::iterator p
= ctx
->ops
->begin();
7772 p
!= ctx
->ops
->end() && result
>= 0; ++p
) {
7773 if (p
->rval
< 0 && !(p
->op
.flags
& CEPH_OSD_OP_FLAG_FAILOK
)) {
7777 ctx
->bytes_read
+= p
->outdata
.length();
7779 ctx
->reply
->claim_op_out_data(*ctx
->ops
);
7780 ctx
->reply
->get_header().data_off
= (ctx
->data_off
? *ctx
->data_off
: 0);
7782 MOSDOpReply
*reply
= ctx
->reply
;
7783 ctx
->reply
= nullptr;
7786 if (!ctx
->ignore_log_op_stats
) {
7788 publish_stats_to_osd();
7791 // on read, return the current object version
7793 reply
->set_reply_versions(eversion_t(), ctx
->obs
->oi
.user_version
);
7795 reply
->set_reply_versions(eversion_t(), ctx
->user_at_version
);
7797 } else if (result
== -ENOENT
) {
7798 // on ENOENT, set a floor for what the next user version will be.
7799 reply
->set_enoent_reply_versions(info
.last_update
, info
.last_user_version
);
7802 reply
->set_result(result
);
7803 reply
->add_flags(CEPH_OSD_FLAG_ACK
| CEPH_OSD_FLAG_ONDISK
);
7804 osd
->send_message_osd_client(reply
, m
->get_connection());
7808 // ========================================================================
7811 struct C_Copyfrom
: public Context
{
7814 epoch_t last_peering_reset
;
7816 PrimaryLogPG::CopyOpRef cop
;
7817 C_Copyfrom(PrimaryLogPG
*p
, hobject_t o
, epoch_t lpr
,
7818 const PrimaryLogPG::CopyOpRef
& c
)
7819 : pg(p
), oid(o
), last_peering_reset(lpr
),
7822 void finish(int r
) override
{
7823 if (r
== -ECANCELED
)
7826 if (last_peering_reset
== pg
->get_last_peering_reset()) {
7827 pg
->process_copy_chunk(oid
, tid
, r
);
7833 struct C_CopyFrom_AsyncReadCb
: public Context
{
7835 object_copy_data_t reply_obj
;
7838 C_CopyFrom_AsyncReadCb(OSDOp
*osd_op
, uint64_t features
) :
7839 osd_op(osd_op
), features(features
), len(0) {}
7840 void finish(int r
) override
{
7847 assert(len
<= reply_obj
.data
.length());
7849 bl
.substr_of(reply_obj
.data
, 0, len
);
7850 reply_obj
.data
.swap(bl
);
7851 ::encode(reply_obj
, osd_op
->outdata
, features
);
7855 int PrimaryLogPG::do_copy_get(OpContext
*ctx
, bufferlist::iterator
& bp
,
7856 OSDOp
& osd_op
, ObjectContextRef
&obc
)
7858 object_info_t
& oi
= obc
->obs
.oi
;
7859 hobject_t
& soid
= oi
.soid
;
7861 object_copy_cursor_t cursor
;
7863 bool skip_data_digest
=
7864 (osd
->store
->has_builtin_csum() && g_conf
->osd_skip_data_digest
) ||
7865 g_conf
->osd_distrust_data_digest
;
7868 ::decode(cursor
, bp
);
7869 ::decode(out_max
, bp
);
7871 catch (buffer::error
& e
) {
7876 const MOSDOp
*op
= reinterpret_cast<const MOSDOp
*>(ctx
->op
->get_req());
7877 uint64_t features
= op
->get_features();
7879 bool async_read_started
= false;
7880 object_copy_data_t _reply_obj
;
7881 C_CopyFrom_AsyncReadCb
*cb
= NULL
;
7882 if (pool
.info
.require_rollback()) {
7883 cb
= new C_CopyFrom_AsyncReadCb(&osd_op
, features
);
7885 object_copy_data_t
&reply_obj
= cb
? cb
->reply_obj
: _reply_obj
;
7887 reply_obj
.size
= oi
.size
;
7888 reply_obj
.mtime
= oi
.mtime
;
7890 if (soid
.snap
< CEPH_NOSNAP
) {
7891 if (obc
->ssc
->snapset
.is_legacy()) {
7892 reply_obj
.snaps
= oi
.legacy_snaps
;
7894 auto p
= obc
->ssc
->snapset
.clone_snaps
.find(soid
.snap
);
7895 assert(p
!= obc
->ssc
->snapset
.clone_snaps
.end()); // warn?
7896 reply_obj
.snaps
= p
->second
;
7899 reply_obj
.snap_seq
= obc
->ssc
->snapset
.seq
;
7901 if (!skip_data_digest
&& oi
.is_data_digest()) {
7902 reply_obj
.flags
|= object_copy_data_t::FLAG_DATA_DIGEST
;
7903 reply_obj
.data_digest
= oi
.data_digest
;
7905 if (oi
.is_omap_digest()) {
7906 reply_obj
.flags
|= object_copy_data_t::FLAG_OMAP_DIGEST
;
7907 reply_obj
.omap_digest
= oi
.omap_digest
;
7909 reply_obj
.truncate_seq
= oi
.truncate_seq
;
7910 reply_obj
.truncate_size
= oi
.truncate_size
;
7913 map
<string
,bufferlist
>& out_attrs
= reply_obj
.attrs
;
7914 if (!cursor
.attr_complete
) {
7915 result
= getattrs_maybe_cache(
7924 cursor
.attr_complete
= true;
7925 dout(20) << " got attrs" << dendl
;
7928 int64_t left
= out_max
- osd_op
.outdata
.length();
7931 bufferlist
& bl
= reply_obj
.data
;
7932 if (left
> 0 && !cursor
.data_complete
) {
7933 if (cursor
.data_offset
< oi
.size
) {
7934 uint64_t max_read
= MIN(oi
.size
- cursor
.data_offset
, (uint64_t)left
);
7936 async_read_started
= true;
7937 ctx
->pending_async_reads
.push_back(
7939 boost::make_tuple(cursor
.data_offset
, max_read
, osd_op
.op
.flags
),
7940 make_pair(&bl
, cb
)));
7943 ctx
->op_finishers
[ctx
->current_osd_subop_num
].reset(
7944 new ReadFinisher(osd_op
));
7945 result
= -EINPROGRESS
;
7947 dout(10) << __func__
<< ": async_read noted for " << soid
<< dendl
;
7949 result
= pgbackend
->objects_read_sync(
7950 oi
.soid
, cursor
.data_offset
, max_read
, osd_op
.op
.flags
, &bl
);
7955 cursor
.data_offset
+= max_read
;
7957 if (cursor
.data_offset
== oi
.size
) {
7958 cursor
.data_complete
= true;
7959 dout(20) << " got data" << dendl
;
7961 assert(cursor
.data_offset
<= oi
.size
);
7965 uint32_t omap_keys
= 0;
7966 if (!pool
.info
.supports_omap() || !oi
.is_omap()) {
7967 cursor
.omap_complete
= true;
7969 if (left
> 0 && !cursor
.omap_complete
) {
7970 assert(cursor
.data_complete
);
7971 if (cursor
.omap_offset
.empty()) {
7972 osd
->store
->omap_get_header(ch
, ghobject_t(oi
.soid
),
7973 &reply_obj
.omap_header
);
7975 bufferlist omap_data
;
7976 ObjectMap::ObjectMapIterator iter
=
7977 osd
->store
->get_omap_iterator(coll
, ghobject_t(oi
.soid
));
7979 iter
->upper_bound(cursor
.omap_offset
);
7980 for (; iter
->valid(); iter
->next(false)) {
7982 ::encode(iter
->key(), omap_data
);
7983 ::encode(iter
->value(), omap_data
);
7984 left
-= iter
->key().length() + 4 + iter
->value().length() + 4;
7989 ::encode(omap_keys
, reply_obj
.omap_data
);
7990 reply_obj
.omap_data
.claim_append(omap_data
);
7992 if (iter
->valid()) {
7993 cursor
.omap_offset
= iter
->key();
7995 cursor
.omap_complete
= true;
7996 dout(20) << " got omap" << dendl
;
8001 if (cursor
.is_complete()) {
8002 // include reqids only in the final step. this is a bit fragile
8004 pg_log
.get_log().get_object_reqids(ctx
->obc
->obs
.oi
.soid
, 10, &reply_obj
.reqids
);
8005 dout(20) << " got reqids" << dendl
;
8008 dout(20) << " cursor.is_complete=" << cursor
.is_complete()
8009 << " " << out_attrs
.size() << " attrs"
8010 << " " << bl
.length() << " bytes"
8011 << " " << reply_obj
.omap_header
.length() << " omap header bytes"
8012 << " " << reply_obj
.omap_data
.length() << " omap data bytes in "
8013 << omap_keys
<< " keys"
8014 << " " << reply_obj
.reqids
.size() << " reqids"
8016 reply_obj
.cursor
= cursor
;
8017 if (!async_read_started
) {
8018 ::encode(reply_obj
, osd_op
.outdata
, features
);
8020 if (cb
&& !async_read_started
) {
8030 void PrimaryLogPG::fill_in_copy_get_noent(OpRequestRef
& op
, hobject_t oid
,
8033 // NOTE: we take non-const ref here for claim_op_out_data below; we must
8034 // be careful not to modify anything else that will upset a racing
8036 MOSDOp
*m
= static_cast<MOSDOp
*>(op
->get_nonconst_req());
8037 uint64_t features
= m
->get_features();
8038 object_copy_data_t reply_obj
;
8040 pg_log
.get_log().get_object_reqids(oid
, 10, &reply_obj
.reqids
);
8041 dout(20) << __func__
<< " got reqids " << reply_obj
.reqids
<< dendl
;
8042 ::encode(reply_obj
, osd_op
.outdata
, features
);
8043 osd_op
.rval
= -ENOENT
;
8044 MOSDOpReply
*reply
= new MOSDOpReply(m
, 0, get_osdmap()->get_epoch(), 0, false);
8045 reply
->claim_op_out_data(m
->ops
);
8046 reply
->set_result(-ENOENT
);
8047 reply
->add_flags(CEPH_OSD_FLAG_ACK
| CEPH_OSD_FLAG_ONDISK
);
8048 osd
->send_message_osd_client(reply
, m
->get_connection());
8051 void PrimaryLogPG::start_copy(CopyCallback
*cb
, ObjectContextRef obc
,
8052 hobject_t src
, object_locator_t oloc
,
8053 version_t version
, unsigned flags
,
8054 bool mirror_snapset
,
8055 unsigned src_obj_fadvise_flags
,
8056 unsigned dest_obj_fadvise_flags
)
8058 const hobject_t
& dest
= obc
->obs
.oi
.soid
;
8059 dout(10) << __func__
<< " " << dest
8060 << " from " << src
<< " " << oloc
<< " v" << version
8061 << " flags " << flags
8062 << (mirror_snapset
? " mirror_snapset" : "")
8065 assert(!mirror_snapset
|| (src
.snap
== CEPH_NOSNAP
||
8066 src
.snap
== CEPH_SNAPDIR
));
8068 // cancel a previous in-progress copy?
8069 if (copy_ops
.count(dest
)) {
8070 // FIXME: if the src etc match, we could avoid restarting from the
8072 CopyOpRef cop
= copy_ops
[dest
];
8073 vector
<ceph_tid_t
> tids
;
8074 cancel_copy(cop
, false, &tids
);
8075 osd
->objecter
->op_cancel(tids
, -ECANCELED
);
8078 CopyOpRef
cop(std::make_shared
<CopyOp
>(cb
, obc
, src
, oloc
, version
, flags
,
8079 mirror_snapset
, src_obj_fadvise_flags
,
8080 dest_obj_fadvise_flags
));
8081 copy_ops
[dest
] = cop
;
8084 _copy_some(obc
, cop
);
8087 void PrimaryLogPG::_copy_some(ObjectContextRef obc
, CopyOpRef cop
)
8089 dout(10) << __func__
<< " " << obc
<< " " << cop
<< dendl
;
8092 if (cop
->flags
& CEPH_OSD_COPY_FROM_FLAG_FLUSH
)
8093 flags
|= CEPH_OSD_FLAG_FLUSH
;
8094 if (cop
->flags
& CEPH_OSD_COPY_FROM_FLAG_IGNORE_CACHE
)
8095 flags
|= CEPH_OSD_FLAG_IGNORE_CACHE
;
8096 if (cop
->flags
& CEPH_OSD_COPY_FROM_FLAG_IGNORE_OVERLAY
)
8097 flags
|= CEPH_OSD_FLAG_IGNORE_OVERLAY
;
8098 if (cop
->flags
& CEPH_OSD_COPY_FROM_FLAG_MAP_SNAP_CLONE
)
8099 flags
|= CEPH_OSD_FLAG_MAP_SNAP_CLONE
;
8100 if (cop
->flags
& CEPH_OSD_COPY_FROM_FLAG_RWORDERED
)
8101 flags
|= CEPH_OSD_FLAG_RWORDERED
;
8103 C_GatherBuilder
gather(cct
);
8105 if (cop
->cursor
.is_initial() && cop
->mirror_snapset
) {
8107 assert(cop
->src
.snap
== CEPH_NOSNAP
);
8109 op
.list_snaps(&cop
->results
.snapset
, NULL
);
8110 ceph_tid_t tid
= osd
->objecter
->read(cop
->src
.oid
, cop
->oloc
, op
,
8112 flags
, gather
.new_sub(), NULL
);
8113 cop
->objecter_tid2
= tid
;
8117 if (cop
->results
.user_version
) {
8118 op
.assert_version(cop
->results
.user_version
);
8120 // we should learn the version after the first chunk, if we didn't know
8122 assert(cop
->cursor
.is_initial());
8124 op
.copy_get(&cop
->cursor
, get_copy_chunk_size(),
8125 &cop
->results
.object_size
, &cop
->results
.mtime
,
8126 &cop
->attrs
, &cop
->data
, &cop
->omap_header
, &cop
->omap_data
,
8127 &cop
->results
.snaps
, &cop
->results
.snap_seq
,
8128 &cop
->results
.flags
,
8129 &cop
->results
.source_data_digest
,
8130 &cop
->results
.source_omap_digest
,
8131 &cop
->results
.reqids
,
8132 &cop
->results
.truncate_seq
,
8133 &cop
->results
.truncate_size
,
8135 op
.set_last_op_flags(cop
->src_obj_fadvise_flags
);
8137 C_Copyfrom
*fin
= new C_Copyfrom(this, obc
->obs
.oi
.soid
,
8138 get_last_peering_reset(), cop
);
8139 gather
.set_finisher(new C_OnFinisher(fin
,
8140 &osd
->objecter_finisher
));
8142 ceph_tid_t tid
= osd
->objecter
->read(cop
->src
.oid
, cop
->oloc
, op
,
8143 cop
->src
.snap
, NULL
,
8146 // discover the object version if we don't know it yet
8147 cop
->results
.user_version
? NULL
: &cop
->results
.user_version
);
8149 cop
->objecter_tid
= tid
;
8153 void PrimaryLogPG::process_copy_chunk(hobject_t oid
, ceph_tid_t tid
, int r
)
8155 vector
<ceph_tid_t
> tids
;
8156 dout(10) << __func__
<< " " << oid
<< " tid " << tid
8157 << " " << cpp_strerror(r
) << dendl
;
8158 map
<hobject_t
,CopyOpRef
>::iterator p
= copy_ops
.find(oid
);
8159 if (p
== copy_ops
.end()) {
8160 dout(10) << __func__
<< " no copy_op found" << dendl
;
8163 CopyOpRef cop
= p
->second
;
8164 if (tid
!= cop
->objecter_tid
) {
8165 dout(10) << __func__
<< " tid " << tid
<< " != cop " << cop
8166 << " tid " << cop
->objecter_tid
<< dendl
;
8170 if (cop
->omap_data
.length() || cop
->omap_header
.length())
8171 cop
->results
.has_omap
= true;
8173 if (r
>= 0 && !pool
.info
.supports_omap() &&
8174 (cop
->omap_data
.length() || cop
->omap_header
.length())) {
8177 cop
->objecter_tid
= 0;
8178 cop
->objecter_tid2
= 0; // assume this ordered before us (if it happened)
8179 ObjectContextRef
& cobc
= cop
->obc
;
8184 assert(cop
->rval
>= 0);
8186 if (oid
.snap
< CEPH_NOSNAP
&& !cop
->results
.snaps
.empty()) {
8187 // verify snap hasn't been deleted
8188 vector
<snapid_t
>::iterator p
= cop
->results
.snaps
.begin();
8189 while (p
!= cop
->results
.snaps
.end()) {
8190 if (pool
.info
.is_removed_snap(*p
)) {
8191 dout(10) << __func__
<< " clone snap " << *p
<< " has been deleted"
8193 for (vector
<snapid_t
>::iterator q
= p
+ 1;
8194 q
!= cop
->results
.snaps
.end();
8197 cop
->results
.snaps
.resize(cop
->results
.snaps
.size() - 1);
8202 if (cop
->results
.snaps
.empty()) {
8203 dout(10) << __func__
<< " no more snaps for " << oid
<< dendl
;
8209 assert(cop
->rval
>= 0);
8211 if (!cop
->temp_cursor
.data_complete
) {
8212 cop
->results
.data_digest
= cop
->data
.crc32c(cop
->results
.data_digest
);
8214 if (pool
.info
.supports_omap() && !cop
->temp_cursor
.omap_complete
) {
8215 if (cop
->omap_header
.length()) {
8216 cop
->results
.omap_digest
=
8217 cop
->omap_header
.crc32c(cop
->results
.omap_digest
);
8219 if (cop
->omap_data
.length()) {
8221 keys
.substr_of(cop
->omap_data
, 4, cop
->omap_data
.length() - 4);
8222 cop
->results
.omap_digest
= keys
.crc32c(cop
->results
.omap_digest
);
8226 if (!cop
->temp_cursor
.attr_complete
) {
8227 for (map
<string
,bufferlist
>::iterator p
= cop
->attrs
.begin();
8228 p
!= cop
->attrs
.end();
8230 cop
->results
.attrs
[string("_") + p
->first
] = p
->second
;
8235 if (!cop
->cursor
.is_complete()) {
8236 // write out what we have so far
8237 if (cop
->temp_cursor
.is_initial()) {
8238 assert(!cop
->results
.started_temp_obj
);
8239 cop
->results
.started_temp_obj
= true;
8240 cop
->results
.temp_oid
= generate_temp_object(oid
);
8241 dout(20) << __func__
<< " using temp " << cop
->results
.temp_oid
<< dendl
;
8243 ObjectContextRef tempobc
= get_object_context(cop
->results
.temp_oid
, true);
8244 OpContextUPtr ctx
= simple_opc_create(tempobc
);
8245 if (cop
->temp_cursor
.is_initial()) {
8246 ctx
->new_temp_oid
= cop
->results
.temp_oid
;
8248 _write_copy_chunk(cop
, ctx
->op_t
.get());
8249 simple_opc_submit(std::move(ctx
));
8250 dout(10) << __func__
<< " fetching more" << dendl
;
8251 _copy_some(cobc
, cop
);
8256 if (cop
->results
.is_data_digest() || cop
->results
.is_omap_digest()) {
8257 dout(20) << __func__
<< std::hex
8258 << " got digest: rx data 0x" << cop
->results
.data_digest
8259 << " omap 0x" << cop
->results
.omap_digest
8260 << ", source: data 0x" << cop
->results
.source_data_digest
8261 << " omap 0x" << cop
->results
.source_omap_digest
8263 << " flags " << cop
->results
.flags
8266 if (cop
->results
.is_data_digest() &&
8267 cop
->results
.data_digest
!= cop
->results
.source_data_digest
) {
8268 derr
<< __func__
<< std::hex
<< " data digest 0x" << cop
->results
.data_digest
8269 << " != source 0x" << cop
->results
.source_data_digest
<< std::dec
8271 osd
->clog
->error() << info
.pgid
<< " copy from " << cop
->src
8272 << " to " << cop
->obc
->obs
.oi
.soid
<< std::hex
8273 << " data digest 0x" << cop
->results
.data_digest
8274 << " != source 0x" << cop
->results
.source_data_digest
8279 if (cop
->results
.is_omap_digest() &&
8280 cop
->results
.omap_digest
!= cop
->results
.source_omap_digest
) {
8281 derr
<< __func__
<< std::hex
8282 << " omap digest 0x" << cop
->results
.omap_digest
8283 << " != source 0x" << cop
->results
.source_omap_digest
8284 << std::dec
<< dendl
;
8285 osd
->clog
->error() << info
.pgid
<< " copy from " << cop
->src
8286 << " to " << cop
->obc
->obs
.oi
.soid
<< std::hex
8287 << " omap digest 0x" << cop
->results
.omap_digest
8288 << " != source 0x" << cop
->results
.source_omap_digest
8293 if (cct
->_conf
->osd_debug_inject_copyfrom_error
) {
8294 derr
<< __func__
<< " injecting copyfrom failure" << dendl
;
8299 cop
->results
.fill_in_final_tx
= std::function
<void(PGTransaction
*)>(
8300 [this, &cop
/* avoid ref cycle */](PGTransaction
*t
) {
8301 ObjectState
& obs
= cop
->obc
->obs
;
8302 if (cop
->temp_cursor
.is_initial()) {
8303 dout(20) << "fill_in_final_tx: writing "
8304 << "directly to final object" << dendl
;
8305 // write directly to final object
8306 cop
->results
.temp_oid
= obs
.oi
.soid
;
8307 _write_copy_chunk(cop
, t
);
8309 // finish writing to temp object, then move into place
8310 dout(20) << "fill_in_final_tx: writing to temp object" << dendl
;
8311 _write_copy_chunk(cop
, t
);
8312 t
->rename(obs
.oi
.soid
, cop
->results
.temp_oid
);
8314 t
->setattrs(obs
.oi
.soid
, cop
->results
.attrs
);
8317 dout(20) << __func__
<< " success; committing" << dendl
;
8320 dout(20) << __func__
<< " complete r = " << cpp_strerror(r
) << dendl
;
8321 CopyCallbackResults
results(r
, &cop
->results
);
8322 cop
->cb
->complete(results
);
8324 copy_ops
.erase(cobc
->obs
.oi
.soid
);
8327 if (r
< 0 && cop
->results
.started_temp_obj
) {
8328 dout(10) << __func__
<< " deleting partial temp object "
8329 << cop
->results
.temp_oid
<< dendl
;
8330 ObjectContextRef tempobc
= get_object_context(cop
->results
.temp_oid
, true);
8331 OpContextUPtr ctx
= simple_opc_create(tempobc
);
8332 ctx
->op_t
->remove(cop
->results
.temp_oid
);
8333 ctx
->discard_temp_oid
= cop
->results
.temp_oid
;
8334 simple_opc_submit(std::move(ctx
));
8337 // cancel and requeue proxy ops on this object
8339 for (map
<ceph_tid_t
, ProxyReadOpRef
>::iterator it
= proxyread_ops
.begin();
8340 it
!= proxyread_ops
.end();) {
8341 if (it
->second
->soid
== cobc
->obs
.oi
.soid
) {
8342 cancel_proxy_read((it
++)->second
, &tids
);
8347 for (map
<ceph_tid_t
, ProxyWriteOpRef
>::iterator it
= proxywrite_ops
.begin();
8348 it
!= proxywrite_ops
.end();) {
8349 if (it
->second
->soid
== cobc
->obs
.oi
.soid
) {
8350 cancel_proxy_write((it
++)->second
, &tids
);
8355 osd
->objecter
->op_cancel(tids
, -ECANCELED
);
8356 kick_proxy_ops_blocked(cobc
->obs
.oi
.soid
);
8359 kick_object_context_blocked(cobc
);
8362 void PrimaryLogPG::cancel_and_requeue_proxy_ops(hobject_t oid
) {
8363 vector
<ceph_tid_t
> tids
;
8364 for (map
<ceph_tid_t
, ProxyReadOpRef
>::iterator it
= proxyread_ops
.begin();
8365 it
!= proxyread_ops
.end();) {
8366 if (it
->second
->soid
== oid
) {
8367 cancel_proxy_read((it
++)->second
, &tids
);
8372 for (map
<ceph_tid_t
, ProxyWriteOpRef
>::iterator it
= proxywrite_ops
.begin();
8373 it
!= proxywrite_ops
.end();) {
8374 if (it
->second
->soid
== oid
) {
8375 cancel_proxy_write((it
++)->second
, &tids
);
8380 osd
->objecter
->op_cancel(tids
, -ECANCELED
);
8381 kick_proxy_ops_blocked(oid
);
8384 void PrimaryLogPG::_write_copy_chunk(CopyOpRef cop
, PGTransaction
*t
)
8386 dout(20) << __func__
<< " " << cop
8387 << " " << cop
->attrs
.size() << " attrs"
8388 << " " << cop
->data
.length() << " bytes"
8389 << " " << cop
->omap_header
.length() << " omap header bytes"
8390 << " " << cop
->omap_data
.length() << " omap data bytes"
8392 if (!cop
->temp_cursor
.attr_complete
) {
8393 t
->create(cop
->results
.temp_oid
);
8395 if (!cop
->temp_cursor
.data_complete
) {
8396 assert(cop
->data
.length() + cop
->temp_cursor
.data_offset
==
8397 cop
->cursor
.data_offset
);
8398 if (pool
.info
.requires_aligned_append() &&
8399 !cop
->cursor
.data_complete
) {
8401 * Trim off the unaligned bit at the end, we'll adjust cursor.data_offset
8402 * to pick it up on the next pass.
8404 assert(cop
->temp_cursor
.data_offset
%
8405 pool
.info
.required_alignment() == 0);
8406 if (cop
->data
.length() % pool
.info
.required_alignment() != 0) {
8408 cop
->data
.length() % pool
.info
.required_alignment();
8410 bl
.substr_of(cop
->data
, 0, cop
->data
.length() - to_trim
);
8412 cop
->cursor
.data_offset
-= to_trim
;
8413 assert(cop
->data
.length() + cop
->temp_cursor
.data_offset
==
8414 cop
->cursor
.data_offset
);
8417 if (cop
->data
.length()) {
8419 cop
->results
.temp_oid
,
8420 cop
->temp_cursor
.data_offset
,
8423 cop
->dest_obj_fadvise_flags
);
8427 if (pool
.info
.supports_omap()) {
8428 if (!cop
->temp_cursor
.omap_complete
) {
8429 if (cop
->omap_header
.length()) {
8431 cop
->results
.temp_oid
,
8433 cop
->omap_header
.clear();
8435 if (cop
->omap_data
.length()) {
8436 map
<string
,bufferlist
> omap
;
8437 bufferlist::iterator p
= cop
->omap_data
.begin();
8439 t
->omap_setkeys(cop
->results
.temp_oid
, omap
);
8440 cop
->omap_data
.clear();
8444 assert(cop
->omap_header
.length() == 0);
8445 assert(cop
->omap_data
.length() == 0);
8447 cop
->temp_cursor
= cop
->cursor
;
8450 void PrimaryLogPG::finish_copyfrom(CopyFromCallback
*cb
)
8452 OpContext
*ctx
= cb
->ctx
;
8453 dout(20) << "finish_copyfrom on " << ctx
->obs
->oi
.soid
<< dendl
;
8455 ObjectState
& obs
= ctx
->new_obs
;
8457 dout(20) << __func__
<< ": exists, removing" << dendl
;
8458 ctx
->op_t
->remove(obs
.oi
.soid
);
8460 ctx
->delta_stats
.num_objects
++;
8463 if (cb
->is_temp_obj_used()) {
8464 ctx
->discard_temp_oid
= cb
->results
->temp_oid
;
8466 cb
->results
->fill_in_final_tx(ctx
->op_t
.get());
8468 // CopyFromCallback fills this in for us
8469 obs
.oi
.user_version
= ctx
->user_at_version
;
8471 if (cb
->results
->is_data_digest()) {
8472 obs
.oi
.set_data_digest(cb
->results
->data_digest
);
8474 obs
.oi
.clear_data_digest();
8476 if (cb
->results
->is_omap_digest()) {
8477 obs
.oi
.set_omap_digest(cb
->results
->omap_digest
);
8479 obs
.oi
.clear_omap_digest();
8482 obs
.oi
.truncate_seq
= cb
->results
->truncate_seq
;
8483 obs
.oi
.truncate_size
= cb
->results
->truncate_size
;
8485 ctx
->extra_reqids
= cb
->results
->reqids
;
8487 // cache: clear whiteout?
8488 if (obs
.oi
.is_whiteout()) {
8489 dout(10) << __func__
<< " clearing whiteout on " << obs
.oi
.soid
<< dendl
;
8490 obs
.oi
.clear_flag(object_info_t::FLAG_WHITEOUT
);
8491 --ctx
->delta_stats
.num_whiteouts
;
8494 if (cb
->results
->has_omap
) {
8495 dout(10) << __func__
<< " setting omap flag on " << obs
.oi
.soid
<< dendl
;
8496 obs
.oi
.set_flag(object_info_t::FLAG_OMAP
);
8498 dout(10) << __func__
<< " clearing omap flag on " << obs
.oi
.soid
<< dendl
;
8499 obs
.oi
.clear_flag(object_info_t::FLAG_OMAP
);
8502 interval_set
<uint64_t> ch
;
8503 if (obs
.oi
.size
> 0)
8504 ch
.insert(0, obs
.oi
.size
);
8505 ctx
->modified_ranges
.union_of(ch
);
8507 if (cb
->get_data_size() != obs
.oi
.size
) {
8508 ctx
->delta_stats
.num_bytes
-= obs
.oi
.size
;
8509 obs
.oi
.size
= cb
->get_data_size();
8510 ctx
->delta_stats
.num_bytes
+= obs
.oi
.size
;
8512 ctx
->delta_stats
.num_wr
++;
8513 ctx
->delta_stats
.num_wr_kb
+= SHIFT_ROUND_UP(obs
.oi
.size
, 10);
8515 osd
->logger
->inc(l_osd_copyfrom
);
8518 void PrimaryLogPG::finish_promote(int r
, CopyResults
*results
,
8519 ObjectContextRef obc
)
8521 const hobject_t
& soid
= obc
->obs
.oi
.soid
;
8522 dout(10) << __func__
<< " " << soid
<< " r=" << r
8523 << " uv" << results
->user_version
<< dendl
;
8525 if (r
== -ECANCELED
) {
8529 if (r
!= -ENOENT
&& soid
.is_snap()) {
8530 if (results
->snaps
.empty()) {
8531 // we must have read "snap" content from the head object in
8532 // the base pool. use snap_seq to construct what snaps should
8533 // be for this clone (what is was before we evicted the clean
8534 // clone from this pool, and what it will be when we flush and
8535 // the clone eventually happens in the base pool).
8536 SnapSet
& snapset
= obc
->ssc
->snapset
;
8537 vector
<snapid_t
>::iterator p
= snapset
.snaps
.begin();
8538 while (p
!= snapset
.snaps
.end() && *p
> soid
.snap
)
8540 while (p
!= snapset
.snaps
.end() && *p
> results
->snap_seq
) {
8541 results
->snaps
.push_back(*p
);
8546 dout(20) << __func__
<< " snaps " << results
->snaps
<< dendl
;
8547 filter_snapc(results
->snaps
);
8549 dout(20) << __func__
<< " filtered snaps " << results
->snaps
<< dendl
;
8550 if (results
->snaps
.empty()) {
8551 dout(20) << __func__
8552 << " snaps are empty, clone is invalid,"
8553 << " setting r to ENOENT" << dendl
;
8558 if (r
< 0 && results
->started_temp_obj
) {
8559 dout(10) << __func__
<< " abort; will clean up partial work" << dendl
;
8560 ObjectContextRef tempobc
= get_object_context(results
->temp_oid
, false);
8562 OpContextUPtr ctx
= simple_opc_create(tempobc
);
8563 ctx
->op_t
->remove(results
->temp_oid
);
8564 simple_opc_submit(std::move(ctx
));
8565 results
->started_temp_obj
= false;
8568 if (r
== -ENOENT
&& soid
.is_snap()) {
8569 dout(10) << __func__
8570 << ": enoent while trying to promote clone, " << soid
8571 << " must have been trimmed, removing from snapset"
8573 hobject_t
head(soid
.get_head());
8574 ObjectContextRef obc
= get_object_context(head
, false);
8577 OpContextUPtr tctx
= simple_opc_create(obc
);
8578 tctx
->at_version
= get_next_version();
8579 filter_snapc(tctx
->new_snapset
.snaps
);
8580 vector
<snapid_t
> new_clones
;
8581 map
<snapid_t
, vector
<snapid_t
>> new_clone_snaps
;
8582 for (vector
<snapid_t
>::iterator i
= tctx
->new_snapset
.clones
.begin();
8583 i
!= tctx
->new_snapset
.clones
.end();
8585 if (*i
!= soid
.snap
) {
8586 new_clones
.push_back(*i
);
8587 auto p
= tctx
->new_snapset
.clone_snaps
.find(*i
);
8588 if (p
!= tctx
->new_snapset
.clone_snaps
.end()) {
8589 new_clone_snaps
[*i
] = p
->second
;
8593 tctx
->new_snapset
.clones
.swap(new_clones
);
8594 tctx
->new_snapset
.clone_overlap
.erase(soid
.snap
);
8595 tctx
->new_snapset
.clone_size
.erase(soid
.snap
);
8596 tctx
->new_snapset
.clone_snaps
.swap(new_clone_snaps
);
8598 // take RWWRITE lock for duration of our local write. ignore starvation.
8599 if (!tctx
->lock_manager
.take_write_lock(
8602 assert(0 == "problem!");
8604 dout(20) << __func__
<< " took lock on obc, " << obc
->rwstate
<< dendl
;
8606 finish_ctx(tctx
.get(), pg_log_entry_t::PROMOTE
);
8608 simple_opc_submit(std::move(tctx
));
8612 bool whiteout
= false;
8614 assert(soid
.snap
== CEPH_NOSNAP
); // snap case is above
8615 dout(10) << __func__
<< " whiteout " << soid
<< dendl
;
8619 if (r
< 0 && !whiteout
) {
8620 derr
<< __func__
<< " unexpected promote error " << cpp_strerror(r
) << dendl
;
8621 // pass error to everyone blocked on this object
8622 // FIXME: this is pretty sloppy, but at this point we got
8623 // something unexpected and don't have many other options.
8624 map
<hobject_t
,list
<OpRequestRef
>>::iterator blocked_iter
=
8625 waiting_for_blocked_object
.find(soid
);
8626 if (blocked_iter
!= waiting_for_blocked_object
.end()) {
8627 while (!blocked_iter
->second
.empty()) {
8628 osd
->reply_op_error(blocked_iter
->second
.front(), r
);
8629 blocked_iter
->second
.pop_front();
8631 waiting_for_blocked_object
.erase(blocked_iter
);
8636 osd
->promote_finish(results
->object_size
);
8638 OpContextUPtr tctx
= simple_opc_create(obc
);
8639 tctx
->at_version
= get_next_version();
8641 ++tctx
->delta_stats
.num_objects
;
8642 if (soid
.snap
< CEPH_NOSNAP
)
8643 ++tctx
->delta_stats
.num_object_clones
;
8644 tctx
->new_obs
.exists
= true;
8646 tctx
->extra_reqids
= results
->reqids
;
8648 bool legacy_snapset
= tctx
->new_snapset
.is_legacy() ||
8649 get_osdmap()->require_osd_release
< CEPH_RELEASE_LUMINOUS
;
8652 // create a whiteout
8653 tctx
->op_t
->create(soid
);
8654 tctx
->new_obs
.oi
.set_flag(object_info_t::FLAG_WHITEOUT
);
8655 ++tctx
->delta_stats
.num_whiteouts
;
8656 dout(20) << __func__
<< " creating whiteout on " << soid
<< dendl
;
8657 osd
->logger
->inc(l_osd_tier_whiteout
);
8659 if (results
->has_omap
) {
8660 dout(10) << __func__
<< " setting omap flag on " << soid
<< dendl
;
8661 tctx
->new_obs
.oi
.set_flag(object_info_t::FLAG_OMAP
);
8662 ++tctx
->delta_stats
.num_objects_omap
;
8665 results
->fill_in_final_tx(tctx
->op_t
.get());
8666 if (results
->started_temp_obj
) {
8667 tctx
->discard_temp_oid
= results
->temp_oid
;
8669 tctx
->new_obs
.oi
.size
= results
->object_size
;
8670 tctx
->new_obs
.oi
.user_version
= results
->user_version
;
8671 if (results
->is_data_digest()) {
8672 tctx
->new_obs
.oi
.set_data_digest(results
->data_digest
);
8674 tctx
->new_obs
.oi
.clear_data_digest();
8676 if (results
->is_omap_digest()) {
8677 tctx
->new_obs
.oi
.set_omap_digest(results
->omap_digest
);
8679 tctx
->new_obs
.oi
.clear_omap_digest();
8681 tctx
->new_obs
.oi
.truncate_seq
= results
->truncate_seq
;
8682 tctx
->new_obs
.oi
.truncate_size
= results
->truncate_size
;
8684 if (soid
.snap
!= CEPH_NOSNAP
) {
8685 if (legacy_snapset
) {
8686 tctx
->new_obs
.oi
.legacy_snaps
= results
->snaps
;
8687 assert(!tctx
->new_obs
.oi
.legacy_snaps
.empty());
8689 // it's already in the snapset
8690 assert(obc
->ssc
->snapset
.clone_snaps
.count(soid
.snap
));
8692 assert(obc
->ssc
->snapset
.clone_size
.count(soid
.snap
));
8693 assert(obc
->ssc
->snapset
.clone_size
[soid
.snap
] ==
8694 results
->object_size
);
8695 assert(obc
->ssc
->snapset
.clone_overlap
.count(soid
.snap
));
8697 tctx
->delta_stats
.num_bytes
+= obc
->ssc
->snapset
.get_clone_bytes(soid
.snap
);
8699 tctx
->delta_stats
.num_bytes
+= results
->object_size
;
8703 if (results
->mirror_snapset
) {
8704 assert(tctx
->new_obs
.oi
.soid
.snap
== CEPH_NOSNAP
);
8705 tctx
->new_snapset
.from_snap_set(
8707 get_osdmap()->require_osd_release
< CEPH_RELEASE_LUMINOUS
);
8709 tctx
->new_snapset
.head_exists
= true;
8710 dout(20) << __func__
<< " new_snapset " << tctx
->new_snapset
<< dendl
;
8712 // take RWWRITE lock for duration of our local write. ignore starvation.
8713 if (!tctx
->lock_manager
.take_write_lock(
8716 assert(0 == "problem!");
8718 dout(20) << __func__
<< " took lock on obc, " << obc
->rwstate
<< dendl
;
8720 finish_ctx(tctx
.get(), pg_log_entry_t::PROMOTE
);
8722 simple_opc_submit(std::move(tctx
));
8724 osd
->logger
->inc(l_osd_tier_promote
);
8727 agent_state
->is_idle())
8728 agent_choose_mode();
8731 void PrimaryLogPG::cancel_copy(CopyOpRef cop
, bool requeue
,
8732 vector
<ceph_tid_t
> *tids
)
8734 dout(10) << __func__
<< " " << cop
->obc
->obs
.oi
.soid
8735 << " from " << cop
->src
<< " " << cop
->oloc
8736 << " v" << cop
->results
.user_version
<< dendl
;
8738 // cancel objecter op, if we can
8739 if (cop
->objecter_tid
) {
8740 tids
->push_back(cop
->objecter_tid
);
8741 cop
->objecter_tid
= 0;
8742 if (cop
->objecter_tid2
) {
8743 tids
->push_back(cop
->objecter_tid2
);
8744 cop
->objecter_tid2
= 0;
8748 copy_ops
.erase(cop
->obc
->obs
.oi
.soid
);
8749 cop
->obc
->stop_block();
8751 kick_object_context_blocked(cop
->obc
);
8752 cop
->results
.should_requeue
= requeue
;
8753 CopyCallbackResults
result(-ECANCELED
, &cop
->results
);
8754 cop
->cb
->complete(result
);
8756 // There may still be an objecter callback referencing this copy op.
8757 // That callback will not need the obc since it's been canceled, and
8758 // we need the obc reference to go away prior to flush.
8759 cop
->obc
= ObjectContextRef();
8762 void PrimaryLogPG::cancel_copy_ops(bool requeue
, vector
<ceph_tid_t
> *tids
)
8764 dout(10) << __func__
<< dendl
;
8765 map
<hobject_t
,CopyOpRef
>::iterator p
= copy_ops
.begin();
8766 while (p
!= copy_ops
.end()) {
8767 // requeue this op? can I queue up all of them?
8768 cancel_copy((p
++)->second
, requeue
, tids
);
8773 // ========================================================================
8776 // Flush a dirty object in the cache tier by writing it back to the
8777 // base tier. The sequence looks like:
8779 // * send a copy-from operation to the base tier to copy the current
8780 // version of the object
8781 // * base tier will pull the object via (perhaps multiple) copy-get(s)
8782 // * on completion, we check if the object has been modified. if so,
8783 // just reply with -EAGAIN.
8784 // * try to take a write lock so we can clear the dirty flag. if this
8785 // fails, wait and retry
8786 // * start a repop that clears the bit.
8788 // If we have to wait, we will retry by coming back through the
8789 // start_flush method. We check if a flush is already in progress
8790 // and, if so, try to finish it by rechecking the version and trying
8791 // to clear the dirty bit.
8793 // In order for the cache-flush (a write op) to not block the copy-get
8794 // from reading the object, the client *must* set the SKIPRWLOCKS
8797 // NOTE: normally writes are strictly ordered for the client, but
8798 // flushes are special in that they can be reordered with respect to
8799 // other writes. In particular, we can't have a flush request block
8800 // an update to the cache pool object!
8802 struct C_Flush
: public Context
{
8805 epoch_t last_peering_reset
;
8808 C_Flush(PrimaryLogPG
*p
, hobject_t o
, epoch_t lpr
)
8809 : pg(p
), oid(o
), last_peering_reset(lpr
),
8810 tid(0), start(ceph_clock_now())
8812 void finish(int r
) override
{
8813 if (r
== -ECANCELED
)
8816 if (last_peering_reset
== pg
->get_last_peering_reset()) {
8817 pg
->finish_flush(oid
, tid
, r
);
8818 pg
->osd
->logger
->tinc(l_osd_tier_flush_lat
, ceph_clock_now() - start
);
8824 int PrimaryLogPG::start_flush(
8825 OpRequestRef op
, ObjectContextRef obc
,
8826 bool blocking
, hobject_t
*pmissing
,
8827 boost::optional
<std::function
<void()>> &&on_flush
)
8829 const object_info_t
& oi
= obc
->obs
.oi
;
8830 const hobject_t
& soid
= oi
.soid
;
8831 dout(10) << __func__
<< " " << soid
8832 << " v" << oi
.version
8833 << " uv" << oi
.user_version
8834 << " " << (blocking
? "blocking" : "non-blocking/best-effort")
8837 // get a filtered snapset, need to remove removed snaps
8838 SnapSet snapset
= obc
->ssc
->snapset
.get_filtered(pool
.info
);
8840 // verify there are no (older) check for dirty clones
8842 dout(20) << " snapset " << snapset
<< dendl
;
8843 vector
<snapid_t
>::reverse_iterator p
= snapset
.clones
.rbegin();
8844 while (p
!= snapset
.clones
.rend() && *p
>= soid
.snap
)
8846 if (p
!= snapset
.clones
.rend()) {
8847 hobject_t next
= soid
;
8849 assert(next
.snap
< soid
.snap
);
8850 if (pg_log
.get_missing().is_missing(next
)) {
8851 dout(10) << __func__
<< " missing clone is " << next
<< dendl
;
8856 ObjectContextRef older_obc
= get_object_context(next
, false);
8858 dout(20) << __func__
<< " next oldest clone is " << older_obc
->obs
.oi
8860 if (older_obc
->obs
.oi
.is_dirty()) {
8861 dout(10) << __func__
<< " next oldest clone is dirty: "
8862 << older_obc
->obs
.oi
<< dendl
;
8866 dout(20) << __func__
<< " next oldest clone " << next
8867 << " is not present; implicitly clean" << dendl
;
8870 dout(20) << __func__
<< " no older clones" << dendl
;
8877 map
<hobject_t
,FlushOpRef
>::iterator p
= flush_ops
.find(soid
);
8878 if (p
!= flush_ops
.end()) {
8879 FlushOpRef fop
= p
->second
;
8880 if (fop
->op
== op
) {
8881 // we couldn't take the write lock on a cache-try-flush before;
8882 // now we are trying again for the lock.
8883 return try_flush_mark_clean(fop
);
8885 if (fop
->flushed_version
== obc
->obs
.oi
.user_version
&&
8886 (fop
->blocking
|| !blocking
)) {
8887 // nonblocking can join anything
8888 // blocking can only join a blocking flush
8889 dout(20) << __func__
<< " piggybacking on existing flush " << dendl
;
8891 fop
->dup_ops
.push_back(op
);
8892 return -EAGAIN
; // clean up this ctx; op will retry later
8895 // cancel current flush since it will fail anyway, or because we
8896 // are blocking and the existing flush is nonblocking.
8897 dout(20) << __func__
<< " canceling previous flush; it will fail" << dendl
;
8899 osd
->reply_op_error(fop
->op
, -EBUSY
);
8900 while (!fop
->dup_ops
.empty()) {
8901 osd
->reply_op_error(fop
->dup_ops
.front(), -EBUSY
);
8902 fop
->dup_ops
.pop_front();
8904 vector
<ceph_tid_t
> tids
;
8905 cancel_flush(fop
, false, &tids
);
8906 osd
->objecter
->op_cancel(tids
, -ECANCELED
);
8910 * In general, we need to send a delete and a copyfrom.
8911 * Consider snapc 10:[10, 9, 8, 4, 3, 2]:[10(10, 9), 4(4,3,2)]
8912 * where 4 is marked as clean. To flush 10, we have to:
8913 * 1) delete 4:[4,3,2] -- Logically, the object does not exist after 4
8914 * 2) copyfrom 8:[8,4,3,2] -- flush object after snap 8
8916 * There is a complicating case. Supposed there had been a clone 7
8917 * for snaps [7, 6] which has been trimmed since they no longer exist.
8918 * In the base pool, we'd have 5:[4,3,2]:[4(4,3,2)]+head. When we submit
8919 * the delete, the snap will be promoted to 5, and the head will become
8920 * a snapdir. When the copy-from goes through, we'll end up with
8921 * 8:[8,4,3,2]:[4(4,3,2)]+head.
8923 * Another complication is the case where there is an interval change
8924 * after doing the delete and the flush but before marking the object
8925 * clean. We'll happily delete head and then recreate it at the same
8926 * sequence number, which works out ok.
8929 SnapContext snapc
, dsnapc
;
8930 if (snapset
.seq
!= 0) {
8931 if (soid
.snap
== CEPH_NOSNAP
) {
8932 snapc
.seq
= snapset
.seq
;
8933 snapc
.snaps
= snapset
.snaps
;
8935 snapid_t min_included_snap
;
8936 if (snapset
.is_legacy()) {
8937 min_included_snap
= oi
.legacy_snaps
.back();
8939 auto p
= snapset
.clone_snaps
.find(soid
.snap
);
8940 assert(p
!= snapset
.clone_snaps
.end());
8941 min_included_snap
= p
->second
.back();
8943 snapc
= snapset
.get_ssc_as_of(min_included_snap
- 1);
8946 snapid_t prev_snapc
= 0;
8947 for (vector
<snapid_t
>::reverse_iterator citer
= snapset
.clones
.rbegin();
8948 citer
!= snapset
.clones
.rend();
8950 if (*citer
< soid
.snap
) {
8951 prev_snapc
= *citer
;
8956 dsnapc
= snapset
.get_ssc_as_of(prev_snapc
);
8959 object_locator_t
base_oloc(soid
);
8960 base_oloc
.pool
= pool
.info
.tier_of
;
8962 if (dsnapc
.seq
< snapc
.seq
) {
8965 osd
->objecter
->mutate(
8970 ceph::real_clock::from_ceph_timespec(oi
.mtime
),
8971 (CEPH_OSD_FLAG_IGNORE_OVERLAY
|
8972 CEPH_OSD_FLAG_ENFORCE_SNAPC
),
8973 NULL
/* no callback, we'll rely on the ordering w.r.t the next op */);
8976 FlushOpRef
fop(std::make_shared
<FlushOp
>());
8978 fop
->flushed_version
= oi
.user_version
;
8979 fop
->blocking
= blocking
;
8980 fop
->on_flush
= std::move(on_flush
);
8984 if (oi
.is_whiteout()) {
8985 fop
->removal
= true;
8988 object_locator_t
oloc(soid
);
8989 o
.copy_from(soid
.oid
.name
, soid
.snap
, oloc
, oi
.user_version
,
8990 CEPH_OSD_COPY_FROM_FLAG_FLUSH
|
8991 CEPH_OSD_COPY_FROM_FLAG_IGNORE_OVERLAY
|
8992 CEPH_OSD_COPY_FROM_FLAG_IGNORE_CACHE
|
8993 CEPH_OSD_COPY_FROM_FLAG_MAP_SNAP_CLONE
,
8994 LIBRADOS_OP_FLAG_FADVISE_SEQUENTIAL
|LIBRADOS_OP_FLAG_FADVISE_NOCACHE
);
8996 //mean the base tier don't cache data after this
8997 if (agent_state
&& agent_state
->evict_mode
!= TierAgentState::EVICT_MODE_FULL
)
8998 o
.set_last_op_flags(LIBRADOS_OP_FLAG_FADVISE_DONTNEED
);
9000 C_Flush
*fin
= new C_Flush(this, soid
, get_last_peering_reset());
9002 ceph_tid_t tid
= osd
->objecter
->mutate(
9003 soid
.oid
, base_oloc
, o
, snapc
,
9004 ceph::real_clock::from_ceph_timespec(oi
.mtime
),
9005 CEPH_OSD_FLAG_IGNORE_OVERLAY
| CEPH_OSD_FLAG_ENFORCE_SNAPC
,
9006 new C_OnFinisher(fin
,
9007 &osd
->objecter_finisher
));
9008 /* we're under the pg lock and fin->finish() is grabbing that */
9010 fop
->objecter_tid
= tid
;
9012 flush_ops
[soid
] = fop
;
9013 info
.stats
.stats
.sum
.num_flush
++;
9014 info
.stats
.stats
.sum
.num_flush_kb
+= SHIFT_ROUND_UP(oi
.size
, 10);
9015 return -EINPROGRESS
;
9018 void PrimaryLogPG::finish_flush(hobject_t oid
, ceph_tid_t tid
, int r
)
9020 dout(10) << __func__
<< " " << oid
<< " tid " << tid
9021 << " " << cpp_strerror(r
) << dendl
;
9022 map
<hobject_t
,FlushOpRef
>::iterator p
= flush_ops
.find(oid
);
9023 if (p
== flush_ops
.end()) {
9024 dout(10) << __func__
<< " no flush_op found" << dendl
;
9027 FlushOpRef fop
= p
->second
;
9028 if (tid
!= fop
->objecter_tid
) {
9029 dout(10) << __func__
<< " tid " << tid
<< " != fop " << fop
9030 << " tid " << fop
->objecter_tid
<< dendl
;
9033 ObjectContextRef obc
= fop
->obc
;
9034 fop
->objecter_tid
= 0;
9036 if (r
< 0 && !(r
== -ENOENT
&& fop
->removal
)) {
9038 osd
->reply_op_error(fop
->op
, -EBUSY
);
9039 if (fop
->blocking
) {
9041 kick_object_context_blocked(obc
);
9044 if (!fop
->dup_ops
.empty()) {
9045 dout(20) << __func__
<< " requeueing dups" << dendl
;
9046 requeue_ops(fop
->dup_ops
);
9048 if (fop
->on_flush
) {
9049 (*(fop
->on_flush
))();
9050 fop
->on_flush
= boost::none
;
9052 flush_ops
.erase(oid
);
9056 r
= try_flush_mark_clean(fop
);
9057 if (r
== -EBUSY
&& fop
->op
) {
9058 osd
->reply_op_error(fop
->op
, r
);
9062 int PrimaryLogPG::try_flush_mark_clean(FlushOpRef fop
)
9064 ObjectContextRef obc
= fop
->obc
;
9065 const hobject_t
& oid
= obc
->obs
.oi
.soid
;
9067 if (fop
->blocking
) {
9069 kick_object_context_blocked(obc
);
9072 if (fop
->flushed_version
!= obc
->obs
.oi
.user_version
||
9074 if (obc
->obs
.exists
)
9075 dout(10) << __func__
<< " flushed_version " << fop
->flushed_version
9076 << " != current " << obc
->obs
.oi
.user_version
9079 dout(10) << __func__
<< " object no longer exists" << dendl
;
9081 if (!fop
->dup_ops
.empty()) {
9082 dout(20) << __func__
<< " requeueing dups" << dendl
;
9083 requeue_ops(fop
->dup_ops
);
9085 if (fop
->on_flush
) {
9086 (*(fop
->on_flush
))();
9087 fop
->on_flush
= boost::none
;
9089 flush_ops
.erase(oid
);
9091 osd
->logger
->inc(l_osd_tier_flush_fail
);
9093 osd
->logger
->inc(l_osd_tier_try_flush_fail
);
9097 if (!fop
->blocking
&&
9098 write_blocked_by_scrub(oid
)) {
9100 dout(10) << __func__
<< " blocked by scrub" << dendl
;
9101 requeue_op(fop
->op
);
9102 requeue_ops(fop
->dup_ops
);
9103 return -EAGAIN
; // will retry
9105 osd
->logger
->inc(l_osd_tier_try_flush_fail
);
9106 vector
<ceph_tid_t
> tids
;
9107 cancel_flush(fop
, false, &tids
);
9108 osd
->objecter
->op_cancel(tids
, -ECANCELED
);
9113 // successfully flushed, can we evict this object?
9114 if (!fop
->op
&& agent_state
->evict_mode
!= TierAgentState::EVICT_MODE_IDLE
&&
9115 agent_maybe_evict(obc
, true)) {
9116 osd
->logger
->inc(l_osd_tier_clean
);
9117 if (fop
->on_flush
) {
9118 (*(fop
->on_flush
))();
9119 fop
->on_flush
= boost::none
;
9121 flush_ops
.erase(oid
);
9125 dout(10) << __func__
<< " clearing DIRTY flag for " << oid
<< dendl
;
9126 OpContextUPtr ctx
= simple_opc_create(fop
->obc
);
9128 // successfully flushed; can we clear the dirty bit?
9129 // try to take the lock manually, since we don't
9131 if (ctx
->lock_manager
.get_lock_type(
9132 ObjectContext::RWState::RWWRITE
,
9136 dout(20) << __func__
<< " took write lock" << dendl
;
9137 } else if (fop
->op
) {
9138 dout(10) << __func__
<< " waiting on write lock " << fop
->op
<< " "
9139 << fop
->dup_ops
<< dendl
;
9140 close_op_ctx(ctx
.release());
9141 // fop->op is now waiting on the lock; get fop->dup_ops to wait too.
9142 for (auto op
: fop
->dup_ops
) {
9143 bool locked
= ctx
->lock_manager
.get_lock_type(
9144 ObjectContext::RWState::RWWRITE
,
9150 return -EAGAIN
; // will retry
9152 dout(10) << __func__
<< " failed write lock, no op; failing" << dendl
;
9153 close_op_ctx(ctx
.release());
9154 osd
->logger
->inc(l_osd_tier_try_flush_fail
);
9155 vector
<ceph_tid_t
> tids
;
9156 cancel_flush(fop
, false, &tids
);
9157 osd
->objecter
->op_cancel(tids
, -ECANCELED
);
9161 if (fop
->on_flush
) {
9162 ctx
->register_on_finish(*(fop
->on_flush
));
9163 fop
->on_flush
= boost::none
;
9166 ctx
->at_version
= get_next_version();
9168 ctx
->new_obs
= obc
->obs
;
9169 ctx
->new_obs
.oi
.clear_flag(object_info_t::FLAG_DIRTY
);
9170 --ctx
->delta_stats
.num_objects_dirty
;
9172 finish_ctx(ctx
.get(), pg_log_entry_t::CLEAN
);
9174 osd
->logger
->inc(l_osd_tier_clean
);
9176 if (!fop
->dup_ops
.empty() || fop
->op
) {
9177 dout(20) << __func__
<< " requeueing for " << ctx
->at_version
<< dendl
;
9178 list
<OpRequestRef
> ls
;
9180 ls
.push_back(fop
->op
);
9181 ls
.splice(ls
.end(), fop
->dup_ops
);
9185 simple_opc_submit(std::move(ctx
));
9187 flush_ops
.erase(oid
);
9190 osd
->logger
->inc(l_osd_tier_flush
);
9192 osd
->logger
->inc(l_osd_tier_try_flush
);
9194 return -EINPROGRESS
;
9197 void PrimaryLogPG::cancel_flush(FlushOpRef fop
, bool requeue
,
9198 vector
<ceph_tid_t
> *tids
)
9200 dout(10) << __func__
<< " " << fop
->obc
->obs
.oi
.soid
<< " tid "
9201 << fop
->objecter_tid
<< dendl
;
9202 if (fop
->objecter_tid
) {
9203 tids
->push_back(fop
->objecter_tid
);
9204 fop
->objecter_tid
= 0;
9206 if (fop
->io_tids
.size()) {
9207 for (auto &p
: fop
->io_tids
) {
9208 tids
->push_back(p
.second
);
9212 if (fop
->blocking
&& fop
->obc
->is_blocked()) {
9213 fop
->obc
->stop_block();
9214 kick_object_context_blocked(fop
->obc
);
9218 requeue_op(fop
->op
);
9219 requeue_ops(fop
->dup_ops
);
9221 if (fop
->on_flush
) {
9222 (*(fop
->on_flush
))();
9223 fop
->on_flush
= boost::none
;
9225 flush_ops
.erase(fop
->obc
->obs
.oi
.soid
);
9228 void PrimaryLogPG::cancel_flush_ops(bool requeue
, vector
<ceph_tid_t
> *tids
)
9230 dout(10) << __func__
<< dendl
;
9231 map
<hobject_t
,FlushOpRef
>::iterator p
= flush_ops
.begin();
9232 while (p
!= flush_ops
.end()) {
9233 cancel_flush((p
++)->second
, requeue
, tids
);
9237 bool PrimaryLogPG::is_present_clone(hobject_t coid
)
9239 if (!pool
.info
.allow_incomplete_clones())
9241 if (is_missing_object(coid
))
9243 ObjectContextRef obc
= get_object_context(coid
, false);
9244 return obc
&& obc
->obs
.exists
;
9247 // ========================================================================
9250 class C_OSD_RepopApplied
: public Context
{
9252 boost::intrusive_ptr
<PrimaryLogPG::RepGather
> repop
;
9254 C_OSD_RepopApplied(PrimaryLogPG
*pg
, PrimaryLogPG::RepGather
*repop
)
9255 : pg(pg
), repop(repop
) {}
9256 void finish(int) override
{
9257 pg
->repop_all_applied(repop
.get());
9262 void PrimaryLogPG::repop_all_applied(RepGather
*repop
)
9264 dout(10) << __func__
<< ": repop tid " << repop
->rep_tid
<< " all applied "
9266 assert(!repop
->applies_with_commit
);
9267 repop
->all_applied
= true;
9268 if (!repop
->rep_aborted
) {
9273 class C_OSD_RepopCommit
: public Context
{
9275 boost::intrusive_ptr
<PrimaryLogPG::RepGather
> repop
;
9277 C_OSD_RepopCommit(PrimaryLogPG
*pg
, PrimaryLogPG::RepGather
*repop
)
9278 : pg(pg
), repop(repop
) {}
9279 void finish(int) override
{
9280 pg
->repop_all_committed(repop
.get());
9284 void PrimaryLogPG::repop_all_committed(RepGather
*repop
)
9286 dout(10) << __func__
<< ": repop tid " << repop
->rep_tid
<< " all committed "
9288 repop
->all_committed
= true;
9289 if (repop
->applies_with_commit
) {
9290 assert(!repop
->all_applied
);
9291 repop
->all_applied
= true;
9294 if (!repop
->rep_aborted
) {
9295 if (repop
->v
!= eversion_t()) {
9296 last_update_ondisk
= repop
->v
;
9297 last_complete_ondisk
= repop
->pg_local_last_complete
;
9303 void PrimaryLogPG::op_applied(const eversion_t
&applied_version
)
9305 dout(10) << "op_applied version " << applied_version
<< dendl
;
9306 if (applied_version
== eversion_t())
9308 assert(applied_version
> last_update_applied
);
9309 assert(applied_version
<= info
.last_update
);
9310 last_update_applied
= applied_version
;
9312 if (scrubber
.active
) {
9313 if (last_update_applied
>= scrubber
.subset_last_update
) {
9314 if (ops_blocked_by_scrub()) {
9315 requeue_scrub(true);
9317 requeue_scrub(false);
9322 assert(scrubber
.start
== scrubber
.end
);
9325 if (scrubber
.active_rep_scrub
) {
9326 if (last_update_applied
>= static_cast<const MOSDRepScrub
*>(
9327 scrubber
.active_rep_scrub
->get_req())->scrub_to
) {
9330 PGQueueable(scrubber
.active_rep_scrub
, get_osdmap()->get_epoch()));
9331 scrubber
.active_rep_scrub
= OpRequestRef();
9337 void PrimaryLogPG::eval_repop(RepGather
*repop
)
9339 const MOSDOp
*m
= NULL
;
9341 m
= static_cast<const MOSDOp
*>(repop
->op
->get_req());
9344 dout(10) << "eval_repop " << *repop
9345 << (repop
->rep_done
? " DONE" : "")
9348 dout(10) << "eval_repop " << *repop
<< " (no op)"
9349 << (repop
->rep_done
? " DONE" : "")
9352 if (repop
->rep_done
)
9356 if (repop
->all_committed
) {
9357 dout(10) << " commit: " << *repop
<< dendl
;
9358 for (auto p
= repop
->on_committed
.begin();
9359 p
!= repop
->on_committed
.end();
9360 repop
->on_committed
.erase(p
++)) {
9363 // send dup commits, in order
9364 if (waiting_for_ondisk
.count(repop
->v
)) {
9365 assert(waiting_for_ondisk
.begin()->first
== repop
->v
);
9366 for (list
<pair
<OpRequestRef
, version_t
> >::iterator i
=
9367 waiting_for_ondisk
[repop
->v
].begin();
9368 i
!= waiting_for_ondisk
[repop
->v
].end();
9370 osd
->reply_op_error(i
->first
, repop
->r
, repop
->v
,
9373 waiting_for_ondisk
.erase(repop
->v
);
9378 if (repop
->all_applied
) {
9379 if (repop
->applies_with_commit
) {
9380 assert(repop
->on_applied
.empty());
9382 dout(10) << " applied: " << *repop
<< " " << dendl
;
9383 for (auto p
= repop
->on_applied
.begin();
9384 p
!= repop
->on_applied
.end();
9385 repop
->on_applied
.erase(p
++)) {
9391 if (repop
->all_applied
&& repop
->all_committed
) {
9392 repop
->rep_done
= true;
9394 publish_stats_to_osd();
9395 calc_min_last_complete_ondisk();
9397 dout(10) << " removing " << *repop
<< dendl
;
9398 assert(!repop_queue
.empty());
9399 dout(20) << " q front is " << *repop_queue
.front() << dendl
;
9400 if (repop_queue
.front() != repop
) {
9401 if (!repop
->applies_with_commit
) {
9402 dout(0) << " removing " << *repop
<< dendl
;
9403 dout(0) << " q front is " << *repop_queue
.front() << dendl
;
9404 assert(repop_queue
.front() == repop
);
9407 RepGather
*to_remove
= nullptr;
9408 while (!repop_queue
.empty() &&
9409 (to_remove
= repop_queue
.front())->rep_done
) {
9410 repop_queue
.pop_front();
9411 for (auto p
= to_remove
->on_success
.begin();
9412 p
!= to_remove
->on_success
.end();
9413 to_remove
->on_success
.erase(p
++)) {
9416 remove_repop(to_remove
);
9422 void PrimaryLogPG::issue_repop(RepGather
*repop
, OpContext
*ctx
)
9425 const hobject_t
& soid
= ctx
->obs
->oi
.soid
;
9426 dout(7) << "issue_repop rep_tid " << repop
->rep_tid
9430 repop
->v
= ctx
->at_version
;
9431 if (ctx
->at_version
> eversion_t()) {
9432 for (set
<pg_shard_t
>::iterator i
= actingbackfill
.begin();
9433 i
!= actingbackfill
.end();
9435 if (*i
== get_primary()) continue;
9436 pg_info_t
&pinfo
= peer_info
[*i
];
9437 // keep peer_info up to date
9438 if (pinfo
.last_complete
== pinfo
.last_update
)
9439 pinfo
.last_complete
= ctx
->at_version
;
9440 pinfo
.last_update
= ctx
->at_version
;
9444 ctx
->obc
->ondisk_write_lock();
9446 bool unlock_snapset_obc
= false;
9447 ctx
->op_t
->add_obc(ctx
->obc
);
9448 if (ctx
->clone_obc
) {
9449 ctx
->clone_obc
->ondisk_write_lock();
9450 ctx
->op_t
->add_obc(ctx
->clone_obc
);
9452 if (ctx
->snapset_obc
&& ctx
->snapset_obc
->obs
.oi
.soid
!=
9453 ctx
->obc
->obs
.oi
.soid
) {
9454 ctx
->snapset_obc
->ondisk_write_lock();
9455 unlock_snapset_obc
= true;
9456 ctx
->op_t
->add_obc(ctx
->snapset_obc
);
9459 Context
*on_all_commit
= new C_OSD_RepopCommit(this, repop
);
9460 Context
*on_all_applied
= new C_OSD_RepopApplied(this, repop
);
9461 Context
*onapplied_sync
= new C_OSD_OndiskWriteUnlock(
9464 unlock_snapset_obc
? ctx
->snapset_obc
: ObjectContextRef());
9465 if (!(ctx
->log
.empty())) {
9466 assert(ctx
->at_version
>= projected_last_update
);
9467 projected_last_update
= ctx
->at_version
;
9469 for (auto &&entry
: ctx
->log
) {
9470 projected_log
.add(entry
);
9472 pgbackend
->submit_transaction(
9476 std::move(ctx
->op_t
),
9478 min_last_complete_ondisk
,
9480 ctx
->updated_hset_history
,
9489 PrimaryLogPG::RepGather
*PrimaryLogPG::new_repop(
9490 OpContext
*ctx
, ObjectContextRef obc
,
9494 dout(10) << "new_repop rep_tid " << rep_tid
<< " on " << *ctx
->op
->get_req() << dendl
;
9496 dout(10) << "new_repop rep_tid " << rep_tid
<< " (no op)" << dendl
;
9498 RepGather
*repop
= new RepGather(
9499 ctx
, rep_tid
, info
.last_complete
, false);
9501 repop
->start
= ceph_clock_now();
9503 repop_queue
.push_back(&repop
->queue_item
);
9506 osd
->logger
->inc(l_osd_op_wip
);
9508 dout(10) << __func__
<< ": " << *repop
<< dendl
;
9512 boost::intrusive_ptr
<PrimaryLogPG::RepGather
> PrimaryLogPG::new_repop(
9515 ObcLockManager
&&manager
,
9517 boost::optional
<std::function
<void(void)> > &&on_complete
)
9519 RepGather
*repop
= new RepGather(
9522 std::move(on_complete
),
9529 repop
->start
= ceph_clock_now();
9531 repop_queue
.push_back(&repop
->queue_item
);
9533 osd
->logger
->inc(l_osd_op_wip
);
9535 dout(10) << __func__
<< ": " << *repop
<< dendl
;
9536 return boost::intrusive_ptr
<RepGather
>(repop
);
9539 void PrimaryLogPG::remove_repop(RepGather
*repop
)
9541 dout(20) << __func__
<< " " << *repop
<< dendl
;
9543 for (auto p
= repop
->on_finish
.begin();
9544 p
!= repop
->on_finish
.end();
9545 repop
->on_finish
.erase(p
++)) {
9549 release_object_locks(
9550 repop
->lock_manager
);
9553 osd
->logger
->dec(l_osd_op_wip
);
9556 PrimaryLogPG::OpContextUPtr
PrimaryLogPG::simple_opc_create(ObjectContextRef obc
)
9558 dout(20) << __func__
<< " " << obc
->obs
.oi
.soid
<< dendl
;
9559 ceph_tid_t rep_tid
= osd
->get_tid();
9560 osd_reqid_t
reqid(osd
->get_cluster_msgr_name(), 0, rep_tid
);
9561 OpContextUPtr
ctx(new OpContext(OpRequestRef(), reqid
, nullptr, obc
, this));
9562 ctx
->op_t
.reset(new PGTransaction());
9563 ctx
->mtime
= ceph_clock_now();
9567 void PrimaryLogPG::simple_opc_submit(OpContextUPtr ctx
)
9569 RepGather
*repop
= new_repop(ctx
.get(), ctx
->obc
, ctx
->reqid
.tid
);
9570 dout(20) << __func__
<< " " << repop
<< dendl
;
9571 issue_repop(repop
, ctx
.get());
9578 void PrimaryLogPG::submit_log_entries(
9579 const mempool::osd_pglog::list
<pg_log_entry_t
> &entries
,
9580 ObcLockManager
&&manager
,
9581 boost::optional
<std::function
<void(void)> > &&_on_complete
,
9585 dout(10) << __func__
<< " " << entries
<< dendl
;
9586 assert(is_primary());
9589 if (!entries
.empty()) {
9590 assert(entries
.rbegin()->version
>= projected_last_update
);
9591 version
= projected_last_update
= entries
.rbegin()->version
;
9594 boost::intrusive_ptr
<RepGather
> repop
;
9595 boost::optional
<std::function
<void(void)> > on_complete
;
9596 if (get_osdmap()->require_osd_release
>= CEPH_RELEASE_JEWEL
) {
9602 std::move(_on_complete
));
9604 on_complete
= std::move(_on_complete
);
9607 pgbackend
->call_write_ordered(
9608 [this, entries
, repop
, on_complete
]() {
9609 ObjectStore::Transaction t
;
9610 eversion_t old_last_update
= info
.last_update
;
9611 merge_new_log_entries(entries
, t
, pg_trim_to
, min_last_complete_ondisk
);
9614 set
<pg_shard_t
> waiting_on
;
9615 for (set
<pg_shard_t
>::const_iterator i
= actingbackfill
.begin();
9616 i
!= actingbackfill
.end();
9618 pg_shard_t
peer(*i
);
9619 if (peer
== pg_whoami
) continue;
9620 assert(peer_missing
.count(peer
));
9621 assert(peer_info
.count(peer
));
9622 if (get_osdmap()->require_osd_release
>= CEPH_RELEASE_JEWEL
) {
9624 MOSDPGUpdateLogMissing
*m
= new MOSDPGUpdateLogMissing(
9626 spg_t(info
.pgid
.pgid
, i
->shard
),
9628 get_osdmap()->get_epoch(),
9632 min_last_complete_ondisk
);
9633 osd
->send_message_osd_cluster(
9634 peer
.osd
, m
, get_osdmap()->get_epoch());
9635 waiting_on
.insert(peer
);
9637 MOSDPGLog
*m
= new MOSDPGLog(
9638 peer
.shard
, pg_whoami
.shard
,
9639 info
.last_update
.epoch
,
9641 m
->log
.log
= entries
;
9642 m
->log
.tail
= old_last_update
;
9643 m
->log
.head
= info
.last_update
;
9644 osd
->send_message_osd_cluster(
9645 peer
.osd
, m
, get_osdmap()->get_epoch());
9648 if (get_osdmap()->require_osd_release
>= CEPH_RELEASE_JEWEL
) {
9649 ceph_tid_t rep_tid
= repop
->rep_tid
;
9650 waiting_on
.insert(pg_whoami
);
9651 log_entry_update_waiting_on
.insert(
9654 LogUpdateCtx
{std::move(repop
), std::move(waiting_on
)}
9656 struct OnComplete
: public Context
{
9664 : pg(pg
), rep_tid(rep_tid
), epoch(epoch
) {}
9665 void finish(int) override
{
9667 if (!pg
->pg_has_reset_since(epoch
)) {
9668 auto it
= pg
->log_entry_update_waiting_on
.find(rep_tid
);
9669 assert(it
!= pg
->log_entry_update_waiting_on
.end());
9670 auto it2
= it
->second
.waiting_on
.find(pg
->pg_whoami
);
9671 assert(it2
!= it
->second
.waiting_on
.end());
9672 it
->second
.waiting_on
.erase(it2
);
9673 if (it
->second
.waiting_on
.empty()) {
9674 pg
->repop_all_committed(it
->second
.repop
.get());
9675 pg
->log_entry_update_waiting_on
.erase(it
);
9681 t
.register_on_commit(
9682 new OnComplete
{this, rep_tid
, get_osdmap()->get_epoch()});
9685 struct OnComplete
: public Context
{
9687 std::function
<void(void)> on_complete
;
9691 const std::function
<void(void)> &on_complete
,
9694 on_complete(std::move(on_complete
)),
9696 void finish(int) override
{
9698 if (!pg
->pg_has_reset_since(epoch
))
9703 t
.register_on_complete(
9705 this, *on_complete
, get_osdmap()->get_epoch()
9709 t
.register_on_applied(
9710 new C_OSD_OnApplied
{this, get_osdmap()->get_epoch(), info
.last_update
});
9711 int r
= osd
->store
->queue_transaction(osr
.get(), std::move(t
), NULL
);
9718 void PrimaryLogPG::cancel_log_updates()
9720 // get rid of all the LogUpdateCtx so their references to repops are
9722 log_entry_update_waiting_on
.clear();
9725 // -------------------------------------------------------
9727 void PrimaryLogPG::get_watchers(list
<obj_watch_item_t
> &pg_watchers
)
9729 pair
<hobject_t
, ObjectContextRef
> i
;
9730 while (object_contexts
.get_next(i
.first
, &i
)) {
9731 ObjectContextRef
obc(i
.second
);
9732 get_obc_watchers(obc
, pg_watchers
);
9736 void PrimaryLogPG::get_obc_watchers(ObjectContextRef obc
, list
<obj_watch_item_t
> &pg_watchers
)
9738 for (map
<pair
<uint64_t, entity_name_t
>, WatchRef
>::iterator j
=
9739 obc
->watchers
.begin();
9740 j
!= obc
->watchers
.end();
9742 obj_watch_item_t owi
;
9744 owi
.obj
= obc
->obs
.oi
.soid
;
9745 owi
.wi
.addr
= j
->second
->get_peer_addr();
9746 owi
.wi
.name
= j
->second
->get_entity();
9747 owi
.wi
.cookie
= j
->second
->get_cookie();
9748 owi
.wi
.timeout_seconds
= j
->second
->get_timeout();
9750 dout(30) << "watch: Found oid=" << owi
.obj
<< " addr=" << owi
.wi
.addr
9751 << " name=" << owi
.wi
.name
<< " cookie=" << owi
.wi
.cookie
<< dendl
;
9753 pg_watchers
.push_back(owi
);
9757 void PrimaryLogPG::check_blacklisted_watchers()
9759 dout(20) << "PrimaryLogPG::check_blacklisted_watchers for pg " << get_pgid() << dendl
;
9760 pair
<hobject_t
, ObjectContextRef
> i
;
9761 while (object_contexts
.get_next(i
.first
, &i
))
9762 check_blacklisted_obc_watchers(i
.second
);
9765 void PrimaryLogPG::check_blacklisted_obc_watchers(ObjectContextRef obc
)
9767 dout(20) << "PrimaryLogPG::check_blacklisted_obc_watchers for obc " << obc
->obs
.oi
.soid
<< dendl
;
9768 for (map
<pair
<uint64_t, entity_name_t
>, WatchRef
>::iterator k
=
9769 obc
->watchers
.begin();
9770 k
!= obc
->watchers
.end();
9772 //Advance iterator now so handle_watch_timeout() can erase element
9773 map
<pair
<uint64_t, entity_name_t
>, WatchRef
>::iterator j
= k
++;
9774 dout(30) << "watch: Found " << j
->second
->get_entity() << " cookie " << j
->second
->get_cookie() << dendl
;
9775 entity_addr_t ea
= j
->second
->get_peer_addr();
9776 dout(30) << "watch: Check entity_addr_t " << ea
<< dendl
;
9777 if (get_osdmap()->is_blacklisted(ea
)) {
9778 dout(10) << "watch: Found blacklisted watcher for " << ea
<< dendl
;
9779 assert(j
->second
->get_pg() == this);
9780 j
->second
->unregister_cb();
9781 handle_watch_timeout(j
->second
);
9786 void PrimaryLogPG::populate_obc_watchers(ObjectContextRef obc
)
9788 assert(is_active());
9789 assert((recovering
.count(obc
->obs
.oi
.soid
) ||
9790 !is_missing_object(obc
->obs
.oi
.soid
)) ||
9791 (pg_log
.get_log().objects
.count(obc
->obs
.oi
.soid
) && // or this is a revert... see recover_primary()
9792 pg_log
.get_log().objects
.find(obc
->obs
.oi
.soid
)->second
->op
==
9793 pg_log_entry_t::LOST_REVERT
&&
9794 pg_log
.get_log().objects
.find(obc
->obs
.oi
.soid
)->second
->reverting_to
==
9795 obc
->obs
.oi
.version
));
9797 dout(10) << "populate_obc_watchers " << obc
->obs
.oi
.soid
<< dendl
;
9798 assert(obc
->watchers
.empty());
9799 // populate unconnected_watchers
9800 for (map
<pair
<uint64_t, entity_name_t
>, watch_info_t
>::iterator p
=
9801 obc
->obs
.oi
.watchers
.begin();
9802 p
!= obc
->obs
.oi
.watchers
.end();
9804 utime_t expire
= info
.stats
.last_became_active
;
9805 expire
+= p
->second
.timeout_seconds
;
9806 dout(10) << " unconnected watcher " << p
->first
<< " will expire " << expire
<< dendl
;
9808 Watch::makeWatchRef(
9809 this, osd
, obc
, p
->second
.timeout_seconds
, p
->first
.first
,
9810 p
->first
.second
, p
->second
.addr
));
9811 watch
->disconnect();
9812 obc
->watchers
.insert(
9814 make_pair(p
->first
.first
, p
->first
.second
),
9817 // Look for watchers from blacklisted clients and drop
9818 check_blacklisted_obc_watchers(obc
);
9821 void PrimaryLogPG::handle_watch_timeout(WatchRef watch
)
9823 ObjectContextRef obc
= watch
->get_obc(); // handle_watch_timeout owns this ref
9824 dout(10) << "handle_watch_timeout obc " << obc
<< dendl
;
9827 dout(10) << "handle_watch_timeout not active, no-op" << dendl
;
9830 if (is_degraded_or_backfilling_object(obc
->obs
.oi
.soid
)) {
9831 callbacks_for_degraded_object
[obc
->obs
.oi
.soid
].push_back(
9832 watch
->get_delayed_cb()
9834 dout(10) << "handle_watch_timeout waiting for degraded on obj "
9840 if (write_blocked_by_scrub(obc
->obs
.oi
.soid
)) {
9841 dout(10) << "handle_watch_timeout waiting for scrub on obj "
9844 scrubber
.add_callback(
9845 watch
->get_delayed_cb() // This callback!
9850 OpContextUPtr ctx
= simple_opc_create(obc
);
9851 ctx
->at_version
= get_next_version();
9853 object_info_t
& oi
= ctx
->new_obs
.oi
;
9854 oi
.watchers
.erase(make_pair(watch
->get_cookie(),
9855 watch
->get_entity()));
9857 list
<watch_disconnect_t
> watch_disconnects
= {
9858 watch_disconnect_t(watch
->get_cookie(), watch
->get_entity(), true)
9860 ctx
->register_on_success(
9861 [this, obc
, watch_disconnects
]() {
9862 complete_disconnect_watches(obc
, watch_disconnects
);
9866 PGTransaction
*t
= ctx
->op_t
.get();
9867 ctx
->log
.push_back(pg_log_entry_t(pg_log_entry_t::MODIFY
, obc
->obs
.oi
.soid
,
9871 osd_reqid_t(), ctx
->mtime
, 0));
9873 oi
.prior_version
= obc
->obs
.oi
.version
;
9874 oi
.version
= ctx
->at_version
;
9876 ::encode(oi
, bl
, get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD
, nullptr));
9877 t
->setattr(obc
->obs
.oi
.soid
, OI_ATTR
, bl
);
9879 // apply new object state.
9880 ctx
->obc
->obs
= ctx
->new_obs
;
9882 // no ctx->delta_stats
9883 simple_opc_submit(std::move(ctx
));
9886 ObjectContextRef
PrimaryLogPG::create_object_context(const object_info_t
& oi
,
9887 SnapSetContext
*ssc
)
9889 ObjectContextRef
obc(object_contexts
.lookup_or_create(oi
.soid
));
9890 assert(obc
->destructor_callback
== NULL
);
9891 obc
->destructor_callback
= new C_PG_ObjectContext(this, obc
.get());
9893 obc
->obs
.exists
= false;
9896 register_snapset_context(ssc
);
9897 dout(10) << "create_object_context " << (void*)obc
.get() << " " << oi
.soid
<< " " << dendl
;
9899 populate_obc_watchers(obc
);
9903 ObjectContextRef
PrimaryLogPG::get_object_context(
9904 const hobject_t
& soid
,
9906 const map
<string
, bufferlist
> *attrs
)
9909 attrs
|| !pg_log
.get_missing().is_missing(soid
) ||
9910 // or this is a revert... see recover_primary()
9911 (pg_log
.get_log().objects
.count(soid
) &&
9912 pg_log
.get_log().objects
.find(soid
)->second
->op
==
9913 pg_log_entry_t::LOST_REVERT
));
9914 ObjectContextRef obc
= object_contexts
.lookup(soid
);
9915 osd
->logger
->inc(l_osd_object_ctx_cache_total
);
9917 osd
->logger
->inc(l_osd_object_ctx_cache_hit
);
9918 dout(10) << __func__
<< ": found obc in cache: " << obc
9921 dout(10) << __func__
<< ": obc NOT found in cache: " << soid
<< dendl
;
9925 assert(attrs
->count(OI_ATTR
));
9926 bv
= attrs
->find(OI_ATTR
)->second
;
9928 int r
= pgbackend
->objects_get_attr(soid
, OI_ATTR
, &bv
);
9931 dout(10) << __func__
<< ": no obc for soid "
9932 << soid
<< " and !can_create"
9934 return ObjectContextRef(); // -ENOENT!
9937 dout(10) << __func__
<< ": no obc for soid "
9938 << soid
<< " but can_create"
9941 object_info_t
oi(soid
);
9942 SnapSetContext
*ssc
= get_snapset_context(
9943 soid
, true, 0, false);
9945 obc
= create_object_context(oi
, ssc
);
9946 dout(10) << __func__
<< ": " << obc
<< " " << soid
9947 << " " << obc
->rwstate
9948 << " oi: " << obc
->obs
.oi
9949 << " ssc: " << obc
->ssc
9950 << " snapset: " << obc
->ssc
->snapset
<< dendl
;
9957 bufferlist::iterator bliter
= bv
.begin();
9958 ::decode(oi
, bliter
);
9960 dout(0) << __func__
<< ": obc corrupt: " << soid
<< dendl
;
9961 return ObjectContextRef(); // -ENOENT!
9964 assert(oi
.soid
.pool
== (int64_t)info
.pgid
.pool());
9966 obc
= object_contexts
.lookup_or_create(oi
.soid
);
9967 obc
->destructor_callback
= new C_PG_ObjectContext(this, obc
.get());
9969 obc
->obs
.exists
= true;
9971 obc
->ssc
= get_snapset_context(
9973 soid
.has_snapset() ? attrs
: 0);
9976 populate_obc_watchers(obc
);
9978 if (pool
.info
.require_rollback()) {
9980 obc
->attr_cache
= *attrs
;
9982 int r
= pgbackend
->objects_get_attrs(
9989 dout(10) << __func__
<< ": creating obc from disk: " << obc
9993 // XXX: Caller doesn't expect this
9994 if (obc
->ssc
== NULL
) {
9995 derr
<< __func__
<< ": obc->ssc not available, not returning context" << dendl
;
9996 return ObjectContextRef(); // -ENOENT!
9999 dout(10) << __func__
<< ": " << obc
<< " " << soid
10000 << " " << obc
->rwstate
10001 << " oi: " << obc
->obs
.oi
10002 << " exists: " << (int)obc
->obs
.exists
10003 << " ssc: " << obc
->ssc
10004 << " snapset: " << obc
->ssc
->snapset
<< dendl
;
10008 void PrimaryLogPG::context_registry_on_change()
10010 pair
<hobject_t
, ObjectContextRef
> i
;
10011 while (object_contexts
.get_next(i
.first
, &i
)) {
10012 ObjectContextRef
obc(i
.second
);
10014 for (map
<pair
<uint64_t, entity_name_t
>, WatchRef
>::iterator j
=
10015 obc
->watchers
.begin();
10016 j
!= obc
->watchers
.end();
10017 obc
->watchers
.erase(j
++)) {
10018 j
->second
->discard();
10026 * If we return an error, and set *pmissing, then promoting that
10029 * If we return -EAGAIN, we will always set *pmissing to the missing
10030 * object to wait for.
10032 * If we return an error but do not set *pmissing, then we know the
10033 * object does not exist.
10035 int PrimaryLogPG::find_object_context(const hobject_t
& oid
,
10036 ObjectContextRef
*pobc
,
10038 bool map_snapid_to_clone
,
10039 hobject_t
*pmissing
)
10042 assert(oid
.pool
== static_cast<int64_t>(info
.pgid
.pool()));
10044 if (oid
.snap
== CEPH_NOSNAP
) {
10045 ObjectContextRef obc
= get_object_context(oid
, can_create
);
10051 dout(10) << "find_object_context " << oid
10052 << " @" << oid
.snap
10053 << " oi=" << obc
->obs
.oi
10060 hobject_t head
= oid
.get_head();
10062 // want the snapdir?
10063 if (oid
.snap
== CEPH_SNAPDIR
) {
10064 // return head or snapdir, whichever exists.
10065 ObjectContextRef headobc
= get_object_context(head
, can_create
);
10066 ObjectContextRef obc
= headobc
;
10067 if (!obc
|| !obc
->obs
.exists
)
10068 obc
= get_object_context(oid
, can_create
);
10069 if (!obc
|| !obc
->obs
.exists
) {
10070 // if we have neither, we would want to promote the head.
10074 *pobc
= headobc
; // may be null
10077 dout(10) << "find_object_context " << oid
10078 << " @" << oid
.snap
10079 << " oi=" << obc
->obs
.oi
10083 // always populate ssc for SNAPDIR...
10085 obc
->ssc
= get_snapset_context(
10091 if (!map_snapid_to_clone
&& pool
.info
.is_removed_snap(oid
.snap
)) {
10092 dout(10) << __func__
<< " snap " << oid
.snap
<< " is removed" << dendl
;
10096 SnapSetContext
*ssc
= get_snapset_context(oid
, can_create
);
10097 if (!ssc
|| !(ssc
->exists
|| can_create
)) {
10098 dout(20) << __func__
<< " " << oid
<< " no snapset" << dendl
;
10100 *pmissing
= head
; // start by getting the head
10102 put_snapset_context(ssc
);
10106 if (map_snapid_to_clone
) {
10107 dout(10) << "find_object_context " << oid
<< " @" << oid
.snap
10108 << " snapset " << ssc
->snapset
10109 << " map_snapid_to_clone=true" << dendl
;
10110 if (oid
.snap
> ssc
->snapset
.seq
) {
10111 // already must be readable
10112 ObjectContextRef obc
= get_object_context(head
, false);
10113 dout(10) << "find_object_context " << oid
<< " @" << oid
.snap
10114 << " snapset " << ssc
->snapset
10115 << " maps to head" << dendl
;
10117 put_snapset_context(ssc
);
10118 return (obc
&& obc
->obs
.exists
) ? 0 : -ENOENT
;
10120 vector
<snapid_t
>::const_iterator citer
= std::find(
10121 ssc
->snapset
.clones
.begin(),
10122 ssc
->snapset
.clones
.end(),
10124 if (citer
== ssc
->snapset
.clones
.end()) {
10125 dout(10) << "find_object_context " << oid
<< " @" << oid
.snap
10126 << " snapset " << ssc
->snapset
10127 << " maps to nothing" << dendl
;
10128 put_snapset_context(ssc
);
10132 dout(10) << "find_object_context " << oid
<< " @" << oid
.snap
10133 << " snapset " << ssc
->snapset
10134 << " maps to " << oid
<< dendl
;
10136 if (pg_log
.get_missing().is_missing(oid
)) {
10137 dout(10) << "find_object_context " << oid
<< " @" << oid
.snap
10138 << " snapset " << ssc
->snapset
10139 << " " << oid
<< " is missing" << dendl
;
10142 put_snapset_context(ssc
);
10146 ObjectContextRef obc
= get_object_context(oid
, false);
10147 if (!obc
|| !obc
->obs
.exists
) {
10148 dout(10) << "find_object_context " << oid
<< " @" << oid
.snap
10149 << " snapset " << ssc
->snapset
10150 << " " << oid
<< " is not present" << dendl
;
10153 put_snapset_context(ssc
);
10156 dout(10) << "find_object_context " << oid
<< " @" << oid
.snap
10157 << " snapset " << ssc
->snapset
10158 << " " << oid
<< " HIT" << dendl
;
10160 put_snapset_context(ssc
);
10163 ceph_abort(); //unreachable
10166 dout(10) << "find_object_context " << oid
<< " @" << oid
.snap
10167 << " snapset " << ssc
->snapset
<< dendl
;
10170 if (oid
.snap
> ssc
->snapset
.seq
) {
10171 if (ssc
->snapset
.head_exists
) {
10172 ObjectContextRef obc
= get_object_context(head
, false);
10173 dout(10) << "find_object_context " << head
10174 << " want " << oid
.snap
<< " > snapset seq " << ssc
->snapset
.seq
10175 << " -- HIT " << obc
->obs
10180 assert(ssc
== obc
->ssc
);
10181 put_snapset_context(ssc
);
10186 dout(10) << "find_object_context " << head
10187 << " want " << oid
.snap
<< " > snapset seq " << ssc
->snapset
.seq
10188 << " but head dne -- DNE"
10190 put_snapset_context(ssc
);
10194 // which clone would it be?
10196 while (k
< ssc
->snapset
.clones
.size() &&
10197 ssc
->snapset
.clones
[k
] < oid
.snap
)
10199 if (k
== ssc
->snapset
.clones
.size()) {
10200 dout(10) << "find_object_context no clones with last >= oid.snap "
10201 << oid
.snap
<< " -- DNE" << dendl
;
10202 put_snapset_context(ssc
);
10205 hobject_t
soid(oid
.oid
, oid
.get_key(), ssc
->snapset
.clones
[k
], oid
.get_hash(),
10206 info
.pgid
.pool(), oid
.get_namespace());
10208 if (pg_log
.get_missing().is_missing(soid
)) {
10209 dout(20) << "find_object_context " << soid
<< " missing, try again later"
10213 put_snapset_context(ssc
);
10217 ObjectContextRef obc
= get_object_context(soid
, false);
10218 if (!obc
|| !obc
->obs
.exists
) {
10221 put_snapset_context(ssc
);
10222 if (is_degraded_or_backfilling_object(soid
)) {
10223 dout(20) << __func__
<< " clone is degraded or backfilling " << soid
<< dendl
;
10226 dout(20) << __func__
<< " missing clone " << soid
<< dendl
;
10234 assert(obc
->ssc
== ssc
);
10235 put_snapset_context(ssc
);
10240 dout(20) << "find_object_context " << soid
10241 << " snapset " << obc
->ssc
->snapset
10242 << " legacy_snaps " << obc
->obs
.oi
.legacy_snaps
10244 snapid_t first
, last
;
10245 if (obc
->ssc
->snapset
.is_legacy()) {
10246 first
= obc
->obs
.oi
.legacy_snaps
.back();
10247 last
= obc
->obs
.oi
.legacy_snaps
.front();
10249 auto p
= obc
->ssc
->snapset
.clone_snaps
.find(soid
.snap
);
10250 assert(p
!= obc
->ssc
->snapset
.clone_snaps
.end());
10251 if (p
->second
.empty()) {
10252 dout(1) << __func__
<< " " << soid
<< " empty snapset -- DNE" << dendl
;
10253 assert(!cct
->_conf
->osd_debug_verify_snaps
);
10256 first
= p
->second
.back();
10257 last
= p
->second
.front();
10259 if (first
<= oid
.snap
) {
10260 dout(20) << "find_object_context " << soid
<< " [" << first
<< "," << last
10261 << "] contains " << oid
.snap
<< " -- HIT " << obc
->obs
<< dendl
;
10265 dout(20) << "find_object_context " << soid
<< " [" << first
<< "," << last
10266 << "] does not contain " << oid
.snap
<< " -- DNE" << dendl
;
10271 void PrimaryLogPG::object_context_destructor_callback(ObjectContext
*obc
)
10274 put_snapset_context(obc
->ssc
);
10277 void PrimaryLogPG::add_object_context_to_pg_stat(ObjectContextRef obc
, pg_stat_t
*pgstat
)
10279 object_info_t
& oi
= obc
->obs
.oi
;
10281 dout(10) << "add_object_context_to_pg_stat " << oi
.soid
<< dendl
;
10282 object_stat_sum_t stat
;
10284 stat
.num_bytes
+= oi
.size
;
10286 if (oi
.soid
.snap
!= CEPH_SNAPDIR
)
10287 stat
.num_objects
++;
10289 stat
.num_objects_dirty
++;
10290 if (oi
.is_whiteout())
10291 stat
.num_whiteouts
++;
10293 stat
.num_objects_omap
++;
10294 if (oi
.is_cache_pinned())
10295 stat
.num_objects_pinned
++;
10297 if (oi
.soid
.snap
&& oi
.soid
.snap
!= CEPH_NOSNAP
&& oi
.soid
.snap
!= CEPH_SNAPDIR
) {
10298 stat
.num_object_clones
++;
10301 obc
->ssc
= get_snapset_context(oi
.soid
, false);
10304 // subtract off clone overlap
10305 if (obc
->ssc
->snapset
.clone_overlap
.count(oi
.soid
.snap
)) {
10306 interval_set
<uint64_t>& o
= obc
->ssc
->snapset
.clone_overlap
[oi
.soid
.snap
];
10307 for (interval_set
<uint64_t>::const_iterator r
= o
.begin();
10310 stat
.num_bytes
-= r
.get_len();
10316 pgstat
->stats
.sum
.add(stat
);
10319 void PrimaryLogPG::kick_object_context_blocked(ObjectContextRef obc
)
10321 const hobject_t
& soid
= obc
->obs
.oi
.soid
;
10322 if (obc
->is_blocked()) {
10323 dout(10) << __func__
<< " " << soid
<< " still blocked" << dendl
;
10327 map
<hobject_t
, list
<OpRequestRef
>>::iterator p
= waiting_for_blocked_object
.find(soid
);
10328 if (p
!= waiting_for_blocked_object
.end()) {
10329 list
<OpRequestRef
>& ls
= p
->second
;
10330 dout(10) << __func__
<< " " << soid
<< " requeuing " << ls
.size() << " requests" << dendl
;
10332 waiting_for_blocked_object
.erase(p
);
10335 map
<hobject_t
, ObjectContextRef
>::iterator i
=
10336 objects_blocked_on_snap_promotion
.find(obc
->obs
.oi
.soid
.get_head());
10337 if (i
!= objects_blocked_on_snap_promotion
.end()) {
10338 assert(i
->second
== obc
);
10339 objects_blocked_on_snap_promotion
.erase(i
);
10342 if (obc
->requeue_scrub_on_unblock
) {
10343 obc
->requeue_scrub_on_unblock
= false;
10348 SnapSetContext
*PrimaryLogPG::get_snapset_context(
10349 const hobject_t
& oid
,
10351 const map
<string
, bufferlist
> *attrs
,
10354 Mutex::Locker
l(snapset_contexts_lock
);
10355 SnapSetContext
*ssc
;
10356 map
<hobject_t
, SnapSetContext
*>::iterator p
= snapset_contexts
.find(
10357 oid
.get_snapdir());
10358 if (p
!= snapset_contexts
.end()) {
10359 if (can_create
|| p
->second
->exists
) {
10368 if (!(oid
.is_head() && !oid_existed
))
10369 r
= pgbackend
->objects_get_attr(oid
.get_head(), SS_ATTR
, &bv
);
10372 if (!(oid
.is_snapdir() && !oid_existed
))
10373 r
= pgbackend
->objects_get_attr(oid
.get_snapdir(), SS_ATTR
, &bv
);
10374 if (r
< 0 && !can_create
)
10378 assert(attrs
->count(SS_ATTR
));
10379 bv
= attrs
->find(SS_ATTR
)->second
;
10381 ssc
= new SnapSetContext(oid
.get_snapdir());
10382 _register_snapset_context(ssc
);
10384 bufferlist::iterator bvp
= bv
.begin();
10386 ssc
->snapset
.decode(bvp
);
10387 } catch (buffer::error
& e
) {
10388 dout(0) << __func__
<< " Can't decode snapset: " << e
<< dendl
;
10391 ssc
->exists
= true;
10393 ssc
->exists
= false;
10401 void PrimaryLogPG::put_snapset_context(SnapSetContext
*ssc
)
10403 Mutex::Locker
l(snapset_contexts_lock
);
10405 if (ssc
->ref
== 0) {
10406 if (ssc
->registered
)
10407 snapset_contexts
.erase(ssc
->oid
);
10412 /** pull - request object from a peer
10417 * NONE - didn't pull anything
10418 * YES - pulled what the caller wanted
10419 * OTHER - needed to pull something else first (_head or _snapdir)
10421 enum { PULL_NONE
, PULL_OTHER
, PULL_YES
};
10423 int PrimaryLogPG::recover_missing(
10424 const hobject_t
&soid
, eversion_t v
,
10426 PGBackend::RecoveryHandle
*h
)
10428 if (missing_loc
.is_unfound(soid
)) {
10429 dout(7) << "pull " << soid
10431 << " but it is unfound" << dendl
;
10435 if (missing_loc
.is_deleted(soid
)) {
10436 start_recovery_op(soid
);
10437 assert(!recovering
.count(soid
));
10438 recovering
.insert(make_pair(soid
, ObjectContextRef()));
10439 epoch_t cur_epoch
= get_osdmap()->get_epoch();
10440 remove_missing_object(soid
, v
, new FunctionContext(
10443 if (!pg_has_reset_since(cur_epoch
)) {
10444 bool object_missing
= false;
10445 for (const auto& shard
: actingbackfill
) {
10446 if (shard
== pg_whoami
)
10448 if (peer_missing
[shard
].is_missing(soid
)) {
10449 dout(20) << __func__
<< ": soid " << soid
<< " needs to be deleted from replica " << shard
<< dendl
;
10450 object_missing
= true;
10454 if (!object_missing
) {
10455 object_stat_sum_t stat_diff
;
10456 stat_diff
.num_objects_recovered
= 1;
10457 on_global_recover(soid
, stat_diff
, true);
10459 auto recovery_handle
= pgbackend
->open_recovery_op();
10460 pgbackend
->recover_delete_object(soid
, v
, recovery_handle
);
10461 pgbackend
->run_recovery_op(recovery_handle
, priority
);
10469 // is this a snapped object? if so, consult the snapset.. we may not need the entire object!
10470 ObjectContextRef obc
;
10471 ObjectContextRef head_obc
;
10472 if (soid
.snap
&& soid
.snap
< CEPH_NOSNAP
) {
10473 // do we have the head and/or snapdir?
10474 hobject_t head
= soid
.get_head();
10475 if (pg_log
.get_missing().is_missing(head
)) {
10476 if (recovering
.count(head
)) {
10477 dout(10) << " missing but already recovering head " << head
<< dendl
;
10480 int r
= recover_missing(
10481 head
, pg_log
.get_missing().get_items().find(head
)->second
.need
, priority
,
10483 if (r
!= PULL_NONE
)
10488 head
= soid
.get_snapdir();
10489 if (pg_log
.get_missing().is_missing(head
)) {
10490 if (recovering
.count(head
)) {
10491 dout(10) << " missing but already recovering snapdir " << head
<< dendl
;
10494 int r
= recover_missing(
10495 head
, pg_log
.get_missing().get_items().find(head
)->second
.need
, priority
,
10497 if (r
!= PULL_NONE
)
10503 // we must have one or the other
10504 head_obc
= get_object_context(
10509 head_obc
= get_object_context(
10510 soid
.get_snapdir(),
10515 start_recovery_op(soid
);
10516 assert(!recovering
.count(soid
));
10517 recovering
.insert(make_pair(soid
, obc
));
10518 int r
= pgbackend
->recover_object(
10524 // This is only a pull which shouldn't return an error
10529 void PrimaryLogPG::send_remove_op(
10530 const hobject_t
& oid
, eversion_t v
, pg_shard_t peer
)
10532 ceph_tid_t tid
= osd
->get_tid();
10533 osd_reqid_t
rid(osd
->get_cluster_msgr_name(), 0, tid
);
10535 dout(10) << "send_remove_op " << oid
<< " from osd." << peer
10536 << " tid " << tid
<< dendl
;
10538 MOSDSubOp
*subop
= new MOSDSubOp(
10539 rid
, pg_whoami
, spg_t(info
.pgid
.pgid
, peer
.shard
),
10540 oid
, CEPH_OSD_FLAG_ACK
,
10541 get_osdmap()->get_epoch(), tid
, v
);
10542 subop
->ops
= vector
<OSDOp
>(1);
10543 subop
->ops
[0].op
.op
= CEPH_OSD_OP_DELETE
;
10545 osd
->send_message_osd_cluster(peer
.osd
, subop
, get_osdmap()->get_epoch());
10548 void PrimaryLogPG::remove_missing_object(const hobject_t
&soid
,
10549 eversion_t v
, Context
*on_complete
)
10551 dout(20) << __func__
<< " " << soid
<< " " << v
<< dendl
;
10552 assert(on_complete
!= nullptr);
10554 ObjectStore::Transaction t
;
10555 remove_snap_mapped_object(t
, soid
);
10557 ObjectRecoveryInfo recovery_info
;
10558 recovery_info
.soid
= soid
;
10559 recovery_info
.version
= v
;
10561 epoch_t cur_epoch
= get_osdmap()->get_epoch();
10562 t
.register_on_complete(new FunctionContext(
10565 if (!pg_has_reset_since(cur_epoch
)) {
10566 ObjectStore::Transaction t2
;
10567 on_local_recover(soid
, recovery_info
, ObjectContextRef(), true, &t2
);
10568 t2
.register_on_complete(on_complete
);
10569 int r
= osd
->store
->queue_transaction(osr
.get(), std::move(t2
), nullptr);
10574 on_complete
->complete(-EAGAIN
);
10577 int r
= osd
->store
->queue_transaction(osr
.get(), std::move(t
), nullptr);
10581 void PrimaryLogPG::finish_degraded_object(const hobject_t
& oid
)
10583 dout(10) << "finish_degraded_object " << oid
<< dendl
;
10584 if (callbacks_for_degraded_object
.count(oid
)) {
10585 list
<Context
*> contexts
;
10586 contexts
.swap(callbacks_for_degraded_object
[oid
]);
10587 callbacks_for_degraded_object
.erase(oid
);
10588 for (list
<Context
*>::iterator i
= contexts
.begin();
10589 i
!= contexts
.end();
10594 map
<hobject_t
, snapid_t
>::iterator i
= objects_blocked_on_degraded_snap
.find(
10596 if (i
!= objects_blocked_on_degraded_snap
.end() &&
10597 i
->second
== oid
.snap
)
10598 objects_blocked_on_degraded_snap
.erase(i
);
10601 void PrimaryLogPG::_committed_pushed_object(
10602 epoch_t epoch
, eversion_t last_complete
)
10605 if (!pg_has_reset_since(epoch
)) {
10606 dout(10) << "_committed_pushed_object last_complete " << last_complete
<< " now ondisk" << dendl
;
10607 last_complete_ondisk
= last_complete
;
10609 if (last_complete_ondisk
== info
.last_update
) {
10610 if (!is_primary()) {
10611 // Either we are a replica or backfill target.
10612 // we are fully up to date. tell the primary!
10613 osd
->send_message_osd_cluster(
10616 get_osdmap()->get_epoch(),
10617 spg_t(info
.pgid
.pgid
, get_primary().shard
),
10618 last_complete_ondisk
),
10619 get_osdmap()->get_epoch());
10621 calc_min_last_complete_ondisk();
10626 dout(10) << "_committed_pushed_object pg has changed, not touching last_complete_ondisk" << dendl
;
10632 void PrimaryLogPG::_applied_recovered_object(ObjectContextRef obc
)
10635 dout(20) << __func__
<< dendl
;
10637 dout(20) << "obc = " << *obc
<< dendl
;
10639 assert(active_pushes
>= 1);
10642 // requeue an active chunky scrub waiting on recovery ops
10643 if (!deleting
&& active_pushes
== 0
10644 && scrubber
.is_chunky_scrub_active()) {
10645 if (ops_blocked_by_scrub()) {
10646 requeue_scrub(true);
10648 requeue_scrub(false);
10654 void PrimaryLogPG::_applied_recovered_object_replica()
10657 dout(20) << __func__
<< dendl
;
10658 assert(active_pushes
>= 1);
10661 // requeue an active chunky scrub waiting on recovery ops
10662 if (!deleting
&& active_pushes
== 0 &&
10663 scrubber
.active_rep_scrub
&& static_cast<const MOSDRepScrub
*>(
10664 scrubber
.active_rep_scrub
->get_req())->chunky
) {
10667 PGQueueable(scrubber
.active_rep_scrub
, get_osdmap()->get_epoch()));
10668 scrubber
.active_rep_scrub
= OpRequestRef();
10673 void PrimaryLogPG::recover_got(hobject_t oid
, eversion_t v
)
10675 dout(10) << "got missing " << oid
<< " v " << v
<< dendl
;
10676 pg_log
.recover_got(oid
, v
, info
);
10677 if (pg_log
.get_log().complete_to
!= pg_log
.get_log().log
.end()) {
10678 dout(10) << "last_complete now " << info
.last_complete
10679 << " log.complete_to " << pg_log
.get_log().complete_to
->version
10682 dout(10) << "last_complete now " << info
.last_complete
10683 << " log.complete_to at end" << dendl
;
10684 //below is not true in the repair case.
10685 //assert(missing.num_missing() == 0); // otherwise, complete_to was wrong.
10686 assert(info
.last_complete
== info
.last_update
);
10690 void PrimaryLogPG::primary_failed(const hobject_t
&soid
)
10692 list
<pg_shard_t
> fl
= { pg_whoami
};
10693 failed_push(fl
, soid
);
10696 void PrimaryLogPG::failed_push(const list
<pg_shard_t
> &from
, const hobject_t
&soid
)
10698 dout(20) << __func__
<< ": " << soid
<< dendl
;
10699 assert(recovering
.count(soid
));
10700 auto obc
= recovering
[soid
];
10702 list
<OpRequestRef
> blocked_ops
;
10703 obc
->drop_recovery_read(&blocked_ops
);
10704 requeue_ops(blocked_ops
);
10706 recovering
.erase(soid
);
10707 for (auto&& i
: from
)
10708 missing_loc
.remove_location(soid
, i
);
10709 dout(0) << __func__
<< " " << soid
<< " from shard " << from
10710 << ", reps on " << missing_loc
.get_locations(soid
)
10711 << " unfound? " << missing_loc
.is_unfound(soid
) << dendl
;
10712 finish_recovery_op(soid
); // close out this attempt,
10715 void PrimaryLogPG::sub_op_remove(OpRequestRef op
)
10717 const MOSDSubOp
*m
= static_cast<const MOSDSubOp
*>(op
->get_req());
10718 assert(m
->get_type() == MSG_OSD_SUBOP
);
10719 dout(7) << "sub_op_remove " << m
->poid
<< dendl
;
10721 op
->mark_started();
10723 ObjectStore::Transaction t
;
10724 remove_snap_mapped_object(t
, m
->poid
);
10725 int r
= osd
->store
->queue_transaction(osr
.get(), std::move(t
), NULL
);
10729 eversion_t
PrimaryLogPG::pick_newest_available(const hobject_t
& oid
)
10732 pg_missing_item pmi
;
10733 bool is_missing
= pg_log
.get_missing().is_missing(oid
, &pmi
);
10734 assert(is_missing
);
10736 dout(10) << "pick_newest_available " << oid
<< " " << v
<< " on osd." << osd
->whoami
<< " (local)" << dendl
;
10738 assert(!actingbackfill
.empty());
10739 for (set
<pg_shard_t
>::iterator i
= actingbackfill
.begin();
10740 i
!= actingbackfill
.end();
10742 if (*i
== get_primary()) continue;
10743 pg_shard_t peer
= *i
;
10744 if (!peer_missing
[peer
].is_missing(oid
)) {
10747 eversion_t h
= peer_missing
[peer
].get_items().at(oid
).have
;
10748 dout(10) << "pick_newest_available " << oid
<< " " << h
<< " on osd." << peer
<< dendl
;
10753 dout(10) << "pick_newest_available " << oid
<< " " << v
<< " (newest)" << dendl
;
10757 void PrimaryLogPG::do_update_log_missing(OpRequestRef
&op
)
10759 const MOSDPGUpdateLogMissing
*m
= static_cast<const MOSDPGUpdateLogMissing
*>(
10761 assert(m
->get_type() == MSG_OSD_PG_UPDATE_LOG_MISSING
);
10762 ObjectStore::Transaction t
;
10763 boost::optional
<eversion_t
> op_trim_to
, op_roll_forward_to
;
10764 if (m
->pg_trim_to
!= eversion_t())
10765 op_trim_to
= m
->pg_trim_to
;
10766 if (m
->pg_roll_forward_to
!= eversion_t())
10767 op_roll_forward_to
= m
->pg_roll_forward_to
;
10769 dout(20) << __func__
<< " op_trim_to = " << op_trim_to
<< " op_roll_forward_to = " << op_roll_forward_to
<< dendl
;
10771 append_log_entries_update_missing(m
->entries
, t
, op_trim_to
, op_roll_forward_to
);
10772 eversion_t new_lcod
= info
.last_complete
;
10774 Context
*complete
= new FunctionContext(
10776 const MOSDPGUpdateLogMissing
*msg
= static_cast<const MOSDPGUpdateLogMissing
*>(
10779 if (!pg_has_reset_since(msg
->get_epoch())) {
10780 update_last_complete_ondisk(new_lcod
);
10781 MOSDPGUpdateLogMissingReply
*reply
=
10782 new MOSDPGUpdateLogMissingReply(
10783 spg_t(info
.pgid
.pgid
, primary_shard().shard
),
10789 reply
->set_priority(CEPH_MSG_PRIO_HIGH
);
10790 msg
->get_connection()->send_message(reply
);
10795 if (get_osdmap()->require_osd_release
>= CEPH_RELEASE_KRAKEN
) {
10796 t
.register_on_commit(complete
);
10798 /* Hack to work around the fact that ReplicatedBackend sends
10799 * ack+commit if commit happens first
10801 * This behavior is no longer necessary, but we preserve it so old
10802 * primaries can keep their repops in order */
10803 if (pool
.info
.ec_pool()) {
10804 t
.register_on_complete(complete
);
10806 t
.register_on_commit(complete
);
10809 t
.register_on_applied(
10810 new C_OSD_OnApplied
{this, get_osdmap()->get_epoch(), info
.last_update
});
10811 int tr
= osd
->store
->queue_transaction(
10818 void PrimaryLogPG::do_update_log_missing_reply(OpRequestRef
&op
)
10820 const MOSDPGUpdateLogMissingReply
*m
=
10821 static_cast<const MOSDPGUpdateLogMissingReply
*>(
10823 dout(20) << __func__
<< " got reply from "
10824 << m
->get_from() << dendl
;
10826 auto it
= log_entry_update_waiting_on
.find(m
->get_tid());
10827 if (it
!= log_entry_update_waiting_on
.end()) {
10828 if (it
->second
.waiting_on
.count(m
->get_from())) {
10829 it
->second
.waiting_on
.erase(m
->get_from());
10830 if (m
->last_complete_ondisk
!= eversion_t()) {
10831 update_peer_last_complete_ondisk(m
->get_from(), m
->last_complete_ondisk
);
10835 << info
.pgid
<< " got reply "
10836 << *m
<< " from shard we are not waiting for "
10840 if (it
->second
.waiting_on
.empty()) {
10841 repop_all_committed(it
->second
.repop
.get());
10842 log_entry_update_waiting_on
.erase(it
);
10846 << info
.pgid
<< " got reply "
10847 << *m
<< " on unknown tid " << m
->get_tid();
10851 /* Mark all unfound objects as lost.
10853 void PrimaryLogPG::mark_all_unfound_lost(
10858 dout(3) << __func__
<< " " << pg_log_entry_t::get_op_name(what
) << dendl
;
10859 list
<hobject_t
> oids
;
10861 dout(30) << __func__
<< ": log before:\n";
10862 pg_log
.get_log().print(*_dout
);
10865 mempool::osd_pglog::list
<pg_log_entry_t
> log_entries
;
10867 utime_t mtime
= ceph_clock_now();
10868 map
<hobject_t
, pg_missing_item
>::const_iterator m
=
10869 missing_loc
.get_needs_recovery().begin();
10870 map
<hobject_t
, pg_missing_item
>::const_iterator mend
=
10871 missing_loc
.get_needs_recovery().end();
10873 ObcLockManager manager
;
10874 eversion_t v
= get_next_version();
10875 v
.epoch
= get_osdmap()->get_epoch();
10876 uint64_t num_unfound
= missing_loc
.num_unfound();
10877 while (m
!= mend
) {
10878 const hobject_t
&oid(m
->first
);
10879 if (!missing_loc
.is_unfound(oid
)) {
10880 // We only care about unfound objects
10885 ObjectContextRef obc
;
10889 case pg_log_entry_t::LOST_MARK
:
10890 assert(0 == "actually, not implemented yet!");
10893 case pg_log_entry_t::LOST_REVERT
:
10894 prev
= pick_newest_available(oid
);
10895 if (prev
> eversion_t()) {
10898 pg_log_entry_t::LOST_REVERT
, oid
, v
,
10899 m
->second
.need
, 0, osd_reqid_t(), mtime
, 0);
10900 e
.reverting_to
= prev
;
10901 e
.mark_unrollbackable();
10902 log_entries
.push_back(e
);
10903 dout(10) << e
<< dendl
;
10905 // we are now missing the new version; recovery code will sort it out.
10911 case pg_log_entry_t::LOST_DELETE
:
10913 pg_log_entry_t
e(pg_log_entry_t::LOST_DELETE
, oid
, v
, m
->second
.need
,
10914 0, osd_reqid_t(), mtime
, 0);
10915 if (get_osdmap()->require_osd_release
>= CEPH_RELEASE_JEWEL
) {
10916 if (pool
.info
.require_rollback()) {
10917 e
.mod_desc
.try_rmobject(v
.version
);
10919 e
.mark_unrollbackable();
10921 } // otherwise, just do what we used to do
10922 dout(10) << e
<< dendl
;
10923 log_entries
.push_back(e
);
10924 oids
.push_back(oid
);
10926 // If context found mark object as deleted in case
10927 // of racing with new creation. This can happen if
10928 // object lost and EIO at primary.
10929 obc
= object_contexts
.lookup(oid
);
10931 obc
->obs
.exists
= false;
10943 info
.stats
.stats_invalid
= true;
10945 submit_log_entries(
10947 std::move(manager
),
10948 boost::optional
<std::function
<void(void)> >(
10949 [this, oids
, con
, num_unfound
, tid
]() {
10950 if (perform_deletes_during_peering()) {
10951 for (auto oid
: oids
) {
10952 // clear old locations - merge_new_log_entries will have
10953 // handled rebuilding missing_loc for each of these
10954 // objects if we have the RECOVERY_DELETES flag
10955 missing_loc
.recovered(oid
);
10959 if (is_recovery_unfound()) {
10960 queue_peering_event(
10962 std::make_shared
<CephPeeringEvt
>(
10963 get_osdmap()->get_epoch(),
10964 get_osdmap()->get_epoch(),
10966 } else if (is_backfill_unfound()) {
10967 queue_peering_event(
10969 std::make_shared
<CephPeeringEvt
>(
10970 get_osdmap()->get_epoch(),
10971 get_osdmap()->get_epoch(),
10972 RequestBackfill())));
10978 ss
<< "pg has " << num_unfound
10979 << " objects unfound and apparently lost marking";
10980 string rs
= ss
.str();
10981 dout(0) << "do_command r=" << 0 << " " << rs
<< dendl
;
10982 osd
->clog
->info() << rs
;
10984 MCommandReply
*reply
= new MCommandReply(0, rs
);
10985 reply
->set_tid(tid
);
10986 con
->send_message(reply
);
10992 void PrimaryLogPG::_split_into(pg_t child_pgid
, PG
*child
, unsigned split_bits
)
10994 assert(repop_queue
.empty());
10998 * pg status change notification
11001 void PrimaryLogPG::apply_and_flush_repops(bool requeue
)
11003 list
<OpRequestRef
> rq
;
11005 // apply all repops
11006 while (!repop_queue
.empty()) {
11007 RepGather
*repop
= repop_queue
.front();
11008 repop_queue
.pop_front();
11009 dout(10) << " canceling repop tid " << repop
->rep_tid
<< dendl
;
11010 repop
->rep_aborted
= true;
11011 repop
->on_applied
.clear();
11012 repop
->on_committed
.clear();
11013 repop
->on_success
.clear();
11017 dout(10) << " requeuing " << *repop
->op
->get_req() << dendl
;
11018 rq
.push_back(repop
->op
);
11019 repop
->op
= OpRequestRef();
11022 // also requeue any dups, interleaved into position
11023 map
<eversion_t
, list
<pair
<OpRequestRef
, version_t
> > >::iterator p
=
11024 waiting_for_ondisk
.find(repop
->v
);
11025 if (p
!= waiting_for_ondisk
.end()) {
11026 dout(10) << " also requeuing ondisk waiters " << p
->second
<< dendl
;
11027 for (list
<pair
<OpRequestRef
, version_t
> >::iterator i
=
11029 i
!= p
->second
.end();
11031 rq
.push_back(i
->first
);
11033 waiting_for_ondisk
.erase(p
);
11037 remove_repop(repop
);
11040 assert(repop_queue
.empty());
11044 if (!waiting_for_ondisk
.empty()) {
11045 for (map
<eversion_t
, list
<pair
<OpRequestRef
, version_t
> > >::iterator i
=
11046 waiting_for_ondisk
.begin();
11047 i
!= waiting_for_ondisk
.end();
11049 for (list
<pair
<OpRequestRef
, version_t
> >::iterator j
=
11051 j
!= i
->second
.end();
11053 derr
<< __func__
<< ": op " << *(j
->first
->get_req()) << " waiting on "
11054 << i
->first
<< dendl
;
11057 assert(waiting_for_ondisk
.empty());
11061 waiting_for_ondisk
.clear();
11064 void PrimaryLogPG::on_flushed()
11066 assert(flushes_in_progress
> 0);
11067 flushes_in_progress
--;
11068 if (flushes_in_progress
== 0) {
11069 requeue_ops(waiting_for_flush
);
11071 if (!is_peered() || !is_primary()) {
11072 pair
<hobject_t
, ObjectContextRef
> i
;
11073 while (object_contexts
.get_next(i
.first
, &i
)) {
11074 derr
<< "on_flushed: object " << i
.first
<< " obc still alive" << dendl
;
11076 assert(object_contexts
.empty());
11078 pgbackend
->on_flushed();
11081 void PrimaryLogPG::on_removal(ObjectStore::Transaction
*t
)
11083 dout(10) << "on_removal" << dendl
;
11085 // adjust info to backfill
11086 info
.set_last_backfill(hobject_t());
11087 pg_log
.reset_backfill();
11092 PGLogEntryHandler rollbacker
{this, t
};
11093 pg_log
.roll_forward(&rollbacker
);
11095 write_if_dirty(*t
);
11101 void PrimaryLogPG::clear_async_reads()
11103 dout(10) << __func__
<< dendl
;
11104 for(auto& i
: in_progress_async_reads
) {
11105 dout(10) << "clear ctx: "
11106 << "OpRequestRef " << i
.first
11107 << " OpContext " << i
.second
11109 close_op_ctx(i
.second
);
11113 void PrimaryLogPG::on_shutdown()
11115 dout(10) << "on_shutdown" << dendl
;
11117 // remove from queues
11118 osd
->pg_stat_queue_dequeue(this);
11119 osd
->peering_wq
.dequeue(this);
11121 // handles queue races
11124 if (recovery_queued
) {
11125 recovery_queued
= false;
11126 osd
->clear_queued_recovery(this);
11129 clear_scrub_reserved();
11130 scrub_clear_state();
11132 unreg_next_scrub();
11134 vector
<ceph_tid_t
> tids
;
11135 cancel_copy_ops(false, &tids
);
11136 cancel_flush_ops(false, &tids
);
11137 cancel_proxy_ops(false, &tids
);
11138 osd
->objecter
->op_cancel(tids
, -ECANCELED
);
11140 apply_and_flush_repops(false);
11141 cancel_log_updates();
11142 // we must remove PGRefs, so do this this prior to release_backoffs() callers
11144 // clean up snap trim references
11145 snap_trimmer_machine
.process_event(Reset());
11147 pgbackend
->on_change();
11149 context_registry_on_change();
11150 object_contexts
.clear();
11152 clear_async_reads();
11154 osd
->remote_reserver
.cancel_reservation(info
.pgid
);
11155 osd
->local_reserver
.cancel_reservation(info
.pgid
);
11157 clear_primary_state();
11161 void PrimaryLogPG::on_activate()
11164 if (needs_recovery()) {
11165 dout(10) << "activate not all replicas are up-to-date, queueing recovery" << dendl
;
11166 queue_peering_event(
11168 std::make_shared
<CephPeeringEvt
>(
11169 get_osdmap()->get_epoch(),
11170 get_osdmap()->get_epoch(),
11172 } else if (needs_backfill()) {
11173 dout(10) << "activate queueing backfill" << dendl
;
11174 queue_peering_event(
11176 std::make_shared
<CephPeeringEvt
>(
11177 get_osdmap()->get_epoch(),
11178 get_osdmap()->get_epoch(),
11179 RequestBackfill())));
11181 dout(10) << "activate all replicas clean, no recovery" << dendl
;
11182 eio_errors_to_process
= false;
11183 queue_peering_event(
11185 std::make_shared
<CephPeeringEvt
>(
11186 get_osdmap()->get_epoch(),
11187 get_osdmap()->get_epoch(),
11188 AllReplicasRecovered())));
11191 publish_stats_to_osd();
11193 if (!backfill_targets
.empty()) {
11194 last_backfill_started
= earliest_backfill();
11195 new_backfill
= true;
11196 assert(!last_backfill_started
.is_max());
11197 dout(5) << "on activate: bft=" << backfill_targets
11198 << " from " << last_backfill_started
<< dendl
;
11199 for (set
<pg_shard_t
>::iterator i
= backfill_targets
.begin();
11200 i
!= backfill_targets
.end();
11202 dout(5) << "target shard " << *i
11203 << " from " << peer_info
[*i
].last_backfill
11212 void PrimaryLogPG::_on_new_interval()
11214 dout(20) << __func__
<< " checking missing set deletes flag. missing = " << pg_log
.get_missing() << dendl
;
11215 if (!pg_log
.get_missing().may_include_deletes
&&
11216 get_osdmap()->test_flag(CEPH_OSDMAP_RECOVERY_DELETES
)) {
11217 pg_log
.rebuild_missing_set_with_deletes(osd
->store
, coll
, info
);
11219 assert(pg_log
.get_missing().may_include_deletes
== get_osdmap()->test_flag(CEPH_OSDMAP_RECOVERY_DELETES
));
11222 void PrimaryLogPG::on_change(ObjectStore::Transaction
*t
)
11224 dout(10) << "on_change" << dendl
;
11226 if (hit_set
&& hit_set
->insert_count() == 0) {
11227 dout(20) << " discarding empty hit_set" << dendl
;
11231 if (recovery_queued
) {
11232 recovery_queued
= false;
11233 osd
->clear_queued_recovery(this);
11236 // requeue everything in the reverse order they should be
11238 requeue_ops(waiting_for_peered
);
11239 requeue_ops(waiting_for_flush
);
11240 requeue_ops(waiting_for_active
);
11242 clear_scrub_reserved();
11244 vector
<ceph_tid_t
> tids
;
11245 cancel_copy_ops(is_primary(), &tids
);
11246 cancel_flush_ops(is_primary(), &tids
);
11247 cancel_proxy_ops(is_primary(), &tids
);
11248 osd
->objecter
->op_cancel(tids
, -ECANCELED
);
11250 // requeue object waiters
11251 for (auto& p
: waiting_for_unreadable_object
) {
11252 release_backoffs(p
.first
);
11254 if (is_primary()) {
11255 requeue_object_waiters(waiting_for_unreadable_object
);
11257 waiting_for_unreadable_object
.clear();
11259 for (map
<hobject_t
,list
<OpRequestRef
>>::iterator p
= waiting_for_degraded_object
.begin();
11260 p
!= waiting_for_degraded_object
.end();
11261 waiting_for_degraded_object
.erase(p
++)) {
11262 release_backoffs(p
->first
);
11264 requeue_ops(p
->second
);
11267 finish_degraded_object(p
->first
);
11270 // requeues waiting_for_scrub
11271 scrub_clear_state();
11273 for (auto p
= waiting_for_blocked_object
.begin();
11274 p
!= waiting_for_blocked_object
.end();
11275 waiting_for_blocked_object
.erase(p
++)) {
11277 requeue_ops(p
->second
);
11281 for (auto i
= callbacks_for_degraded_object
.begin();
11282 i
!= callbacks_for_degraded_object
.end();
11284 finish_degraded_object((i
++)->first
);
11286 assert(callbacks_for_degraded_object
.empty());
11288 if (is_primary()) {
11289 requeue_ops(waiting_for_cache_not_full
);
11291 waiting_for_cache_not_full
.clear();
11293 objects_blocked_on_cache_full
.clear();
11295 for (list
<pair
<OpRequestRef
, OpContext
*> >::iterator i
=
11296 in_progress_async_reads
.begin();
11297 i
!= in_progress_async_reads
.end();
11298 in_progress_async_reads
.erase(i
++)) {
11299 close_op_ctx(i
->second
);
11301 requeue_op(i
->first
);
11304 // this will requeue ops we were working on but didn't finish, and
11306 apply_and_flush_repops(is_primary());
11307 cancel_log_updates();
11309 // do this *after* apply_and_flush_repops so that we catch any newly
11310 // registered watches.
11311 context_registry_on_change();
11313 pgbackend
->on_change_cleanup(t
);
11314 scrubber
.cleanup_store(t
);
11315 pgbackend
->on_change();
11317 // clear snap_trimmer state
11318 snap_trimmer_machine
.process_event(Reset());
11320 debug_op_order
.clear();
11321 unstable_stats
.clear();
11323 // we don't want to cache object_contexts through the interval change
11324 // NOTE: we actually assert that all currently live references are dead
11325 // by the time the flush for the next interval completes.
11326 object_contexts
.clear();
11328 // should have been cleared above by finishing all of the degraded objects
11329 assert(objects_blocked_on_degraded_snap
.empty());
11332 void PrimaryLogPG::on_role_change()
11334 dout(10) << "on_role_change" << dendl
;
11335 if (get_role() != 0 && hit_set
) {
11336 dout(10) << " clearing hit set" << dendl
;
11341 void PrimaryLogPG::on_pool_change()
11343 dout(10) << __func__
<< dendl
;
11344 // requeue cache full waiters just in case the cache_mode is
11345 // changing away from writeback mode. note that if we are not
11346 // active the normal requeuing machinery is sufficient (and properly
11349 pool
.info
.cache_mode
!= pg_pool_t::CACHEMODE_WRITEBACK
&&
11350 !waiting_for_cache_not_full
.empty()) {
11351 dout(10) << __func__
<< " requeuing full waiters (not in writeback) "
11353 requeue_ops(waiting_for_cache_not_full
);
11354 objects_blocked_on_cache_full
.clear();
11360 // clear state. called on recovery completion AND cancellation.
11361 void PrimaryLogPG::_clear_recovery_state()
11363 missing_loc
.clear();
11364 #ifdef DEBUG_RECOVERY_OIDS
11365 recovering_oids
.clear();
11367 last_backfill_started
= hobject_t();
11368 set
<hobject_t
>::iterator i
= backfills_in_flight
.begin();
11369 while (i
!= backfills_in_flight
.end()) {
11370 assert(recovering
.count(*i
));
11371 backfills_in_flight
.erase(i
++);
11374 list
<OpRequestRef
> blocked_ops
;
11375 for (map
<hobject_t
, ObjectContextRef
>::iterator i
= recovering
.begin();
11376 i
!= recovering
.end();
11377 recovering
.erase(i
++)) {
11379 i
->second
->drop_recovery_read(&blocked_ops
);
11380 requeue_ops(blocked_ops
);
11383 assert(backfills_in_flight
.empty());
11384 pending_backfill_updates
.clear();
11385 assert(recovering
.empty());
11386 pgbackend
->clear_recovery_state();
11389 void PrimaryLogPG::cancel_pull(const hobject_t
&soid
)
11391 dout(20) << __func__
<< ": " << soid
<< dendl
;
11392 assert(recovering
.count(soid
));
11393 ObjectContextRef obc
= recovering
[soid
];
11395 list
<OpRequestRef
> blocked_ops
;
11396 obc
->drop_recovery_read(&blocked_ops
);
11397 requeue_ops(blocked_ops
);
11399 recovering
.erase(soid
);
11400 finish_recovery_op(soid
);
11401 release_backoffs(soid
);
11402 if (waiting_for_degraded_object
.count(soid
)) {
11403 dout(20) << " kicking degraded waiters on " << soid
<< dendl
;
11404 requeue_ops(waiting_for_degraded_object
[soid
]);
11405 waiting_for_degraded_object
.erase(soid
);
11407 if (waiting_for_unreadable_object
.count(soid
)) {
11408 dout(20) << " kicking unreadable waiters on " << soid
<< dendl
;
11409 requeue_ops(waiting_for_unreadable_object
[soid
]);
11410 waiting_for_unreadable_object
.erase(soid
);
11412 if (is_missing_object(soid
))
11413 pg_log
.set_last_requested(0); // get recover_primary to start over
11414 finish_degraded_object(soid
);
11417 void PrimaryLogPG::check_recovery_sources(const OSDMapRef
& osdmap
)
11420 * check that any peers we are planning to (or currently) pulling
11421 * objects from are dealt with.
11423 missing_loc
.check_recovery_sources(osdmap
);
11424 pgbackend
->check_recovery_sources(osdmap
);
11426 for (set
<pg_shard_t
>::iterator i
= peer_log_requested
.begin();
11427 i
!= peer_log_requested
.end();
11429 if (!osdmap
->is_up(i
->osd
)) {
11430 dout(10) << "peer_log_requested removing " << *i
<< dendl
;
11431 peer_log_requested
.erase(i
++);
11437 for (set
<pg_shard_t
>::iterator i
= peer_missing_requested
.begin();
11438 i
!= peer_missing_requested
.end();
11440 if (!osdmap
->is_up(i
->osd
)) {
11441 dout(10) << "peer_missing_requested removing " << *i
<< dendl
;
11442 peer_missing_requested
.erase(i
++);
11449 void PG::MissingLoc::check_recovery_sources(const OSDMapRef
& osdmap
)
11451 set
<pg_shard_t
> now_down
;
11452 for (set
<pg_shard_t
>::iterator p
= missing_loc_sources
.begin();
11453 p
!= missing_loc_sources
.end();
11455 if (osdmap
->is_up(p
->osd
)) {
11459 ldout(pg
->cct
, 10) << "check_recovery_sources source osd." << *p
<< " now down" << dendl
;
11460 now_down
.insert(*p
);
11461 missing_loc_sources
.erase(p
++);
11464 if (now_down
.empty()) {
11465 ldout(pg
->cct
, 10) << "check_recovery_sources no source osds (" << missing_loc_sources
<< ") went down" << dendl
;
11467 ldout(pg
->cct
, 10) << "check_recovery_sources sources osds " << now_down
<< " now down, remaining sources are "
11468 << missing_loc_sources
<< dendl
;
11470 // filter missing_loc
11471 map
<hobject_t
, set
<pg_shard_t
>>::iterator p
= missing_loc
.begin();
11472 while (p
!= missing_loc
.end()) {
11473 set
<pg_shard_t
>::iterator q
= p
->second
.begin();
11474 while (q
!= p
->second
.end())
11475 if (now_down
.count(*q
)) {
11476 p
->second
.erase(q
++);
11480 if (p
->second
.empty())
11481 missing_loc
.erase(p
++);
11489 bool PrimaryLogPG::start_recovery_ops(
11491 ThreadPool::TPHandle
&handle
,
11492 uint64_t *ops_started
)
11494 uint64_t& started
= *ops_started
;
11496 bool work_in_progress
= false;
11497 assert(is_primary());
11499 if (!state_test(PG_STATE_RECOVERING
) &&
11500 !state_test(PG_STATE_BACKFILLING
)) {
11501 /* TODO: I think this case is broken and will make do_recovery()
11502 * unhappy since we're returning false */
11503 dout(10) << "recovery raced and were queued twice, ignoring!" << dendl
;
11507 const auto &missing
= pg_log
.get_missing();
11509 unsigned int num_missing
= missing
.num_missing();
11510 uint64_t num_unfound
= get_num_unfound();
11512 if (num_missing
== 0) {
11513 info
.last_complete
= info
.last_update
;
11516 if (num_missing
== num_unfound
) {
11517 // All of the missing objects we have are unfound.
11518 // Recover the replicas.
11519 started
= recover_replicas(max
, handle
);
11522 // We still have missing objects that we should grab from replicas.
11523 started
+= recover_primary(max
, handle
);
11525 if (!started
&& num_unfound
!= get_num_unfound()) {
11526 // second chance to recovery replicas
11527 started
= recover_replicas(max
, handle
);
11531 work_in_progress
= true;
11533 bool deferred_backfill
= false;
11534 if (recovering
.empty() &&
11535 state_test(PG_STATE_BACKFILLING
) &&
11536 !backfill_targets
.empty() && started
< max
&&
11537 missing
.num_missing() == 0 &&
11538 waiting_on_backfill
.empty()) {
11539 if (get_osdmap()->test_flag(CEPH_OSDMAP_NOBACKFILL
)) {
11540 dout(10) << "deferring backfill due to NOBACKFILL" << dendl
;
11541 deferred_backfill
= true;
11542 } else if (get_osdmap()->test_flag(CEPH_OSDMAP_NOREBALANCE
) &&
11544 dout(10) << "deferring backfill due to NOREBALANCE" << dendl
;
11545 deferred_backfill
= true;
11546 } else if (!backfill_reserved
) {
11547 dout(10) << "deferring backfill due to !backfill_reserved" << dendl
;
11548 if (!backfill_reserving
) {
11549 dout(10) << "queueing RequestBackfill" << dendl
;
11550 backfill_reserving
= true;
11551 queue_peering_event(
11553 std::make_shared
<CephPeeringEvt
>(
11554 get_osdmap()->get_epoch(),
11555 get_osdmap()->get_epoch(),
11556 RequestBackfill())));
11558 deferred_backfill
= true;
11560 started
+= recover_backfill(max
- started
, handle
, &work_in_progress
);
11564 dout(10) << " started " << started
<< dendl
;
11565 osd
->logger
->inc(l_osd_rop
, started
);
11567 if (!recovering
.empty() ||
11568 work_in_progress
|| recovery_ops_active
> 0 || deferred_backfill
)
11569 return work_in_progress
;
11571 assert(recovering
.empty());
11572 assert(recovery_ops_active
== 0);
11574 dout(10) << __func__
<< " needs_recovery: "
11575 << missing_loc
.get_needs_recovery()
11577 dout(10) << __func__
<< " missing_loc: "
11578 << missing_loc
.get_missing_locs()
11580 int unfound
= get_num_unfound();
11582 dout(10) << " still have " << unfound
<< " unfound" << dendl
;
11583 return work_in_progress
;
11586 if (missing
.num_missing() > 0) {
11587 // this shouldn't happen!
11588 osd
->clog
->error() << info
.pgid
<< " Unexpected Error: recovery ending with "
11589 << missing
.num_missing() << ": " << missing
.get_items();
11590 return work_in_progress
;
11593 if (needs_recovery()) {
11594 // this shouldn't happen!
11595 // We already checked num_missing() so we must have missing replicas
11596 osd
->clog
->error() << info
.pgid
11597 << " Unexpected Error: recovery ending with missing replicas";
11598 return work_in_progress
;
11601 if (state_test(PG_STATE_RECOVERING
)) {
11602 state_clear(PG_STATE_RECOVERING
);
11603 state_clear(PG_STATE_FORCED_RECOVERY
);
11604 if (needs_backfill()) {
11605 dout(10) << "recovery done, queuing backfill" << dendl
;
11606 queue_peering_event(
11608 std::make_shared
<CephPeeringEvt
>(
11609 get_osdmap()->get_epoch(),
11610 get_osdmap()->get_epoch(),
11611 RequestBackfill())));
11613 dout(10) << "recovery done, no backfill" << dendl
;
11614 eio_errors_to_process
= false;
11615 state_clear(PG_STATE_FORCED_BACKFILL
);
11616 queue_peering_event(
11618 std::make_shared
<CephPeeringEvt
>(
11619 get_osdmap()->get_epoch(),
11620 get_osdmap()->get_epoch(),
11621 AllReplicasRecovered())));
11623 } else { // backfilling
11624 state_clear(PG_STATE_BACKFILLING
);
11625 state_clear(PG_STATE_FORCED_BACKFILL
);
11626 state_clear(PG_STATE_FORCED_RECOVERY
);
11627 dout(10) << "recovery done, backfill done" << dendl
;
11628 eio_errors_to_process
= false;
11629 queue_peering_event(
11631 std::make_shared
<CephPeeringEvt
>(
11632 get_osdmap()->get_epoch(),
11633 get_osdmap()->get_epoch(),
11641 * do one recovery op.
11642 * return true if done, false if nothing left to do.
11644 uint64_t PrimaryLogPG::recover_primary(uint64_t max
, ThreadPool::TPHandle
&handle
)
11646 assert(is_primary());
11648 const auto &missing
= pg_log
.get_missing();
11650 dout(10) << "recover_primary recovering " << recovering
.size()
11651 << " in pg" << dendl
;
11652 dout(10) << "recover_primary " << missing
<< dendl
;
11653 dout(25) << "recover_primary " << missing
.get_items() << dendl
;
11656 pg_log_entry_t
*latest
= 0;
11657 unsigned started
= 0;
11660 PGBackend::RecoveryHandle
*h
= pgbackend
->open_recovery_op();
11661 map
<version_t
, hobject_t
>::const_iterator p
=
11662 missing
.get_rmissing().lower_bound(pg_log
.get_log().last_requested
);
11663 while (p
!= missing
.get_rmissing().end()) {
11664 handle
.reset_tp_timeout();
11666 version_t v
= p
->first
;
11668 if (pg_log
.get_log().objects
.count(p
->second
)) {
11669 latest
= pg_log
.get_log().objects
.find(p
->second
)->second
;
11670 assert(latest
->is_update() || latest
->is_delete());
11671 soid
= latest
->soid
;
11676 const pg_missing_item
& item
= missing
.get_items().find(p
->second
)->second
;
11679 hobject_t head
= soid
.get_head();
11681 eversion_t need
= item
.need
;
11683 dout(10) << "recover_primary "
11684 << soid
<< " " << item
.need
11685 << (missing
.is_missing(soid
) ? " (missing)":"")
11686 << (missing
.is_missing(head
) ? " (missing head)":"")
11687 << (recovering
.count(soid
) ? " (recovering)":"")
11688 << (recovering
.count(head
) ? " (recovering head)":"")
11692 switch (latest
->op
) {
11693 case pg_log_entry_t::CLONE
:
11695 * Handling for this special case removed for now, until we
11696 * can correctly construct an accurate SnapSet from the old
11701 case pg_log_entry_t::LOST_REVERT
:
11703 if (item
.have
== latest
->reverting_to
) {
11704 ObjectContextRef obc
= get_object_context(soid
, true);
11706 if (obc
->obs
.oi
.version
== latest
->version
) {
11707 // I'm already reverting
11708 dout(10) << " already reverting " << soid
<< dendl
;
11710 dout(10) << " reverting " << soid
<< " to " << latest
->prior_version
<< dendl
;
11711 obc
->ondisk_write_lock();
11712 obc
->obs
.oi
.version
= latest
->version
;
11714 ObjectStore::Transaction t
;
11716 obc
->obs
.oi
.encode(
11718 get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD
, nullptr));
11719 assert(!pool
.info
.require_rollback());
11720 t
.setattr(coll
, ghobject_t(soid
), OI_ATTR
, b2
);
11722 recover_got(soid
, latest
->version
);
11723 missing_loc
.add_location(soid
, pg_whoami
);
11727 osd
->store
->queue_transaction(osr
.get(), std::move(t
),
11728 new C_OSD_AppliedRecoveredObject(this, obc
),
11729 new C_OSD_CommittedPushedObject(
11731 get_osdmap()->get_epoch(),
11732 info
.last_complete
),
11733 new C_OSD_OndiskWriteUnlock(obc
));
11738 * Pull the old version of the object. Update missing_loc here to have the location
11739 * of the version we want.
11741 * This doesn't use the usual missing_loc paths, but that's okay:
11742 * - if we have it locally, we hit the case above, and go from there.
11743 * - if we don't, we always pass through this case during recovery and set up the location
11745 * - this way we don't need to mangle the missing code to be general about needing an old
11748 eversion_t alternate_need
= latest
->reverting_to
;
11749 dout(10) << " need to pull prior_version " << alternate_need
<< " for revert " << item
<< dendl
;
11751 for (map
<pg_shard_t
, pg_missing_t
>::iterator p
= peer_missing
.begin();
11752 p
!= peer_missing
.end();
11754 if (p
->second
.is_missing(soid
, need
) &&
11755 p
->second
.get_items().at(soid
).have
== alternate_need
) {
11756 missing_loc
.add_location(soid
, p
->first
);
11758 dout(10) << " will pull " << alternate_need
<< " or " << need
11759 << " from one of " << missing_loc
.get_locations(soid
)
11767 if (!recovering
.count(soid
)) {
11768 if (recovering
.count(head
)) {
11771 int r
= recover_missing(
11772 soid
, need
, get_recovery_op_priority(), h
);
11785 if (started
>= max
)
11790 // only advance last_requested if we haven't skipped anything
11792 pg_log
.set_last_requested(v
);
11795 pgbackend
->run_recovery_op(h
, get_recovery_op_priority());
11799 bool PrimaryLogPG::primary_error(
11800 const hobject_t
& soid
, eversion_t v
)
11802 pg_log
.missing_add(soid
, v
, eversion_t());
11803 pg_log
.set_last_requested(0);
11804 missing_loc
.remove_location(soid
, pg_whoami
);
11806 assert(!actingbackfill
.empty());
11807 for (set
<pg_shard_t
>::iterator i
= actingbackfill
.begin();
11808 i
!= actingbackfill
.end();
11810 if (*i
== get_primary()) continue;
11811 pg_shard_t peer
= *i
;
11812 if (!peer_missing
[peer
].is_missing(soid
, v
)) {
11813 missing_loc
.add_location(soid
, peer
);
11814 dout(10) << info
.pgid
<< " unexpectedly missing " << soid
<< " v" << v
11815 << ", there should be a copy on shard " << peer
<< dendl
;
11820 osd
->clog
->error() << info
.pgid
<< " missing primary copy of " << soid
<< ", unfound";
11822 osd
->clog
->error() << info
.pgid
<< " missing primary copy of " << soid
11823 << ", will try copies on " << missing_loc
.get_locations(soid
);
11827 int PrimaryLogPG::prep_object_replica_deletes(
11828 const hobject_t
& soid
, eversion_t v
,
11829 PGBackend::RecoveryHandle
*h
)
11831 assert(is_primary());
11832 dout(10) << __func__
<< ": on " << soid
<< dendl
;
11834 start_recovery_op(soid
);
11835 assert(!recovering
.count(soid
));
11836 recovering
.insert(make_pair(soid
, ObjectContextRef()));
11838 pgbackend
->recover_delete_object(soid
, v
, h
);
11842 int PrimaryLogPG::prep_object_replica_pushes(
11843 const hobject_t
& soid
, eversion_t v
,
11844 PGBackend::RecoveryHandle
*h
)
11846 assert(is_primary());
11847 dout(10) << __func__
<< ": on " << soid
<< dendl
;
11849 // NOTE: we know we will get a valid oloc off of disk here.
11850 ObjectContextRef obc
= get_object_context(soid
, false);
11852 primary_error(soid
, v
);
11856 if (!obc
->get_recovery_read()) {
11857 dout(20) << "recovery delayed on " << soid
11858 << "; could not get rw_manager lock" << dendl
;
11861 dout(20) << "recovery got recovery read lock on " << soid
11865 start_recovery_op(soid
);
11866 assert(!recovering
.count(soid
));
11867 recovering
.insert(make_pair(soid
, obc
));
11869 /* We need this in case there is an in progress write on the object. In fact,
11870 * the only possible write is an update to the xattr due to a lost_revert --
11871 * a client write would be blocked since the object is degraded.
11872 * In almost all cases, therefore, this lock should be uncontended.
11874 obc
->ondisk_read_lock();
11875 int r
= pgbackend
->recover_object(
11878 ObjectContextRef(),
11879 obc
, // has snapset context
11881 obc
->ondisk_read_unlock();
11883 dout(0) << __func__
<< " Error " << r
<< " on oid " << soid
<< dendl
;
11884 primary_failed(soid
);
11885 primary_error(soid
, v
);
11891 uint64_t PrimaryLogPG::recover_replicas(uint64_t max
, ThreadPool::TPHandle
&handle
)
11893 dout(10) << __func__
<< "(" << max
<< ")" << dendl
;
11894 uint64_t started
= 0;
11896 PGBackend::RecoveryHandle
*h
= pgbackend
->open_recovery_op();
11898 // this is FAR from an optimal recovery order. pretty lame, really.
11899 assert(!actingbackfill
.empty());
11900 for (set
<pg_shard_t
>::iterator i
= actingbackfill
.begin();
11901 i
!= actingbackfill
.end();
11903 if (*i
== get_primary()) continue;
11904 pg_shard_t peer
= *i
;
11905 map
<pg_shard_t
, pg_missing_t
>::const_iterator pm
= peer_missing
.find(peer
);
11906 assert(pm
!= peer_missing
.end());
11907 map
<pg_shard_t
, pg_info_t
>::const_iterator pi
= peer_info
.find(peer
);
11908 assert(pi
!= peer_info
.end());
11909 size_t m_sz
= pm
->second
.num_missing();
11911 dout(10) << " peer osd." << peer
<< " missing " << m_sz
<< " objects." << dendl
;
11912 dout(20) << " peer osd." << peer
<< " missing " << pm
->second
.get_items() << dendl
;
11915 const pg_missing_t
&m(pm
->second
);
11916 for (map
<version_t
, hobject_t
>::const_iterator p
= m
.get_rmissing().begin();
11917 p
!= m
.get_rmissing().end() && started
< max
;
11919 handle
.reset_tp_timeout();
11920 const hobject_t
soid(p
->second
);
11922 if (missing_loc
.is_unfound(soid
)) {
11923 dout(10) << __func__
<< ": " << soid
<< " still unfound" << dendl
;
11927 if (soid
> pi
->second
.last_backfill
) {
11928 if (!recovering
.count(soid
)) {
11929 derr
<< __func__
<< ": object " << soid
<< " last_backfill " << pi
->second
.last_backfill
<< dendl
;
11930 derr
<< __func__
<< ": object added to missing set for backfill, but "
11931 << "is not in recovering, error!" << dendl
;
11937 if (recovering
.count(soid
)) {
11938 dout(10) << __func__
<< ": already recovering " << soid
<< dendl
;
11942 if (missing_loc
.is_deleted(soid
)) {
11943 dout(10) << __func__
<< ": " << soid
<< " is a delete, removing" << dendl
;
11944 map
<hobject_t
,pg_missing_item
>::const_iterator r
= m
.get_items().find(soid
);
11945 started
+= prep_object_replica_deletes(soid
, r
->second
.need
, h
);
11949 if (soid
.is_snap() && pg_log
.get_missing().is_missing(soid
.get_head())) {
11950 dout(10) << __func__
<< ": " << soid
.get_head()
11951 << " still missing on primary" << dendl
;
11955 if (soid
.is_snap() && pg_log
.get_missing().is_missing(soid
.get_snapdir())) {
11956 dout(10) << __func__
<< ": " << soid
.get_snapdir()
11957 << " still missing on primary" << dendl
;
11961 if (pg_log
.get_missing().is_missing(soid
)) {
11962 dout(10) << __func__
<< ": " << soid
<< " still missing on primary" << dendl
;
11966 dout(10) << __func__
<< ": recover_object_replicas(" << soid
<< ")" << dendl
;
11967 map
<hobject_t
,pg_missing_item
>::const_iterator r
= m
.get_items().find(soid
);
11968 started
+= prep_object_replica_pushes(soid
, r
->second
.need
,
11973 pgbackend
->run_recovery_op(h
, get_recovery_op_priority());
11977 hobject_t
PrimaryLogPG::earliest_peer_backfill() const
11979 hobject_t e
= hobject_t::get_max();
11980 for (set
<pg_shard_t
>::const_iterator i
= backfill_targets
.begin();
11981 i
!= backfill_targets
.end();
11983 pg_shard_t peer
= *i
;
11984 map
<pg_shard_t
, BackfillInterval
>::const_iterator iter
=
11985 peer_backfill_info
.find(peer
);
11986 assert(iter
!= peer_backfill_info
.end());
11987 if (iter
->second
.begin
< e
)
11988 e
= iter
->second
.begin
;
11993 bool PrimaryLogPG::all_peer_done() const
11995 // Primary hasn't got any more objects
11996 assert(backfill_info
.empty());
11998 for (set
<pg_shard_t
>::const_iterator i
= backfill_targets
.begin();
11999 i
!= backfill_targets
.end();
12001 pg_shard_t bt
= *i
;
12002 map
<pg_shard_t
, BackfillInterval
>::const_iterator piter
=
12003 peer_backfill_info
.find(bt
);
12004 assert(piter
!= peer_backfill_info
.end());
12005 const BackfillInterval
& pbi
= piter
->second
;
12006 // See if peer has more to process
12007 if (!pbi
.extends_to_end() || !pbi
.empty())
12018 * backfilled: fully pushed to replica or present in replica's missing set (both
12019 * our copy and theirs).
12021 * All objects on a backfill_target in
12022 * [MIN,peer_backfill_info[backfill_target].begin) are valid; logically-removed
12023 * objects have been actually deleted and all logically-valid objects are replicated.
12024 * There may be PG objects in this interval yet to be backfilled.
12026 * All objects in PG in [MIN,backfill_info.begin) have been backfilled to all
12027 * backfill_targets. There may be objects on backfill_target(s) yet to be deleted.
12029 * For a backfill target, all objects < MIN(peer_backfill_info[target].begin,
12030 * backfill_info.begin) in PG are backfilled. No deleted objects in this
12031 * interval remain on the backfill target.
12033 * For a backfill target, all objects <= peer_info[target].last_backfill
12034 * have been backfilled to target
12036 * There *MAY* be missing/outdated objects between last_backfill_started and
12037 * MIN(peer_backfill_info[*].begin, backfill_info.begin) in the event that client
12038 * io created objects since the last scan. For this reason, we call
12039 * update_range() again before continuing backfill.
12041 uint64_t PrimaryLogPG::recover_backfill(
12043 ThreadPool::TPHandle
&handle
, bool *work_started
)
12045 dout(10) << "recover_backfill (" << max
<< ")"
12046 << " bft=" << backfill_targets
12047 << " last_backfill_started " << last_backfill_started
12048 << (new_backfill
? " new_backfill":"")
12050 assert(!backfill_targets
.empty());
12052 // Initialize from prior backfill state
12053 if (new_backfill
) {
12054 // on_activate() was called prior to getting here
12055 assert(last_backfill_started
== earliest_backfill());
12056 new_backfill
= false;
12058 // initialize BackfillIntervals
12059 for (set
<pg_shard_t
>::iterator i
= backfill_targets
.begin();
12060 i
!= backfill_targets
.end();
12062 peer_backfill_info
[*i
].reset(peer_info
[*i
].last_backfill
);
12064 backfill_info
.reset(last_backfill_started
);
12066 backfills_in_flight
.clear();
12067 pending_backfill_updates
.clear();
12070 for (set
<pg_shard_t
>::iterator i
= backfill_targets
.begin();
12071 i
!= backfill_targets
.end();
12073 dout(10) << "peer osd." << *i
12074 << " info " << peer_info
[*i
]
12075 << " interval " << peer_backfill_info
[*i
].begin
12076 << "-" << peer_backfill_info
[*i
].end
12077 << " " << peer_backfill_info
[*i
].objects
.size() << " objects"
12081 // update our local interval to cope with recent changes
12082 backfill_info
.begin
= last_backfill_started
;
12083 update_range(&backfill_info
, handle
);
12086 vector
<boost::tuple
<hobject_t
, eversion_t
, pg_shard_t
> > to_remove
;
12087 set
<hobject_t
> add_to_stat
;
12089 for (set
<pg_shard_t
>::iterator i
= backfill_targets
.begin();
12090 i
!= backfill_targets
.end();
12092 peer_backfill_info
[*i
].trim_to(
12093 std::max(peer_info
[*i
].last_backfill
, last_backfill_started
));
12095 backfill_info
.trim_to(last_backfill_started
);
12097 PGBackend::RecoveryHandle
*h
= pgbackend
->open_recovery_op();
12098 while (ops
< max
) {
12099 if (backfill_info
.begin
<= earliest_peer_backfill() &&
12100 !backfill_info
.extends_to_end() && backfill_info
.empty()) {
12101 hobject_t next
= backfill_info
.end
;
12102 backfill_info
.reset(next
);
12103 backfill_info
.end
= hobject_t::get_max();
12104 update_range(&backfill_info
, handle
);
12105 backfill_info
.trim();
12108 dout(20) << " my backfill interval " << backfill_info
<< dendl
;
12110 bool sent_scan
= false;
12111 for (set
<pg_shard_t
>::iterator i
= backfill_targets
.begin();
12112 i
!= backfill_targets
.end();
12114 pg_shard_t bt
= *i
;
12115 BackfillInterval
& pbi
= peer_backfill_info
[bt
];
12117 dout(20) << " peer shard " << bt
<< " backfill " << pbi
<< dendl
;
12118 if (pbi
.begin
<= backfill_info
.begin
&&
12119 !pbi
.extends_to_end() && pbi
.empty()) {
12120 dout(10) << " scanning peer osd." << bt
<< " from " << pbi
.end
<< dendl
;
12121 epoch_t e
= get_osdmap()->get_epoch();
12122 MOSDPGScan
*m
= new MOSDPGScan(
12123 MOSDPGScan::OP_SCAN_GET_DIGEST
, pg_whoami
, e
, last_peering_reset
,
12124 spg_t(info
.pgid
.pgid
, bt
.shard
),
12125 pbi
.end
, hobject_t());
12126 osd
->send_message_osd_cluster(bt
.osd
, m
, get_osdmap()->get_epoch());
12127 assert(waiting_on_backfill
.find(bt
) == waiting_on_backfill
.end());
12128 waiting_on_backfill
.insert(bt
);
12133 // Count simultaneous scans as a single op and let those complete
12136 start_recovery_op(hobject_t::get_max()); // XXX: was pbi.end
12140 if (backfill_info
.empty() && all_peer_done()) {
12141 dout(10) << " reached end for both local and all peers" << dendl
;
12145 // Get object within set of peers to operate on and
12146 // the set of targets for which that object applies.
12147 hobject_t check
= earliest_peer_backfill();
12149 if (check
< backfill_info
.begin
) {
12151 set
<pg_shard_t
> check_targets
;
12152 for (set
<pg_shard_t
>::iterator i
= backfill_targets
.begin();
12153 i
!= backfill_targets
.end();
12155 pg_shard_t bt
= *i
;
12156 BackfillInterval
& pbi
= peer_backfill_info
[bt
];
12157 if (pbi
.begin
== check
)
12158 check_targets
.insert(bt
);
12160 assert(!check_targets
.empty());
12162 dout(20) << " BACKFILL removing " << check
12163 << " from peers " << check_targets
<< dendl
;
12164 for (set
<pg_shard_t
>::iterator i
= check_targets
.begin();
12165 i
!= check_targets
.end();
12167 pg_shard_t bt
= *i
;
12168 BackfillInterval
& pbi
= peer_backfill_info
[bt
];
12169 assert(pbi
.begin
== check
);
12171 to_remove
.push_back(boost::make_tuple(check
, pbi
.objects
.begin()->second
, bt
));
12175 /* This requires a bit of explanation. We compare head against
12176 * last_backfill to determine whether to send an operation
12177 * to the replica. A single write operation can touch up to three
12178 * objects: head, the snapdir, and a new clone which sorts closer to
12179 * head than any existing clone. If last_backfill points at a clone,
12180 * the transaction won't be sent and all 3 must lie on the right side
12181 * of the line (i.e., we'll backfill them later). If last_backfill
12182 * points at snapdir, it sorts greater than head, so we send the
12183 * transaction which is correct because all three must lie to the left
12186 * If it points at head, we have a bit of an issue. If head actually
12187 * exists, no problem, because any transaction which touches snapdir
12188 * must end up creating it (and deleting head), so sending the
12189 * operation won't pose a problem -- we'll end up having to scan it,
12190 * but it'll end up being the right version so we won't bother to
12191 * rebackfill it. However, if head doesn't exist, any write on head
12192 * will remove snapdir. For a replicated pool, this isn't a problem,
12193 * ENOENT on remove isn't an issue and it's in backfill future anyway.
12194 * It only poses a problem for EC pools, because we never just delete
12195 * an object, we rename it into a rollback object. That operation
12196 * will end up crashing the osd with ENOENT. Tolerating the failure
12197 * wouldn't work either, even if snapdir exists, we'd be creating a
12198 * rollback object past the last_backfill line which wouldn't get
12199 * cleaned up (no rollback objects past the last_backfill line is an
12200 * existing important invariant). Thus, let's avoid the whole issue
12201 * by just not updating last_backfill_started here if head doesn't
12202 * exist and snapdir does. We aren't using up a recovery count here,
12203 * so we're going to recover snapdir immediately anyway. We'll only
12204 * fail "backward" if we fail to get the rw lock and that just means
12205 * we'll re-process this section of the hash space again.
12207 * I'm choosing this hack here because the really "correct" answer is
12208 * going to be to unify snapdir and head into a single object (a
12209 * snapdir is really just a confusing way to talk about head existing
12210 * as a whiteout), but doing that is going to be a somewhat larger
12213 * @see http://tracker.ceph.com/issues/17668
12215 if (!(check
.is_head() &&
12216 backfill_info
.begin
.is_snapdir() &&
12217 check
== backfill_info
.begin
.get_head()))
12218 last_backfill_started
= check
;
12220 // Don't increment ops here because deletions
12221 // are cheap and not replied to unlike real recovery_ops,
12222 // and we can't increment ops without requeueing ourself
12225 eversion_t
& obj_v
= backfill_info
.objects
.begin()->second
;
12227 vector
<pg_shard_t
> need_ver_targs
, missing_targs
, keep_ver_targs
, skip_targs
;
12228 for (set
<pg_shard_t
>::iterator i
= backfill_targets
.begin();
12229 i
!= backfill_targets
.end();
12231 pg_shard_t bt
= *i
;
12232 BackfillInterval
& pbi
= peer_backfill_info
[bt
];
12233 // Find all check peers that have the wrong version
12234 if (check
== backfill_info
.begin
&& check
== pbi
.begin
) {
12235 if (pbi
.objects
.begin()->second
!= obj_v
) {
12236 need_ver_targs
.push_back(bt
);
12238 keep_ver_targs
.push_back(bt
);
12241 pg_info_t
& pinfo
= peer_info
[bt
];
12243 // Only include peers that we've caught up to their backfill line
12244 // otherwise, they only appear to be missing this object
12245 // because their pbi.begin > backfill_info.begin.
12246 if (backfill_info
.begin
> pinfo
.last_backfill
)
12247 missing_targs
.push_back(bt
);
12249 skip_targs
.push_back(bt
);
12253 if (!keep_ver_targs
.empty()) {
12254 // These peers have version obj_v
12255 dout(20) << " BACKFILL keeping " << check
12256 << " with ver " << obj_v
12257 << " on peers " << keep_ver_targs
<< dendl
;
12258 //assert(!waiting_for_degraded_object.count(check));
12260 if (!need_ver_targs
.empty() || !missing_targs
.empty()) {
12261 ObjectContextRef obc
= get_object_context(backfill_info
.begin
, false);
12263 if (obc
->get_recovery_read()) {
12264 if (!need_ver_targs
.empty()) {
12265 dout(20) << " BACKFILL replacing " << check
12266 << " with ver " << obj_v
12267 << " to peers " << need_ver_targs
<< dendl
;
12269 if (!missing_targs
.empty()) {
12270 dout(20) << " BACKFILL pushing " << backfill_info
.begin
12271 << " with ver " << obj_v
12272 << " to peers " << missing_targs
<< dendl
;
12274 vector
<pg_shard_t
> all_push
= need_ver_targs
;
12275 all_push
.insert(all_push
.end(), missing_targs
.begin(), missing_targs
.end());
12277 handle
.reset_tp_timeout();
12278 int r
= prep_backfill_object_push(backfill_info
.begin
, obj_v
, obc
, all_push
, h
);
12280 *work_started
= true;
12281 dout(0) << __func__
<< " Error " << r
<< " trying to backfill " << backfill_info
.begin
<< dendl
;
12286 *work_started
= true;
12287 dout(20) << "backfill blocking on " << backfill_info
.begin
12288 << "; could not get rw_manager lock" << dendl
;
12292 dout(20) << "need_ver_targs=" << need_ver_targs
12293 << " keep_ver_targs=" << keep_ver_targs
<< dendl
;
12294 dout(20) << "backfill_targets=" << backfill_targets
12295 << " missing_targs=" << missing_targs
12296 << " skip_targs=" << skip_targs
<< dendl
;
12298 last_backfill_started
= backfill_info
.begin
;
12299 add_to_stat
.insert(backfill_info
.begin
); // XXX: Only one for all pushes?
12300 backfill_info
.pop_front();
12301 vector
<pg_shard_t
> check_targets
= need_ver_targs
;
12302 check_targets
.insert(check_targets
.end(), keep_ver_targs
.begin(), keep_ver_targs
.end());
12303 for (vector
<pg_shard_t
>::iterator i
= check_targets
.begin();
12304 i
!= check_targets
.end();
12306 pg_shard_t bt
= *i
;
12307 BackfillInterval
& pbi
= peer_backfill_info
[bt
];
12313 hobject_t backfill_pos
=
12314 std::min(backfill_info
.begin
, earliest_peer_backfill());
12316 for (set
<hobject_t
>::iterator i
= add_to_stat
.begin();
12317 i
!= add_to_stat
.end();
12319 ObjectContextRef obc
= get_object_context(*i
, false);
12322 add_object_context_to_pg_stat(obc
, &stat
);
12323 pending_backfill_updates
[*i
] = stat
;
12325 if (HAVE_FEATURE(get_min_upacting_features(), SERVER_LUMINOUS
)) {
12326 map
<pg_shard_t
,MOSDPGBackfillRemove
*> reqs
;
12327 for (unsigned i
= 0; i
< to_remove
.size(); ++i
) {
12328 handle
.reset_tp_timeout();
12329 const hobject_t
& oid
= to_remove
[i
].get
<0>();
12330 eversion_t v
= to_remove
[i
].get
<1>();
12331 pg_shard_t peer
= to_remove
[i
].get
<2>();
12332 MOSDPGBackfillRemove
*m
;
12333 auto it
= reqs
.find(peer
);
12334 if (it
!= reqs
.end()) {
12337 m
= reqs
[peer
] = new MOSDPGBackfillRemove(
12338 spg_t(info
.pgid
.pgid
, peer
.shard
),
12339 get_osdmap()->get_epoch());
12341 m
->ls
.push_back(make_pair(oid
, v
));
12343 if (oid
<= last_backfill_started
)
12344 pending_backfill_updates
[oid
]; // add empty stat!
12346 for (auto p
: reqs
) {
12347 osd
->send_message_osd_cluster(p
.first
.osd
, p
.second
,
12348 get_osdmap()->get_epoch());
12351 // for jewel targets
12352 for (unsigned i
= 0; i
< to_remove
.size(); ++i
) {
12353 handle
.reset_tp_timeout();
12355 // ordered before any subsequent updates
12356 send_remove_op(to_remove
[i
].get
<0>(), to_remove
[i
].get
<1>(),
12357 to_remove
[i
].get
<2>());
12359 if (to_remove
[i
].get
<0>() <= last_backfill_started
)
12360 pending_backfill_updates
[to_remove
[i
].get
<0>()]; // add empty stat!
12364 pgbackend
->run_recovery_op(h
, get_recovery_op_priority());
12366 dout(5) << "backfill_pos is " << backfill_pos
<< dendl
;
12367 for (set
<hobject_t
>::iterator i
= backfills_in_flight
.begin();
12368 i
!= backfills_in_flight
.end();
12370 dout(20) << *i
<< " is still in flight" << dendl
;
12373 hobject_t next_backfill_to_complete
= backfills_in_flight
.empty() ?
12374 backfill_pos
: *(backfills_in_flight
.begin());
12375 hobject_t new_last_backfill
= earliest_backfill();
12376 dout(10) << "starting new_last_backfill at " << new_last_backfill
<< dendl
;
12377 for (map
<hobject_t
, pg_stat_t
>::iterator i
=
12378 pending_backfill_updates
.begin();
12379 i
!= pending_backfill_updates
.end() &&
12380 i
->first
< next_backfill_to_complete
;
12381 pending_backfill_updates
.erase(i
++)) {
12382 dout(20) << " pending_backfill_update " << i
->first
<< dendl
;
12383 assert(i
->first
> new_last_backfill
);
12384 for (set
<pg_shard_t
>::iterator j
= backfill_targets
.begin();
12385 j
!= backfill_targets
.end();
12387 pg_shard_t bt
= *j
;
12388 pg_info_t
& pinfo
= peer_info
[bt
];
12389 //Add stats to all peers that were missing object
12390 if (i
->first
> pinfo
.last_backfill
)
12391 pinfo
.stats
.add(i
->second
);
12393 new_last_backfill
= i
->first
;
12395 dout(10) << "possible new_last_backfill at " << new_last_backfill
<< dendl
;
12397 assert(!pending_backfill_updates
.empty() ||
12398 new_last_backfill
== last_backfill_started
);
12399 if (pending_backfill_updates
.empty() &&
12400 backfill_pos
.is_max()) {
12401 assert(backfills_in_flight
.empty());
12402 new_last_backfill
= backfill_pos
;
12403 last_backfill_started
= backfill_pos
;
12405 dout(10) << "final new_last_backfill at " << new_last_backfill
<< dendl
;
12407 // If new_last_backfill == MAX, then we will send OP_BACKFILL_FINISH to
12408 // all the backfill targets. Otherwise, we will move last_backfill up on
12409 // those targets need it and send OP_BACKFILL_PROGRESS to them.
12410 for (set
<pg_shard_t
>::iterator i
= backfill_targets
.begin();
12411 i
!= backfill_targets
.end();
12413 pg_shard_t bt
= *i
;
12414 pg_info_t
& pinfo
= peer_info
[bt
];
12416 if (new_last_backfill
> pinfo
.last_backfill
) {
12417 pinfo
.set_last_backfill(new_last_backfill
);
12418 epoch_t e
= get_osdmap()->get_epoch();
12419 MOSDPGBackfill
*m
= NULL
;
12420 if (pinfo
.last_backfill
.is_max()) {
12421 m
= new MOSDPGBackfill(
12422 MOSDPGBackfill::OP_BACKFILL_FINISH
,
12424 last_peering_reset
,
12425 spg_t(info
.pgid
.pgid
, bt
.shard
));
12426 // Use default priority here, must match sub_op priority
12427 /* pinfo.stats might be wrong if we did log-based recovery on the
12428 * backfilled portion in addition to continuing backfill.
12430 pinfo
.stats
= info
.stats
;
12431 start_recovery_op(hobject_t::get_max());
12433 m
= new MOSDPGBackfill(
12434 MOSDPGBackfill::OP_BACKFILL_PROGRESS
,
12436 last_peering_reset
,
12437 spg_t(info
.pgid
.pgid
, bt
.shard
));
12438 // Use default priority here, must match sub_op priority
12440 m
->last_backfill
= pinfo
.last_backfill
;
12441 m
->stats
= pinfo
.stats
;
12442 osd
->send_message_osd_cluster(bt
.osd
, m
, get_osdmap()->get_epoch());
12443 dout(10) << " peer " << bt
12444 << " num_objects now " << pinfo
.stats
.stats
.sum
.num_objects
12445 << " / " << info
.stats
.stats
.sum
.num_objects
<< dendl
;
12450 *work_started
= true;
12454 int PrimaryLogPG::prep_backfill_object_push(
12455 hobject_t oid
, eversion_t v
,
12456 ObjectContextRef obc
,
12457 vector
<pg_shard_t
> peers
,
12458 PGBackend::RecoveryHandle
*h
)
12460 dout(10) << __func__
<< " " << oid
<< " v " << v
<< " to peers " << peers
<< dendl
;
12461 assert(!peers
.empty());
12463 backfills_in_flight
.insert(oid
);
12464 for (unsigned int i
= 0 ; i
< peers
.size(); ++i
) {
12465 map
<pg_shard_t
, pg_missing_t
>::iterator bpm
= peer_missing
.find(peers
[i
]);
12466 assert(bpm
!= peer_missing
.end());
12467 bpm
->second
.add(oid
, eversion_t(), eversion_t(), false);
12470 assert(!recovering
.count(oid
));
12472 start_recovery_op(oid
);
12473 recovering
.insert(make_pair(oid
, obc
));
12475 // We need to take the read_lock here in order to flush in-progress writes
12476 obc
->ondisk_read_lock();
12477 int r
= pgbackend
->recover_object(
12480 ObjectContextRef(),
12483 obc
->ondisk_read_unlock();
12485 dout(0) << __func__
<< " Error " << r
<< " on oid " << oid
<< dendl
;
12486 primary_failed(oid
);
12487 primary_error(oid
, v
);
12488 backfills_in_flight
.erase(oid
);
12489 missing_loc
.add_missing(oid
, v
, eversion_t());
12494 void PrimaryLogPG::update_range(
12495 BackfillInterval
*bi
,
12496 ThreadPool::TPHandle
&handle
)
12498 int local_min
= cct
->_conf
->osd_backfill_scan_min
;
12499 int local_max
= cct
->_conf
->osd_backfill_scan_max
;
12501 if (bi
->version
< info
.log_tail
) {
12502 dout(10) << __func__
<< ": bi is old, rescanning local backfill_info"
12505 if (last_update_applied
>= info
.log_tail
) {
12506 bi
->version
= last_update_applied
;
12508 bi
->version
= info
.last_update
;
12510 scan_range(local_min
, local_max
, bi
, handle
);
12513 if (bi
->version
>= projected_last_update
) {
12514 dout(10) << __func__
<< ": bi is current " << dendl
;
12515 assert(bi
->version
== projected_last_update
);
12516 } else if (bi
->version
>= info
.log_tail
) {
12517 if (pg_log
.get_log().empty() && projected_log
.empty()) {
12518 /* Because we don't move log_tail on split, the log might be
12519 * empty even if log_tail != last_update. However, the only
12520 * way to get here with an empty log is if log_tail is actually
12521 * eversion_t(), because otherwise the entry which changed
12522 * last_update since the last scan would have to be present.
12524 assert(bi
->version
== eversion_t());
12528 dout(10) << __func__
<< ": bi is old, (" << bi
->version
12529 << ") can be updated with log to projected_last_update "
12530 << projected_last_update
<< dendl
;
12532 auto func
= [&](const pg_log_entry_t
&e
) {
12533 dout(10) << __func__
<< ": updating from version " << e
.version
12535 const hobject_t
&soid
= e
.soid
;
12536 if (soid
>= bi
->begin
&&
12538 if (e
.is_update()) {
12539 dout(10) << __func__
<< ": " << e
.soid
<< " updated to version "
12540 << e
.version
<< dendl
;
12541 bi
->objects
.erase(e
.soid
);
12542 bi
->objects
.insert(
12546 } else if (e
.is_delete()) {
12547 dout(10) << __func__
<< ": " << e
.soid
<< " removed" << dendl
;
12548 bi
->objects
.erase(e
.soid
);
12552 dout(10) << "scanning pg log first" << dendl
;
12553 pg_log
.get_log().scan_log_after(bi
->version
, func
);
12554 dout(10) << "scanning projected log" << dendl
;
12555 projected_log
.scan_log_after(bi
->version
, func
);
12556 bi
->version
= projected_last_update
;
12558 assert(0 == "scan_range should have raised bi->version past log_tail");
12562 void PrimaryLogPG::scan_range(
12563 int min
, int max
, BackfillInterval
*bi
,
12564 ThreadPool::TPHandle
&handle
)
12566 assert(is_locked());
12567 dout(10) << "scan_range from " << bi
->begin
<< dendl
;
12568 bi
->clear_objects();
12570 vector
<hobject_t
> ls
;
12572 int r
= pgbackend
->objects_list_partial(bi
->begin
, min
, max
, &ls
, &bi
->end
);
12574 dout(10) << " got " << ls
.size() << " items, next " << bi
->end
<< dendl
;
12575 dout(20) << ls
<< dendl
;
12577 for (vector
<hobject_t
>::iterator p
= ls
.begin(); p
!= ls
.end(); ++p
) {
12578 handle
.reset_tp_timeout();
12579 ObjectContextRef obc
;
12581 obc
= object_contexts
.lookup(*p
);
12583 bi
->objects
[*p
] = obc
->obs
.oi
.version
;
12584 dout(20) << " " << *p
<< " " << obc
->obs
.oi
.version
<< dendl
;
12587 int r
= pgbackend
->objects_get_attr(*p
, OI_ATTR
, &bl
);
12589 /* If the object does not exist here, it must have been removed
12590 * between the collection_list_partial and here. This can happen
12591 * for the first item in the range, which is usually last_backfill.
12597 object_info_t
oi(bl
);
12598 bi
->objects
[*p
] = oi
.version
;
12599 dout(20) << " " << *p
<< " " << oi
.version
<< dendl
;
12607 * verifies that stray objects have been deleted
12609 void PrimaryLogPG::check_local()
12611 dout(10) << __func__
<< dendl
;
12613 assert(info
.last_update
>= pg_log
.get_tail()); // otherwise we need some help!
12615 if (!cct
->_conf
->osd_debug_verify_stray_on_activate
)
12618 // just scan the log.
12619 set
<hobject_t
> did
;
12620 for (list
<pg_log_entry_t
>::const_reverse_iterator p
= pg_log
.get_log().log
.rbegin();
12621 p
!= pg_log
.get_log().log
.rend();
12623 if (did
.count(p
->soid
))
12625 did
.insert(p
->soid
);
12627 if (p
->is_delete() && !is_missing_object(p
->soid
)) {
12628 dout(10) << " checking " << p
->soid
12629 << " at " << p
->version
<< dendl
;
12631 int r
= osd
->store
->stat(
12633 ghobject_t(p
->soid
, ghobject_t::NO_GEN
, pg_whoami
.shard
),
12635 if (r
!= -ENOENT
) {
12636 derr
<< __func__
<< " " << p
->soid
<< " exists, but should have been "
12637 << "deleted" << dendl
;
12638 assert(0 == "erroneously present object");
12641 // ignore old(+missing) objects
12648 // ===========================
12651 hobject_t
PrimaryLogPG::get_hit_set_current_object(utime_t stamp
)
12654 ss
<< "hit_set_" << info
.pgid
.pgid
<< "_current_" << stamp
;
12655 hobject_t
hoid(sobject_t(ss
.str(), CEPH_NOSNAP
), "",
12656 info
.pgid
.ps(), info
.pgid
.pool(),
12657 cct
->_conf
->osd_hit_set_namespace
);
12658 dout(20) << __func__
<< " " << hoid
<< dendl
;
12662 hobject_t
PrimaryLogPG::get_hit_set_archive_object(utime_t start
,
12667 ss
<< "hit_set_" << info
.pgid
.pgid
<< "_archive_";
12669 start
.gmtime(ss
) << "_";
12672 start
.localtime(ss
) << "_";
12675 hobject_t
hoid(sobject_t(ss
.str(), CEPH_NOSNAP
), "",
12676 info
.pgid
.ps(), info
.pgid
.pool(),
12677 cct
->_conf
->osd_hit_set_namespace
);
12678 dout(20) << __func__
<< " " << hoid
<< dendl
;
12682 void PrimaryLogPG::hit_set_clear()
12684 dout(20) << __func__
<< dendl
;
12686 hit_set_start_stamp
= utime_t();
12689 void PrimaryLogPG::hit_set_setup()
12691 if (!is_active() ||
12697 if (is_active() && is_primary() &&
12698 (!pool
.info
.hit_set_count
||
12699 !pool
.info
.hit_set_period
||
12700 pool
.info
.hit_set_params
.get_type() == HitSet::TYPE_NONE
)) {
12703 // only primary is allowed to remove all the hit set objects
12704 hit_set_remove_all();
12708 // FIXME: discard any previous data for now
12711 // include any writes we know about from the pg log. this doesn't
12712 // capture reads, but it is better than nothing!
12713 hit_set_apply_log();
12716 void PrimaryLogPG::hit_set_remove_all()
12718 // If any archives are degraded we skip this
12719 for (list
<pg_hit_set_info_t
>::iterator p
= info
.hit_set
.history
.begin();
12720 p
!= info
.hit_set
.history
.end();
12722 hobject_t aoid
= get_hit_set_archive_object(p
->begin
, p
->end
, p
->using_gmt
);
12724 // Once we hit a degraded object just skip
12725 if (is_degraded_or_backfilling_object(aoid
))
12727 if (write_blocked_by_scrub(aoid
))
12731 if (!info
.hit_set
.history
.empty()) {
12732 list
<pg_hit_set_info_t
>::reverse_iterator p
= info
.hit_set
.history
.rbegin();
12733 assert(p
!= info
.hit_set
.history
.rend());
12734 hobject_t oid
= get_hit_set_archive_object(p
->begin
, p
->end
, p
->using_gmt
);
12735 assert(!is_degraded_or_backfilling_object(oid
));
12736 ObjectContextRef obc
= get_object_context(oid
, false);
12739 OpContextUPtr ctx
= simple_opc_create(obc
);
12740 ctx
->at_version
= get_next_version();
12741 ctx
->updated_hset_history
= info
.hit_set
;
12742 utime_t now
= ceph_clock_now();
12744 hit_set_trim(ctx
, 0);
12745 simple_opc_submit(std::move(ctx
));
12748 info
.hit_set
= pg_hit_set_history_t();
12750 agent_state
->discard_hit_sets();
12754 void PrimaryLogPG::hit_set_create()
12756 utime_t now
= ceph_clock_now();
12757 // make a copy of the params to modify
12758 HitSet::Params
params(pool
.info
.hit_set_params
);
12760 dout(20) << __func__
<< " " << params
<< dendl
;
12761 if (pool
.info
.hit_set_params
.get_type() == HitSet::TYPE_BLOOM
) {
12762 BloomHitSet::Params
*p
=
12763 static_cast<BloomHitSet::Params
*>(params
.impl
.get());
12765 // convert false positive rate so it holds up across the full period
12766 p
->set_fpp(p
->get_fpp() / pool
.info
.hit_set_count
);
12767 if (p
->get_fpp() <= 0.0)
12768 p
->set_fpp(.01); // fpp cannot be zero!
12770 // if we don't have specified size, estimate target size based on the
12772 if (p
->target_size
== 0 && hit_set
) {
12773 utime_t dur
= now
- hit_set_start_stamp
;
12774 unsigned unique
= hit_set
->approx_unique_insert_count();
12775 dout(20) << __func__
<< " previous set had approx " << unique
12776 << " unique items over " << dur
<< " seconds" << dendl
;
12777 p
->target_size
= (double)unique
* (double)pool
.info
.hit_set_period
12780 if (p
->target_size
<
12781 static_cast<uint64_t>(cct
->_conf
->osd_hit_set_min_size
))
12782 p
->target_size
= cct
->_conf
->osd_hit_set_min_size
;
12785 > static_cast<uint64_t>(cct
->_conf
->osd_hit_set_max_size
))
12786 p
->target_size
= cct
->_conf
->osd_hit_set_max_size
;
12788 p
->seed
= now
.sec();
12790 dout(10) << __func__
<< " target_size " << p
->target_size
12791 << " fpp " << p
->get_fpp() << dendl
;
12793 hit_set
.reset(new HitSet(params
));
12794 hit_set_start_stamp
= now
;
12798 * apply log entries to set
12800 * this would only happen after peering, to at least capture writes
12801 * during an interval that was potentially lost.
12803 bool PrimaryLogPG::hit_set_apply_log()
12808 eversion_t to
= info
.last_update
;
12809 eversion_t from
= info
.hit_set
.current_last_update
;
12811 dout(20) << __func__
<< " no update" << dendl
;
12815 dout(20) << __func__
<< " " << to
<< " .. " << info
.last_update
<< dendl
;
12816 list
<pg_log_entry_t
>::const_reverse_iterator p
= pg_log
.get_log().log
.rbegin();
12817 while (p
!= pg_log
.get_log().log
.rend() && p
->version
> to
)
12819 while (p
!= pg_log
.get_log().log
.rend() && p
->version
> from
) {
12820 hit_set
->insert(p
->soid
);
12827 void PrimaryLogPG::hit_set_persist()
12829 dout(10) << __func__
<< dendl
;
12831 unsigned max
= pool
.info
.hit_set_count
;
12833 utime_t now
= ceph_clock_now();
12836 // If any archives are degraded we skip this persist request
12837 // account for the additional entry being added below
12838 for (list
<pg_hit_set_info_t
>::iterator p
= info
.hit_set
.history
.begin();
12839 p
!= info
.hit_set
.history
.end();
12841 hobject_t aoid
= get_hit_set_archive_object(p
->begin
, p
->end
, p
->using_gmt
);
12843 // Once we hit a degraded object just skip further trim
12844 if (is_degraded_or_backfilling_object(aoid
))
12846 if (write_blocked_by_scrub(aoid
))
12850 // If backfill is in progress and we could possibly overlap with the
12851 // hit_set_* objects, back off. Since these all have
12852 // hobject_t::hash set to pgid.ps(), and those sort first, we can
12853 // look just at that. This is necessary because our transactions
12854 // may include a modify of the new hit_set *and* a delete of the
12855 // old one, and this may span the backfill boundary.
12856 for (set
<pg_shard_t
>::iterator p
= backfill_targets
.begin();
12857 p
!= backfill_targets
.end();
12859 assert(peer_info
.count(*p
));
12860 const pg_info_t
& pi
= peer_info
[*p
];
12861 if (pi
.last_backfill
== hobject_t() ||
12862 pi
.last_backfill
.get_hash() == info
.pgid
.ps()) {
12863 dout(10) << __func__
<< " backfill target osd." << *p
12864 << " last_backfill has not progressed past pgid ps"
12871 pg_hit_set_info_t new_hset
= pg_hit_set_info_t(pool
.info
.use_gmt_hitset
);
12872 new_hset
.begin
= hit_set_start_stamp
;
12873 new_hset
.end
= now
;
12874 oid
= get_hit_set_archive_object(
12877 new_hset
.using_gmt
);
12879 // If the current object is degraded we skip this persist request
12880 if (write_blocked_by_scrub(oid
))
12884 ::encode(*hit_set
, bl
);
12885 dout(20) << __func__
<< " archive " << oid
<< dendl
;
12888 agent_state
->add_hit_set(new_hset
.begin
, hit_set
);
12889 uint32_t size
= agent_state
->hit_set_map
.size();
12890 if (size
>= pool
.info
.hit_set_count
) {
12891 size
= pool
.info
.hit_set_count
> 0 ? pool
.info
.hit_set_count
- 1: 0;
12893 hit_set_in_memory_trim(size
);
12896 ObjectContextRef obc
= get_object_context(oid
, true);
12897 OpContextUPtr ctx
= simple_opc_create(obc
);
12899 ctx
->at_version
= get_next_version();
12900 ctx
->updated_hset_history
= info
.hit_set
;
12901 pg_hit_set_history_t
&updated_hit_set_hist
= *(ctx
->updated_hset_history
);
12903 updated_hit_set_hist
.current_last_update
= info
.last_update
;
12904 new_hset
.version
= ctx
->at_version
;
12906 updated_hit_set_hist
.history
.push_back(new_hset
);
12909 // fabricate an object_info_t and SnapSet
12910 obc
->obs
.oi
.version
= ctx
->at_version
;
12911 obc
->obs
.oi
.mtime
= now
;
12912 obc
->obs
.oi
.size
= bl
.length();
12913 obc
->obs
.exists
= true;
12914 obc
->obs
.oi
.set_data_digest(bl
.crc32c(-1));
12916 ctx
->new_obs
= obc
->obs
;
12918 obc
->ssc
->snapset
.head_exists
= true;
12919 ctx
->new_snapset
= obc
->ssc
->snapset
;
12921 ctx
->delta_stats
.num_objects
++;
12922 ctx
->delta_stats
.num_objects_hit_set_archive
++;
12923 ctx
->delta_stats
.num_bytes
+= bl
.length();
12924 ctx
->delta_stats
.num_bytes_hit_set_archive
+= bl
.length();
12927 ::encode(ctx
->new_snapset
, bss
);
12928 bufferlist
boi(sizeof(ctx
->new_obs
.oi
));
12929 ::encode(ctx
->new_obs
.oi
, boi
,
12930 get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD
, nullptr));
12932 ctx
->op_t
->create(oid
);
12934 ctx
->op_t
->write(oid
, 0, bl
.length(), bl
, 0);
12936 map
<string
, bufferlist
> attrs
;
12937 attrs
[OI_ATTR
].claim(boi
);
12938 attrs
[SS_ATTR
].claim(bss
);
12939 setattrs_maybe_cache(ctx
->obc
, ctx
.get(), ctx
->op_t
.get(), attrs
);
12940 ctx
->log
.push_back(
12942 pg_log_entry_t::MODIFY
,
12952 hit_set_trim(ctx
, max
);
12954 simple_opc_submit(std::move(ctx
));
12957 void PrimaryLogPG::hit_set_trim(OpContextUPtr
&ctx
, unsigned max
)
12959 assert(ctx
->updated_hset_history
);
12960 pg_hit_set_history_t
&updated_hit_set_hist
=
12961 *(ctx
->updated_hset_history
);
12962 for (unsigned num
= updated_hit_set_hist
.history
.size(); num
> max
; --num
) {
12963 list
<pg_hit_set_info_t
>::iterator p
= updated_hit_set_hist
.history
.begin();
12964 assert(p
!= updated_hit_set_hist
.history
.end());
12965 hobject_t oid
= get_hit_set_archive_object(p
->begin
, p
->end
, p
->using_gmt
);
12967 assert(!is_degraded_or_backfilling_object(oid
));
12969 dout(20) << __func__
<< " removing " << oid
<< dendl
;
12970 ++ctx
->at_version
.version
;
12971 ctx
->log
.push_back(
12972 pg_log_entry_t(pg_log_entry_t::DELETE
,
12981 ctx
->op_t
->remove(oid
);
12982 updated_hit_set_hist
.history
.pop_front();
12984 ObjectContextRef obc
= get_object_context(oid
, false);
12986 --ctx
->delta_stats
.num_objects
;
12987 --ctx
->delta_stats
.num_objects_hit_set_archive
;
12988 ctx
->delta_stats
.num_bytes
-= obc
->obs
.oi
.size
;
12989 ctx
->delta_stats
.num_bytes_hit_set_archive
-= obc
->obs
.oi
.size
;
12993 void PrimaryLogPG::hit_set_in_memory_trim(uint32_t max_in_memory
)
12995 while (agent_state
->hit_set_map
.size() > max_in_memory
) {
12996 agent_state
->remove_oldest_hit_set();
13001 // =======================================
13004 void PrimaryLogPG::agent_setup()
13006 assert(is_locked());
13007 if (!is_active() ||
13009 pool
.info
.cache_mode
== pg_pool_t::CACHEMODE_NONE
||
13010 pool
.info
.tier_of
< 0 ||
13011 !get_osdmap()->have_pg_pool(pool
.info
.tier_of
)) {
13015 if (!agent_state
) {
13016 agent_state
.reset(new TierAgentState
);
13018 // choose random starting position
13019 agent_state
->position
= hobject_t();
13020 agent_state
->position
.pool
= info
.pgid
.pool();
13021 agent_state
->position
.set_hash(pool
.info
.get_random_pg_position(
13024 agent_state
->start
= agent_state
->position
;
13026 dout(10) << __func__
<< " allocated new state, position "
13027 << agent_state
->position
<< dendl
;
13029 dout(10) << __func__
<< " keeping existing state" << dendl
;
13032 if (info
.stats
.stats_invalid
) {
13033 osd
->clog
->warn() << "pg " << info
.pgid
<< " has invalid (post-split) stats; must scrub before tier agent can activate";
13036 agent_choose_mode();
13039 void PrimaryLogPG::agent_clear()
13042 agent_state
.reset(NULL
);
13045 // Return false if no objects operated on since start of object hash space
13046 bool PrimaryLogPG::agent_work(int start_max
, int agent_flush_quota
)
13049 if (!agent_state
) {
13050 dout(10) << __func__
<< " no agent state, stopping" << dendl
;
13057 if (agent_state
->is_idle()) {
13058 dout(10) << __func__
<< " idle, stopping" << dendl
;
13063 osd
->logger
->inc(l_osd_agent_wake
);
13065 dout(10) << __func__
13066 << " max " << start_max
13067 << ", flush " << agent_state
->get_flush_mode_name()
13068 << ", evict " << agent_state
->get_evict_mode_name()
13069 << ", pos " << agent_state
->position
13071 assert(is_primary());
13072 assert(is_active());
13074 agent_load_hit_sets();
13076 const pg_pool_t
*base_pool
= get_osdmap()->get_pg_pool(pool
.info
.tier_of
);
13080 int ls_max
= cct
->_conf
->osd_pool_default_cache_max_evict_check_size
;
13082 // list some objects. this conveniently lists clones (oldest to
13083 // newest) before heads... the same order we want to flush in.
13085 // NOTE: do not flush the Sequencer. we will assume that the
13086 // listing we get back is imprecise.
13087 vector
<hobject_t
> ls
;
13089 int r
= pgbackend
->objects_list_partial(agent_state
->position
, ls_min
, ls_max
,
13092 dout(20) << __func__
<< " got " << ls
.size() << " objects" << dendl
;
13094 for (vector
<hobject_t
>::iterator p
= ls
.begin();
13097 if (p
->nspace
== cct
->_conf
->osd_hit_set_namespace
) {
13098 dout(20) << __func__
<< " skip (hit set) " << *p
<< dendl
;
13099 osd
->logger
->inc(l_osd_agent_skip
);
13102 if (is_degraded_or_backfilling_object(*p
)) {
13103 dout(20) << __func__
<< " skip (degraded) " << *p
<< dendl
;
13104 osd
->logger
->inc(l_osd_agent_skip
);
13107 if (is_missing_object(p
->get_head())) {
13108 dout(20) << __func__
<< " skip (missing head) " << *p
<< dendl
;
13109 osd
->logger
->inc(l_osd_agent_skip
);
13112 ObjectContextRef obc
= get_object_context(*p
, false, NULL
);
13114 // we didn't flush; we may miss something here.
13115 dout(20) << __func__
<< " skip (no obc) " << *p
<< dendl
;
13116 osd
->logger
->inc(l_osd_agent_skip
);
13119 if (!obc
->obs
.exists
) {
13120 dout(20) << __func__
<< " skip (dne) " << obc
->obs
.oi
.soid
<< dendl
;
13121 osd
->logger
->inc(l_osd_agent_skip
);
13124 if (range_intersects_scrub(obc
->obs
.oi
.soid
,
13125 obc
->obs
.oi
.soid
.get_head())) {
13126 dout(20) << __func__
<< " skip (scrubbing) " << obc
->obs
.oi
<< dendl
;
13127 osd
->logger
->inc(l_osd_agent_skip
);
13130 if (obc
->is_blocked()) {
13131 dout(20) << __func__
<< " skip (blocked) " << obc
->obs
.oi
<< dendl
;
13132 osd
->logger
->inc(l_osd_agent_skip
);
13135 if (obc
->is_request_pending()) {
13136 dout(20) << __func__
<< " skip (request pending) " << obc
->obs
.oi
<< dendl
;
13137 osd
->logger
->inc(l_osd_agent_skip
);
13141 // be careful flushing omap to an EC pool.
13142 if (!base_pool
->supports_omap() &&
13143 obc
->obs
.oi
.is_omap()) {
13144 dout(20) << __func__
<< " skip (omap to EC) " << obc
->obs
.oi
<< dendl
;
13145 osd
->logger
->inc(l_osd_agent_skip
);
13149 if (agent_state
->evict_mode
!= TierAgentState::EVICT_MODE_IDLE
&&
13150 agent_maybe_evict(obc
, false))
13152 else if (agent_state
->flush_mode
!= TierAgentState::FLUSH_MODE_IDLE
&&
13153 agent_flush_quota
> 0 && agent_maybe_flush(obc
)) {
13155 --agent_flush_quota
;
13157 if (started
>= start_max
) {
13158 // If finishing early, set "next" to the next object
13159 if (++p
!= ls
.end())
13165 if (++agent_state
->hist_age
> cct
->_conf
->osd_agent_hist_halflife
) {
13166 dout(20) << __func__
<< " resetting atime and temp histograms" << dendl
;
13167 agent_state
->hist_age
= 0;
13168 agent_state
->temp_hist
.decay();
13171 // Total objects operated on so far
13172 int total_started
= agent_state
->started
+ started
;
13173 bool need_delay
= false;
13175 dout(20) << __func__
<< " start pos " << agent_state
->position
13176 << " next start pos " << next
13177 << " started " << total_started
<< dendl
;
13179 // See if we've made a full pass over the object hash space
13180 // This might check at most ls_max objects a second time to notice that
13181 // we've checked every objects at least once.
13182 if (agent_state
->position
< agent_state
->start
&&
13183 next
>= agent_state
->start
) {
13184 dout(20) << __func__
<< " wrap around " << agent_state
->start
<< dendl
;
13185 if (total_started
== 0)
13189 agent_state
->start
= next
;
13191 agent_state
->started
= total_started
;
13193 // See if we are starting from beginning
13195 agent_state
->position
= hobject_t();
13197 agent_state
->position
= next
;
13199 // Discard old in memory HitSets
13200 hit_set_in_memory_trim(pool
.info
.hit_set_count
);
13203 assert(agent_state
->delaying
== false);
13208 agent_choose_mode();
13213 void PrimaryLogPG::agent_load_hit_sets()
13215 if (agent_state
->evict_mode
== TierAgentState::EVICT_MODE_IDLE
) {
13219 if (agent_state
->hit_set_map
.size() < info
.hit_set
.history
.size()) {
13220 dout(10) << __func__
<< dendl
;
13221 for (list
<pg_hit_set_info_t
>::iterator p
= info
.hit_set
.history
.begin();
13222 p
!= info
.hit_set
.history
.end(); ++p
) {
13223 if (agent_state
->hit_set_map
.count(p
->begin
.sec()) == 0) {
13224 dout(10) << __func__
<< " loading " << p
->begin
<< "-"
13225 << p
->end
<< dendl
;
13226 if (!pool
.info
.is_replicated()) {
13227 // FIXME: EC not supported here yet
13228 derr
<< __func__
<< " on non-replicated pool" << dendl
;
13232 hobject_t oid
= get_hit_set_archive_object(p
->begin
, p
->end
, p
->using_gmt
);
13233 if (is_unreadable_object(oid
)) {
13234 dout(10) << __func__
<< " unreadable " << oid
<< ", waiting" << dendl
;
13238 ObjectContextRef obc
= get_object_context(oid
, false);
13240 derr
<< __func__
<< ": could not load hitset " << oid
<< dendl
;
13246 obc
->ondisk_read_lock();
13247 int r
= osd
->store
->read(ch
, ghobject_t(oid
), 0, 0, bl
);
13249 obc
->ondisk_read_unlock();
13251 HitSetRef
hs(new HitSet
);
13252 bufferlist::iterator pbl
= bl
.begin();
13253 ::decode(*hs
, pbl
);
13254 agent_state
->add_hit_set(p
->begin
.sec(), hs
);
13260 bool PrimaryLogPG::agent_maybe_flush(ObjectContextRef
& obc
)
13262 if (!obc
->obs
.oi
.is_dirty()) {
13263 dout(20) << __func__
<< " skip (clean) " << obc
->obs
.oi
<< dendl
;
13264 osd
->logger
->inc(l_osd_agent_skip
);
13267 if (obc
->obs
.oi
.is_cache_pinned()) {
13268 dout(20) << __func__
<< " skip (cache_pinned) " << obc
->obs
.oi
<< dendl
;
13269 osd
->logger
->inc(l_osd_agent_skip
);
13273 utime_t now
= ceph_clock_now();
13274 utime_t ob_local_mtime
;
13275 if (obc
->obs
.oi
.local_mtime
!= utime_t()) {
13276 ob_local_mtime
= obc
->obs
.oi
.local_mtime
;
13278 ob_local_mtime
= obc
->obs
.oi
.mtime
;
13280 bool evict_mode_full
=
13281 (agent_state
->evict_mode
== TierAgentState::EVICT_MODE_FULL
);
13282 if (!evict_mode_full
&&
13283 obc
->obs
.oi
.soid
.snap
== CEPH_NOSNAP
&& // snaps immutable; don't delay
13284 (ob_local_mtime
+ utime_t(pool
.info
.cache_min_flush_age
, 0) > now
)) {
13285 dout(20) << __func__
<< " skip (too young) " << obc
->obs
.oi
<< dendl
;
13286 osd
->logger
->inc(l_osd_agent_skip
);
13290 if (osd
->agent_is_active_oid(obc
->obs
.oi
.soid
)) {
13291 dout(20) << __func__
<< " skip (flushing) " << obc
->obs
.oi
<< dendl
;
13292 osd
->logger
->inc(l_osd_agent_skip
);
13296 dout(10) << __func__
<< " flushing " << obc
->obs
.oi
<< dendl
;
13298 // FIXME: flush anything dirty, regardless of what distribution of
13301 hobject_t oid
= obc
->obs
.oi
.soid
;
13302 osd
->agent_start_op(oid
);
13303 // no need to capture a pg ref, can't outlive fop or ctx
13304 std::function
<void()> on_flush
= [this, oid
]() {
13305 osd
->agent_finish_op(oid
);
13308 int result
= start_flush(
13309 OpRequestRef(), obc
, false, NULL
,
13311 if (result
!= -EINPROGRESS
) {
13313 dout(10) << __func__
<< " start_flush() failed " << obc
->obs
.oi
13314 << " with " << result
<< dendl
;
13315 osd
->logger
->inc(l_osd_agent_skip
);
13319 osd
->logger
->inc(l_osd_agent_flush
);
13323 bool PrimaryLogPG::agent_maybe_evict(ObjectContextRef
& obc
, bool after_flush
)
13325 const hobject_t
& soid
= obc
->obs
.oi
.soid
;
13326 if (!after_flush
&& obc
->obs
.oi
.is_dirty()) {
13327 dout(20) << __func__
<< " skip (dirty) " << obc
->obs
.oi
<< dendl
;
13330 if (!obc
->obs
.oi
.watchers
.empty()) {
13331 dout(20) << __func__
<< " skip (watchers) " << obc
->obs
.oi
<< dendl
;
13334 if (obc
->is_blocked()) {
13335 dout(20) << __func__
<< " skip (blocked) " << obc
->obs
.oi
<< dendl
;
13338 if (obc
->obs
.oi
.is_cache_pinned()) {
13339 dout(20) << __func__
<< " skip (cache_pinned) " << obc
->obs
.oi
<< dendl
;
13343 if (soid
.snap
== CEPH_NOSNAP
) {
13344 int result
= _verify_no_head_clones(soid
, obc
->ssc
->snapset
);
13346 dout(20) << __func__
<< " skip (clones) " << obc
->obs
.oi
<< dendl
;
13351 if (agent_state
->evict_mode
!= TierAgentState::EVICT_MODE_FULL
) {
13352 // is this object old than cache_min_evict_age?
13353 utime_t now
= ceph_clock_now();
13354 utime_t ob_local_mtime
;
13355 if (obc
->obs
.oi
.local_mtime
!= utime_t()) {
13356 ob_local_mtime
= obc
->obs
.oi
.local_mtime
;
13358 ob_local_mtime
= obc
->obs
.oi
.mtime
;
13360 if (ob_local_mtime
+ utime_t(pool
.info
.cache_min_evict_age
, 0) > now
) {
13361 dout(20) << __func__
<< " skip (too young) " << obc
->obs
.oi
<< dendl
;
13362 osd
->logger
->inc(l_osd_agent_skip
);
13365 // is this object old and/or cold enough?
13367 uint64_t temp_upper
= 0, temp_lower
= 0;
13369 agent_estimate_temp(soid
, &temp
);
13370 agent_state
->temp_hist
.add(temp
);
13371 agent_state
->temp_hist
.get_position_micro(temp
, &temp_lower
, &temp_upper
);
13373 dout(20) << __func__
13374 << " temp " << temp
13375 << " pos " << temp_lower
<< "-" << temp_upper
13376 << ", evict_effort " << agent_state
->evict_effort
13378 dout(30) << "agent_state:\n";
13379 Formatter
*f
= Formatter::create("");
13380 f
->open_object_section("agent_state");
13381 agent_state
->dump(f
);
13382 f
->close_section();
13387 if (1000000 - temp_upper
>= agent_state
->evict_effort
)
13391 dout(10) << __func__
<< " evicting " << obc
->obs
.oi
<< dendl
;
13392 OpContextUPtr ctx
= simple_opc_create(obc
);
13394 if (!ctx
->lock_manager
.get_lock_type(
13395 ObjectContext::RWState::RWWRITE
,
13399 close_op_ctx(ctx
.release());
13400 dout(20) << __func__
<< " skip (cannot get lock) " << obc
->obs
.oi
<< dendl
;
13404 osd
->agent_start_evict_op();
13405 ctx
->register_on_finish(
13407 osd
->agent_finish_evict_op();
13410 ctx
->at_version
= get_next_version();
13411 assert(ctx
->new_obs
.exists
);
13412 int r
= _delete_oid(ctx
.get(), true, false);
13413 if (obc
->obs
.oi
.is_omap())
13414 ctx
->delta_stats
.num_objects_omap
--;
13415 ctx
->delta_stats
.num_evict
++;
13416 ctx
->delta_stats
.num_evict_kb
+= SHIFT_ROUND_UP(obc
->obs
.oi
.size
, 10);
13417 if (obc
->obs
.oi
.is_dirty())
13418 --ctx
->delta_stats
.num_objects_dirty
;
13420 finish_ctx(ctx
.get(), pg_log_entry_t::DELETE
, false);
13421 simple_opc_submit(std::move(ctx
));
13422 osd
->logger
->inc(l_osd_tier_evict
);
13423 osd
->logger
->inc(l_osd_agent_evict
);
13427 void PrimaryLogPG::agent_stop()
13429 dout(20) << __func__
<< dendl
;
13430 if (agent_state
&& !agent_state
->is_idle()) {
13431 agent_state
->evict_mode
= TierAgentState::EVICT_MODE_IDLE
;
13432 agent_state
->flush_mode
= TierAgentState::FLUSH_MODE_IDLE
;
13433 osd
->agent_disable_pg(this, agent_state
->evict_effort
);
13437 void PrimaryLogPG::agent_delay()
13439 dout(20) << __func__
<< dendl
;
13440 if (agent_state
&& !agent_state
->is_idle()) {
13441 assert(agent_state
->delaying
== false);
13442 agent_state
->delaying
= true;
13443 osd
->agent_disable_pg(this, agent_state
->evict_effort
);
13447 void PrimaryLogPG::agent_choose_mode_restart()
13449 dout(20) << __func__
<< dendl
;
13451 if (agent_state
&& agent_state
->delaying
) {
13452 agent_state
->delaying
= false;
13453 agent_choose_mode(true);
13458 bool PrimaryLogPG::agent_choose_mode(bool restart
, OpRequestRef op
)
13460 bool requeued
= false;
13461 // Let delay play out
13462 if (agent_state
->delaying
) {
13463 dout(20) << __func__
<< this << " delaying, ignored" << dendl
;
13467 TierAgentState::flush_mode_t flush_mode
= TierAgentState::FLUSH_MODE_IDLE
;
13468 TierAgentState::evict_mode_t evict_mode
= TierAgentState::EVICT_MODE_IDLE
;
13469 unsigned evict_effort
= 0;
13471 if (info
.stats
.stats_invalid
) {
13472 // idle; stats can't be trusted until we scrub.
13473 dout(20) << __func__
<< " stats invalid (post-split), idle" << dendl
;
13478 uint64_t divisor
= pool
.info
.get_pg_num_divisor(info
.pgid
.pgid
);
13479 assert(divisor
> 0);
13481 // adjust (effective) user objects down based on the number
13482 // of HitSet objects, which should not count toward our total since
13483 // they cannot be flushed.
13484 uint64_t unflushable
= info
.stats
.stats
.sum
.num_objects_hit_set_archive
;
13486 // also exclude omap objects if ec backing pool
13487 const pg_pool_t
*base_pool
= get_osdmap()->get_pg_pool(pool
.info
.tier_of
);
13489 if (!base_pool
->supports_omap())
13490 unflushable
+= info
.stats
.stats
.sum
.num_objects_omap
;
13492 uint64_t num_user_objects
= info
.stats
.stats
.sum
.num_objects
;
13493 if (num_user_objects
> unflushable
)
13494 num_user_objects
-= unflushable
;
13496 num_user_objects
= 0;
13498 uint64_t num_user_bytes
= info
.stats
.stats
.sum
.num_bytes
;
13499 uint64_t unflushable_bytes
= info
.stats
.stats
.sum
.num_bytes_hit_set_archive
;
13500 num_user_bytes
-= unflushable_bytes
;
13501 uint64_t num_overhead_bytes
= osd
->store
->estimate_objects_overhead(num_user_objects
);
13502 num_user_bytes
+= num_overhead_bytes
;
13504 // also reduce the num_dirty by num_objects_omap
13505 int64_t num_dirty
= info
.stats
.stats
.sum
.num_objects_dirty
;
13506 if (!base_pool
->supports_omap()) {
13507 if (num_dirty
> info
.stats
.stats
.sum
.num_objects_omap
)
13508 num_dirty
-= info
.stats
.stats
.sum
.num_objects_omap
;
13513 dout(10) << __func__
13515 << TierAgentState::get_flush_mode_name(agent_state
->flush_mode
)
13517 << TierAgentState::get_evict_mode_name(agent_state
->evict_mode
)
13518 << " num_objects: " << info
.stats
.stats
.sum
.num_objects
13519 << " num_bytes: " << info
.stats
.stats
.sum
.num_bytes
13520 << " num_objects_dirty: " << info
.stats
.stats
.sum
.num_objects_dirty
13521 << " num_objects_omap: " << info
.stats
.stats
.sum
.num_objects_omap
13522 << " num_dirty: " << num_dirty
13523 << " num_user_objects: " << num_user_objects
13524 << " num_user_bytes: " << num_user_bytes
13525 << " num_overhead_bytes: " << num_overhead_bytes
13526 << " pool.info.target_max_bytes: " << pool
.info
.target_max_bytes
13527 << " pool.info.target_max_objects: " << pool
.info
.target_max_objects
13530 // get dirty, full ratios
13531 uint64_t dirty_micro
= 0;
13532 uint64_t full_micro
= 0;
13533 if (pool
.info
.target_max_bytes
&& num_user_objects
> 0) {
13534 uint64_t avg_size
= num_user_bytes
/ num_user_objects
;
13536 num_dirty
* avg_size
* 1000000 /
13537 MAX(pool
.info
.target_max_bytes
/ divisor
, 1);
13539 num_user_objects
* avg_size
* 1000000 /
13540 MAX(pool
.info
.target_max_bytes
/ divisor
, 1);
13542 if (pool
.info
.target_max_objects
> 0) {
13543 uint64_t dirty_objects_micro
=
13544 num_dirty
* 1000000 /
13545 MAX(pool
.info
.target_max_objects
/ divisor
, 1);
13546 if (dirty_objects_micro
> dirty_micro
)
13547 dirty_micro
= dirty_objects_micro
;
13548 uint64_t full_objects_micro
=
13549 num_user_objects
* 1000000 /
13550 MAX(pool
.info
.target_max_objects
/ divisor
, 1);
13551 if (full_objects_micro
> full_micro
)
13552 full_micro
= full_objects_micro
;
13554 dout(20) << __func__
<< " dirty " << ((float)dirty_micro
/ 1000000.0)
13555 << " full " << ((float)full_micro
/ 1000000.0)
13559 uint64_t flush_target
= pool
.info
.cache_target_dirty_ratio_micro
;
13560 uint64_t flush_high_target
= pool
.info
.cache_target_dirty_high_ratio_micro
;
13561 uint64_t flush_slop
= (float)flush_target
* cct
->_conf
->osd_agent_slop
;
13562 if (restart
|| agent_state
->flush_mode
== TierAgentState::FLUSH_MODE_IDLE
) {
13563 flush_target
+= flush_slop
;
13564 flush_high_target
+= flush_slop
;
13566 flush_target
-= MIN(flush_target
, flush_slop
);
13567 flush_high_target
-= MIN(flush_high_target
, flush_slop
);
13570 if (dirty_micro
> flush_high_target
) {
13571 flush_mode
= TierAgentState::FLUSH_MODE_HIGH
;
13572 } else if (dirty_micro
> flush_target
) {
13573 flush_mode
= TierAgentState::FLUSH_MODE_LOW
;
13577 uint64_t evict_target
= pool
.info
.cache_target_full_ratio_micro
;
13578 uint64_t evict_slop
= (float)evict_target
* cct
->_conf
->osd_agent_slop
;
13579 if (restart
|| agent_state
->evict_mode
== TierAgentState::EVICT_MODE_IDLE
)
13580 evict_target
+= evict_slop
;
13582 evict_target
-= MIN(evict_target
, evict_slop
);
13584 if (full_micro
> 1000000) {
13585 // evict anything clean
13586 evict_mode
= TierAgentState::EVICT_MODE_FULL
;
13587 evict_effort
= 1000000;
13588 } else if (full_micro
> evict_target
) {
13589 // set effort in [0..1] range based on where we are between
13590 evict_mode
= TierAgentState::EVICT_MODE_SOME
;
13591 uint64_t over
= full_micro
- evict_target
;
13592 uint64_t span
= 1000000 - evict_target
;
13593 evict_effort
= MAX(over
* 1000000 / span
,
13594 (unsigned)(1000000.0 * cct
->_conf
->osd_agent_min_evict_effort
));
13596 // quantize effort to avoid too much reordering in the agent_queue.
13597 uint64_t inc
= cct
->_conf
->osd_agent_quantize_effort
* 1000000;
13599 uint64_t was
= evict_effort
;
13600 evict_effort
-= evict_effort
% inc
;
13601 if (evict_effort
< inc
)
13602 evict_effort
= inc
;
13603 assert(evict_effort
>= inc
&& evict_effort
<= 1000000);
13604 dout(30) << __func__
<< " evict_effort " << was
<< " quantized by " << inc
<< " to " << evict_effort
<< dendl
;
13609 bool old_idle
= agent_state
->is_idle();
13610 if (flush_mode
!= agent_state
->flush_mode
) {
13611 dout(5) << __func__
<< " flush_mode "
13612 << TierAgentState::get_flush_mode_name(agent_state
->flush_mode
)
13614 << TierAgentState::get_flush_mode_name(flush_mode
)
13616 if (flush_mode
== TierAgentState::FLUSH_MODE_HIGH
) {
13617 osd
->agent_inc_high_count();
13618 info
.stats
.stats
.sum
.num_flush_mode_high
= 1;
13619 } else if (flush_mode
== TierAgentState::FLUSH_MODE_LOW
) {
13620 info
.stats
.stats
.sum
.num_flush_mode_low
= 1;
13622 if (agent_state
->flush_mode
== TierAgentState::FLUSH_MODE_HIGH
) {
13623 osd
->agent_dec_high_count();
13624 info
.stats
.stats
.sum
.num_flush_mode_high
= 0;
13625 } else if (agent_state
->flush_mode
== TierAgentState::FLUSH_MODE_LOW
) {
13626 info
.stats
.stats
.sum
.num_flush_mode_low
= 0;
13628 agent_state
->flush_mode
= flush_mode
;
13630 if (evict_mode
!= agent_state
->evict_mode
) {
13631 dout(5) << __func__
<< " evict_mode "
13632 << TierAgentState::get_evict_mode_name(agent_state
->evict_mode
)
13634 << TierAgentState::get_evict_mode_name(evict_mode
)
13636 if (agent_state
->evict_mode
== TierAgentState::EVICT_MODE_FULL
&&
13640 requeue_ops(waiting_for_flush
);
13641 requeue_ops(waiting_for_active
);
13642 requeue_ops(waiting_for_scrub
);
13643 requeue_ops(waiting_for_cache_not_full
);
13644 objects_blocked_on_cache_full
.clear();
13647 if (evict_mode
== TierAgentState::EVICT_MODE_SOME
) {
13648 info
.stats
.stats
.sum
.num_evict_mode_some
= 1;
13649 } else if (evict_mode
== TierAgentState::EVICT_MODE_FULL
) {
13650 info
.stats
.stats
.sum
.num_evict_mode_full
= 1;
13652 if (agent_state
->evict_mode
== TierAgentState::EVICT_MODE_SOME
) {
13653 info
.stats
.stats
.sum
.num_evict_mode_some
= 0;
13654 } else if (agent_state
->evict_mode
== TierAgentState::EVICT_MODE_FULL
) {
13655 info
.stats
.stats
.sum
.num_evict_mode_full
= 0;
13657 agent_state
->evict_mode
= evict_mode
;
13659 uint64_t old_effort
= agent_state
->evict_effort
;
13660 if (evict_effort
!= agent_state
->evict_effort
) {
13661 dout(5) << __func__
<< " evict_effort "
13662 << ((float)agent_state
->evict_effort
/ 1000000.0)
13664 << ((float)evict_effort
/ 1000000.0)
13666 agent_state
->evict_effort
= evict_effort
;
13669 // NOTE: we are using evict_effort as a proxy for *all* agent effort
13670 // (including flush). This is probably fine (they should be
13671 // correlated) but it is not precisely correct.
13672 if (agent_state
->is_idle()) {
13673 if (!restart
&& !old_idle
) {
13674 osd
->agent_disable_pg(this, old_effort
);
13677 if (restart
|| old_idle
) {
13678 osd
->agent_enable_pg(this, agent_state
->evict_effort
);
13679 } else if (old_effort
!= agent_state
->evict_effort
) {
13680 osd
->agent_adjust_pg(this, old_effort
, agent_state
->evict_effort
);
13686 void PrimaryLogPG::agent_estimate_temp(const hobject_t
& oid
, int *temp
)
13691 if (hit_set
->contains(oid
))
13694 int last_n
= pool
.info
.hit_set_search_last_n
;
13695 for (map
<time_t,HitSetRef
>::reverse_iterator p
=
13696 agent_state
->hit_set_map
.rbegin(); last_n
> 0 &&
13697 p
!= agent_state
->hit_set_map
.rend(); ++p
, ++i
) {
13698 if (p
->second
->contains(oid
)) {
13699 *temp
+= pool
.info
.get_grade(i
);
13705 // Dup op detection
13707 bool PrimaryLogPG::already_complete(eversion_t v
)
13709 dout(20) << __func__
<< ": " << v
<< dendl
;
13710 for (xlist
<RepGather
*>::iterator i
= repop_queue
.begin();
13713 dout(20) << __func__
<< ": " << **i
<< dendl
;
13714 // skip copy from temp object ops
13715 if ((*i
)->v
== eversion_t()) {
13716 dout(20) << __func__
<< ": " << **i
13717 << " version is empty" << dendl
;
13721 dout(20) << __func__
<< ": " << **i
13722 << " (*i)->v past v" << dendl
;
13725 if (!(*i
)->all_committed
) {
13726 dout(20) << __func__
<< ": " << **i
13727 << " not committed, returning false"
13732 dout(20) << __func__
<< ": returning true" << dendl
;
13736 bool PrimaryLogPG::already_ack(eversion_t v
)
13738 dout(20) << __func__
<< ": " << v
<< dendl
;
13739 for (xlist
<RepGather
*>::iterator i
= repop_queue
.begin();
13742 // skip copy from temp object ops
13743 if ((*i
)->v
== eversion_t()) {
13744 dout(20) << __func__
<< ": " << **i
13745 << " version is empty" << dendl
;
13749 dout(20) << __func__
<< ": " << **i
13750 << " (*i)->v past v" << dendl
;
13753 if (!(*i
)->all_applied
) {
13754 dout(20) << __func__
<< ": " << **i
13755 << " not applied, returning false"
13760 dout(20) << __func__
<< ": returning true" << dendl
;
13765 // ==========================================================================================
13769 bool PrimaryLogPG::_range_available_for_scrub(
13770 const hobject_t
&begin
, const hobject_t
&end
)
13772 pair
<hobject_t
, ObjectContextRef
> next
;
13773 next
.second
= object_contexts
.lookup(begin
);
13774 next
.first
= begin
;
13776 while (more
&& next
.first
< end
) {
13777 if (next
.second
&& next
.second
->is_blocked()) {
13778 next
.second
->requeue_scrub_on_unblock
= true;
13779 dout(10) << __func__
<< ": scrub delayed, "
13780 << next
.first
<< " is blocked"
13784 more
= object_contexts
.get_next(next
.first
, &next
);
13789 static bool doing_clones(const boost::optional
<SnapSet
> &snapset
,
13790 const vector
<snapid_t
>::reverse_iterator
&curclone
) {
13791 return snapset
&& curclone
!= snapset
.get().clones
.rend();
13794 void PrimaryLogPG::log_missing(unsigned missing
,
13795 const boost::optional
<hobject_t
> &head
,
13796 LogChannelRef clog
,
13800 bool allow_incomplete_clones
)
13803 if (allow_incomplete_clones
) {
13804 dout(20) << func
<< " " << mode
<< " " << pgid
<< " " << head
.get()
13805 << " skipped " << missing
<< " clone(s) in cache tier" << dendl
;
13807 clog
->info() << mode
<< " " << pgid
<< " " << head
.get()
13808 << " " << missing
<< " missing clone(s)";
13812 unsigned PrimaryLogPG::process_clones_to(const boost::optional
<hobject_t
> &head
,
13813 const boost::optional
<SnapSet
> &snapset
,
13814 LogChannelRef clog
,
13817 bool allow_incomplete_clones
,
13818 boost::optional
<snapid_t
> target
,
13819 vector
<snapid_t
>::reverse_iterator
*curclone
,
13820 inconsistent_snapset_wrapper
&e
)
13824 unsigned missing
= 0;
13826 // NOTE: clones are in descending order, thus **curclone > target test here
13827 hobject_t
next_clone(head
.get());
13828 while(doing_clones(snapset
, *curclone
) && (!target
|| **curclone
> *target
)) {
13830 // it is okay to be missing one or more clones in a cache tier.
13831 // skip higher-numbered clones in the list.
13832 if (!allow_incomplete_clones
) {
13833 next_clone
.snap
= **curclone
;
13834 clog
->error() << mode
<< " " << pgid
<< " " << head
.get()
13835 << " expected clone " << next_clone
<< " " << missing
13837 ++scrubber
.shallow_errors
;
13838 e
.set_clone_missing(next_clone
.snap
);
13840 // Clones are descending
13847 * Validate consistency of the object info and snap sets.
13849 * We are sort of comparing 2 lists. The main loop is on objmap.objects. But
13850 * the comparison of the objects is against multiple snapset.clones. There are
13851 * multiple clone lists and in between lists we expect head or snapdir.
13857 * obj1 snap 1 head/snapdir, unexpected obj1 snap 1
13858 * obj2 head head/snapdir, head ok
13859 * [SnapSet clones 6 4 2 1]
13860 * obj2 snap 7 obj2 snap 6, unexpected obj2 snap 7
13861 * obj2 snap 6 obj2 snap 6, match
13862 * obj2 snap 4 obj2 snap 4, match
13863 * obj3 head obj2 snap 2 (expected), obj2 snap 1 (expected), head ok
13864 * [Snapset clones 3 1]
13865 * obj3 snap 3 obj3 snap 3 match
13866 * obj3 snap 1 obj3 snap 1 match
13867 * obj4 snapdir head/snapdir, snapdir ok
13868 * [Snapset clones 4]
13869 * EOL obj4 snap 4, (expected)
13871 void PrimaryLogPG::scrub_snapshot_metadata(
13872 ScrubMap
&scrubmap
,
13873 const map
<hobject_t
,
13874 pair
<boost::optional
<uint32_t>,
13875 boost::optional
<uint32_t>>> &missing_digest
)
13877 dout(10) << __func__
<< dendl
;
13879 coll_t
c(info
.pgid
);
13880 bool repair
= state_test(PG_STATE_REPAIR
);
13881 bool deep_scrub
= state_test(PG_STATE_DEEP_SCRUB
);
13882 const char *mode
= (repair
? "repair": (deep_scrub
? "deep-scrub" : "scrub"));
13883 boost::optional
<snapid_t
> all_clones
; // Unspecified snapid_t or boost::none
13885 /// snapsets to repair
13886 map
<hobject_t
,SnapSet
> snapset_to_repair
;
13888 // traverse in reverse order.
13889 boost::optional
<hobject_t
> head
;
13890 boost::optional
<SnapSet
> snapset
; // If initialized so will head (above)
13891 vector
<snapid_t
>::reverse_iterator curclone
; // Defined only if snapset initialized
13892 unsigned missing
= 0;
13893 inconsistent_snapset_wrapper soid_error
, head_error
;
13894 unsigned soid_error_count
= 0;
13896 bufferlist last_data
;
13898 for (map
<hobject_t
,ScrubMap::object
>::reverse_iterator
13899 p
= scrubmap
.objects
.rbegin(); p
!= scrubmap
.objects
.rend(); ++p
) {
13900 const hobject_t
& soid
= p
->first
;
13901 soid_error
= inconsistent_snapset_wrapper
{soid
};
13902 object_stat_sum_t stat
;
13903 boost::optional
<object_info_t
> oi
;
13905 if (!soid
.is_snapdir())
13906 stat
.num_objects
++;
13908 if (soid
.nspace
== cct
->_conf
->osd_hit_set_namespace
)
13909 stat
.num_objects_hit_set_archive
++;
13911 if (soid
.is_snap()) {
13913 stat
.num_object_clones
++;
13917 if (p
->second
.attrs
.count(OI_ATTR
) == 0) {
13919 osd
->clog
->error() << mode
<< " " << info
.pgid
<< " " << soid
13920 << " no '" << OI_ATTR
<< "' attr";
13921 ++scrubber
.shallow_errors
;
13922 soid_error
.set_info_missing();
13925 bv
.push_back(p
->second
.attrs
[OI_ATTR
]);
13927 oi
= object_info_t(); // Initialize optional<> before decode into it
13928 oi
.get().decode(bv
);
13929 } catch (buffer::error
& e
) {
13931 osd
->clog
->error() << mode
<< " " << info
.pgid
<< " " << soid
13932 << " can't decode '" << OI_ATTR
<< "' attr " << e
.what();
13933 ++scrubber
.shallow_errors
;
13934 soid_error
.set_info_corrupted();
13935 soid_error
.set_info_missing(); // Not available too
13940 if (pgbackend
->be_get_ondisk_size(oi
->size
) != p
->second
.size
) {
13941 osd
->clog
->error() << mode
<< " " << info
.pgid
<< " " << soid
13942 << " on disk size (" << p
->second
.size
13943 << ") does not match object info size ("
13944 << oi
->size
<< ") adjusted for ondisk to ("
13945 << pgbackend
->be_get_ondisk_size(oi
->size
)
13947 soid_error
.set_size_mismatch();
13948 ++scrubber
.shallow_errors
;
13951 dout(20) << mode
<< " " << soid
<< " " << oi
.get() << dendl
;
13953 // A clone num_bytes will be added later when we have snapset
13954 if (!soid
.is_snap()) {
13955 stat
.num_bytes
+= oi
->size
;
13957 if (soid
.nspace
== cct
->_conf
->osd_hit_set_namespace
)
13958 stat
.num_bytes_hit_set_archive
+= oi
->size
;
13960 if (!soid
.is_snapdir()) {
13961 if (oi
->is_dirty())
13962 ++stat
.num_objects_dirty
;
13963 if (oi
->is_whiteout())
13964 ++stat
.num_whiteouts
;
13966 ++stat
.num_objects_omap
;
13967 if (oi
->is_cache_pinned())
13968 ++stat
.num_objects_pinned
;
13971 // pessimistic assumption that this object might contain a
13973 stat
.num_legacy_snapsets
++;
13976 // Check for any problems while processing clones
13977 if (doing_clones(snapset
, curclone
)) {
13978 boost::optional
<snapid_t
> target
;
13979 // Expecting an object with snap for current head
13980 if (soid
.has_snapset() || soid
.get_head() != head
->get_head()) {
13982 dout(10) << __func__
<< " " << mode
<< " " << info
.pgid
<< " new object "
13983 << soid
<< " while processing " << head
.get() << dendl
;
13985 target
= all_clones
;
13987 assert(soid
.is_snap());
13988 target
= soid
.snap
;
13991 // Log any clones we were expecting to be there up to target
13992 // This will set missing, but will be a no-op if snap.soid == *curclone.
13993 missing
+= process_clones_to(head
, snapset
, osd
->clog
, info
.pgid
, mode
,
13994 pool
.info
.allow_incomplete_clones(), target
, &curclone
,
13998 // Check doing_clones() again in case we ran process_clones_to()
13999 if (doing_clones(snapset
, curclone
)) {
14000 // A head/snapdir would have processed all clones above
14001 // or all greater than *curclone.
14002 assert(soid
.is_snap() && *curclone
<= soid
.snap
);
14004 // After processing above clone snap should match the expected curclone
14005 expected
= (*curclone
== soid
.snap
);
14007 // If we aren't doing clones any longer, then expecting head/snapdir
14008 expected
= soid
.has_snapset();
14011 // If we couldn't read the head's snapset, just ignore clones
14012 if (head
&& !snapset
) {
14013 osd
->clog
->error() << mode
<< " " << info
.pgid
<< " " << soid
14014 << " clone ignored due to missing snapset";
14016 osd
->clog
->error() << mode
<< " " << info
.pgid
<< " " << soid
14017 << " is an unexpected clone";
14019 ++scrubber
.shallow_errors
;
14020 soid_error
.set_headless();
14021 scrubber
.store
->add_snap_error(pool
.id
, soid_error
);
14022 ++soid_error_count
;
14023 if (head
&& soid
.get_head() == head
->get_head())
14024 head_error
.set_clone(soid
.snap
);
14029 if (soid
.has_snapset()) {
14032 log_missing(missing
, head
, osd
->clog
, info
.pgid
, __func__
, mode
,
14033 pool
.info
.allow_incomplete_clones());
14036 // Save previous head error information
14037 if (head
&& (head_error
.errors
|| soid_error_count
))
14038 scrubber
.store
->add_snap_error(pool
.id
, head_error
);
14039 // Set this as a new head object
14042 head_error
= soid_error
;
14043 soid_error_count
= 0;
14045 dout(20) << __func__
<< " " << mode
<< " new head " << head
<< dendl
;
14047 if (p
->second
.attrs
.count(SS_ATTR
) == 0) {
14048 osd
->clog
->error() << mode
<< " " << info
.pgid
<< " " << soid
14049 << " no '" << SS_ATTR
<< "' attr";
14050 ++scrubber
.shallow_errors
;
14051 snapset
= boost::none
;
14052 head_error
.set_snapset_missing();
14055 bl
.push_back(p
->second
.attrs
[SS_ATTR
]);
14056 bufferlist::iterator blp
= bl
.begin();
14058 snapset
= SnapSet(); // Initialize optional<> before decoding into it
14059 ::decode(snapset
.get(), blp
);
14060 head_error
.ss_bl
.push_back(p
->second
.attrs
[SS_ATTR
]);
14061 } catch (buffer::error
& e
) {
14062 snapset
= boost::none
;
14063 osd
->clog
->error() << mode
<< " " << info
.pgid
<< " " << soid
14064 << " can't decode '" << SS_ATTR
<< "' attr " << e
.what();
14065 ++scrubber
.shallow_errors
;
14066 head_error
.set_snapset_corrupted();
14071 // what will be next?
14072 curclone
= snapset
->clones
.rbegin();
14074 if (!snapset
->clones
.empty()) {
14075 dout(20) << " snapset " << snapset
.get() << dendl
;
14076 if (snapset
->seq
== 0) {
14077 osd
->clog
->error() << mode
<< " " << info
.pgid
<< " " << soid
14078 << " snaps.seq not set";
14079 ++scrubber
.shallow_errors
;
14080 head_error
.set_snapset_error();
14084 if (soid
.is_head() && !snapset
->head_exists
) {
14085 osd
->clog
->error() << mode
<< " " << info
.pgid
<< " " << soid
14086 << " snapset.head_exists=false, but head exists";
14087 ++scrubber
.shallow_errors
;
14088 head_error
.set_head_mismatch();
14089 // Fix head_exists locally so is_legacy() returns correctly
14090 snapset
->head_exists
= true;
14092 if (soid
.is_snapdir() && snapset
->head_exists
) {
14093 osd
->clog
->error() << mode
<< " " << info
.pgid
<< " " << soid
14094 << " snapset.head_exists=true, but snapdir exists";
14095 ++scrubber
.shallow_errors
;
14096 head_error
.set_head_mismatch();
14097 // For symmetry fix this too, but probably doesn't matter
14098 snapset
->head_exists
= false;
14101 if (get_osdmap()->require_osd_release
>= CEPH_RELEASE_LUMINOUS
) {
14102 if (soid
.is_snapdir()) {
14103 dout(10) << " will move snapset to head from " << soid
<< dendl
;
14104 snapset_to_repair
[soid
.get_head()] = *snapset
;
14105 } else if (snapset
->is_legacy()) {
14106 dout(10) << " will convert legacy snapset on " << soid
<< " " << *snapset
14108 snapset_to_repair
[soid
.get_head()] = *snapset
;
14111 stat
.num_legacy_snapsets
++;
14114 // pessimistic assumption that this object might contain a
14116 stat
.num_legacy_snapsets
++;
14119 assert(soid
.is_snap());
14122 assert(soid
.snap
== *curclone
);
14124 dout(20) << __func__
<< " " << mode
<< " matched clone " << soid
<< dendl
;
14126 if (snapset
->clone_size
.count(soid
.snap
) == 0) {
14127 osd
->clog
->error() << mode
<< " " << info
.pgid
<< " " << soid
14128 << " is missing in clone_size";
14129 ++scrubber
.shallow_errors
;
14130 soid_error
.set_size_mismatch();
14132 if (oi
&& oi
->size
!= snapset
->clone_size
[soid
.snap
]) {
14133 osd
->clog
->error() << mode
<< " " << info
.pgid
<< " " << soid
14134 << " size " << oi
->size
<< " != clone_size "
14135 << snapset
->clone_size
[*curclone
];
14136 ++scrubber
.shallow_errors
;
14137 soid_error
.set_size_mismatch();
14140 if (snapset
->clone_overlap
.count(soid
.snap
) == 0) {
14141 osd
->clog
->error() << mode
<< " " << info
.pgid
<< " " << soid
14142 << " is missing in clone_overlap";
14143 ++scrubber
.shallow_errors
;
14144 soid_error
.set_size_mismatch();
14146 // This checking is based on get_clone_bytes(). The first 2 asserts
14147 // can't happen because we know we have a clone_size and
14148 // a clone_overlap. Now we check that the interval_set won't
14149 // cause the last assert.
14150 uint64_t size
= snapset
->clone_size
.find(soid
.snap
)->second
;
14151 const interval_set
<uint64_t> &overlap
=
14152 snapset
->clone_overlap
.find(soid
.snap
)->second
;
14153 bool bad_interval_set
= false;
14154 for (interval_set
<uint64_t>::const_iterator i
= overlap
.begin();
14155 i
!= overlap
.end(); ++i
) {
14156 if (size
< i
.get_len()) {
14157 bad_interval_set
= true;
14160 size
-= i
.get_len();
14163 if (bad_interval_set
) {
14164 osd
->clog
->error() << mode
<< " " << info
.pgid
<< " " << soid
14165 << " bad interval_set in clone_overlap";
14166 ++scrubber
.shallow_errors
;
14167 soid_error
.set_size_mismatch();
14169 stat
.num_bytes
+= snapset
->get_clone_bytes(soid
.snap
);
14174 // migrate legacy_snaps to snapset?
14175 auto p
= snapset_to_repair
.find(soid
.get_head());
14176 if (p
!= snapset_to_repair
.end()) {
14177 if (!oi
|| oi
->legacy_snaps
.empty()) {
14178 osd
->clog
->error() << mode
<< " " << info
.pgid
<< " " << soid
14179 << " has no oi or legacy_snaps; cannot convert "
14181 ++scrubber
.shallow_errors
;
14183 dout(20) << __func__
<< " copying legacy_snaps " << oi
->legacy_snaps
14184 << " to snapset " << p
->second
<< dendl
;
14185 p
->second
.clone_snaps
[soid
.snap
] = oi
->legacy_snaps
;
14191 if (soid_error
.errors
) {
14192 scrubber
.store
->add_snap_error(pool
.id
, soid_error
);
14193 ++soid_error_count
;
14197 scrub_cstat
.add(stat
);
14200 if (doing_clones(snapset
, curclone
)) {
14201 dout(10) << __func__
<< " " << mode
<< " " << info
.pgid
14202 << " No more objects while processing " << head
.get() << dendl
;
14204 missing
+= process_clones_to(head
, snapset
, osd
->clog
, info
.pgid
, mode
,
14205 pool
.info
.allow_incomplete_clones(), all_clones
, &curclone
,
14208 // There could be missing found by the test above or even
14209 // before dropping out of the loop for the last head.
14211 log_missing(missing
, head
, osd
->clog
, info
.pgid
, __func__
,
14212 mode
, pool
.info
.allow_incomplete_clones());
14214 if (head
&& (head_error
.errors
|| soid_error_count
))
14215 scrubber
.store
->add_snap_error(pool
.id
, head_error
);
14217 for (auto p
= missing_digest
.begin(); p
!= missing_digest
.end(); ++p
) {
14218 if (p
->first
.is_snapdir())
14220 dout(10) << __func__
<< " recording digests for " << p
->first
<< dendl
;
14221 ObjectContextRef obc
= get_object_context(p
->first
, false);
14223 osd
->clog
->error() << info
.pgid
<< " " << mode
14224 << " cannot get object context for object "
14227 } else if (obc
->obs
.oi
.soid
!= p
->first
) {
14228 osd
->clog
->error() << info
.pgid
<< " " << mode
14229 << " object " << p
->first
14230 << " has a valid oi attr with a mismatched name, "
14231 << " obc->obs.oi.soid: " << obc
->obs
.oi
.soid
;
14234 OpContextUPtr ctx
= simple_opc_create(obc
);
14235 ctx
->at_version
= get_next_version();
14236 ctx
->mtime
= utime_t(); // do not update mtime
14237 if (p
->second
.first
) {
14238 ctx
->new_obs
.oi
.set_data_digest(*p
->second
.first
);
14240 ctx
->new_obs
.oi
.clear_data_digest();
14242 if (p
->second
.second
) {
14243 ctx
->new_obs
.oi
.set_omap_digest(*p
->second
.second
);
14245 ctx
->new_obs
.oi
.clear_omap_digest();
14247 finish_ctx(ctx
.get(), pg_log_entry_t::MODIFY
);
14249 ctx
->register_on_success(
14251 dout(20) << "updating scrub digest" << dendl
;
14252 if (--scrubber
.num_digest_updates_pending
== 0) {
14257 simple_opc_submit(std::move(ctx
));
14258 ++scrubber
.num_digest_updates_pending
;
14260 for (auto& p
: snapset_to_repair
) {
14261 // cache pools may not have the clones, which means we won't know
14262 // what snaps they have. fake out the clone_snaps entries anyway (with
14263 // blank snap lists).
14264 p
.second
.head_exists
= true;
14265 if (pool
.info
.allow_incomplete_clones()) {
14266 for (auto s
: p
.second
.clones
) {
14267 if (p
.second
.clone_snaps
.count(s
) == 0) {
14268 dout(10) << __func__
<< " " << p
.first
<< " faking clone_snaps for "
14270 p
.second
.clone_snaps
[s
];
14274 if (p
.second
.clones
.size() != p
.second
.clone_snaps
.size() ||
14275 p
.second
.is_legacy()) {
14276 // this happens if we encounter other errors above, like a missing
14278 dout(10) << __func__
<< " not writing snapset to " << p
.first
14279 << " snapset " << p
.second
<< " clones " << p
.second
.clones
14280 << "; didn't convert fully" << dendl
;
14281 scrub_cstat
.sum
.num_legacy_snapsets
++;
14284 dout(10) << __func__
<< " writing snapset to " << p
.first
14285 << " " << p
.second
<< dendl
;
14286 ObjectContextRef obc
= get_object_context(p
.first
, true);
14288 osd
->clog
->error() << info
.pgid
<< " " << mode
14289 << " cannot get object context for object "
14292 } else if (obc
->obs
.oi
.soid
!= p
.first
) {
14293 osd
->clog
->error() << info
.pgid
<< " " << mode
14294 << " object " << p
.first
14295 << " has a valid oi attr with a mismatched name, "
14296 << " obc->obs.oi.soid: " << obc
->obs
.oi
.soid
;
14299 ObjectContextRef snapset_obc
;
14300 if (!obc
->obs
.exists
) {
14301 snapset_obc
= get_object_context(p
.first
.get_snapdir(), false);
14302 if (!snapset_obc
) {
14303 osd
->clog
->error() << info
.pgid
<< " " << mode
14304 << " cannot get object context for "
14305 << p
.first
.get_snapdir();
14309 OpContextUPtr ctx
= simple_opc_create(obc
);
14310 PGTransaction
*t
= ctx
->op_t
.get();
14311 ctx
->snapset_obc
= snapset_obc
;
14312 ctx
->at_version
= get_next_version();
14313 ctx
->mtime
= utime_t(); // do not update mtime
14314 ctx
->new_snapset
= p
.second
;
14315 if (!ctx
->new_obs
.exists
) {
14316 dout(20) << __func__
<< " making " << p
.first
<< " a whiteout" << dendl
;
14317 ctx
->new_obs
.exists
= true;
14318 ctx
->new_snapset
.head_exists
= true;
14319 ctx
->new_obs
.oi
.set_flag(object_info_t::FLAG_WHITEOUT
);
14320 ++ctx
->delta_stats
.num_whiteouts
;
14321 ++ctx
->delta_stats
.num_objects
;
14322 t
->create(p
.first
);
14323 if (p
.first
< scrubber
.start
) {
14324 dout(20) << __func__
<< " kludging around update outside of scrub range"
14327 scrub_cstat
.add(ctx
->delta_stats
);
14330 dout(20) << __func__
<< " final snapset " << ctx
->new_snapset
<< dendl
;
14331 assert(!ctx
->new_snapset
.is_legacy());
14332 finish_ctx(ctx
.get(), pg_log_entry_t::MODIFY
);
14333 ctx
->register_on_success(
14335 dout(20) << "updating snapset" << dendl
;
14336 if (--scrubber
.num_digest_updates_pending
== 0) {
14341 simple_opc_submit(std::move(ctx
));
14342 ++scrubber
.num_digest_updates_pending
;
14345 dout(10) << __func__
<< " (" << mode
<< ") finish" << dendl
;
14348 void PrimaryLogPG::_scrub_clear_state()
14350 scrub_cstat
= object_stat_collection_t();
14353 void PrimaryLogPG::_scrub_finish()
14355 bool repair
= state_test(PG_STATE_REPAIR
);
14356 bool deep_scrub
= state_test(PG_STATE_DEEP_SCRUB
);
14357 const char *mode
= (repair
? "repair": (deep_scrub
? "deep-scrub" : "scrub"));
14359 if (info
.stats
.stats_invalid
) {
14360 info
.stats
.stats
= scrub_cstat
;
14361 info
.stats
.stats_invalid
= false;
14364 agent_choose_mode();
14367 dout(10) << mode
<< " got "
14368 << scrub_cstat
.sum
.num_objects
<< "/" << info
.stats
.stats
.sum
.num_objects
<< " objects, "
14369 << scrub_cstat
.sum
.num_object_clones
<< "/" << info
.stats
.stats
.sum
.num_object_clones
<< " clones, "
14370 << scrub_cstat
.sum
.num_objects_dirty
<< "/" << info
.stats
.stats
.sum
.num_objects_dirty
<< " dirty, "
14371 << scrub_cstat
.sum
.num_objects_omap
<< "/" << info
.stats
.stats
.sum
.num_objects_omap
<< " omap, "
14372 << scrub_cstat
.sum
.num_objects_pinned
<< "/" << info
.stats
.stats
.sum
.num_objects_pinned
<< " pinned, "
14373 << scrub_cstat
.sum
.num_objects_hit_set_archive
<< "/" << info
.stats
.stats
.sum
.num_objects_hit_set_archive
<< " hit_set_archive, "
14374 << scrub_cstat
.sum
.num_bytes
<< "/" << info
.stats
.stats
.sum
.num_bytes
<< " bytes, "
14375 << scrub_cstat
.sum
.num_bytes_hit_set_archive
<< "/" << info
.stats
.stats
.sum
.num_bytes_hit_set_archive
<< " hit_set_archive bytes."
14378 if (scrub_cstat
.sum
.num_objects
!= info
.stats
.stats
.sum
.num_objects
||
14379 scrub_cstat
.sum
.num_object_clones
!= info
.stats
.stats
.sum
.num_object_clones
||
14380 (scrub_cstat
.sum
.num_objects_dirty
!= info
.stats
.stats
.sum
.num_objects_dirty
&&
14381 !info
.stats
.dirty_stats_invalid
) ||
14382 (scrub_cstat
.sum
.num_objects_omap
!= info
.stats
.stats
.sum
.num_objects_omap
&&
14383 !info
.stats
.omap_stats_invalid
) ||
14384 (scrub_cstat
.sum
.num_objects_pinned
!= info
.stats
.stats
.sum
.num_objects_pinned
&&
14385 !info
.stats
.pin_stats_invalid
) ||
14386 (scrub_cstat
.sum
.num_objects_hit_set_archive
!= info
.stats
.stats
.sum
.num_objects_hit_set_archive
&&
14387 !info
.stats
.hitset_stats_invalid
) ||
14388 (scrub_cstat
.sum
.num_bytes_hit_set_archive
!= info
.stats
.stats
.sum
.num_bytes_hit_set_archive
&&
14389 !info
.stats
.hitset_bytes_stats_invalid
) ||
14390 scrub_cstat
.sum
.num_whiteouts
!= info
.stats
.stats
.sum
.num_whiteouts
||
14391 scrub_cstat
.sum
.num_bytes
!= info
.stats
.stats
.sum
.num_bytes
) {
14392 osd
->clog
->error() << info
.pgid
<< " " << mode
14393 << " stat mismatch, got "
14394 << scrub_cstat
.sum
.num_objects
<< "/" << info
.stats
.stats
.sum
.num_objects
<< " objects, "
14395 << scrub_cstat
.sum
.num_object_clones
<< "/" << info
.stats
.stats
.sum
.num_object_clones
<< " clones, "
14396 << scrub_cstat
.sum
.num_objects_dirty
<< "/" << info
.stats
.stats
.sum
.num_objects_dirty
<< " dirty, "
14397 << scrub_cstat
.sum
.num_objects_omap
<< "/" << info
.stats
.stats
.sum
.num_objects_omap
<< " omap, "
14398 << scrub_cstat
.sum
.num_objects_pinned
<< "/" << info
.stats
.stats
.sum
.num_objects_pinned
<< " pinned, "
14399 << scrub_cstat
.sum
.num_objects_hit_set_archive
<< "/" << info
.stats
.stats
.sum
.num_objects_hit_set_archive
<< " hit_set_archive, "
14400 << scrub_cstat
.sum
.num_whiteouts
<< "/" << info
.stats
.stats
.sum
.num_whiteouts
<< " whiteouts, "
14401 << scrub_cstat
.sum
.num_bytes
<< "/" << info
.stats
.stats
.sum
.num_bytes
<< " bytes, "
14402 << scrub_cstat
.sum
.num_bytes_hit_set_archive
<< "/" << info
.stats
.stats
.sum
.num_bytes_hit_set_archive
<< " hit_set_archive bytes.";
14403 ++scrubber
.shallow_errors
;
14407 info
.stats
.stats
= scrub_cstat
;
14408 info
.stats
.dirty_stats_invalid
= false;
14409 info
.stats
.omap_stats_invalid
= false;
14410 info
.stats
.hitset_stats_invalid
= false;
14411 info
.stats
.hitset_bytes_stats_invalid
= false;
14412 publish_stats_to_osd();
14415 } else if (scrub_cstat
.sum
.num_legacy_snapsets
!=
14416 info
.stats
.stats
.sum
.num_legacy_snapsets
) {
14417 osd
->clog
->info() << info
.pgid
<< " " << mode
<< " updated num_legacy_snapsets"
14418 << " from " << info
.stats
.stats
.sum
.num_legacy_snapsets
14419 << " -> " << scrub_cstat
.sum
.num_legacy_snapsets
<< "\n";
14420 info
.stats
.stats
.sum
.num_legacy_snapsets
= scrub_cstat
.sum
.num_legacy_snapsets
;
14421 publish_stats_to_osd();
14424 // Clear object context cache to get repair information
14426 object_contexts
.clear();
14429 bool PrimaryLogPG::check_osdmap_full(const set
<pg_shard_t
> &missing_on
)
14431 return osd
->check_osdmap_full(missing_on
);
14434 int PrimaryLogPG::rep_repair_primary_object(const hobject_t
& soid
, OpRequestRef op
)
14436 // Only supports replicated pools
14437 assert(!pool
.info
.require_rollback());
14438 assert(is_primary());
14440 dout(10) << __func__
<< " " << soid
14441 << " peers osd.{" << actingbackfill
<< "}" << dendl
;
14444 block_for_clean(soid
, op
);
14448 assert(!pg_log
.get_missing().is_missing(soid
));
14452 int r
= get_pgbackend()->objects_get_attr(soid
, OI_ATTR
, &bv
);
14454 // Leave v and try to repair without a version, getting attr failed
14455 dout(0) << __func__
<< ": Need version of replica, objects_get_attr failed: "
14456 << soid
<< " error=" << r
<< dendl
;
14458 bufferlist::iterator bliter
= bv
.begin();
14459 ::decode(oi
, bliter
);
14462 // Leave v as default constructed. This will fail when sent to older OSDs, but
14463 // not much worse than failing here.
14464 dout(0) << __func__
<< ": Need version of replica, bad object_info_t: " << soid
<< dendl
;
14467 missing_loc
.add_missing(soid
, v
, eversion_t());
14468 if (primary_error(soid
, v
)) {
14469 dout(0) << __func__
<< " No other replicas available for " << soid
<< dendl
;
14470 // XXX: If we knew that there is no down osd which could include this
14471 // object, it would be nice if we could return EIO here.
14472 // If a "never fail" flag was available, that could be used
14473 // for rbd to NOT return EIO until object marked lost.
14475 // Drop through to save this op in case an osd comes up with the object.
14478 // Restart the op after object becomes readable again
14479 waiting_for_unreadable_object
[soid
].push_back(op
);
14480 op
->mark_delayed("waiting for missing object");
14482 if (!eio_errors_to_process
) {
14483 eio_errors_to_process
= true;
14484 assert(is_clean());
14485 queue_peering_event(
14487 std::make_shared
<CephPeeringEvt
>(
14488 get_osdmap()->get_epoch(),
14489 get_osdmap()->get_epoch(),
14492 // A prior error must have already cleared clean state and queued recovery
14493 // or a map change has triggered re-peering.
14494 // Not inlining the recovery by calling maybe_kick_recovery(soid);
14495 dout(5) << __func__
<< ": Read error on " << soid
<< ", but already seen errors" << dendl
;
14501 /*---SnapTrimmer Logging---*/
14503 #define dout_prefix *_dout << pg->gen_prefix()
14505 void PrimaryLogPG::SnapTrimmer::log_enter(const char *state_name
)
14507 ldout(pg
->cct
, 20) << "enter " << state_name
<< dendl
;
14510 void PrimaryLogPG::SnapTrimmer::log_exit(const char *state_name
, utime_t enter_time
)
14512 ldout(pg
->cct
, 20) << "exit " << state_name
<< dendl
;
14515 /*---SnapTrimmer states---*/
14517 #define dout_prefix (*_dout << context< SnapTrimmer >().pg->gen_prefix() \
14518 << "SnapTrimmer state<" << get_state_name() << ">: ")
14521 PrimaryLogPG::NotTrimming::NotTrimming(my_context ctx
)
14523 NamedState(context
< SnapTrimmer
>().pg
, "NotTrimming")
14525 context
< SnapTrimmer
>().log_enter(state_name
);
14528 void PrimaryLogPG::NotTrimming::exit()
14530 context
< SnapTrimmer
>().log_exit(state_name
, enter_time
);
14533 boost::statechart::result
PrimaryLogPG::NotTrimming::react(const KickTrim
&)
14535 PrimaryLogPG
*pg
= context
< SnapTrimmer
>().pg
;
14536 ldout(pg
->cct
, 10) << "NotTrimming react KickTrim" << dendl
;
14538 if (!(pg
->is_primary() && pg
->is_active())) {
14539 ldout(pg
->cct
, 10) << "NotTrimming not primary or active" << dendl
;
14540 return discard_event();
14542 if (!pg
->is_clean() ||
14543 pg
->snap_trimq
.empty()) {
14544 ldout(pg
->cct
, 10) << "NotTrimming not clean or nothing to trim" << dendl
;
14545 return discard_event();
14547 if (pg
->scrubber
.active
) {
14548 ldout(pg
->cct
, 10) << " scrubbing, will requeue snap_trimmer after" << dendl
;
14549 return transit
< WaitScrub
>();
14551 return transit
< Trimming
>();
14555 boost::statechart::result
PrimaryLogPG::WaitReservation::react(const SnapTrimReserved
&)
14557 PrimaryLogPG
*pg
= context
< SnapTrimmer
>().pg
;
14558 ldout(pg
->cct
, 10) << "WaitReservation react SnapTrimReserved" << dendl
;
14561 if (!context
< SnapTrimmer
>().can_trim()) {
14562 post_event(KickTrim());
14563 return transit
< NotTrimming
>();
14566 context
<Trimming
>().snap_to_trim
= pg
->snap_trimq
.range_start();
14567 ldout(pg
->cct
, 10) << "NotTrimming: trimming "
14568 << pg
->snap_trimq
.range_start()
14570 return transit
< AwaitAsyncWork
>();
14573 /* AwaitAsyncWork */
14574 PrimaryLogPG::AwaitAsyncWork::AwaitAsyncWork(my_context ctx
)
14576 NamedState(context
< SnapTrimmer
>().pg
, "Trimming/AwaitAsyncWork")
14578 auto *pg
= context
< SnapTrimmer
>().pg
;
14579 context
< SnapTrimmer
>().log_enter(state_name
);
14580 context
< SnapTrimmer
>().pg
->osd
->queue_for_snap_trim(pg
);
14581 pg
->state_set(PG_STATE_SNAPTRIM
);
14582 pg
->state_clear(PG_STATE_SNAPTRIM_ERROR
);
14583 pg
->publish_stats_to_osd();
14586 boost::statechart::result
PrimaryLogPG::AwaitAsyncWork::react(const DoSnapWork
&)
14588 PrimaryLogPGRef pg
= context
< SnapTrimmer
>().pg
;
14589 snapid_t snap_to_trim
= context
<Trimming
>().snap_to_trim
;
14590 auto &in_flight
= context
<Trimming
>().in_flight
;
14591 assert(in_flight
.empty());
14593 assert(pg
->is_primary() && pg
->is_active());
14594 if (!context
< SnapTrimmer
>().can_trim()) {
14595 ldout(pg
->cct
, 10) << "something changed, reverting to NotTrimming" << dendl
;
14596 post_event(KickTrim());
14597 return transit
< NotTrimming
>();
14600 ldout(pg
->cct
, 10) << "AwaitAsyncWork: trimming snap " << snap_to_trim
<< dendl
;
14602 vector
<hobject_t
> to_trim
;
14603 unsigned max
= pg
->cct
->_conf
->osd_pg_max_concurrent_snap_trims
;
14604 to_trim
.reserve(max
);
14605 int r
= pg
->snap_mapper
.get_next_objects_to_trim(
14609 if (r
!= 0 && r
!= -ENOENT
) {
14610 lderr(pg
->cct
) << "get_next_objects_to_trim returned "
14611 << cpp_strerror(r
) << dendl
;
14612 assert(0 == "get_next_objects_to_trim returned an invalid code");
14613 } else if (r
== -ENOENT
) {
14615 ldout(pg
->cct
, 10) << "got ENOENT" << dendl
;
14617 ldout(pg
->cct
, 10) << "adding snap " << snap_to_trim
14618 << " to purged_snaps"
14620 pg
->info
.purged_snaps
.insert(snap_to_trim
);
14621 pg
->snap_trimq
.erase(snap_to_trim
);
14622 ldout(pg
->cct
, 10) << "purged_snaps now "
14623 << pg
->info
.purged_snaps
<< ", snap_trimq now "
14624 << pg
->snap_trimq
<< dendl
;
14626 ObjectStore::Transaction t
;
14627 pg
->dirty_big_info
= true;
14628 pg
->write_if_dirty(t
);
14629 int tr
= pg
->osd
->store
->queue_transaction(pg
->osr
.get(), std::move(t
), NULL
);
14632 pg
->share_pg_info();
14633 post_event(KickTrim());
14634 return transit
< NotTrimming
>();
14636 assert(!to_trim
.empty());
14638 for (auto &&object
: to_trim
) {
14640 ldout(pg
->cct
, 10) << "AwaitAsyncWork react trimming " << object
<< dendl
;
14642 int error
= pg
->trim_object(in_flight
.empty(), object
, &ctx
);
14644 if (error
== -ENOLCK
) {
14645 ldout(pg
->cct
, 10) << "could not get write lock on obj "
14646 << object
<< dendl
;
14648 pg
->state_set(PG_STATE_SNAPTRIM_ERROR
);
14649 ldout(pg
->cct
, 10) << "Snaptrim error=" << error
<< dendl
;
14651 if (!in_flight
.empty()) {
14652 ldout(pg
->cct
, 10) << "letting the ones we already started finish" << dendl
;
14653 return transit
< WaitRepops
>();
14655 if (error
== -ENOLCK
) {
14656 ldout(pg
->cct
, 10) << "waiting for it to clear"
14658 return transit
< WaitRWLock
>();
14660 return transit
< NotTrimming
>();
14664 in_flight
.insert(object
);
14665 ctx
->register_on_success(
14666 [pg
, object
, &in_flight
]() {
14667 assert(in_flight
.find(object
) != in_flight
.end());
14668 in_flight
.erase(object
);
14669 if (in_flight
.empty()) {
14670 if (pg
->state_test(PG_STATE_SNAPTRIM_ERROR
)) {
14671 pg
->snap_trimmer_machine
.process_event(Reset());
14673 pg
->snap_trimmer_machine
.process_event(RepopsComplete());
14678 pg
->simple_opc_submit(std::move(ctx
));
14681 return transit
< WaitRepops
>();
14684 void PrimaryLogPG::setattr_maybe_cache(
14685 ObjectContextRef obc
,
14691 t
->setattr(obc
->obs
.oi
.soid
, key
, val
);
14694 void PrimaryLogPG::setattrs_maybe_cache(
14695 ObjectContextRef obc
,
14698 map
<string
, bufferlist
> &attrs
)
14700 t
->setattrs(obc
->obs
.oi
.soid
, attrs
);
14703 void PrimaryLogPG::rmattr_maybe_cache(
14704 ObjectContextRef obc
,
14709 t
->rmattr(obc
->obs
.oi
.soid
, key
);
14712 int PrimaryLogPG::getattr_maybe_cache(
14713 ObjectContextRef obc
,
14717 if (pool
.info
.require_rollback()) {
14718 map
<string
, bufferlist
>::iterator i
= obc
->attr_cache
.find(key
);
14719 if (i
!= obc
->attr_cache
.end()) {
14727 return pgbackend
->objects_get_attr(obc
->obs
.oi
.soid
, key
, val
);
14730 int PrimaryLogPG::getattrs_maybe_cache(
14731 ObjectContextRef obc
,
14732 map
<string
, bufferlist
> *out
)
14736 if (pool
.info
.require_rollback()) {
14737 *out
= obc
->attr_cache
;
14739 r
= pgbackend
->objects_get_attrs(obc
->obs
.oi
.soid
, out
);
14741 map
<string
, bufferlist
> tmp
;
14742 for (map
<string
, bufferlist
>::iterator i
= out
->begin();
14745 if (i
->first
.size() > 1 && i
->first
[0] == '_')
14746 tmp
[i
->first
.substr(1, i
->first
.size())].claim(i
->second
);
14752 bool PrimaryLogPG::check_failsafe_full(ostream
&ss
) {
14753 return osd
->check_failsafe_full(ss
);
14756 void intrusive_ptr_add_ref(PrimaryLogPG
*pg
) { pg
->get("intptr"); }
14757 void intrusive_ptr_release(PrimaryLogPG
*pg
) { pg
->put("intptr"); }
14759 #ifdef PG_DEBUG_REFS
14760 uint64_t get_with_id(PrimaryLogPG
*pg
) { return pg
->get_with_id(); }
14761 void put_with_id(PrimaryLogPG
*pg
, uint64_t id
) { return pg
->put_with_id(id
); }
14764 void intrusive_ptr_add_ref(PrimaryLogPG::RepGather
*repop
) { repop
->get(); }
14765 void intrusive_ptr_release(PrimaryLogPG::RepGather
*repop
) { repop
->put(); }