1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 * Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com>
9 * Author: Loic Dachary <loic@dachary.org>
11 * This is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public
13 * License version 2.1, as published by the Free Software
14 * Foundation. See file COPYING.
18 #include "boost/tuple/tuple.hpp"
19 #include "boost/intrusive_ptr.hpp"
21 #include "PrimaryLogPG.h"
23 #include "OpRequest.h"
24 #include "ScrubStore.h"
26 #include "objclass/objclass.h"
28 #include "common/errno.h"
29 #include "common/scrub_types.h"
30 #include "common/perf_counters.h"
32 #include "messages/MOSDOp.h"
33 #include "messages/MOSDBackoff.h"
34 #include "messages/MOSDSubOp.h"
35 #include "messages/MOSDSubOpReply.h"
36 #include "messages/MOSDPGTrim.h"
37 #include "messages/MOSDPGScan.h"
38 #include "messages/MOSDRepScrub.h"
39 #include "messages/MOSDPGBackfill.h"
40 #include "messages/MOSDPGBackfillRemove.h"
41 #include "messages/MOSDPGUpdateLogMissing.h"
42 #include "messages/MOSDPGUpdateLogMissingReply.h"
43 #include "messages/MCommandReply.h"
44 #include "messages/MOSDScrubReserve.h"
45 #include "mds/inode_backtrace.h" // Ugh
46 #include "common/EventTrace.h"
48 #include "common/config.h"
49 #include "include/compat.h"
50 #include "mon/MonClient.h"
51 #include "osdc/Objecter.h"
52 #include "json_spirit/json_spirit_value.h"
53 #include "json_spirit/json_spirit_reader.h"
54 #include "include/assert.h" // json_spirit clobbers it
55 #include "include/rados/rados_types.hpp"
58 #include "tracing/osd.h"
60 #define tracepoint(...)
63 #define dout_context cct
64 #define dout_subsys ceph_subsys_osd
65 #define DOUT_PREFIX_ARGS this, osd->whoami, get_osdmap()
67 #define dout_prefix _prefix(_dout, this)
69 static ostream
& _prefix(std::ostream
*_dout
, T
*pg
) {
70 return *_dout
<< pg
->gen_prefix();
79 MEMPOOL_DEFINE_OBJECT_FACTORY(PrimaryLogPG
, replicatedpg
, osd
);
81 PGLSFilter::PGLSFilter() : cct(nullptr)
85 PGLSFilter::~PGLSFilter()
89 struct PrimaryLogPG::C_OSD_OnApplied
: Context
{
97 : pg(pg
), epoch(epoch
), v(v
) {}
98 void finish(int) override
{
100 if (!pg
->pg_has_reset_since(epoch
))
107 * The CopyCallback class defines an interface for completions to the
108 * copy_start code. Users of the copy infrastructure must implement
109 * one and give an instance of the class to start_copy.
111 * The implementer is responsible for making sure that the CopyCallback
112 * can associate itself with the correct copy operation.
114 class PrimaryLogPG::CopyCallback
: public GenContext
<CopyCallbackResults
> {
118 * results.get<0>() is the return code: 0 for success; -ECANCELED if
119 * the operation was cancelled by the local OSD; -errno for other issues.
120 * results.get<1>() is a pointer to a CopyResults object, which you are
121 * responsible for deleting.
123 void finish(CopyCallbackResults results_
) override
= 0;
126 /// Provide the final size of the copied object to the CopyCallback
127 ~CopyCallback() override
{}
130 template <typename T
>
131 class PrimaryLogPG::BlessedGenContext
: public GenContext
<T
> {
133 unique_ptr
<GenContext
<T
>> c
;
136 BlessedGenContext(PrimaryLogPG
*pg
, GenContext
<T
> *c
, epoch_t e
)
137 : pg(pg
), c(c
), e(e
) {}
138 void finish(T t
) override
{
140 if (pg
->pg_has_reset_since(e
))
143 c
.release()->complete(t
);
148 GenContext
<ThreadPool::TPHandle
&> *PrimaryLogPG::bless_gencontext(
149 GenContext
<ThreadPool::TPHandle
&> *c
) {
150 return new BlessedGenContext
<ThreadPool::TPHandle
&>(
151 this, c
, get_osdmap()->get_epoch());
154 class PrimaryLogPG::BlessedContext
: public Context
{
156 unique_ptr
<Context
> c
;
159 BlessedContext(PrimaryLogPG
*pg
, Context
*c
, epoch_t e
)
160 : pg(pg
), c(c
), e(e
) {}
161 void finish(int r
) override
{
163 if (pg
->pg_has_reset_since(e
))
166 c
.release()->complete(r
);
172 Context
*PrimaryLogPG::bless_context(Context
*c
) {
173 return new BlessedContext(this, c
, get_osdmap()->get_epoch());
176 class PrimaryLogPG::C_PG_ObjectContext
: public Context
{
180 C_PG_ObjectContext(PrimaryLogPG
*p
, ObjectContext
*o
) :
182 void finish(int r
) override
{
183 pg
->object_context_destructor_callback(obc
);
187 class PrimaryLogPG::C_OSD_OndiskWriteUnlock
: public Context
{
188 ObjectContextRef obc
, obc2
, obc3
;
190 C_OSD_OndiskWriteUnlock(
192 ObjectContextRef o2
= ObjectContextRef(),
193 ObjectContextRef o3
= ObjectContextRef()) : obc(o
), obc2(o2
), obc3(o3
) {}
194 void finish(int r
) override
{
195 obc
->ondisk_write_unlock();
197 obc2
->ondisk_write_unlock();
199 obc3
->ondisk_write_unlock();
203 struct OnReadComplete
: public Context
{
205 PrimaryLogPG::OpContext
*opcontext
;
208 PrimaryLogPG::OpContext
*ctx
) : pg(pg
), opcontext(ctx
) {}
209 void finish(int r
) override
{
211 opcontext
->async_read_result
= r
;
212 opcontext
->finish_read(pg
);
214 ~OnReadComplete() override
{}
217 class PrimaryLogPG::C_OSD_AppliedRecoveredObject
: public Context
{
219 ObjectContextRef obc
;
221 C_OSD_AppliedRecoveredObject(PrimaryLogPG
*p
, ObjectContextRef o
) :
223 void finish(int r
) override
{
224 pg
->_applied_recovered_object(obc
);
228 class PrimaryLogPG::C_OSD_CommittedPushedObject
: public Context
{
231 eversion_t last_complete
;
233 C_OSD_CommittedPushedObject(
234 PrimaryLogPG
*p
, epoch_t epoch
, eversion_t lc
) :
235 pg(p
), epoch(epoch
), last_complete(lc
) {
237 void finish(int r
) override
{
238 pg
->_committed_pushed_object(epoch
, last_complete
);
242 class PrimaryLogPG::C_OSD_AppliedRecoveredObjectReplica
: public Context
{
245 explicit C_OSD_AppliedRecoveredObjectReplica(PrimaryLogPG
*p
) :
247 void finish(int r
) override
{
248 pg
->_applied_recovered_object_replica();
253 void PrimaryLogPG::OpContext::start_async_reads(PrimaryLogPG
*pg
)
256 list
<pair
<boost::tuple
<uint64_t, uint64_t, unsigned>,
257 pair
<bufferlist
*, Context
*> > > in
;
258 in
.swap(pending_async_reads
);
259 pg
->pgbackend
->objects_read_async(
262 new OnReadComplete(pg
, this), pg
->get_pool().fast_read
);
264 void PrimaryLogPG::OpContext::finish_read(PrimaryLogPG
*pg
)
266 assert(inflightreads
> 0);
268 if (async_reads_complete()) {
269 assert(pg
->in_progress_async_reads
.size());
270 assert(pg
->in_progress_async_reads
.front().second
== this);
271 pg
->in_progress_async_reads
.pop_front();
272 pg
->complete_read_ctx(async_read_result
, this);
276 class CopyFromCallback
: public PrimaryLogPG::CopyCallback
{
278 PrimaryLogPG::CopyResults
*results
;
280 PrimaryLogPG::OpContext
*ctx
;
281 explicit CopyFromCallback(PrimaryLogPG::OpContext
*ctx_
)
285 ~CopyFromCallback() override
{}
287 void finish(PrimaryLogPG::CopyCallbackResults results_
) override
{
288 results
= results_
.get
<1>();
289 int r
= results_
.get
<0>();
292 // for finish_copyfrom
293 ctx
->user_at_version
= results
->user_version
;
296 ctx
->pg
->execute_ctx(ctx
);
300 if (r
!= -ECANCELED
) { // on cancel just toss it out; client resends
302 ctx
->pg
->osd
->reply_op_error(ctx
->op
, r
);
303 } else if (results
->should_requeue
) {
305 ctx
->pg
->requeue_op(ctx
->op
);
307 ctx
->pg
->close_op_ctx(ctx
);
311 bool is_temp_obj_used() {
312 return results
->started_temp_obj
;
314 uint64_t get_data_size() {
315 return results
->object_size
;
322 // ======================
323 // PGBackend::Listener
325 void PrimaryLogPG::on_local_recover(
326 const hobject_t
&hoid
,
327 const ObjectRecoveryInfo
&_recovery_info
,
328 ObjectContextRef obc
,
329 ObjectStore::Transaction
*t
332 dout(10) << __func__
<< ": " << hoid
<< dendl
;
334 ObjectRecoveryInfo
recovery_info(_recovery_info
);
335 clear_object_snap_mapping(t
, hoid
);
336 if (recovery_info
.soid
.is_snap()) {
337 OSDriver::OSTransaction
_t(osdriver
.get_transaction(t
));
339 dout(20) << " snapset " << recovery_info
.ss
340 << " legacy_snaps " << recovery_info
.oi
.legacy_snaps
<< dendl
;
341 if (recovery_info
.ss
.is_legacy() ||
342 recovery_info
.ss
.seq
== 0 /* jewel osd doesn't populate this */) {
343 assert(recovery_info
.oi
.legacy_snaps
.size());
344 snaps
.insert(recovery_info
.oi
.legacy_snaps
.begin(),
345 recovery_info
.oi
.legacy_snaps
.end());
347 auto p
= recovery_info
.ss
.clone_snaps
.find(hoid
.snap
);
348 assert(p
!= recovery_info
.ss
.clone_snaps
.end()); // hmm, should we warn?
349 snaps
.insert(p
->second
.begin(), p
->second
.end());
351 dout(20) << " snaps " << snaps
<< dendl
;
357 if (pg_log
.get_missing().is_missing(recovery_info
.soid
) &&
358 pg_log
.get_missing().get_items().find(recovery_info
.soid
)->second
.need
> recovery_info
.version
) {
359 assert(is_primary());
360 const pg_log_entry_t
*latest
= pg_log
.get_log().objects
.find(recovery_info
.soid
)->second
;
361 if (latest
->op
== pg_log_entry_t::LOST_REVERT
&&
362 latest
->reverting_to
== recovery_info
.version
) {
363 dout(10) << " got old revert version " << recovery_info
.version
364 << " for " << *latest
<< dendl
;
365 recovery_info
.version
= latest
->version
;
366 // update the attr to the revert event version
367 recovery_info
.oi
.prior_version
= recovery_info
.oi
.version
;
368 recovery_info
.oi
.version
= latest
->version
;
370 ::encode(recovery_info
.oi
, bl
,
371 get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD
, nullptr));
372 assert(!pool
.info
.require_rollback());
373 t
->setattr(coll
, ghobject_t(recovery_info
.soid
), OI_ATTR
, bl
);
375 obc
->attr_cache
[OI_ATTR
] = bl
;
379 // keep track of active pushes for scrub
382 if (recovery_info
.version
> pg_log
.get_can_rollback_to()) {
383 /* This can only happen during a repair, and even then, it would
384 * be one heck of a race. If we are repairing the object, the
385 * write in question must be fully committed, so it's not valid
386 * to roll it back anyway (and we'll be rolled forward shortly
388 PGLogEntryHandler h
{this, t
};
389 pg_log
.roll_forward_to(recovery_info
.version
, &h
);
391 recover_got(recovery_info
.soid
, recovery_info
.version
);
395 obc
->obs
.exists
= true;
396 obc
->ondisk_write_lock();
398 bool got
= obc
->get_recovery_read();
401 assert(recovering
.count(obc
->obs
.oi
.soid
));
402 recovering
[obc
->obs
.oi
.soid
] = obc
;
403 obc
->obs
.oi
= recovery_info
.oi
; // may have been updated above
406 t
->register_on_applied(new C_OSD_AppliedRecoveredObject(this, obc
));
407 t
->register_on_applied_sync(new C_OSD_OndiskWriteUnlock(obc
));
409 publish_stats_to_osd();
410 assert(missing_loc
.needs_recovery(hoid
));
411 missing_loc
.add_location(hoid
, pg_whoami
);
412 release_backoffs(hoid
);
413 if (!is_unreadable_object(hoid
)) {
414 auto unreadable_object_entry
= waiting_for_unreadable_object
.find(hoid
);
415 if (unreadable_object_entry
!= waiting_for_unreadable_object
.end()) {
416 dout(20) << " kicking unreadable waiters on " << hoid
<< dendl
;
417 requeue_ops(unreadable_object_entry
->second
);
418 waiting_for_unreadable_object
.erase(unreadable_object_entry
);
422 t
->register_on_applied(
423 new C_OSD_AppliedRecoveredObjectReplica(this));
427 t
->register_on_commit(
428 new C_OSD_CommittedPushedObject(
430 get_osdmap()->get_epoch(),
431 info
.last_complete
));
438 void PrimaryLogPG::on_global_recover(
439 const hobject_t
&soid
,
440 const object_stat_sum_t
&stat_diff
)
442 info
.stats
.stats
.sum
.add(stat_diff
);
443 missing_loc
.recovered(soid
);
444 publish_stats_to_osd();
445 dout(10) << "pushed " << soid
<< " to all replicas" << dendl
;
446 map
<hobject_t
, ObjectContextRef
>::iterator i
= recovering
.find(soid
);
447 assert(i
!= recovering
.end());
449 // recover missing won't have had an obc, but it gets filled in
450 // during on_local_recover
452 list
<OpRequestRef
> requeue_list
;
453 i
->second
->drop_recovery_read(&requeue_list
);
454 requeue_ops(requeue_list
);
456 backfills_in_flight
.erase(soid
);
459 finish_recovery_op(soid
);
460 release_backoffs(soid
);
461 auto degraded_object_entry
= waiting_for_degraded_object
.find(soid
);
462 if (degraded_object_entry
!= waiting_for_degraded_object
.end()) {
463 dout(20) << " kicking degraded waiters on " << soid
<< dendl
;
464 requeue_ops(degraded_object_entry
->second
);
465 waiting_for_degraded_object
.erase(degraded_object_entry
);
467 auto unreadable_object_entry
= waiting_for_unreadable_object
.find(soid
);
468 if (unreadable_object_entry
!= waiting_for_unreadable_object
.end()) {
469 dout(20) << " kicking unreadable waiters on " << soid
<< dendl
;
470 requeue_ops(unreadable_object_entry
->second
);
471 waiting_for_unreadable_object
.erase(unreadable_object_entry
);
473 finish_degraded_object(soid
);
476 void PrimaryLogPG::on_peer_recover(
478 const hobject_t
&soid
,
479 const ObjectRecoveryInfo
&recovery_info
)
481 publish_stats_to_osd();
483 peer_missing
[peer
].got(soid
, recovery_info
.version
);
486 void PrimaryLogPG::begin_peer_recover(
488 const hobject_t soid
)
490 peer_missing
[peer
].revise_have(soid
, eversion_t());
493 void PrimaryLogPG::schedule_recovery_work(
494 GenContext
<ThreadPool::TPHandle
&> *c
)
496 osd
->recovery_gen_wq
.queue(c
);
499 void PrimaryLogPG::send_message_osd_cluster(
500 int peer
, Message
*m
, epoch_t from_epoch
)
502 osd
->send_message_osd_cluster(peer
, m
, from_epoch
);
505 void PrimaryLogPG::send_message_osd_cluster(
506 Message
*m
, Connection
*con
)
508 osd
->send_message_osd_cluster(m
, con
);
511 void PrimaryLogPG::send_message_osd_cluster(
512 Message
*m
, const ConnectionRef
& con
)
514 osd
->send_message_osd_cluster(m
, con
);
517 void PrimaryLogPG::on_primary_error(
518 const hobject_t
&oid
,
521 dout(0) << __func__
<< ": oid " << oid
<< " version " << v
<< dendl
;
523 primary_error(oid
, v
);
524 backfills_in_flight
.erase(oid
);
525 missing_loc
.add_missing(oid
, v
, eversion_t());
528 ConnectionRef
PrimaryLogPG::get_con_osd_cluster(
529 int peer
, epoch_t from_epoch
)
531 return osd
->get_con_osd_cluster(peer
, from_epoch
);
534 PerfCounters
*PrimaryLogPG::get_logger()
540 // ====================
543 bool PrimaryLogPG::is_missing_object(const hobject_t
& soid
) const
545 return pg_log
.get_missing().get_items().count(soid
);
548 void PrimaryLogPG::maybe_kick_recovery(
549 const hobject_t
&soid
)
552 if (!missing_loc
.needs_recovery(soid
, &v
))
555 map
<hobject_t
, ObjectContextRef
>::const_iterator p
= recovering
.find(soid
);
556 if (p
!= recovering
.end()) {
557 dout(7) << "object " << soid
<< " v " << v
<< ", already recovering." << dendl
;
558 } else if (missing_loc
.is_unfound(soid
)) {
559 dout(7) << "object " << soid
<< " v " << v
<< ", is unfound." << dendl
;
561 dout(7) << "object " << soid
<< " v " << v
<< ", recovering." << dendl
;
562 PGBackend::RecoveryHandle
*h
= pgbackend
->open_recovery_op();
563 if (is_missing_object(soid
)) {
564 recover_missing(soid
, v
, cct
->_conf
->osd_client_op_priority
, h
);
566 prep_object_replica_pushes(soid
, v
, h
);
568 pgbackend
->run_recovery_op(h
, cct
->_conf
->osd_client_op_priority
);
572 void PrimaryLogPG::wait_for_unreadable_object(
573 const hobject_t
& soid
, OpRequestRef op
)
575 assert(is_unreadable_object(soid
));
576 maybe_kick_recovery(soid
);
577 waiting_for_unreadable_object
[soid
].push_back(op
);
578 op
->mark_delayed("waiting for missing object");
581 bool PrimaryLogPG::is_degraded_or_backfilling_object(const hobject_t
& soid
)
583 /* The conditions below may clear (on_local_recover, before we queue
584 * the transaction) before we actually requeue the degraded waiters
585 * in on_global_recover after the transaction completes.
587 if (waiting_for_degraded_object
.count(soid
))
589 if (pg_log
.get_missing().get_items().count(soid
))
591 assert(!actingbackfill
.empty());
592 for (set
<pg_shard_t
>::iterator i
= actingbackfill
.begin();
593 i
!= actingbackfill
.end();
595 if (*i
== get_primary()) continue;
596 pg_shard_t peer
= *i
;
597 auto peer_missing_entry
= peer_missing
.find(peer
);
598 if (peer_missing_entry
!= peer_missing
.end() &&
599 peer_missing_entry
->second
.get_items().count(soid
))
602 // Object is degraded if after last_backfill AND
603 // we are backfilling it
604 if (is_backfill_targets(peer
) &&
605 peer_info
[peer
].last_backfill
<= soid
&&
606 last_backfill_started
>= soid
&&
607 backfills_in_flight
.count(soid
))
613 void PrimaryLogPG::wait_for_degraded_object(const hobject_t
& soid
, OpRequestRef op
)
615 assert(is_degraded_or_backfilling_object(soid
));
617 maybe_kick_recovery(soid
);
618 waiting_for_degraded_object
[soid
].push_back(op
);
619 op
->mark_delayed("waiting for degraded object");
622 void PrimaryLogPG::block_write_on_full_cache(
623 const hobject_t
& _oid
, OpRequestRef op
)
625 const hobject_t oid
= _oid
.get_head();
626 dout(20) << __func__
<< ": blocking object " << oid
627 << " on full cache" << dendl
;
628 objects_blocked_on_cache_full
.insert(oid
);
629 waiting_for_cache_not_full
.push_back(op
);
630 op
->mark_delayed("waiting for cache not full");
633 void PrimaryLogPG::block_for_clean(
634 const hobject_t
& oid
, OpRequestRef op
)
636 dout(20) << __func__
<< ": blocking object " << oid
637 << " on primary repair" << dendl
;
638 waiting_for_clean_to_primary_repair
.push_back(op
);
639 op
->mark_delayed("waiting for clean to repair");
642 void PrimaryLogPG::block_write_on_snap_rollback(
643 const hobject_t
& oid
, ObjectContextRef obc
, OpRequestRef op
)
645 dout(20) << __func__
<< ": blocking object " << oid
.get_head()
646 << " on snap promotion " << obc
->obs
.oi
.soid
<< dendl
;
647 // otherwise, we'd have blocked in do_op
648 assert(oid
.is_head());
649 assert(objects_blocked_on_snap_promotion
.count(oid
) == 0);
650 objects_blocked_on_snap_promotion
[oid
] = obc
;
651 wait_for_blocked_object(obc
->obs
.oi
.soid
, op
);
654 void PrimaryLogPG::block_write_on_degraded_snap(
655 const hobject_t
& snap
, OpRequestRef op
)
657 dout(20) << __func__
<< ": blocking object " << snap
.get_head()
658 << " on degraded snap " << snap
<< dendl
;
659 // otherwise, we'd have blocked in do_op
660 assert(objects_blocked_on_degraded_snap
.count(snap
.get_head()) == 0);
661 objects_blocked_on_degraded_snap
[snap
.get_head()] = snap
.snap
;
662 wait_for_degraded_object(snap
, op
);
665 bool PrimaryLogPG::maybe_await_blocked_snapset(
666 const hobject_t
&hoid
,
669 ObjectContextRef obc
;
670 obc
= object_contexts
.lookup(hoid
.get_head());
672 if (obc
->is_blocked()) {
673 wait_for_blocked_object(obc
->obs
.oi
.soid
, op
);
679 obc
= object_contexts
.lookup(hoid
.get_snapdir());
681 if (obc
->is_blocked()) {
682 wait_for_blocked_object(obc
->obs
.oi
.soid
, op
);
691 void PrimaryLogPG::wait_for_blocked_object(const hobject_t
& soid
, OpRequestRef op
)
693 dout(10) << __func__
<< " " << soid
<< " " << op
<< dendl
;
694 waiting_for_blocked_object
[soid
].push_back(op
);
695 op
->mark_delayed("waiting for blocked object");
698 void PrimaryLogPG::maybe_force_recovery()
700 // no force if not in degraded/recovery/backfill stats
701 if (!is_degraded() &&
702 !state_test(PG_STATE_RECOVERING
|
703 PG_STATE_RECOVERY_WAIT
|
705 PG_STATE_BACKFILL_WAIT
|
706 PG_STATE_BACKFILL_TOOFULL
))
709 if (pg_log
.get_log().approx_size() <
710 cct
->_conf
->osd_max_pg_log_entries
*
711 cct
->_conf
->osd_force_recovery_pg_log_entries_factor
)
714 // find the oldest missing object
715 version_t min_version
= 0;
717 if (!pg_log
.get_missing().get_items().empty()) {
718 min_version
= pg_log
.get_missing().get_rmissing().begin()->first
;
719 soid
= pg_log
.get_missing().get_rmissing().begin()->second
;
721 assert(!actingbackfill
.empty());
722 for (set
<pg_shard_t
>::iterator it
= actingbackfill
.begin();
723 it
!= actingbackfill
.end();
725 if (*it
== get_primary()) continue;
726 pg_shard_t peer
= *it
;
727 if (peer_missing
.count(peer
) &&
728 !peer_missing
[peer
].get_items().empty() &&
729 min_version
> peer_missing
[peer
].get_rmissing().begin()->first
) {
730 min_version
= peer_missing
[peer
].get_rmissing().begin()->first
;
731 soid
= peer_missing
[peer
].get_rmissing().begin()->second
;
736 if (soid
!= hobject_t())
737 maybe_kick_recovery(soid
);
740 class PGLSPlainFilter
: public PGLSFilter
{
743 int init(bufferlist::iterator
¶ms
) override
746 ::decode(xattr
, params
);
747 ::decode(val
, params
);
748 } catch (buffer::error
&e
) {
754 ~PGLSPlainFilter() override
{}
755 bool filter(const hobject_t
&obj
, bufferlist
& xattr_data
,
756 bufferlist
& outdata
) override
;
759 class PGLSParentFilter
: public PGLSFilter
{
760 inodeno_t parent_ino
;
763 PGLSParentFilter(CephContext
* cct
) : cct(cct
) {
766 int init(bufferlist::iterator
¶ms
) override
769 ::decode(parent_ino
, params
);
770 } catch (buffer::error
&e
) {
773 generic_dout(0) << "parent_ino=" << parent_ino
<< dendl
;
777 ~PGLSParentFilter() override
{}
778 bool filter(const hobject_t
&obj
, bufferlist
& xattr_data
,
779 bufferlist
& outdata
) override
;
782 bool PGLSParentFilter::filter(const hobject_t
&obj
,
783 bufferlist
& xattr_data
, bufferlist
& outdata
)
785 bufferlist::iterator iter
= xattr_data
.begin();
786 inode_backtrace_t bt
;
788 generic_dout(0) << "PGLSParentFilter::filter" << dendl
;
792 vector
<inode_backpointer_t
>::iterator vi
;
793 for (vi
= bt
.ancestors
.begin(); vi
!= bt
.ancestors
.end(); ++vi
) {
794 generic_dout(0) << "vi->dirino=" << vi
->dirino
<< " parent_ino=" << parent_ino
<< dendl
;
795 if (vi
->dirino
== parent_ino
) {
796 ::encode(*vi
, outdata
);
804 bool PGLSPlainFilter::filter(const hobject_t
&obj
,
805 bufferlist
& xattr_data
, bufferlist
& outdata
)
807 if (val
.size() != xattr_data
.length())
810 if (memcmp(val
.c_str(), xattr_data
.c_str(), val
.size()))
816 bool PrimaryLogPG::pgls_filter(PGLSFilter
*filter
, hobject_t
& sobj
, bufferlist
& outdata
)
820 // If filter has expressed an interest in an xattr, load it.
821 if (!filter
->get_xattr().empty()) {
822 int ret
= pgbackend
->objects_get_attr(
826 dout(0) << "getattr (sobj=" << sobj
<< ", attr=" << filter
->get_xattr() << ") returned " << ret
<< dendl
;
828 if (ret
!= -ENODATA
|| filter
->reject_empty_xattr()) {
834 return filter
->filter(sobj
, bl
, outdata
);
837 int PrimaryLogPG::get_pgls_filter(bufferlist::iterator
& iter
, PGLSFilter
**pfilter
)
843 ::decode(type
, iter
);
845 catch (buffer::error
& e
) {
849 if (type
.compare("parent") == 0) {
850 filter
= new PGLSParentFilter(cct
);
851 } else if (type
.compare("plain") == 0) {
852 filter
= new PGLSPlainFilter();
854 std::size_t dot
= type
.find(".");
855 if (dot
== std::string::npos
|| dot
== 0 || dot
== type
.size() - 1) {
859 const std::string class_name
= type
.substr(0, dot
);
860 const std::string filter_name
= type
.substr(dot
+ 1);
861 ClassHandler::ClassData
*cls
= NULL
;
862 int r
= osd
->class_handler
->open_class(class_name
, &cls
);
864 derr
<< "Error opening class '" << class_name
<< "': "
865 << cpp_strerror(r
) << dendl
;
866 if (r
!= -EPERM
) // propogate permission error
873 ClassHandler::ClassFilter
*class_filter
= cls
->get_filter(filter_name
);
874 if (class_filter
== NULL
) {
875 derr
<< "Error finding filter '" << filter_name
<< "' in class "
876 << class_name
<< dendl
;
879 filter
= class_filter
->fn();
881 // Object classes are obliged to return us something, but let's
882 // give an error rather than asserting out.
883 derr
<< "Buggy class " << class_name
<< " failed to construct "
884 "filter " << filter_name
<< dendl
;
890 int r
= filter
->init(iter
);
892 derr
<< "Error initializing filter " << type
<< ": "
893 << cpp_strerror(r
) << dendl
;
897 // Successfully constructed and initialized, return it.
904 // ==========================================================
906 int PrimaryLogPG::do_command(
914 const pg_missing_t
&missing
= pg_log
.get_missing();
918 cmd_getval(cct
, cmdmap
, "format", format
);
919 boost::scoped_ptr
<Formatter
> f(Formatter::create(format
, "json-pretty", "json"));
922 cmd_getval(cct
, cmdmap
, "cmd", command
);
923 if (command
== "query") {
924 f
->open_object_section("pg");
925 f
->dump_string("state", pg_state_string(get_state()));
926 f
->dump_stream("snap_trimq") << snap_trimq
;
927 f
->dump_unsigned("epoch", get_osdmap()->get_epoch());
928 f
->open_array_section("up");
929 for (vector
<int>::iterator p
= up
.begin(); p
!= up
.end(); ++p
)
930 f
->dump_unsigned("osd", *p
);
932 f
->open_array_section("acting");
933 for (vector
<int>::iterator p
= acting
.begin(); p
!= acting
.end(); ++p
)
934 f
->dump_unsigned("osd", *p
);
936 if (!backfill_targets
.empty()) {
937 f
->open_array_section("backfill_targets");
938 for (set
<pg_shard_t
>::iterator p
= backfill_targets
.begin();
939 p
!= backfill_targets
.end();
941 f
->dump_stream("shard") << *p
;
944 if (!actingbackfill
.empty()) {
945 f
->open_array_section("actingbackfill");
946 for (set
<pg_shard_t
>::iterator p
= actingbackfill
.begin();
947 p
!= actingbackfill
.end();
949 f
->dump_stream("shard") << *p
;
952 f
->open_object_section("info");
953 _update_calc_stats();
957 f
->open_array_section("peer_info");
958 for (map
<pg_shard_t
, pg_info_t
>::iterator p
= peer_info
.begin();
959 p
!= peer_info
.end();
961 f
->open_object_section("info");
962 f
->dump_stream("peer") << p
->first
;
963 p
->second
.dump(f
.get());
968 f
->open_array_section("recovery_state");
969 handle_query_state(f
.get());
972 f
->open_object_section("agent_state");
974 agent_state
->dump(f
.get());
981 else if (command
== "mark_unfound_lost") {
983 cmd_getval(cct
, cmdmap
, "mulcmd", mulcmd
);
985 if (mulcmd
== "revert") {
986 if (pool
.info
.ec_pool()) {
987 ss
<< "mode must be 'delete' for ec pool";
990 mode
= pg_log_entry_t::LOST_REVERT
;
991 } else if (mulcmd
== "delete") {
992 mode
= pg_log_entry_t::LOST_DELETE
;
994 ss
<< "mode must be 'revert' or 'delete'; mark not yet implemented";
997 assert(mode
== pg_log_entry_t::LOST_REVERT
||
998 mode
== pg_log_entry_t::LOST_DELETE
);
1000 if (!is_primary()) {
1001 ss
<< "not primary";
1005 uint64_t unfound
= missing_loc
.num_unfound();
1007 ss
<< "pg has no unfound objects";
1008 return 0; // make command idempotent
1011 if (!all_unfound_are_queried_or_lost(get_osdmap())) {
1012 ss
<< "pg has " << unfound
1013 << " unfound objects but we haven't probed all sources, not marking lost";
1017 mark_all_unfound_lost(mode
, con
, tid
);
1020 else if (command
== "list_missing") {
1023 if (cmd_getval(cct
, cmdmap
, "offset", offset_json
)) {
1024 json_spirit::Value v
;
1026 if (!json_spirit::read(offset_json
, v
))
1027 throw std::runtime_error("bad json");
1029 } catch (std::runtime_error
& e
) {
1030 ss
<< "error parsing offset: " << e
.what();
1034 f
->open_object_section("missing");
1036 f
->open_object_section("offset");
1037 offset
.dump(f
.get());
1040 f
->dump_int("num_missing", missing
.num_missing());
1041 f
->dump_int("num_unfound", get_num_unfound());
1042 const map
<hobject_t
, pg_missing_item
> &needs_recovery_map
=
1043 missing_loc
.get_needs_recovery();
1044 map
<hobject_t
, pg_missing_item
>::const_iterator p
=
1045 needs_recovery_map
.upper_bound(offset
);
1047 f
->open_array_section("objects");
1049 for (; p
!= needs_recovery_map
.end() && num
< cct
->_conf
->osd_command_max_records
; ++p
) {
1050 if (missing_loc
.is_unfound(p
->first
)) {
1051 f
->open_object_section("object");
1053 f
->open_object_section("oid");
1054 p
->first
.dump(f
.get());
1057 p
->second
.dump(f
.get()); // have, need keys
1059 f
->open_array_section("locations");
1060 for (set
<pg_shard_t
>::iterator r
=
1061 missing_loc
.get_locations(p
->first
).begin();
1062 r
!= missing_loc
.get_locations(p
->first
).end();
1064 f
->dump_stream("shard") << *r
;
1073 f
->dump_bool("more", p
!= needs_recovery_map
.end());
1079 ss
<< "unknown pg command " << prefix
;
1083 // ==========================================================
1085 void PrimaryLogPG::do_pg_op(OpRequestRef op
)
1087 // NOTE: this is non-const because we modify the OSDOp.outdata in
1089 MOSDOp
*m
= static_cast<MOSDOp
*>(op
->get_nonconst_req());
1090 assert(m
->get_type() == CEPH_MSG_OSD_OP
);
1091 dout(10) << "do_pg_op " << *m
<< dendl
;
1096 string cname
, mname
;
1097 PGLSFilter
*filter
= NULL
;
1098 bufferlist filter_out
;
1100 snapid_t snapid
= m
->get_snapid();
1102 vector
<OSDOp
> ops
= m
->ops
;
1104 for (vector
<OSDOp
>::iterator p
= ops
.begin(); p
!= ops
.end(); ++p
) {
1106 bufferlist::iterator bp
= p
->indata
.begin();
1108 case CEPH_OSD_OP_PGNLS_FILTER
:
1110 ::decode(cname
, bp
);
1111 ::decode(mname
, bp
);
1113 catch (const buffer::error
& e
) {
1114 dout(0) << "unable to decode PGLS_FILTER description in " << *m
<< dendl
;
1122 result
= get_pgls_filter(bp
, &filter
);
1130 case CEPH_OSD_OP_PGNLS
:
1131 if (snapid
!= CEPH_NOSNAP
) {
1135 if (get_osdmap()->raw_pg_to_pg(m
->get_pg()) != info
.pgid
.pgid
) {
1136 dout(10) << " pgnls pg=" << m
->get_pg()
1137 << " " << get_osdmap()->raw_pg_to_pg(m
->get_pg())
1138 << " != " << info
.pgid
<< dendl
;
1141 unsigned list_size
= MIN(cct
->_conf
->osd_max_pgls
, p
->op
.pgls
.count
);
1143 dout(10) << " pgnls pg=" << m
->get_pg() << " count " << list_size
<< dendl
;
1144 // read into a buffer
1145 vector
<hobject_t
> sentries
;
1146 pg_nls_response_t response
;
1148 ::decode(response
.handle
, bp
);
1150 catch (const buffer::error
& e
) {
1151 dout(0) << "unable to decode PGNLS handle in " << *m
<< dendl
;
1157 hobject_t lower_bound
= response
.handle
;
1158 hobject_t pg_start
= info
.pgid
.pgid
.get_hobj_start();
1159 hobject_t pg_end
= info
.pgid
.pgid
.get_hobj_end(pool
.info
.get_pg_num());
1160 dout(10) << " pgnls lower_bound " << lower_bound
1161 << " pg_end " << pg_end
<< dendl
;
1162 if (((!lower_bound
.is_max() && lower_bound
>= pg_end
) ||
1163 (lower_bound
!= hobject_t() && lower_bound
< pg_start
))) {
1164 // this should only happen with a buggy client.
1165 dout(10) << "outside of PG bounds " << pg_start
<< " .. "
1171 hobject_t current
= lower_bound
;
1173 int r
= pgbackend
->objects_list_partial(
1184 map
<hobject_t
, pg_missing_item
>::const_iterator missing_iter
=
1185 pg_log
.get_missing().get_items().lower_bound(current
);
1186 vector
<hobject_t
>::iterator ls_iter
= sentries
.begin();
1187 hobject_t _max
= hobject_t::get_max();
1189 const hobject_t
&mcand
=
1190 missing_iter
== pg_log
.get_missing().get_items().end() ?
1192 missing_iter
->first
;
1193 const hobject_t
&lcand
=
1194 ls_iter
== sentries
.end() ?
1198 hobject_t candidate
;
1199 if (mcand
== lcand
) {
1201 if (!mcand
.is_max()) {
1205 } else if (mcand
< lcand
) {
1207 assert(!mcand
.is_max());
1211 assert(!lcand
.is_max());
1215 dout(10) << " pgnls candidate 0x" << std::hex
<< candidate
.get_hash()
1216 << " vs lower bound 0x" << lower_bound
.get_hash() << dendl
;
1218 if (candidate
>= next
) {
1222 if (response
.entries
.size() == list_size
) {
1227 // skip snapdir objects
1228 if (candidate
.snap
== CEPH_SNAPDIR
)
1231 if (candidate
.snap
!= CEPH_NOSNAP
)
1234 // skip internal namespace
1235 if (candidate
.get_namespace() == cct
->_conf
->osd_hit_set_namespace
)
1238 // skip wrong namespace
1239 if (m
->get_hobj().nspace
!= librados::all_nspaces
&&
1240 candidate
.get_namespace() != m
->get_hobj().nspace
)
1243 if (filter
&& !pgls_filter(filter
, candidate
, filter_out
))
1246 dout(20) << "pgnls item 0x" << std::hex
1247 << candidate
.get_hash()
1248 << ", rev 0x" << hobject_t::_reverse_bits(candidate
.get_hash())
1250 << candidate
.oid
.name
<< dendl
;
1252 librados::ListObjectImpl item
;
1253 item
.nspace
= candidate
.get_namespace();
1254 item
.oid
= candidate
.oid
.name
;
1255 item
.locator
= candidate
.get_key();
1256 response
.entries
.push_back(item
);
1259 if (next
.is_max() &&
1260 missing_iter
== pg_log
.get_missing().get_items().end() &&
1261 ls_iter
== sentries
.end()) {
1264 // Set response.handle to the start of the next PG according
1265 // to the object sort order.
1266 response
.handle
= info
.pgid
.pgid
.get_hobj_end(pool
.info
.get_pg_num());
1268 response
.handle
= next
;
1270 dout(10) << "pgnls handle=" << response
.handle
<< dendl
;
1271 ::encode(response
, osd_op
.outdata
);
1273 ::encode(filter_out
, osd_op
.outdata
);
1274 dout(10) << " pgnls result=" << result
<< " outdata.length()="
1275 << osd_op
.outdata
.length() << dendl
;
1279 case CEPH_OSD_OP_PGLS_FILTER
:
1281 ::decode(cname
, bp
);
1282 ::decode(mname
, bp
);
1284 catch (const buffer::error
& e
) {
1285 dout(0) << "unable to decode PGLS_FILTER description in " << *m
<< dendl
;
1293 result
= get_pgls_filter(bp
, &filter
);
1301 case CEPH_OSD_OP_PGLS
:
1302 if (snapid
!= CEPH_NOSNAP
) {
1306 if (get_osdmap()->raw_pg_to_pg(m
->get_pg()) != info
.pgid
.pgid
) {
1307 dout(10) << " pgls pg=" << m
->get_pg()
1308 << " " << get_osdmap()->raw_pg_to_pg(m
->get_pg())
1309 << " != " << info
.pgid
<< dendl
;
1312 unsigned list_size
= MIN(cct
->_conf
->osd_max_pgls
, p
->op
.pgls
.count
);
1314 dout(10) << " pgls pg=" << m
->get_pg() << " count " << list_size
<< dendl
;
1315 // read into a buffer
1316 vector
<hobject_t
> sentries
;
1317 pg_ls_response_t response
;
1319 ::decode(response
.handle
, bp
);
1321 catch (const buffer::error
& e
) {
1322 dout(0) << "unable to decode PGLS handle in " << *m
<< dendl
;
1328 hobject_t current
= response
.handle
;
1330 int r
= pgbackend
->objects_list_partial(
1341 assert(snapid
== CEPH_NOSNAP
|| pg_log
.get_missing().get_items().empty());
1343 map
<hobject_t
, pg_missing_item
>::const_iterator missing_iter
=
1344 pg_log
.get_missing().get_items().lower_bound(current
);
1345 vector
<hobject_t
>::iterator ls_iter
= sentries
.begin();
1346 hobject_t _max
= hobject_t::get_max();
1348 const hobject_t
&mcand
=
1349 missing_iter
== pg_log
.get_missing().get_items().end() ?
1351 missing_iter
->first
;
1352 const hobject_t
&lcand
=
1353 ls_iter
== sentries
.end() ?
1357 hobject_t candidate
;
1358 if (mcand
== lcand
) {
1360 if (!mcand
.is_max()) {
1364 } else if (mcand
< lcand
) {
1366 assert(!mcand
.is_max());
1370 assert(!lcand
.is_max());
1374 if (candidate
>= next
) {
1378 if (response
.entries
.size() == list_size
) {
1383 // skip snapdir objects
1384 if (candidate
.snap
== CEPH_SNAPDIR
)
1387 if (candidate
.snap
!= CEPH_NOSNAP
)
1390 // skip wrong namespace
1391 if (candidate
.get_namespace() != m
->get_hobj().nspace
)
1394 if (filter
&& !pgls_filter(filter
, candidate
, filter_out
))
1397 response
.entries
.push_back(make_pair(candidate
.oid
,
1398 candidate
.get_key()));
1400 if (next
.is_max() &&
1401 missing_iter
== pg_log
.get_missing().get_items().end() &&
1402 ls_iter
== sentries
.end()) {
1405 response
.handle
= next
;
1406 ::encode(response
, osd_op
.outdata
);
1408 ::encode(filter_out
, osd_op
.outdata
);
1409 dout(10) << " pgls result=" << result
<< " outdata.length()="
1410 << osd_op
.outdata
.length() << dendl
;
1414 case CEPH_OSD_OP_PG_HITSET_LS
:
1416 list
< pair
<utime_t
,utime_t
> > ls
;
1417 for (list
<pg_hit_set_info_t
>::const_iterator p
= info
.hit_set
.history
.begin();
1418 p
!= info
.hit_set
.history
.end();
1420 ls
.push_back(make_pair(p
->begin
, p
->end
));
1422 ls
.push_back(make_pair(hit_set_start_stamp
, utime_t()));
1423 ::encode(ls
, osd_op
.outdata
);
1427 case CEPH_OSD_OP_PG_HITSET_GET
:
1429 utime_t
stamp(osd_op
.op
.hit_set_get
.stamp
);
1430 if (hit_set_start_stamp
&& stamp
>= hit_set_start_stamp
) {
1431 // read the current in-memory HitSet, not the version we've
1437 ::encode(*hit_set
, osd_op
.outdata
);
1438 result
= osd_op
.outdata
.length();
1440 // read an archived HitSet.
1442 for (list
<pg_hit_set_info_t
>::const_iterator p
= info
.hit_set
.history
.begin();
1443 p
!= info
.hit_set
.history
.end();
1445 if (stamp
>= p
->begin
&& stamp
<= p
->end
) {
1446 oid
= get_hit_set_archive_object(p
->begin
, p
->end
, p
->using_gmt
);
1450 if (oid
== hobject_t()) {
1454 if (!pool
.info
.is_replicated()) {
1455 // FIXME: EC not supported yet
1456 result
= -EOPNOTSUPP
;
1459 if (is_unreadable_object(oid
)) {
1460 wait_for_unreadable_object(oid
, op
);
1464 result
= osd
->store
->read(ch
, ghobject_t(oid
), 0, 0, osd_op
.outdata
);
1469 case CEPH_OSD_OP_SCRUBLS
:
1470 result
= do_scrub_ls(m
, &osd_op
);
1483 MOSDOpReply
*reply
= new MOSDOpReply(m
, 0, get_osdmap()->get_epoch(),
1484 CEPH_OSD_FLAG_ACK
| CEPH_OSD_FLAG_ONDISK
,
1486 reply
->claim_op_out_data(ops
);
1487 reply
->set_result(result
);
1488 reply
->set_reply_versions(info
.last_update
, info
.last_user_version
);
1489 osd
->send_message_osd_client(reply
, m
->get_connection());
1493 int PrimaryLogPG::do_scrub_ls(MOSDOp
*m
, OSDOp
*osd_op
)
1495 if (m
->get_pg() != info
.pgid
.pgid
) {
1496 dout(10) << " scrubls pg=" << m
->get_pg() << " != " << info
.pgid
<< dendl
;
1497 return -EINVAL
; // hmm?
1499 auto bp
= osd_op
->indata
.begin();
1503 } catch (buffer::error
&) {
1504 dout(10) << " corrupted scrub_ls_arg_t" << dendl
;
1508 scrub_ls_result_t result
= {.interval
= info
.history
.same_interval_since
};
1509 if (arg
.interval
!= 0 && arg
.interval
!= info
.history
.same_interval_since
) {
1511 } else if (!scrubber
.store
) {
1513 } else if (arg
.get_snapsets
) {
1514 result
.vals
= scrubber
.store
->get_snap_errors(osd
->store
,
1519 result
.vals
= scrubber
.store
->get_object_errors(osd
->store
,
1524 ::encode(result
, osd_op
->outdata
);
1528 void PrimaryLogPG::calc_trim_to()
1530 size_t target
= cct
->_conf
->osd_min_pg_log_entries
;
1531 if (is_degraded() ||
1532 state_test(PG_STATE_RECOVERING
|
1533 PG_STATE_RECOVERY_WAIT
|
1535 PG_STATE_BACKFILL_WAIT
|
1536 PG_STATE_BACKFILL_TOOFULL
)) {
1537 target
= cct
->_conf
->osd_max_pg_log_entries
;
1540 eversion_t limit
= MIN(
1541 min_last_complete_ondisk
,
1542 pg_log
.get_can_rollback_to());
1543 if (limit
!= eversion_t() &&
1544 limit
!= pg_trim_to
&&
1545 pg_log
.get_log().approx_size() > target
) {
1546 size_t num_to_trim
= pg_log
.get_log().approx_size() - target
;
1547 if (num_to_trim
< cct
->_conf
->osd_pg_log_trim_min
) {
1550 list
<pg_log_entry_t
>::const_iterator it
= pg_log
.get_log().log
.begin();
1551 eversion_t new_trim_to
;
1552 for (size_t i
= 0; i
< num_to_trim
; ++i
) {
1553 new_trim_to
= it
->version
;
1555 if (new_trim_to
> limit
) {
1556 new_trim_to
= limit
;
1557 dout(10) << "calc_trim_to trimming to min_last_complete_ondisk" << dendl
;
1561 dout(10) << "calc_trim_to " << pg_trim_to
<< " -> " << new_trim_to
<< dendl
;
1562 pg_trim_to
= new_trim_to
;
1563 assert(pg_trim_to
<= pg_log
.get_head());
1564 assert(pg_trim_to
<= min_last_complete_ondisk
);
1568 PrimaryLogPG::PrimaryLogPG(OSDService
*o
, OSDMapRef curmap
,
1569 const PGPool
&_pool
, spg_t p
) :
1570 PG(o
, curmap
, _pool
, p
),
1572 PGBackend::build_pg_backend(
1573 _pool
.info
, curmap
, this, coll_t(p
), ch
, o
->store
, cct
)),
1574 object_contexts(o
->cct
, o
->cct
->_conf
->osd_pg_object_context_cache_count
),
1575 snapset_contexts_lock("PrimaryLogPG::snapset_contexts_lock"),
1576 new_backfill(false),
1578 snap_trimmer_machine(this)
1580 missing_loc
.set_backend_predicates(
1581 pgbackend
->get_is_readable_predicate(),
1582 pgbackend
->get_is_recoverable_predicate());
1583 snap_trimmer_machine
.initiate();
1586 void PrimaryLogPG::get_src_oloc(const object_t
& oid
, const object_locator_t
& oloc
, object_locator_t
& src_oloc
)
1589 if (oloc
.key
.empty())
1590 src_oloc
.key
= oid
.name
;
1593 void PrimaryLogPG::handle_backoff(OpRequestRef
& op
)
1595 const MOSDBackoff
*m
= static_cast<const MOSDBackoff
*>(op
->get_req());
1596 SessionRef session
= static_cast<Session
*>(m
->get_connection()->get_priv());
1599 session
->put(); // get_priv takes a ref, and so does the SessionRef
1600 hobject_t begin
= info
.pgid
.pgid
.get_hobj_start();
1601 hobject_t end
= info
.pgid
.pgid
.get_hobj_end(pool
.info
.get_pg_num());
1602 if (begin
< m
->begin
) {
1608 dout(10) << __func__
<< " backoff ack id " << m
->id
1609 << " [" << begin
<< "," << end
<< ")" << dendl
;
1610 session
->ack_backoff(cct
, m
->pgid
, m
->id
, begin
, end
);
1613 void PrimaryLogPG::do_request(
1615 ThreadPool::TPHandle
&handle
)
1617 if (op
->osd_trace
) {
1618 op
->pg_trace
.init("pg op", &trace_endpoint
, &op
->osd_trace
);
1619 op
->pg_trace
.event("do request");
1621 // make sure we have a new enough map
1622 auto p
= waiting_for_map
.find(op
->get_source());
1623 if (p
!= waiting_for_map
.end()) {
1624 // preserve ordering
1625 dout(20) << __func__
<< " waiting_for_map "
1626 << p
->first
<< " not empty, queueing" << dendl
;
1627 p
->second
.push_back(op
);
1628 op
->mark_delayed("waiting_for_map not empty");
1631 if (!have_same_or_newer_map(op
->min_epoch
)) {
1632 dout(20) << __func__
<< " min " << op
->min_epoch
1633 << ", queue on waiting_for_map " << op
->get_source() << dendl
;
1634 waiting_for_map
[op
->get_source()].push_back(op
);
1635 op
->mark_delayed("op must wait for map");
1639 if (can_discard_request(op
)) {
1644 const Message
*m
= op
->get_req();
1645 if (m
->get_connection()->has_feature(CEPH_FEATURE_RADOS_BACKOFF
)) {
1646 SessionRef session
= static_cast<Session
*>(m
->get_connection()->get_priv());
1649 session
->put(); // get_priv takes a ref, and so does the SessionRef
1651 if (op
->get_req()->get_type() == CEPH_MSG_OSD_OP
) {
1652 if (session
->check_backoff(cct
, info
.pgid
,
1653 info
.pgid
.pgid
.get_hobj_start(), m
)) {
1660 (!is_active() && is_peered());
1661 if (g_conf
->osd_backoff_on_peering
&& !backoff
) {
1667 add_pg_backoff(session
);
1671 // pg backoff acks at pg-level
1672 if (op
->get_req()->get_type() == CEPH_MSG_OSD_BACKOFF
) {
1673 const MOSDBackoff
*ba
= static_cast<const MOSDBackoff
*>(m
);
1674 if (ba
->begin
!= ba
->end
) {
1681 if (flushes_in_progress
> 0) {
1682 dout(20) << flushes_in_progress
1683 << " flushes_in_progress pending "
1684 << "waiting for active on " << op
<< dendl
;
1685 waiting_for_peered
.push_back(op
);
1686 op
->mark_delayed("waiting for peered");
1691 // Delay unless PGBackend says it's ok
1692 if (pgbackend
->can_handle_while_inactive(op
)) {
1693 bool handled
= pgbackend
->handle_message(op
);
1697 waiting_for_peered
.push_back(op
);
1698 op
->mark_delayed("waiting for peered");
1703 assert(is_peered() && flushes_in_progress
== 0);
1704 if (pgbackend
->handle_message(op
))
1707 switch (op
->get_req()->get_type()) {
1708 case CEPH_MSG_OSD_OP
:
1709 case CEPH_MSG_OSD_BACKOFF
:
1711 dout(20) << " peered, not active, waiting for active on " << op
<< dendl
;
1712 waiting_for_active
.push_back(op
);
1713 op
->mark_delayed("waiting for active");
1716 switch (op
->get_req()->get_type()) {
1717 case CEPH_MSG_OSD_OP
:
1718 // verify client features
1719 if ((pool
.info
.has_tiers() || pool
.info
.is_tier()) &&
1720 !op
->has_feature(CEPH_FEATURE_OSD_CACHEPOOL
)) {
1721 osd
->reply_op_error(op
, -EOPNOTSUPP
);
1726 case CEPH_MSG_OSD_BACKOFF
:
1727 // object-level backoff acks handled in osdop context
1737 case MSG_OSD_SUBOPREPLY
:
1738 do_sub_op_reply(op
);
1741 case MSG_OSD_PG_SCAN
:
1742 do_scan(op
, handle
);
1745 case MSG_OSD_PG_BACKFILL
:
1749 case MSG_OSD_PG_BACKFILL_REMOVE
:
1750 do_backfill_remove(op
);
1753 case MSG_OSD_SCRUB_RESERVE
:
1755 const MOSDScrubReserve
*m
=
1756 static_cast<const MOSDScrubReserve
*>(op
->get_req());
1758 case MOSDScrubReserve::REQUEST
:
1759 handle_scrub_reserve_request(op
);
1761 case MOSDScrubReserve::GRANT
:
1762 handle_scrub_reserve_grant(op
, m
->from
);
1764 case MOSDScrubReserve::REJECT
:
1765 handle_scrub_reserve_reject(op
, m
->from
);
1767 case MOSDScrubReserve::RELEASE
:
1768 handle_scrub_reserve_release(op
);
1774 case MSG_OSD_REP_SCRUB
:
1775 replica_scrub(op
, handle
);
1778 case MSG_OSD_REP_SCRUBMAP
:
1779 do_replica_scrub_map(op
);
1782 case MSG_OSD_PG_UPDATE_LOG_MISSING
:
1783 do_update_log_missing(op
);
1786 case MSG_OSD_PG_UPDATE_LOG_MISSING_REPLY
:
1787 do_update_log_missing_reply(op
);
1791 assert(0 == "bad message type in do_request");
1795 hobject_t
PrimaryLogPG::earliest_backfill() const
1797 hobject_t e
= hobject_t::get_max();
1798 for (set
<pg_shard_t
>::iterator i
= backfill_targets
.begin();
1799 i
!= backfill_targets
.end();
1802 map
<pg_shard_t
, pg_info_t
>::const_iterator iter
= peer_info
.find(bt
);
1803 assert(iter
!= peer_info
.end());
1804 if (iter
->second
.last_backfill
< e
)
1805 e
= iter
->second
.last_backfill
;
1810 /** do_op - do an op
1811 * pg lock will be held (if multithreaded)
1812 * osd_lock NOT held.
1814 void PrimaryLogPG::do_op(OpRequestRef
& op
)
1817 // NOTE: take a non-const pointer here; we must be careful not to
1818 // change anything that will break other reads on m (operator<<).
1819 MOSDOp
*m
= static_cast<MOSDOp
*>(op
->get_nonconst_req());
1820 assert(m
->get_type() == CEPH_MSG_OSD_OP
);
1821 if (m
->finish_decode()) {
1822 op
->reset_desc(); // for TrackedOp
1826 dout(20) << __func__
<< ": op " << *m
<< dendl
;
1828 hobject_t head
= m
->get_hobj();
1829 head
.snap
= CEPH_NOSNAP
;
1831 if (!info
.pgid
.pgid
.contains(
1832 info
.pgid
.pgid
.get_split_bits(pool
.info
.get_pg_num()), head
)) {
1833 derr
<< __func__
<< " " << info
.pgid
.pgid
<< " does not contain "
1834 << head
<< " pg_num " << pool
.info
.get_pg_num() << " hash "
1835 << std::hex
<< head
.get_hash() << std::dec
<< dendl
;
1836 osd
->clog
->warn() << info
.pgid
.pgid
<< " does not contain " << head
1838 assert(!cct
->_conf
->osd_debug_misdirected_ops
);
1843 m
->get_connection()->has_feature(CEPH_FEATURE_RADOS_BACKOFF
);
1846 session
= static_cast<Session
*>(m
->get_connection()->get_priv());
1847 if (!session
.get()) {
1848 dout(10) << __func__
<< " no session" << dendl
;
1851 session
->put(); // get_priv() takes a ref, and so does the intrusive_ptr
1853 if (session
->check_backoff(cct
, info
.pgid
, head
, m
)) {
1858 if (m
->has_flag(CEPH_OSD_FLAG_PARALLELEXEC
)) {
1860 dout(20) << __func__
<< ": PARALLELEXEC not implemented " << *m
<< dendl
;
1861 osd
->reply_op_error(op
, -EINVAL
);
1865 if (op
->rmw_flags
== 0) {
1866 int r
= osd
->osd
->init_op_flags(op
);
1868 osd
->reply_op_error(op
, r
);
1873 if ((m
->get_flags() & (CEPH_OSD_FLAG_BALANCE_READS
|
1874 CEPH_OSD_FLAG_LOCALIZE_READS
)) &&
1876 !(op
->may_write() || op
->may_cache())) {
1877 // balanced reads; any replica will do
1878 if (!(is_primary() || is_replica())) {
1879 osd
->handle_misdirected_op(this, op
);
1883 // normal case; must be primary
1884 if (!is_primary()) {
1885 osd
->handle_misdirected_op(this, op
);
1890 if (!op_has_sufficient_caps(op
)) {
1891 osd
->reply_op_error(op
, -EPERM
);
1895 if (op
->includes_pg_op()) {
1896 return do_pg_op(op
);
1899 // object name too long?
1900 if (m
->get_oid().name
.size() > cct
->_conf
->osd_max_object_name_len
) {
1901 dout(4) << "do_op name is longer than "
1902 << cct
->_conf
->osd_max_object_name_len
1903 << " bytes" << dendl
;
1904 osd
->reply_op_error(op
, -ENAMETOOLONG
);
1907 if (m
->get_hobj().get_key().size() > cct
->_conf
->osd_max_object_name_len
) {
1908 dout(4) << "do_op locator is longer than "
1909 << cct
->_conf
->osd_max_object_name_len
1910 << " bytes" << dendl
;
1911 osd
->reply_op_error(op
, -ENAMETOOLONG
);
1914 if (m
->get_hobj().nspace
.size() > cct
->_conf
->osd_max_object_namespace_len
) {
1915 dout(4) << "do_op namespace is longer than "
1916 << cct
->_conf
->osd_max_object_namespace_len
1917 << " bytes" << dendl
;
1918 osd
->reply_op_error(op
, -ENAMETOOLONG
);
1922 if (int r
= osd
->store
->validate_hobject_key(head
)) {
1923 dout(4) << "do_op object " << head
<< " invalid for backing store: "
1925 osd
->reply_op_error(op
, r
);
1930 if (get_osdmap()->is_blacklisted(m
->get_source_addr())) {
1931 dout(10) << "do_op " << m
->get_source_addr() << " is blacklisted" << dendl
;
1932 osd
->reply_op_error(op
, -EBLACKLISTED
);
1936 // order this op as a write?
1937 bool write_ordered
= op
->rwordered();
1939 // discard due to cluster full transition? (we discard any op that
1940 // originates before the cluster or pool is marked full; the client
1941 // will resend after the full flag is removed or if they expect the
1942 // op to succeed despite being full). The except is FULL_FORCE and
1943 // FULL_TRY ops, which there is no reason to discard because they
1944 // bypass all full checks anyway. If this op isn't write or
1945 // read-ordered, we skip.
1946 // FIXME: we exclude mds writes for now.
1947 if (write_ordered
&& !(m
->get_source().is_mds() ||
1948 m
->has_flag(CEPH_OSD_FLAG_FULL_TRY
) ||
1949 m
->has_flag(CEPH_OSD_FLAG_FULL_FORCE
)) &&
1950 info
.history
.last_epoch_marked_full
> m
->get_map_epoch()) {
1951 dout(10) << __func__
<< " discarding op sent before full " << m
<< " "
1955 // mds should have stopped writing before this point.
1956 // We can't allow OSD to become non-startable even if mds
1957 // could be writing as part of file removals.
1959 if (write_ordered
&& osd
->check_failsafe_full(ss
)) {
1960 dout(10) << __func__
<< " fail-safe full check failed, dropping request"
1965 int64_t poolid
= get_pgid().pool();
1966 if (op
->may_write()) {
1968 const pg_pool_t
*pi
= get_osdmap()->get_pg_pool(poolid
);
1974 if (m
->get_snapid() != CEPH_NOSNAP
) {
1975 dout(20) << __func__
<< ": write to clone not valid " << *m
<< dendl
;
1976 osd
->reply_op_error(op
, -EINVAL
);
1981 if (cct
->_conf
->osd_max_write_size
&&
1982 m
->get_data_len() > cct
->_conf
->osd_max_write_size
<< 20) {
1983 // journal can't hold commit!
1984 derr
<< "do_op msg data len " << m
->get_data_len()
1985 << " > osd_max_write_size " << (cct
->_conf
->osd_max_write_size
<< 20)
1986 << " on " << *m
<< dendl
;
1987 osd
->reply_op_error(op
, -OSD_WRITETOOBIG
);
1992 dout(10) << "do_op " << *m
1993 << (op
->may_write() ? " may_write" : "")
1994 << (op
->may_read() ? " may_read" : "")
1995 << (op
->may_cache() ? " may_cache" : "")
1996 << " -> " << (write_ordered
? "write-ordered" : "read-ordered")
1997 << " flags " << ceph_osd_flag_string(m
->get_flags())
2001 if (is_unreadable_object(head
)) {
2002 if (!is_primary()) {
2003 osd
->reply_op_error(op
, -EAGAIN
);
2007 (g_conf
->osd_backoff_on_degraded
||
2008 (g_conf
->osd_backoff_on_unfound
&& missing_loc
.is_unfound(head
)))) {
2009 add_backoff(session
, head
, head
);
2010 maybe_kick_recovery(head
);
2012 wait_for_unreadable_object(head
, op
);
2018 if (write_ordered
&& is_degraded_or_backfilling_object(head
)) {
2019 if (can_backoff
&& g_conf
->osd_backoff_on_degraded
) {
2020 add_backoff(session
, head
, head
);
2022 wait_for_degraded_object(head
, op
);
2027 if (write_ordered
&&
2028 scrubber
.write_blocked_by_scrub(head
)) {
2029 dout(20) << __func__
<< ": waiting for scrub" << dendl
;
2030 waiting_for_scrub
.push_back(op
);
2031 op
->mark_delayed("waiting for scrub");
2036 map
<hobject_t
, snapid_t
>::iterator blocked_iter
=
2037 objects_blocked_on_degraded_snap
.find(head
);
2038 if (write_ordered
&& blocked_iter
!= objects_blocked_on_degraded_snap
.end()) {
2039 hobject_t
to_wait_on(head
);
2040 to_wait_on
.snap
= blocked_iter
->second
;
2041 wait_for_degraded_object(to_wait_on
, op
);
2044 map
<hobject_t
, ObjectContextRef
>::iterator blocked_snap_promote_iter
=
2045 objects_blocked_on_snap_promotion
.find(head
);
2046 if (write_ordered
&&
2047 blocked_snap_promote_iter
!= objects_blocked_on_snap_promotion
.end()) {
2048 wait_for_blocked_object(
2049 blocked_snap_promote_iter
->second
->obs
.oi
.soid
,
2053 if (write_ordered
&& objects_blocked_on_cache_full
.count(head
)) {
2054 block_write_on_full_cache(head
, op
);
2059 hobject_t snapdir
= head
.get_snapdir();
2061 if (is_unreadable_object(snapdir
)) {
2062 wait_for_unreadable_object(snapdir
, op
);
2067 if (write_ordered
&& is_degraded_or_backfilling_object(snapdir
)) {
2068 wait_for_degraded_object(snapdir
, op
);
2073 if (op
->may_write() || op
->may_cache()) {
2074 // warning: we will get back *a* request for this reqid, but not
2075 // necessarily the most recent. this happens with flush and
2076 // promote ops, but we can't possible have both in our log where
2077 // the original request is still not stable on disk, so for our
2078 // purposes here it doesn't matter which one we get.
2080 version_t user_version
;
2081 int return_code
= 0;
2082 bool got
= check_in_progress_op(
2083 m
->get_reqid(), &version
, &user_version
, &return_code
);
2085 dout(3) << __func__
<< " dup " << m
->get_reqid()
2086 << " version " << version
<< dendl
;
2087 if (already_complete(version
)) {
2088 osd
->reply_op_error(op
, return_code
, version
, user_version
);
2090 dout(10) << " waiting for " << version
<< " to commit" << dendl
;
2091 // always queue ondisk waiters, so that we can requeue if needed
2092 waiting_for_ondisk
[version
].push_back(make_pair(op
, user_version
));
2093 op
->mark_delayed("waiting for ondisk");
2099 ObjectContextRef obc
;
2100 bool can_create
= op
->may_write() || op
->may_cache();
2101 hobject_t missing_oid
;
2102 const hobject_t
& oid
= m
->get_hobj();
2104 // io blocked on obc?
2105 if (!m
->has_flag(CEPH_OSD_FLAG_FLUSH
) &&
2106 maybe_await_blocked_snapset(oid
, op
)) {
2110 int r
= find_object_context(
2111 oid
, &obc
, can_create
,
2112 m
->has_flag(CEPH_OSD_FLAG_MAP_SNAP_CLONE
),
2116 // If we're not the primary of this OSD, we just return -EAGAIN. Otherwise,
2117 // we have to wait for the object.
2119 // missing the specific snap we need; requeue and wait.
2120 assert(!op
->may_write()); // only happens on a read/cache
2121 wait_for_unreadable_object(missing_oid
, op
);
2124 } else if (r
== 0) {
2125 if (is_unreadable_object(obc
->obs
.oi
.soid
)) {
2126 dout(10) << __func__
<< ": clone " << obc
->obs
.oi
.soid
2127 << " is unreadable, waiting" << dendl
;
2128 wait_for_unreadable_object(obc
->obs
.oi
.soid
, op
);
2132 // degraded object? (the check above was for head; this could be a clone)
2133 if (write_ordered
&&
2134 obc
->obs
.oi
.soid
.snap
!= CEPH_NOSNAP
&&
2135 is_degraded_or_backfilling_object(obc
->obs
.oi
.soid
)) {
2136 dout(10) << __func__
<< ": clone " << obc
->obs
.oi
.soid
2137 << " is degraded, waiting" << dendl
;
2138 wait_for_degraded_object(obc
->obs
.oi
.soid
, op
);
2143 bool in_hit_set
= false;
2146 if (obc
->obs
.oi
.soid
!= hobject_t() && hit_set
->contains(obc
->obs
.oi
.soid
))
2149 if (missing_oid
!= hobject_t() && hit_set
->contains(missing_oid
))
2152 if (!op
->hitset_inserted
) {
2153 hit_set
->insert(oid
);
2154 op
->hitset_inserted
= true;
2155 if (hit_set
->is_full() ||
2156 hit_set_start_stamp
+ pool
.info
.hit_set_period
<= m
->get_recv_stamp()) {
2163 if (agent_choose_mode(false, op
))
2167 if (obc
.get() && obc
->obs
.exists
&& obc
->obs
.oi
.has_manifest()) {
2168 if (maybe_handle_manifest(op
,
2174 if (maybe_handle_cache(op
,
2183 if (r
&& (r
!= -ENOENT
|| !obc
)) {
2184 // copy the reqids for copy get on ENOENT
2186 (m
->ops
[0].op
.op
== CEPH_OSD_OP_COPY_GET
)) {
2187 fill_in_copy_get_noent(op
, oid
, m
->ops
[0]);
2190 dout(20) << __func__
<< ": find_object_context got error " << r
<< dendl
;
2191 if (op
->may_write() &&
2192 get_osdmap()->require_osd_release
>= CEPH_RELEASE_KRAKEN
) {
2193 record_write_error(op
, oid
, nullptr, r
);
2195 osd
->reply_op_error(op
, r
);
2200 // make sure locator is consistent
2201 object_locator_t
oloc(obc
->obs
.oi
.soid
);
2202 if (m
->get_object_locator() != oloc
) {
2203 dout(10) << " provided locator " << m
->get_object_locator()
2204 << " != object's " << obc
->obs
.oi
.soid
<< dendl
;
2205 osd
->clog
->warn() << "bad locator " << m
->get_object_locator()
2206 << " on object " << oloc
2210 // io blocked on obc?
2211 if (obc
->is_blocked() &&
2212 !m
->has_flag(CEPH_OSD_FLAG_FLUSH
)) {
2213 wait_for_blocked_object(obc
->obs
.oi
.soid
, op
);
2217 dout(25) << __func__
<< " oi " << obc
->obs
.oi
<< dendl
;
2219 for (vector
<OSDOp
>::iterator p
= m
->ops
.begin(); p
!= m
->ops
.end(); ++p
) {
2222 // make sure LIST_SNAPS is on CEPH_SNAPDIR and nothing else
2223 if (osd_op
.op
.op
== CEPH_OSD_OP_LIST_SNAPS
&&
2224 m
->get_snapid() != CEPH_SNAPDIR
) {
2225 dout(10) << "LIST_SNAPS with incorrect context" << dendl
;
2226 osd
->reply_op_error(op
, -EINVAL
);
2231 OpContext
*ctx
= new OpContext(op
, m
->get_reqid(), m
->ops
, obc
, this);
2233 if (!obc
->obs
.exists
)
2234 ctx
->snapset_obc
= get_object_context(obc
->obs
.oi
.soid
.get_snapdir(), false);
2236 /* Due to obc caching, we might have a cached non-existent snapset_obc
2237 * for the snapdir. If so, we can ignore it. Subsequent parts of the
2238 * do_op pipeline make decisions based on whether snapset_obc is
2241 if (ctx
->snapset_obc
&& !ctx
->snapset_obc
->obs
.exists
)
2242 ctx
->snapset_obc
= ObjectContextRef();
2244 if (m
->has_flag(CEPH_OSD_FLAG_SKIPRWLOCKS
)) {
2245 dout(20) << __func__
<< ": skipping rw locks" << dendl
;
2246 } else if (m
->get_flags() & CEPH_OSD_FLAG_FLUSH
) {
2247 dout(20) << __func__
<< ": part of flush, will ignore write lock" << dendl
;
2249 // verify there is in fact a flush in progress
2250 // FIXME: we could make this a stronger test.
2251 map
<hobject_t
,FlushOpRef
>::iterator p
= flush_ops
.find(obc
->obs
.oi
.soid
);
2252 if (p
== flush_ops
.end()) {
2253 dout(10) << __func__
<< " no flush in progress, aborting" << dendl
;
2254 reply_ctx(ctx
, -EINVAL
);
2257 } else if (!get_rw_locks(write_ordered
, ctx
)) {
2258 dout(20) << __func__
<< " waiting for rw locks " << dendl
;
2259 op
->mark_delayed("waiting for rw locks");
2263 dout(20) << __func__
<< " obc " << *obc
<< dendl
;
2266 dout(20) << __func__
<< " returned an error: " << r
<< dendl
;
2268 if (op
->may_write() &&
2269 get_osdmap()->require_osd_release
>= CEPH_RELEASE_KRAKEN
) {
2270 record_write_error(op
, oid
, nullptr, r
);
2272 osd
->reply_op_error(op
, r
);
2277 if (m
->has_flag(CEPH_OSD_FLAG_IGNORE_CACHE
)) {
2278 ctx
->ignore_cache
= true;
2281 if ((op
->may_read()) && (obc
->obs
.oi
.is_lost())) {
2282 // This object is lost. Reading from it returns an error.
2283 dout(20) << __func__
<< ": object " << obc
->obs
.oi
.soid
2284 << " is lost" << dendl
;
2285 reply_ctx(ctx
, -ENFILE
);
2288 if (!op
->may_write() &&
2290 (!obc
->obs
.exists
||
2291 ((m
->get_snapid() != CEPH_SNAPDIR
) &&
2292 obc
->obs
.oi
.is_whiteout()))) {
2293 // copy the reqids for copy get on ENOENT
2294 if (m
->ops
[0].op
.op
== CEPH_OSD_OP_COPY_GET
) {
2295 fill_in_copy_get_noent(op
, oid
, m
->ops
[0]);
2299 reply_ctx(ctx
, -ENOENT
);
2306 utime_t prepare_latency
= ceph_clock_now();
2307 prepare_latency
-= op
->get_dequeued_time();
2308 osd
->logger
->tinc(l_osd_op_prepare_lat
, prepare_latency
);
2309 if (op
->may_read() && op
->may_write()) {
2310 osd
->logger
->tinc(l_osd_op_rw_prepare_lat
, prepare_latency
);
2311 } else if (op
->may_read()) {
2312 osd
->logger
->tinc(l_osd_op_r_prepare_lat
, prepare_latency
);
2313 } else if (op
->may_write() || op
->may_cache()) {
2314 osd
->logger
->tinc(l_osd_op_w_prepare_lat
, prepare_latency
);
2317 // force recovery of the oldest missing object if too many logs
2318 maybe_force_recovery();
2320 PrimaryLogPG::cache_result_t
PrimaryLogPG::maybe_handle_manifest_detail(
2323 ObjectContextRef obc
)
2325 if (static_cast<const MOSDOp
*>(op
->get_req())->get_flags() &
2326 CEPH_OSD_FLAG_IGNORE_REDIRECT
) {
2327 dout(20) << __func__
<< ": ignoring redirect due to flag" << dendl
;
2328 return cache_result_t::NOOP
;
2332 dout(10) << __func__
<< " " << obc
->obs
.oi
<< " "
2333 << (obc
->obs
.exists
? "exists" : "DNE")
2336 // if it is write-ordered and blocked, stop now
2337 if (obc
.get() && obc
->is_blocked() && write_ordered
) {
2338 // we're already doing something with this object
2339 dout(20) << __func__
<< " blocked on " << obc
->obs
.oi
.soid
<< dendl
;
2340 return cache_result_t::NOOP
;
2343 vector
<OSDOp
> ops
= static_cast<const MOSDOp
*>(op
->get_req())->ops
;
2344 for (vector
<OSDOp
>::iterator p
= ops
.begin(); p
!= ops
.end(); ++p
) {
2346 ceph_osd_op
& op
= osd_op
.op
;
2347 if (op
.op
== CEPH_OSD_OP_SET_REDIRECT
) {
2348 return cache_result_t::NOOP
;
2352 switch (obc
->obs
.oi
.manifest
.type
) {
2353 case object_manifest_t::TYPE_REDIRECT
:
2354 if (op
->may_write() || write_ordered
) {
2355 do_proxy_write(op
, obc
->obs
.oi
.soid
, obc
);
2357 do_proxy_read(op
, obc
);
2359 return cache_result_t::HANDLED_PROXY
;
2360 case object_manifest_t::TYPE_CHUNKED
:
2362 assert(0 == "unrecognized manifest type");
2365 return cache_result_t::NOOP
;
2368 void PrimaryLogPG::record_write_error(OpRequestRef op
, const hobject_t
&soid
,
2369 MOSDOpReply
*orig_reply
, int r
)
2371 dout(20) << __func__
<< " r=" << r
<< dendl
;
2372 assert(op
->may_write());
2373 const osd_reqid_t
&reqid
= static_cast<const MOSDOp
*>(op
->get_req())->get_reqid();
2374 ObjectContextRef obc
;
2375 mempool::osd_pglog::list
<pg_log_entry_t
> entries
;
2376 entries
.push_back(pg_log_entry_t(pg_log_entry_t::ERROR
, soid
,
2377 get_next_version(), eversion_t(), 0,
2378 reqid
, utime_t(), r
));
2383 boost::intrusive_ptr
<MOSDOpReply
> orig_reply
;
2388 MOSDOpReply
*orig_reply
,
2391 orig_reply(orig_reply
, false /* take over ref */), r(r
)
2394 ldpp_dout(pg
, 20) << "finished " << __func__
<< " r=" << r
<< dendl
;
2395 const MOSDOp
*m
= static_cast<const MOSDOp
*>(op
->get_req());
2396 int flags
= m
->get_flags() & (CEPH_OSD_FLAG_ACK
| CEPH_OSD_FLAG_ONDISK
);
2397 MOSDOpReply
*reply
= orig_reply
.detach();
2398 if (reply
== nullptr) {
2399 reply
= new MOSDOpReply(m
, r
, pg
->get_osdmap()->get_epoch(),
2402 ldpp_dout(pg
, 10) << " sending commit on " << *m
<< " " << reply
<< dendl
;
2403 pg
->osd
->send_message_osd_client(reply
, m
->get_connection());
2407 ObcLockManager lock_manager
;
2410 std::move(lock_manager
),
2411 boost::optional
<std::function
<void(void)> >(
2412 OnComplete(this, op
, orig_reply
, r
)),
2417 PrimaryLogPG::cache_result_t
PrimaryLogPG::maybe_handle_cache_detail(
2420 ObjectContextRef obc
,
2421 int r
, hobject_t missing_oid
,
2424 ObjectContextRef
*promote_obc
)
2428 op
->get_req()->get_type() == CEPH_MSG_OSD_OP
&&
2429 (static_cast<const MOSDOp
*>(op
->get_req())->get_flags() &
2430 CEPH_OSD_FLAG_IGNORE_CACHE
)) {
2431 dout(20) << __func__
<< ": ignoring cache due to flag" << dendl
;
2432 return cache_result_t::NOOP
;
2434 // return quickly if caching is not enabled
2435 if (pool
.info
.cache_mode
== pg_pool_t::CACHEMODE_NONE
)
2436 return cache_result_t::NOOP
;
2438 must_promote
= must_promote
|| op
->need_promote();
2441 dout(25) << __func__
<< " " << obc
->obs
.oi
<< " "
2442 << (obc
->obs
.exists
? "exists" : "DNE")
2443 << " missing_oid " << missing_oid
2444 << " must_promote " << (int)must_promote
2445 << " in_hit_set " << (int)in_hit_set
2448 dout(25) << __func__
<< " (no obc)"
2449 << " missing_oid " << missing_oid
2450 << " must_promote " << (int)must_promote
2451 << " in_hit_set " << (int)in_hit_set
2454 // if it is write-ordered and blocked, stop now
2455 if (obc
.get() && obc
->is_blocked() && write_ordered
) {
2456 // we're already doing something with this object
2457 dout(20) << __func__
<< " blocked on " << obc
->obs
.oi
.soid
<< dendl
;
2458 return cache_result_t::NOOP
;
2461 if (r
== -ENOENT
&& missing_oid
== hobject_t()) {
2462 // we know this object is logically absent (e.g., an undefined clone)
2463 return cache_result_t::NOOP
;
2466 if (obc
.get() && obc
->obs
.exists
) {
2467 osd
->logger
->inc(l_osd_op_cache_hit
);
2468 return cache_result_t::NOOP
;
2471 if (missing_oid
== hobject_t() && obc
.get()) {
2472 missing_oid
= obc
->obs
.oi
.soid
;
2475 const MOSDOp
*m
= static_cast<const MOSDOp
*>(op
->get_req());
2476 const object_locator_t oloc
= m
->get_object_locator();
2478 if (op
->need_skip_handle_cache()) {
2479 return cache_result_t::NOOP
;
2482 // older versions do not proxy the feature bits.
2483 bool can_proxy_write
= get_osdmap()->get_up_osd_features() &
2484 CEPH_FEATURE_OSD_PROXY_WRITE_FEATURES
;
2485 OpRequestRef promote_op
;
2487 switch (pool
.info
.cache_mode
) {
2488 case pg_pool_t::CACHEMODE_WRITEBACK
:
2490 agent_state
->evict_mode
== TierAgentState::EVICT_MODE_FULL
) {
2491 if (!op
->may_write() && !op
->may_cache() &&
2492 !write_ordered
&& !must_promote
) {
2493 dout(20) << __func__
<< " cache pool full, proxying read" << dendl
;
2495 return cache_result_t::HANDLED_PROXY
;
2497 dout(20) << __func__
<< " cache pool full, waiting" << dendl
;
2498 block_write_on_full_cache(missing_oid
, op
);
2499 return cache_result_t::BLOCKED_FULL
;
2502 if (must_promote
|| (!hit_set
&& !op
->need_skip_promote())) {
2503 promote_object(obc
, missing_oid
, oloc
, op
, promote_obc
);
2504 return cache_result_t::BLOCKED_PROMOTE
;
2507 if (op
->may_write() || op
->may_cache()) {
2508 if (can_proxy_write
) {
2509 do_proxy_write(op
, missing_oid
);
2511 // promote if can't proxy the write
2512 promote_object(obc
, missing_oid
, oloc
, op
, promote_obc
);
2513 return cache_result_t::BLOCKED_PROMOTE
;
2517 if (!op
->need_skip_promote() &&
2518 maybe_promote(obc
, missing_oid
, oloc
, in_hit_set
,
2519 pool
.info
.min_write_recency_for_promote
,
2522 return cache_result_t::BLOCKED_PROMOTE
;
2524 return cache_result_t::HANDLED_PROXY
;
2528 // Avoid duplicate promotion
2529 if (obc
.get() && obc
->is_blocked()) {
2532 return cache_result_t::BLOCKED_PROMOTE
;
2536 if (!op
->need_skip_promote()) {
2537 (void)maybe_promote(obc
, missing_oid
, oloc
, in_hit_set
,
2538 pool
.info
.min_read_recency_for_promote
,
2539 promote_op
, promote_obc
);
2542 return cache_result_t::HANDLED_PROXY
;
2544 assert(0 == "unreachable");
2545 return cache_result_t::NOOP
;
2547 case pg_pool_t::CACHEMODE_FORWARD
:
2548 // FIXME: this mode allows requests to be reordered.
2549 do_cache_redirect(op
);
2550 return cache_result_t::HANDLED_REDIRECT
;
2552 case pg_pool_t::CACHEMODE_READONLY
:
2553 // TODO: clean this case up
2554 if (!obc
.get() && r
== -ENOENT
) {
2555 // we don't have the object and op's a read
2556 promote_object(obc
, missing_oid
, oloc
, op
, promote_obc
);
2557 return cache_result_t::BLOCKED_PROMOTE
;
2559 if (!r
) { // it must be a write
2560 do_cache_redirect(op
);
2561 return cache_result_t::HANDLED_REDIRECT
;
2563 // crap, there was a failure of some kind
2564 return cache_result_t::NOOP
;
2566 case pg_pool_t::CACHEMODE_READFORWARD
:
2567 // Do writeback to the cache tier for writes
2568 if (op
->may_write() || write_ordered
|| must_promote
) {
2570 agent_state
->evict_mode
== TierAgentState::EVICT_MODE_FULL
) {
2571 dout(20) << __func__
<< " cache pool full, waiting" << dendl
;
2572 block_write_on_full_cache(missing_oid
, op
);
2573 return cache_result_t::BLOCKED_FULL
;
2575 promote_object(obc
, missing_oid
, oloc
, op
, promote_obc
);
2576 return cache_result_t::BLOCKED_PROMOTE
;
2579 // If it is a read, we can read, we need to forward it
2580 do_cache_redirect(op
);
2581 return cache_result_t::HANDLED_REDIRECT
;
2583 case pg_pool_t::CACHEMODE_PROXY
:
2584 if (!must_promote
) {
2585 if (op
->may_write() || op
->may_cache() || write_ordered
) {
2586 if (can_proxy_write
) {
2587 do_proxy_write(op
, missing_oid
);
2588 return cache_result_t::HANDLED_PROXY
;
2592 return cache_result_t::HANDLED_PROXY
;
2595 // ugh, we're forced to promote.
2597 agent_state
->evict_mode
== TierAgentState::EVICT_MODE_FULL
) {
2598 dout(20) << __func__
<< " cache pool full, waiting" << dendl
;
2599 block_write_on_full_cache(missing_oid
, op
);
2600 return cache_result_t::BLOCKED_FULL
;
2602 promote_object(obc
, missing_oid
, oloc
, op
, promote_obc
);
2603 return cache_result_t::BLOCKED_PROMOTE
;
2605 case pg_pool_t::CACHEMODE_READPROXY
:
2606 // Do writeback to the cache tier for writes
2607 if (op
->may_write() || write_ordered
|| must_promote
) {
2609 agent_state
->evict_mode
== TierAgentState::EVICT_MODE_FULL
) {
2610 dout(20) << __func__
<< " cache pool full, waiting" << dendl
;
2611 block_write_on_full_cache(missing_oid
, op
);
2612 return cache_result_t::BLOCKED_FULL
;
2614 promote_object(obc
, missing_oid
, oloc
, op
, promote_obc
);
2615 return cache_result_t::BLOCKED_PROMOTE
;
2618 // If it is a read, we can read, we need to proxy it
2620 return cache_result_t::HANDLED_PROXY
;
2623 assert(0 == "unrecognized cache_mode");
2625 return cache_result_t::NOOP
;
2628 bool PrimaryLogPG::maybe_promote(ObjectContextRef obc
,
2629 const hobject_t
& missing_oid
,
2630 const object_locator_t
& oloc
,
2633 OpRequestRef promote_op
,
2634 ObjectContextRef
*promote_obc
)
2636 dout(20) << __func__
<< " missing_oid " << missing_oid
2637 << " in_hit_set " << in_hit_set
<< dendl
;
2643 // Check if in the current hit set
2653 unsigned count
= (int)in_hit_set
;
2655 // Check if in other hit sets
2656 const hobject_t
& oid
= obc
.get() ? obc
->obs
.oi
.soid
: missing_oid
;
2657 for (map
<time_t,HitSetRef
>::reverse_iterator itor
=
2658 agent_state
->hit_set_map
.rbegin();
2659 itor
!= agent_state
->hit_set_map
.rend();
2661 if (!itor
->second
->contains(oid
)) {
2665 if (count
>= recency
) {
2670 if (count
>= recency
) {
2673 return false; // not promoting
2678 if (osd
->promote_throttle()) {
2679 dout(10) << __func__
<< " promote throttled" << dendl
;
2682 promote_object(obc
, missing_oid
, oloc
, promote_op
, promote_obc
);
2686 void PrimaryLogPG::do_cache_redirect(OpRequestRef op
)
2688 const MOSDOp
*m
= static_cast<const MOSDOp
*>(op
->get_req());
2689 int flags
= m
->get_flags() & (CEPH_OSD_FLAG_ACK
|CEPH_OSD_FLAG_ONDISK
);
2690 MOSDOpReply
*reply
= new MOSDOpReply(m
, -ENOENT
,
2691 get_osdmap()->get_epoch(), flags
, false);
2692 request_redirect_t
redir(m
->get_object_locator(), pool
.info
.tier_of
);
2693 reply
->set_redirect(redir
);
2694 dout(10) << "sending redirect to pool " << pool
.info
.tier_of
<< " for op "
2696 m
->get_connection()->send_message(reply
);
2700 struct C_ProxyRead
: public Context
{
2703 epoch_t last_peering_reset
;
2705 PrimaryLogPG::ProxyReadOpRef prdop
;
2707 C_ProxyRead(PrimaryLogPG
*p
, hobject_t o
, epoch_t lpr
,
2708 const PrimaryLogPG::ProxyReadOpRef
& prd
)
2709 : pg(p
), oid(o
), last_peering_reset(lpr
),
2710 tid(0), prdop(prd
), start(ceph_clock_now())
2712 void finish(int r
) override
{
2713 if (prdop
->canceled
)
2716 if (prdop
->canceled
) {
2720 if (last_peering_reset
== pg
->get_last_peering_reset()) {
2721 pg
->finish_proxy_read(oid
, tid
, r
);
2722 pg
->osd
->logger
->tinc(l_osd_tier_r_lat
, ceph_clock_now() - start
);
2728 void PrimaryLogPG::do_proxy_read(OpRequestRef op
, ObjectContextRef obc
)
2730 // NOTE: non-const here because the ProxyReadOp needs mutable refs to
2731 // stash the result in the request's OSDOp vector
2732 MOSDOp
*m
= static_cast<MOSDOp
*>(op
->get_nonconst_req());
2733 object_locator_t oloc
;
2735 /* extensible tier */
2736 if (obc
&& obc
->obs
.exists
&& obc
->obs
.oi
.has_manifest()) {
2737 switch (obc
->obs
.oi
.manifest
.type
) {
2738 case object_manifest_t::TYPE_REDIRECT
:
2739 oloc
= object_locator_t(obc
->obs
.oi
.manifest
.redirect_target
);
2740 soid
= obc
->obs
.oi
.manifest
.redirect_target
;
2742 case object_manifest_t::TYPE_CHUNKED
:
2744 assert(0 == "unrecognized manifest type");
2748 soid
= m
->get_hobj();
2749 oloc
= object_locator_t(m
->get_object_locator());
2750 oloc
.pool
= pool
.info
.tier_of
;
2752 unsigned flags
= CEPH_OSD_FLAG_IGNORE_CACHE
| CEPH_OSD_FLAG_IGNORE_OVERLAY
;
2754 // pass through some original flags that make sense.
2755 // - leave out redirection and balancing flags since we are
2756 // already proxying through the primary
2757 // - leave off read/write/exec flags that are derived from the op
2758 flags
|= m
->get_flags() & (CEPH_OSD_FLAG_RWORDERED
|
2759 CEPH_OSD_FLAG_ORDERSNAP
|
2760 CEPH_OSD_FLAG_ENFORCE_SNAPC
|
2761 CEPH_OSD_FLAG_MAP_SNAP_CLONE
);
2763 dout(10) << __func__
<< " Start proxy read for " << *m
<< dendl
;
2765 ProxyReadOpRef
prdop(std::make_shared
<ProxyReadOp
>(op
, soid
, m
->ops
));
2767 ObjectOperation obj_op
;
2768 obj_op
.dup(prdop
->ops
);
2770 if (pool
.info
.cache_mode
== pg_pool_t::CACHEMODE_WRITEBACK
&&
2771 (agent_state
&& agent_state
->evict_mode
!= TierAgentState::EVICT_MODE_FULL
)) {
2772 for (unsigned i
= 0; i
< obj_op
.ops
.size(); i
++) {
2773 ceph_osd_op op
= obj_op
.ops
[i
].op
;
2775 case CEPH_OSD_OP_READ
:
2776 case CEPH_OSD_OP_SYNC_READ
:
2777 case CEPH_OSD_OP_SPARSE_READ
:
2778 case CEPH_OSD_OP_CHECKSUM
:
2779 op
.flags
= (op
.flags
| CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL
) &
2780 ~(CEPH_OSD_OP_FLAG_FADVISE_DONTNEED
| CEPH_OSD_OP_FLAG_FADVISE_NOCACHE
);
2785 C_ProxyRead
*fin
= new C_ProxyRead(this, soid
, get_last_peering_reset(),
2787 ceph_tid_t tid
= osd
->objecter
->read(
2788 soid
.oid
, oloc
, obj_op
,
2789 m
->get_snapid(), NULL
,
2790 flags
, new C_OnFinisher(fin
, &osd
->objecter_finisher
),
2791 &prdop
->user_version
,
2792 &prdop
->data_offset
,
2795 prdop
->objecter_tid
= tid
;
2796 proxyread_ops
[tid
] = prdop
;
2797 in_progress_proxy_ops
[soid
].push_back(op
);
2800 void PrimaryLogPG::finish_proxy_read(hobject_t oid
, ceph_tid_t tid
, int r
)
2802 dout(10) << __func__
<< " " << oid
<< " tid " << tid
2803 << " " << cpp_strerror(r
) << dendl
;
2805 map
<ceph_tid_t
, ProxyReadOpRef
>::iterator p
= proxyread_ops
.find(tid
);
2806 if (p
== proxyread_ops
.end()) {
2807 dout(10) << __func__
<< " no proxyread_op found" << dendl
;
2810 ProxyReadOpRef prdop
= p
->second
;
2811 if (tid
!= prdop
->objecter_tid
) {
2812 dout(10) << __func__
<< " tid " << tid
<< " != prdop " << prdop
2813 << " tid " << prdop
->objecter_tid
<< dendl
;
2816 if (oid
!= prdop
->soid
) {
2817 dout(10) << __func__
<< " oid " << oid
<< " != prdop " << prdop
2818 << " soid " << prdop
->soid
<< dendl
;
2821 proxyread_ops
.erase(tid
);
2823 map
<hobject_t
, list
<OpRequestRef
>>::iterator q
= in_progress_proxy_ops
.find(oid
);
2824 if (q
== in_progress_proxy_ops
.end()) {
2825 dout(10) << __func__
<< " no in_progress_proxy_ops found" << dendl
;
2828 assert(q
->second
.size());
2829 list
<OpRequestRef
>::iterator it
= std::find(q
->second
.begin(),
2832 assert(it
!= q
->second
.end());
2833 OpRequestRef op
= *it
;
2834 q
->second
.erase(it
);
2835 if (q
->second
.size() == 0) {
2836 in_progress_proxy_ops
.erase(oid
);
2839 osd
->logger
->inc(l_osd_tier_proxy_read
);
2841 const MOSDOp
*m
= static_cast<const MOSDOp
*>(op
->get_req());
2842 OpContext
*ctx
= new OpContext(op
, m
->get_reqid(), prdop
->ops
, this);
2843 ctx
->reply
= new MOSDOpReply(m
, 0, get_osdmap()->get_epoch(), 0, false);
2844 ctx
->user_at_version
= prdop
->user_version
;
2845 ctx
->data_off
= prdop
->data_offset
;
2846 ctx
->ignore_log_op_stats
= true;
2847 complete_read_ctx(r
, ctx
);
2850 void PrimaryLogPG::kick_proxy_ops_blocked(hobject_t
& soid
)
2852 map
<hobject_t
, list
<OpRequestRef
>>::iterator p
= in_progress_proxy_ops
.find(soid
);
2853 if (p
== in_progress_proxy_ops
.end())
2856 list
<OpRequestRef
>& ls
= p
->second
;
2857 dout(10) << __func__
<< " " << soid
<< " requeuing " << ls
.size() << " requests" << dendl
;
2859 in_progress_proxy_ops
.erase(p
);
2862 void PrimaryLogPG::cancel_proxy_read(ProxyReadOpRef prdop
)
2864 dout(10) << __func__
<< " " << prdop
->soid
<< dendl
;
2865 prdop
->canceled
= true;
2867 // cancel objecter op, if we can
2868 if (prdop
->objecter_tid
) {
2869 osd
->objecter
->op_cancel(prdop
->objecter_tid
, -ECANCELED
);
2870 for (uint32_t i
= 0; i
< prdop
->ops
.size(); i
++) {
2871 prdop
->ops
[i
].outdata
.clear();
2873 proxyread_ops
.erase(prdop
->objecter_tid
);
2874 prdop
->objecter_tid
= 0;
2878 void PrimaryLogPG::cancel_proxy_ops(bool requeue
)
2880 dout(10) << __func__
<< dendl
;
2882 // cancel proxy reads
2883 map
<ceph_tid_t
, ProxyReadOpRef
>::iterator p
= proxyread_ops
.begin();
2884 while (p
!= proxyread_ops
.end()) {
2885 cancel_proxy_read((p
++)->second
);
2888 // cancel proxy writes
2889 map
<ceph_tid_t
, ProxyWriteOpRef
>::iterator q
= proxywrite_ops
.begin();
2890 while (q
!= proxywrite_ops
.end()) {
2891 cancel_proxy_write((q
++)->second
);
2895 map
<hobject_t
, list
<OpRequestRef
>>::iterator p
=
2896 in_progress_proxy_ops
.begin();
2897 while (p
!= in_progress_proxy_ops
.end()) {
2898 list
<OpRequestRef
>& ls
= p
->second
;
2899 dout(10) << __func__
<< " " << p
->first
<< " requeuing " << ls
.size()
2900 << " requests" << dendl
;
2902 in_progress_proxy_ops
.erase(p
++);
2905 in_progress_proxy_ops
.clear();
2909 struct C_ProxyWrite_Commit
: public Context
{
2912 epoch_t last_peering_reset
;
2914 PrimaryLogPG::ProxyWriteOpRef pwop
;
2915 C_ProxyWrite_Commit(PrimaryLogPG
*p
, hobject_t o
, epoch_t lpr
,
2916 const PrimaryLogPG::ProxyWriteOpRef
& pw
)
2917 : pg(p
), oid(o
), last_peering_reset(lpr
),
2920 void finish(int r
) override
{
2924 if (pwop
->canceled
) {
2928 if (last_peering_reset
== pg
->get_last_peering_reset()) {
2929 pg
->finish_proxy_write(oid
, tid
, r
);
2935 void PrimaryLogPG::do_proxy_write(OpRequestRef op
, const hobject_t
& missing_oid
, ObjectContextRef obc
)
2937 // NOTE: non-const because ProxyWriteOp takes a mutable ref
2938 MOSDOp
*m
= static_cast<MOSDOp
*>(op
->get_nonconst_req());
2939 object_locator_t oloc
;
2940 SnapContext
snapc(m
->get_snap_seq(), m
->get_snaps());
2942 /* extensible tier */
2943 if (obc
&& obc
->obs
.exists
&& obc
->obs
.oi
.has_manifest()) {
2944 switch (obc
->obs
.oi
.manifest
.type
) {
2945 case object_manifest_t::TYPE_REDIRECT
:
2946 oloc
= object_locator_t(obc
->obs
.oi
.manifest
.redirect_target
);
2947 soid
= obc
->obs
.oi
.manifest
.redirect_target
;
2949 case object_manifest_t::TYPE_CHUNKED
:
2951 assert(0 == "unrecognized manifest type");
2955 soid
= m
->get_hobj();
2956 oloc
= object_locator_t(m
->get_object_locator());
2957 oloc
.pool
= pool
.info
.tier_of
;
2960 unsigned flags
= CEPH_OSD_FLAG_IGNORE_CACHE
| CEPH_OSD_FLAG_IGNORE_OVERLAY
;
2961 if (!(op
->may_write() || op
->may_cache())) {
2962 flags
|= CEPH_OSD_FLAG_RWORDERED
;
2964 dout(10) << __func__
<< " Start proxy write for " << *m
<< dendl
;
2966 ProxyWriteOpRef
pwop(std::make_shared
<ProxyWriteOp
>(op
, soid
, m
->ops
, m
->get_reqid()));
2967 pwop
->ctx
= new OpContext(op
, m
->get_reqid(), pwop
->ops
, this);
2968 pwop
->mtime
= m
->get_mtime();
2970 ObjectOperation obj_op
;
2971 obj_op
.dup(pwop
->ops
);
2973 C_ProxyWrite_Commit
*fin
= new C_ProxyWrite_Commit(
2974 this, soid
, get_last_peering_reset(), pwop
);
2975 ceph_tid_t tid
= osd
->objecter
->mutate(
2976 soid
.oid
, oloc
, obj_op
, snapc
,
2977 ceph::real_clock::from_ceph_timespec(pwop
->mtime
),
2978 flags
, new C_OnFinisher(fin
, &osd
->objecter_finisher
),
2979 &pwop
->user_version
, pwop
->reqid
);
2981 pwop
->objecter_tid
= tid
;
2982 proxywrite_ops
[tid
] = pwop
;
2983 in_progress_proxy_ops
[soid
].push_back(op
);
2986 void PrimaryLogPG::finish_proxy_write(hobject_t oid
, ceph_tid_t tid
, int r
)
2988 dout(10) << __func__
<< " " << oid
<< " tid " << tid
2989 << " " << cpp_strerror(r
) << dendl
;
2991 map
<ceph_tid_t
, ProxyWriteOpRef
>::iterator p
= proxywrite_ops
.find(tid
);
2992 if (p
== proxywrite_ops
.end()) {
2993 dout(10) << __func__
<< " no proxywrite_op found" << dendl
;
2996 ProxyWriteOpRef pwop
= p
->second
;
2997 assert(tid
== pwop
->objecter_tid
);
2998 assert(oid
== pwop
->soid
);
3000 proxywrite_ops
.erase(tid
);
3002 map
<hobject_t
, list
<OpRequestRef
> >::iterator q
= in_progress_proxy_ops
.find(oid
);
3003 if (q
== in_progress_proxy_ops
.end()) {
3004 dout(10) << __func__
<< " no in_progress_proxy_ops found" << dendl
;
3009 list
<OpRequestRef
>& in_progress_op
= q
->second
;
3010 assert(in_progress_op
.size());
3011 list
<OpRequestRef
>::iterator it
= std::find(in_progress_op
.begin(),
3012 in_progress_op
.end(),
3014 assert(it
!= in_progress_op
.end());
3015 in_progress_op
.erase(it
);
3016 if (in_progress_op
.size() == 0) {
3017 in_progress_proxy_ops
.erase(oid
);
3020 osd
->logger
->inc(l_osd_tier_proxy_write
);
3022 const MOSDOp
*m
= static_cast<const MOSDOp
*>(pwop
->op
->get_req());
3025 if (!pwop
->sent_reply
) {
3027 MOSDOpReply
*reply
= pwop
->ctx
->reply
;
3029 pwop
->ctx
->reply
= NULL
;
3031 reply
= new MOSDOpReply(m
, r
, get_osdmap()->get_epoch(), 0, true);
3032 reply
->set_reply_versions(eversion_t(), pwop
->user_version
);
3034 reply
->add_flags(CEPH_OSD_FLAG_ACK
| CEPH_OSD_FLAG_ONDISK
);
3035 dout(10) << " sending commit on " << pwop
<< " " << reply
<< dendl
;
3036 osd
->send_message_osd_client(reply
, m
->get_connection());
3037 pwop
->sent_reply
= true;
3038 pwop
->ctx
->op
->mark_commit_sent();
3045 void PrimaryLogPG::cancel_proxy_write(ProxyWriteOpRef pwop
)
3047 dout(10) << __func__
<< " " << pwop
->soid
<< dendl
;
3048 pwop
->canceled
= true;
3050 // cancel objecter op, if we can
3051 if (pwop
->objecter_tid
) {
3052 osd
->objecter
->op_cancel(pwop
->objecter_tid
, -ECANCELED
);
3055 proxywrite_ops
.erase(pwop
->objecter_tid
);
3056 pwop
->objecter_tid
= 0;
3060 class PromoteCallback
: public PrimaryLogPG::CopyCallback
{
3061 ObjectContextRef obc
;
3065 PromoteCallback(ObjectContextRef obc_
, PrimaryLogPG
*pg_
)
3068 start(ceph_clock_now()) {}
3070 void finish(PrimaryLogPG::CopyCallbackResults results
) override
{
3071 PrimaryLogPG::CopyResults
*results_data
= results
.get
<1>();
3072 int r
= results
.get
<0>();
3073 pg
->finish_promote(r
, results_data
, obc
);
3074 pg
->osd
->logger
->tinc(l_osd_tier_promote_lat
, ceph_clock_now() - start
);
3078 void PrimaryLogPG::promote_object(ObjectContextRef obc
,
3079 const hobject_t
& missing_oid
,
3080 const object_locator_t
& oloc
,
3082 ObjectContextRef
*promote_obc
)
3084 hobject_t hoid
= obc
? obc
->obs
.oi
.soid
: missing_oid
;
3085 assert(hoid
!= hobject_t());
3086 if (scrubber
.write_blocked_by_scrub(hoid
)) {
3087 dout(10) << __func__
<< " " << hoid
3088 << " blocked by scrub" << dendl
;
3090 waiting_for_scrub
.push_back(op
);
3091 op
->mark_delayed("waiting for scrub");
3092 dout(10) << __func__
<< " " << hoid
3093 << " placing op in waiting_for_scrub" << dendl
;
3095 dout(10) << __func__
<< " " << hoid
3096 << " no op, dropping on the floor" << dendl
;
3100 if (!obc
) { // we need to create an ObjectContext
3101 assert(missing_oid
!= hobject_t());
3102 obc
= get_object_context(missing_oid
, true);
3108 * Before promote complete, if there are proxy-reads for the object,
3109 * for this case we don't use DONTNEED.
3111 unsigned src_fadvise_flags
= LIBRADOS_OP_FLAG_FADVISE_SEQUENTIAL
;
3112 map
<hobject_t
, list
<OpRequestRef
>>::iterator q
= in_progress_proxy_ops
.find(obc
->obs
.oi
.soid
);
3113 if (q
== in_progress_proxy_ops
.end()) {
3114 src_fadvise_flags
|= LIBRADOS_OP_FLAG_FADVISE_DONTNEED
;
3117 PromoteCallback
*cb
= new PromoteCallback(obc
, this);
3118 object_locator_t my_oloc
= oloc
;
3119 my_oloc
.pool
= pool
.info
.tier_of
;
3121 unsigned flags
= CEPH_OSD_COPY_FROM_FLAG_IGNORE_OVERLAY
|
3122 CEPH_OSD_COPY_FROM_FLAG_IGNORE_CACHE
|
3123 CEPH_OSD_COPY_FROM_FLAG_MAP_SNAP_CLONE
|
3124 CEPH_OSD_COPY_FROM_FLAG_RWORDERED
;
3125 start_copy(cb
, obc
, obc
->obs
.oi
.soid
, my_oloc
, 0, flags
,
3126 obc
->obs
.oi
.soid
.snap
== CEPH_NOSNAP
,
3127 src_fadvise_flags
, 0);
3129 assert(obc
->is_blocked());
3132 wait_for_blocked_object(obc
->obs
.oi
.soid
, op
);
3133 info
.stats
.stats
.sum
.num_promote
++;
3136 void PrimaryLogPG::execute_ctx(OpContext
*ctx
)
3139 dout(10) << __func__
<< " " << ctx
<< dendl
;
3140 ctx
->reset_obs(ctx
->obc
);
3141 ctx
->update_log_only
= false; // reset in case finish_copyfrom() is re-running execute_ctx
3142 OpRequestRef op
= ctx
->op
;
3143 const MOSDOp
*m
= static_cast<const MOSDOp
*>(op
->get_req());
3144 ObjectContextRef obc
= ctx
->obc
;
3145 const hobject_t
& soid
= obc
->obs
.oi
.soid
;
3147 // this method must be idempotent since we may call it several times
3148 // before we finally apply the resulting transaction.
3149 ctx
->op_t
.reset(new PGTransaction
);
3151 if (op
->may_write() || op
->may_cache()) {
3153 if (!(m
->has_flag(CEPH_OSD_FLAG_ENFORCE_SNAPC
)) &&
3154 pool
.info
.is_pool_snaps_mode()) {
3156 ctx
->snapc
= pool
.snapc
;
3158 // client specified snapc
3159 ctx
->snapc
.seq
= m
->get_snap_seq();
3160 ctx
->snapc
.snaps
= m
->get_snaps();
3161 filter_snapc(ctx
->snapc
.snaps
);
3163 if ((m
->has_flag(CEPH_OSD_FLAG_ORDERSNAP
)) &&
3164 ctx
->snapc
.seq
< obc
->ssc
->snapset
.seq
) {
3165 dout(10) << " ORDERSNAP flag set and snapc seq " << ctx
->snapc
.seq
3166 << " < snapset seq " << obc
->ssc
->snapset
.seq
3167 << " on " << obc
->obs
.oi
.soid
<< dendl
;
3168 reply_ctx(ctx
, -EOLDSNAPC
);
3173 ctx
->at_version
= get_next_version();
3174 ctx
->mtime
= m
->get_mtime();
3176 dout(10) << __func__
<< " " << soid
<< " " << ctx
->ops
3177 << " ov " << obc
->obs
.oi
.version
<< " av " << ctx
->at_version
3178 << " snapc " << ctx
->snapc
3179 << " snapset " << obc
->ssc
->snapset
3182 dout(10) << __func__
<< " " << soid
<< " " << ctx
->ops
3183 << " ov " << obc
->obs
.oi
.version
3187 if (!ctx
->user_at_version
)
3188 ctx
->user_at_version
= obc
->obs
.oi
.user_version
;
3189 dout(30) << __func__
<< " user_at_version " << ctx
->user_at_version
<< dendl
;
3191 if (op
->may_read()) {
3192 dout(10) << " taking ondisk_read_lock" << dendl
;
3193 obc
->ondisk_read_lock();
3198 osd_reqid_t reqid
= ctx
->op
->get_reqid();
3200 tracepoint(osd
, prepare_tx_enter
, reqid
.name
._type
,
3201 reqid
.name
._num
, reqid
.tid
, reqid
.inc
);
3204 int result
= prepare_transaction(ctx
);
3208 osd_reqid_t reqid
= ctx
->op
->get_reqid();
3210 tracepoint(osd
, prepare_tx_exit
, reqid
.name
._type
,
3211 reqid
.name
._num
, reqid
.tid
, reqid
.inc
);
3214 if (op
->may_read()) {
3215 dout(10) << " dropping ondisk_read_lock" << dendl
;
3216 obc
->ondisk_read_unlock();
3219 if (result
== -EINPROGRESS
) {
3224 if (result
== -EAGAIN
) {
3225 // clean up after the ctx
3230 bool successful_write
= !ctx
->op_t
->empty() && op
->may_write() && result
>= 0;
3231 // prepare the reply
3232 ctx
->reply
= new MOSDOpReply(m
, 0, get_osdmap()->get_epoch(), 0,
3235 // Write operations aren't allowed to return a data payload because
3236 // we can't do so reliably. If the client has to resend the request
3237 // and it has already been applied, we will return 0 with no
3238 // payload. Non-deterministic behavior is no good. However, it is
3239 // possible to construct an operation that does a read, does a guard
3240 // check (e.g., CMPXATTR), and then a write. Then we either succeed
3241 // with the write, or return a CMPXATTR and the read value.
3242 if (successful_write
) {
3243 // write. normalize the result code.
3244 dout(20) << " zeroing write result code " << result
<< dendl
;
3247 ctx
->reply
->set_result(result
);
3250 if ((ctx
->op_t
->empty() || result
< 0) && !ctx
->update_log_only
) {
3251 // finish side-effects
3253 do_osd_op_effects(ctx
, m
->get_connection());
3255 if (ctx
->pending_async_reads
.empty()) {
3256 complete_read_ctx(result
, ctx
);
3258 in_progress_async_reads
.push_back(make_pair(op
, ctx
));
3259 ctx
->start_async_reads(this);
3265 ctx
->reply
->set_reply_versions(ctx
->at_version
, ctx
->user_at_version
);
3267 assert(op
->may_write() || op
->may_cache());
3272 // verify that we are doing this in order?
3273 if (cct
->_conf
->osd_debug_op_order
&& m
->get_source().is_client() &&
3274 !pool
.info
.is_tier() && !pool
.info
.has_tiers()) {
3275 map
<client_t
,ceph_tid_t
>& cm
= debug_op_order
[obc
->obs
.oi
.soid
];
3276 ceph_tid_t t
= m
->get_tid();
3277 client_t n
= m
->get_source().num();
3278 map
<client_t
,ceph_tid_t
>::iterator p
= cm
.find(n
);
3279 if (p
== cm
.end()) {
3280 dout(20) << " op order client." << n
<< " tid " << t
<< " (first)" << dendl
;
3283 dout(20) << " op order client." << n
<< " tid " << t
<< " last was " << p
->second
<< dendl
;
3284 if (p
->second
> t
) {
3285 derr
<< "bad op order, already applied " << p
->second
<< " > this " << t
<< dendl
;
3286 assert(0 == "out of order op");
3292 if (ctx
->update_log_only
) {
3294 do_osd_op_effects(ctx
, m
->get_connection());
3296 dout(20) << __func__
<< " update_log_only -- result=" << result
<< dendl
;
3297 // save just what we need from ctx
3298 MOSDOpReply
*reply
= ctx
->reply
;
3299 ctx
->reply
= nullptr;
3300 reply
->claim_op_out_data(ctx
->ops
);
3301 reply
->get_header().data_off
= ctx
->data_off
;
3304 if (result
== -ENOENT
) {
3305 reply
->set_enoent_reply_versions(info
.last_update
,
3306 info
.last_user_version
);
3308 reply
->add_flags(CEPH_OSD_FLAG_ACK
| CEPH_OSD_FLAG_ONDISK
);
3309 // append to pg log for dup detection - don't save buffers for now
3310 record_write_error(op
, soid
, reply
, result
);
3314 // no need to capture PG ref, repop cancel will handle that
3315 // Can capture the ctx by pointer, it's owned by the repop
3316 ctx
->register_on_commit(
3322 if (m
&& !ctx
->sent_reply
) {
3323 MOSDOpReply
*reply
= ctx
->reply
;
3325 ctx
->reply
= nullptr;
3327 reply
= new MOSDOpReply(m
, 0, get_osdmap()->get_epoch(), 0, true);
3328 reply
->set_reply_versions(ctx
->at_version
,
3329 ctx
->user_at_version
);
3331 reply
->add_flags(CEPH_OSD_FLAG_ACK
| CEPH_OSD_FLAG_ONDISK
);
3332 dout(10) << " sending reply on " << *m
<< " " << reply
<< dendl
;
3333 osd
->send_message_osd_client(reply
, m
->get_connection());
3334 ctx
->sent_reply
= true;
3335 ctx
->op
->mark_commit_sent();
3338 ctx
->register_on_success(
3342 ctx
->op
? ctx
->op
->get_req()->get_connection() :
3345 ctx
->register_on_finish(
3350 // issue replica writes
3351 ceph_tid_t rep_tid
= osd
->get_tid();
3353 RepGather
*repop
= new_repop(ctx
, obc
, rep_tid
);
3355 issue_repop(repop
, ctx
);
3360 void PrimaryLogPG::reply_ctx(OpContext
*ctx
, int r
)
3363 osd
->reply_op_error(ctx
->op
, r
);
3367 void PrimaryLogPG::reply_ctx(OpContext
*ctx
, int r
, eversion_t v
, version_t uv
)
3370 osd
->reply_op_error(ctx
->op
, r
, v
, uv
);
3374 void PrimaryLogPG::log_op_stats(OpContext
*ctx
)
3376 OpRequestRef op
= ctx
->op
;
3377 const MOSDOp
*m
= static_cast<const MOSDOp
*>(op
->get_req());
3379 utime_t now
= ceph_clock_now();
3380 utime_t latency
= now
;
3381 latency
-= ctx
->op
->get_req()->get_recv_stamp();
3382 utime_t process_latency
= now
;
3383 process_latency
-= ctx
->op
->get_dequeued_time();
3385 uint64_t inb
= ctx
->bytes_written
;
3386 uint64_t outb
= ctx
->bytes_read
;
3388 osd
->logger
->inc(l_osd_op
);
3390 osd
->logger
->inc(l_osd_op_outb
, outb
);
3391 osd
->logger
->inc(l_osd_op_inb
, inb
);
3392 osd
->logger
->tinc(l_osd_op_lat
, latency
);
3393 osd
->logger
->tinc(l_osd_op_process_lat
, process_latency
);
3395 if (op
->may_read() && op
->may_write()) {
3396 osd
->logger
->inc(l_osd_op_rw
);
3397 osd
->logger
->inc(l_osd_op_rw_inb
, inb
);
3398 osd
->logger
->inc(l_osd_op_rw_outb
, outb
);
3399 osd
->logger
->tinc(l_osd_op_rw_lat
, latency
);
3400 osd
->logger
->hinc(l_osd_op_rw_lat_inb_hist
, latency
.to_nsec(), inb
);
3401 osd
->logger
->hinc(l_osd_op_rw_lat_outb_hist
, latency
.to_nsec(), outb
);
3402 osd
->logger
->tinc(l_osd_op_rw_process_lat
, process_latency
);
3403 } else if (op
->may_read()) {
3404 osd
->logger
->inc(l_osd_op_r
);
3405 osd
->logger
->inc(l_osd_op_r_outb
, outb
);
3406 osd
->logger
->tinc(l_osd_op_r_lat
, latency
);
3407 osd
->logger
->hinc(l_osd_op_r_lat_outb_hist
, latency
.to_nsec(), outb
);
3408 osd
->logger
->tinc(l_osd_op_r_process_lat
, process_latency
);
3409 } else if (op
->may_write() || op
->may_cache()) {
3410 osd
->logger
->inc(l_osd_op_w
);
3411 osd
->logger
->inc(l_osd_op_w_inb
, inb
);
3412 osd
->logger
->tinc(l_osd_op_w_lat
, latency
);
3413 osd
->logger
->hinc(l_osd_op_w_lat_inb_hist
, latency
.to_nsec(), inb
);
3414 osd
->logger
->tinc(l_osd_op_w_process_lat
, process_latency
);
3418 dout(15) << "log_op_stats " << *m
3421 << " lat " << latency
<< dendl
;
3424 void PrimaryLogPG::do_sub_op(OpRequestRef op
)
3426 const MOSDSubOp
*m
= static_cast<const MOSDSubOp
*>(op
->get_req());
3427 assert(have_same_or_newer_map(m
->map_epoch
));
3428 assert(m
->get_type() == MSG_OSD_SUBOP
);
3429 dout(15) << "do_sub_op " << *op
->get_req() << dendl
;
3432 waiting_for_peered
.push_back(op
);
3433 op
->mark_delayed("waiting for active");
3437 const OSDOp
*first
= NULL
;
3438 if (m
->ops
.size() >= 1) {
3443 switch (first
->op
.op
) {
3444 case CEPH_OSD_OP_DELETE
:
3447 case CEPH_OSD_OP_SCRUB_RESERVE
:
3448 handle_scrub_reserve_request(op
);
3450 case CEPH_OSD_OP_SCRUB_UNRESERVE
:
3451 handle_scrub_reserve_release(op
);
3453 case CEPH_OSD_OP_SCRUB_MAP
:
3454 sub_op_scrub_map(op
);
3460 void PrimaryLogPG::do_sub_op_reply(OpRequestRef op
)
3462 const MOSDSubOpReply
*r
= static_cast<const MOSDSubOpReply
*>(op
->get_req());
3463 assert(r
->get_type() == MSG_OSD_SUBOPREPLY
);
3464 if (r
->ops
.size() >= 1) {
3465 const OSDOp
& first
= r
->ops
[0];
3466 switch (first
.op
.op
) {
3467 case CEPH_OSD_OP_SCRUB_RESERVE
:
3469 pg_shard_t from
= r
->from
;
3470 bufferlist::iterator p
= const_cast<bufferlist
&>(r
->get_data()).begin();
3472 ::decode(reserved
, p
);
3474 handle_scrub_reserve_grant(op
, from
);
3476 handle_scrub_reserve_reject(op
, from
);
3484 void PrimaryLogPG::do_scan(
3486 ThreadPool::TPHandle
&handle
)
3488 const MOSDPGScan
*m
= static_cast<const MOSDPGScan
*>(op
->get_req());
3489 assert(m
->get_type() == MSG_OSD_PG_SCAN
);
3490 dout(10) << "do_scan " << *m
<< dendl
;
3495 case MOSDPGScan::OP_SCAN_GET_DIGEST
:
3498 if (osd
->check_backfill_full(ss
)) {
3499 dout(1) << __func__
<< ": Canceling backfill, " << ss
.str() << dendl
;
3500 queue_peering_event(
3502 std::make_shared
<CephPeeringEvt
>(
3503 get_osdmap()->get_epoch(),
3504 get_osdmap()->get_epoch(),
3505 BackfillTooFull())));
3509 BackfillInterval bi
;
3510 bi
.begin
= m
->begin
;
3511 // No need to flush, there won't be any in progress writes occuring
3514 cct
->_conf
->osd_backfill_scan_min
,
3515 cct
->_conf
->osd_backfill_scan_max
,
3518 MOSDPGScan
*reply
= new MOSDPGScan(
3519 MOSDPGScan::OP_SCAN_DIGEST
,
3521 get_osdmap()->get_epoch(), m
->query_epoch
,
3522 spg_t(info
.pgid
.pgid
, get_primary().shard
), bi
.begin
, bi
.end
);
3523 ::encode(bi
.objects
, reply
->get_data());
3524 osd
->send_message_osd_cluster(reply
, m
->get_connection());
3528 case MOSDPGScan::OP_SCAN_DIGEST
:
3530 pg_shard_t from
= m
->from
;
3532 // Check that from is in backfill_targets vector
3533 assert(is_backfill_targets(from
));
3535 BackfillInterval
& bi
= peer_backfill_info
[from
];
3536 bi
.begin
= m
->begin
;
3538 bufferlist::iterator p
= const_cast<bufferlist
&>(m
->get_data()).begin();
3540 // take care to preserve ordering!
3542 ::decode_noclear(bi
.objects
, p
);
3544 if (waiting_on_backfill
.erase(from
)) {
3545 if (waiting_on_backfill
.empty()) {
3546 assert(peer_backfill_info
.size() == backfill_targets
.size());
3547 finish_recovery_op(hobject_t::get_max());
3550 // we canceled backfill for a while due to a too full, and this
3551 // is an extra response from a non-too-full peer
3558 void PrimaryLogPG::do_backfill(OpRequestRef op
)
3560 const MOSDPGBackfill
*m
= static_cast<const MOSDPGBackfill
*>(op
->get_req());
3561 assert(m
->get_type() == MSG_OSD_PG_BACKFILL
);
3562 dout(10) << "do_backfill " << *m
<< dendl
;
3567 case MOSDPGBackfill::OP_BACKFILL_FINISH
:
3569 assert(cct
->_conf
->osd_kill_backfill_at
!= 1);
3571 MOSDPGBackfill
*reply
= new MOSDPGBackfill(
3572 MOSDPGBackfill::OP_BACKFILL_FINISH_ACK
,
3573 get_osdmap()->get_epoch(),
3575 spg_t(info
.pgid
.pgid
, get_primary().shard
));
3576 reply
->set_priority(get_recovery_op_priority());
3577 osd
->send_message_osd_cluster(reply
, m
->get_connection());
3578 queue_peering_event(
3580 std::make_shared
<CephPeeringEvt
>(
3581 get_osdmap()->get_epoch(),
3582 get_osdmap()->get_epoch(),
3587 case MOSDPGBackfill::OP_BACKFILL_PROGRESS
:
3589 assert(cct
->_conf
->osd_kill_backfill_at
!= 2);
3591 info
.set_last_backfill(m
->last_backfill
);
3592 info
.stats
= m
->stats
;
3594 ObjectStore::Transaction t
;
3597 int tr
= osd
->store
->queue_transaction(osr
.get(), std::move(t
), NULL
);
3602 case MOSDPGBackfill::OP_BACKFILL_FINISH_ACK
:
3604 assert(is_primary());
3605 assert(cct
->_conf
->osd_kill_backfill_at
!= 3);
3606 finish_recovery_op(hobject_t::get_max());
3612 void PrimaryLogPG::do_backfill_remove(OpRequestRef op
)
3614 const MOSDPGBackfillRemove
*m
= static_cast<const MOSDPGBackfillRemove
*>(
3616 assert(m
->get_type() == MSG_OSD_PG_BACKFILL_REMOVE
);
3617 dout(7) << __func__
<< " " << m
->ls
<< dendl
;
3621 ObjectStore::Transaction t
;
3622 for (auto& p
: m
->ls
) {
3623 remove_snap_mapped_object(t
, p
.first
);
3625 int r
= osd
->store
->queue_transaction(osr
.get(), std::move(t
), NULL
);
3629 int PrimaryLogPG::trim_object(
3630 bool first
, const hobject_t
&coid
, PrimaryLogPG::OpContextUPtr
*ctxp
)
3635 ObjectContextRef obc
= get_object_context(coid
, false, NULL
);
3636 if (!obc
|| !obc
->ssc
|| !obc
->ssc
->exists
) {
3637 osd
->clog
->error() << __func__
<< ": Can not trim " << coid
3638 << " repair needed " << (obc
? "(no obc->ssc or !exists)" : "(no obc)");
3643 coid
.oid
, coid
.get_key(),
3644 obc
->ssc
->snapset
.head_exists
? CEPH_NOSNAP
:CEPH_SNAPDIR
, coid
.get_hash(),
3645 info
.pgid
.pool(), coid
.get_namespace());
3646 ObjectContextRef snapset_obc
= get_object_context(snapoid
, false);
3648 osd
->clog
->error() << __func__
<< ": Can not trim " << coid
3649 << " repair needed, no snapset obc for " << snapoid
;
3653 SnapSet
& snapset
= obc
->ssc
->snapset
;
3655 bool legacy
= snapset
.is_legacy() ||
3656 get_osdmap()->require_osd_release
< CEPH_RELEASE_LUMINOUS
;
3658 object_info_t
&coi
= obc
->obs
.oi
;
3659 set
<snapid_t
> old_snaps
;
3661 old_snaps
.insert(coi
.legacy_snaps
.begin(), coi
.legacy_snaps
.end());
3663 auto p
= snapset
.clone_snaps
.find(coid
.snap
);
3664 if (p
== snapset
.clone_snaps
.end()) {
3665 osd
->clog
->error() << __func__
<< " No clone_snaps in snapset " << snapset
3666 << " for " << coid
<< "\n";
3669 old_snaps
.insert(snapset
.clone_snaps
[coid
.snap
].begin(),
3670 snapset
.clone_snaps
[coid
.snap
].end());
3672 if (old_snaps
.empty()) {
3673 osd
->clog
->error() << __func__
<< " No object info snaps for " << coid
;
3677 dout(10) << coid
<< " old_snaps " << old_snaps
3678 << " old snapset " << snapset
<< dendl
;
3679 if (snapset
.seq
== 0) {
3680 osd
->clog
->error() << __func__
<< " No snapset.seq for " << coid
;
3684 set
<snapid_t
> new_snaps
;
3685 for (set
<snapid_t
>::iterator i
= old_snaps
.begin();
3686 i
!= old_snaps
.end();
3688 if (!pool
.info
.is_removed_snap(*i
))
3689 new_snaps
.insert(*i
);
3692 vector
<snapid_t
>::iterator p
= snapset
.clones
.end();
3694 if (new_snaps
.empty()) {
3695 p
= std::find(snapset
.clones
.begin(), snapset
.clones
.end(), coid
.snap
);
3696 if (p
== snapset
.clones
.end()) {
3697 osd
->clog
->error() << __func__
<< " Snap " << coid
.snap
<< " not in clones";
3702 OpContextUPtr ctx
= simple_opc_create(obc
);
3703 ctx
->snapset_obc
= snapset_obc
;
3705 if (!ctx
->lock_manager
.get_snaptrimmer_write(
3709 close_op_ctx(ctx
.release());
3710 dout(10) << __func__
<< ": Unable to get a wlock on " << coid
<< dendl
;
3714 if (!ctx
->lock_manager
.get_snaptrimmer_write(
3718 close_op_ctx(ctx
.release());
3719 dout(10) << __func__
<< ": Unable to get a wlock on " << snapoid
<< dendl
;
3723 ctx
->at_version
= get_next_version();
3725 PGTransaction
*t
= ctx
->op_t
.get();
3727 if (new_snaps
.empty()) {
3729 dout(10) << coid
<< " snaps " << old_snaps
<< " -> "
3730 << new_snaps
<< " ... deleting" << dendl
;
3733 assert(p
!= snapset
.clones
.end());
3735 snapid_t last
= coid
.snap
;
3736 ctx
->delta_stats
.num_bytes
-= snapset
.get_clone_bytes(last
);
3738 if (p
!= snapset
.clones
.begin()) {
3739 // not the oldest... merge overlap into next older clone
3740 vector
<snapid_t
>::iterator n
= p
- 1;
3741 hobject_t prev_coid
= coid
;
3742 prev_coid
.snap
= *n
;
3743 bool adjust_prev_bytes
= is_present_clone(prev_coid
);
3745 if (adjust_prev_bytes
)
3746 ctx
->delta_stats
.num_bytes
-= snapset
.get_clone_bytes(*n
);
3748 snapset
.clone_overlap
[*n
].intersection_of(
3749 snapset
.clone_overlap
[*p
]);
3751 if (adjust_prev_bytes
)
3752 ctx
->delta_stats
.num_bytes
+= snapset
.get_clone_bytes(*n
);
3754 ctx
->delta_stats
.num_objects
--;
3756 ctx
->delta_stats
.num_objects_dirty
--;
3758 ctx
->delta_stats
.num_objects_omap
--;
3759 if (coi
.is_whiteout()) {
3760 dout(20) << __func__
<< " trimming whiteout on " << coid
<< dendl
;
3761 ctx
->delta_stats
.num_whiteouts
--;
3763 ctx
->delta_stats
.num_object_clones
--;
3764 if (coi
.is_cache_pinned())
3765 ctx
->delta_stats
.num_objects_pinned
--;
3766 obc
->obs
.exists
= false;
3768 snapset
.clones
.erase(p
);
3769 snapset
.clone_overlap
.erase(last
);
3770 snapset
.clone_size
.erase(last
);
3771 snapset
.clone_snaps
.erase(last
);
3775 pg_log_entry_t::DELETE
,
3778 ctx
->obs
->oi
.version
,
3790 coi
= object_info_t(coid
);
3792 ctx
->at_version
.version
++;
3794 // save adjusted snaps for this object
3795 dout(10) << coid
<< " snaps " << old_snaps
<< " -> " << new_snaps
<< dendl
;
3797 coi
.legacy_snaps
= vector
<snapid_t
>(new_snaps
.rbegin(), new_snaps
.rend());
3799 snapset
.clone_snaps
[coid
.snap
] = vector
<snapid_t
>(new_snaps
.rbegin(),
3801 // we still do a 'modify' event on this object just to trigger a
3802 // snapmapper.update ... :(
3805 coi
.prior_version
= coi
.version
;
3806 coi
.version
= ctx
->at_version
;
3808 ::encode(coi
, bl
, get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD
, nullptr));
3809 t
->setattr(coid
, OI_ATTR
, bl
);
3813 pg_log_entry_t::MODIFY
,
3822 ctx
->at_version
.version
++;
3830 // save head snapset
3831 dout(10) << coid
<< " new snapset " << snapset
<< " on "
3832 << snapset_obc
->obs
.oi
<< dendl
;
3833 if (snapset
.clones
.empty() &&
3834 (!snapset
.head_exists
||
3835 (snapset_obc
->obs
.oi
.is_whiteout() &&
3836 !(snapset_obc
->obs
.oi
.is_dirty() && pool
.info
.is_tier()) &&
3837 !snapset_obc
->obs
.oi
.is_cache_pinned()))) {
3838 // NOTE: this arguably constitutes minor interference with the
3839 // tiering agent if this is a cache tier since a snap trim event
3840 // is effectively evicting a whiteout we might otherwise want to
3842 dout(10) << coid
<< " removing " << snapoid
<< dendl
;
3845 pg_log_entry_t::DELETE
,
3848 ctx
->snapset_obc
->obs
.oi
.version
,
3854 if (snapoid
.is_head()) {
3855 derr
<< "removing snap head" << dendl
;
3856 object_info_t
& oi
= ctx
->snapset_obc
->obs
.oi
;
3857 ctx
->delta_stats
.num_objects
--;
3858 if (oi
.is_dirty()) {
3859 ctx
->delta_stats
.num_objects_dirty
--;
3862 ctx
->delta_stats
.num_objects_omap
--;
3863 if (oi
.is_whiteout()) {
3864 dout(20) << __func__
<< " trimming whiteout on " << oi
.soid
<< dendl
;
3865 ctx
->delta_stats
.num_whiteouts
--;
3867 if (oi
.is_cache_pinned()) {
3868 ctx
->delta_stats
.num_objects_pinned
--;
3871 ctx
->snapset_obc
->obs
.exists
= false;
3872 ctx
->snapset_obc
->obs
.oi
= object_info_t(snapoid
);
3875 dout(10) << coid
<< " filtering snapset on " << snapoid
<< dendl
;
3876 snapset
.filter(pool
.info
);
3877 dout(10) << coid
<< " writing updated snapset on " << snapoid
3878 << ", snapset is " << snapset
<< dendl
;
3881 pg_log_entry_t::MODIFY
,
3884 ctx
->snapset_obc
->obs
.oi
.version
,
3891 ctx
->snapset_obc
->obs
.oi
.prior_version
=
3892 ctx
->snapset_obc
->obs
.oi
.version
;
3893 ctx
->snapset_obc
->obs
.oi
.version
= ctx
->at_version
;
3895 map
<string
, bufferlist
> attrs
;
3897 ::encode(snapset
, bl
);
3898 attrs
[SS_ATTR
].claim(bl
);
3901 ::encode(ctx
->snapset_obc
->obs
.oi
, bl
,
3902 get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD
, nullptr));
3903 attrs
[OI_ATTR
].claim(bl
);
3904 t
->setattrs(snapoid
, attrs
);
3907 *ctxp
= std::move(ctx
);
3911 void PrimaryLogPG::kick_snap_trim()
3913 assert(is_active());
3914 assert(is_primary());
3915 if (is_clean() && !snap_trimq
.empty()) {
3916 dout(10) << __func__
<< ": clean and snaps to trim, kicking" << dendl
;
3917 snap_trimmer_machine
.process_event(KickTrim());
3921 void PrimaryLogPG::snap_trimmer_scrub_complete()
3923 if (is_primary() && is_active() && is_clean()) {
3924 assert(!snap_trimq
.empty());
3925 snap_trimmer_machine
.process_event(ScrubComplete());
3929 void PrimaryLogPG::snap_trimmer(epoch_t queued
)
3931 if (deleting
|| pg_has_reset_since(queued
)) {
3935 assert(is_primary());
3937 dout(10) << "snap_trimmer posting" << dendl
;
3938 snap_trimmer_machine
.process_event(DoSnapWork());
3939 dout(10) << "snap_trimmer complete" << dendl
;
3943 int PrimaryLogPG::do_xattr_cmp_u64(int op
, __u64 v1
, bufferlist
& xattr
)
3947 string
v2s(xattr
.c_str(), xattr
.length());
3949 v2
= strtoull(v2s
.c_str(), NULL
, 10);
3953 dout(20) << "do_xattr_cmp_u64 '" << v1
<< "' vs '" << v2
<< "' op " << op
<< dendl
;
3956 case CEPH_OSD_CMPXATTR_OP_EQ
:
3958 case CEPH_OSD_CMPXATTR_OP_NE
:
3960 case CEPH_OSD_CMPXATTR_OP_GT
:
3962 case CEPH_OSD_CMPXATTR_OP_GTE
:
3964 case CEPH_OSD_CMPXATTR_OP_LT
:
3966 case CEPH_OSD_CMPXATTR_OP_LTE
:
3973 int PrimaryLogPG::do_xattr_cmp_str(int op
, string
& v1s
, bufferlist
& xattr
)
3975 string
v2s(xattr
.c_str(), xattr
.length());
3977 dout(20) << "do_xattr_cmp_str '" << v1s
<< "' vs '" << v2s
<< "' op " << op
<< dendl
;
3980 case CEPH_OSD_CMPXATTR_OP_EQ
:
3981 return (v1s
.compare(v2s
) == 0);
3982 case CEPH_OSD_CMPXATTR_OP_NE
:
3983 return (v1s
.compare(v2s
) != 0);
3984 case CEPH_OSD_CMPXATTR_OP_GT
:
3985 return (v1s
.compare(v2s
) > 0);
3986 case CEPH_OSD_CMPXATTR_OP_GTE
:
3987 return (v1s
.compare(v2s
) >= 0);
3988 case CEPH_OSD_CMPXATTR_OP_LT
:
3989 return (v1s
.compare(v2s
) < 0);
3990 case CEPH_OSD_CMPXATTR_OP_LTE
:
3991 return (v1s
.compare(v2s
) <= 0);
3997 int PrimaryLogPG::do_extent_cmp(OpContext
*ctx
, OSDOp
& osd_op
)
3999 ceph_osd_op
& op
= osd_op
.op
;
4000 vector
<OSDOp
> read_ops(1);
4001 OSDOp
& read_op
= read_ops
[0];
4004 read_op
.op
.op
= CEPH_OSD_OP_SYNC_READ
;
4005 read_op
.op
.extent
.offset
= op
.extent
.offset
;
4006 read_op
.op
.extent
.length
= op
.extent
.length
;
4007 read_op
.op
.extent
.truncate_seq
= op
.extent
.truncate_seq
;
4008 read_op
.op
.extent
.truncate_size
= op
.extent
.truncate_size
;
4010 result
= do_osd_ops(ctx
, read_ops
);
4012 derr
<< "do_extent_cmp do_osd_ops failed " << result
<< dendl
;
4016 if (read_op
.outdata
.length() != osd_op
.indata
.length())
4019 for (uint64_t p
= 0; p
< osd_op
.indata
.length(); p
++) {
4020 if (read_op
.outdata
[p
] != osd_op
.indata
[p
]) {
4021 return (-MAX_ERRNO
- p
);
4028 int PrimaryLogPG::do_writesame(OpContext
*ctx
, OSDOp
& osd_op
)
4030 ceph_osd_op
& op
= osd_op
.op
;
4031 vector
<OSDOp
> write_ops(1);
4032 OSDOp
& write_op
= write_ops
[0];
4033 uint64_t write_length
= op
.writesame
.length
;
4039 if (!op
.writesame
.data_length
|| write_length
% op
.writesame
.data_length
)
4042 if (op
.writesame
.data_length
!= osd_op
.indata
.length()) {
4043 derr
<< "invalid length ws data length " << op
.writesame
.data_length
<< " actual len " << osd_op
.indata
.length() << dendl
;
4047 while (write_length
) {
4048 write_op
.indata
.append(osd_op
.indata
);
4049 write_length
-= op
.writesame
.data_length
;
4052 write_op
.op
.op
= CEPH_OSD_OP_WRITE
;
4053 write_op
.op
.extent
.offset
= op
.writesame
.offset
;
4054 write_op
.op
.extent
.length
= op
.writesame
.length
;
4055 result
= do_osd_ops(ctx
, write_ops
);
4057 derr
<< "do_writesame do_osd_ops failed " << result
<< dendl
;
4062 // ========================================================================
4063 // low level osd ops
4065 int PrimaryLogPG::do_tmap2omap(OpContext
*ctx
, unsigned flags
)
4067 dout(20) << " convert tmap to omap for " << ctx
->new_obs
.oi
.soid
<< dendl
;
4068 bufferlist header
, vals
;
4069 int r
= _get_tmap(ctx
, &header
, &vals
);
4071 if (r
== -ENODATA
&& (flags
& CEPH_OSD_TMAP2OMAP_NULLOK
))
4076 vector
<OSDOp
> ops(3);
4078 ops
[0].op
.op
= CEPH_OSD_OP_TRUNCATE
;
4079 ops
[0].op
.extent
.offset
= 0;
4080 ops
[0].op
.extent
.length
= 0;
4082 ops
[1].op
.op
= CEPH_OSD_OP_OMAPSETHEADER
;
4083 ops
[1].indata
.claim(header
);
4085 ops
[2].op
.op
= CEPH_OSD_OP_OMAPSETVALS
;
4086 ops
[2].indata
.claim(vals
);
4088 return do_osd_ops(ctx
, ops
);
4091 int PrimaryLogPG::do_tmapup_slow(OpContext
*ctx
, bufferlist::iterator
& bp
, OSDOp
& osd_op
,
4096 map
<string
, bufferlist
> m
;
4098 bufferlist::iterator p
= bl
.begin();
4099 ::decode(header
, p
);
4111 case CEPH_OSD_TMAP_SET
: // insert key
4119 case CEPH_OSD_TMAP_RM
: // remove key
4121 if (!m
.count(key
)) {
4126 case CEPH_OSD_TMAP_RMSLOPPY
: // remove key
4130 case CEPH_OSD_TMAP_HDR
: // update header
4132 ::decode(header
, bp
);
4142 ::encode(header
, obl
);
4146 vector
<OSDOp
> nops(1);
4147 OSDOp
& newop
= nops
[0];
4148 newop
.op
.op
= CEPH_OSD_OP_WRITEFULL
;
4149 newop
.op
.extent
.offset
= 0;
4150 newop
.op
.extent
.length
= obl
.length();
4152 do_osd_ops(ctx
, nops
);
4153 osd_op
.outdata
.claim(newop
.outdata
);
4157 int PrimaryLogPG::do_tmapup(OpContext
*ctx
, bufferlist::iterator
& bp
, OSDOp
& osd_op
)
4159 bufferlist::iterator orig_bp
= bp
;
4162 dout(10) << "tmapup is a no-op" << dendl
;
4164 // read the whole object
4165 vector
<OSDOp
> nops(1);
4166 OSDOp
& newop
= nops
[0];
4167 newop
.op
.op
= CEPH_OSD_OP_READ
;
4168 newop
.op
.extent
.offset
= 0;
4169 newop
.op
.extent
.length
= 0;
4170 result
= do_osd_ops(ctx
, nops
);
4172 dout(10) << "tmapup read " << newop
.outdata
.length() << dendl
;
4174 dout(30) << " starting is \n";
4175 newop
.outdata
.hexdump(*_dout
);
4178 bufferlist::iterator ip
= newop
.outdata
.begin();
4181 dout(30) << "the update command is: \n";
4182 osd_op
.indata
.hexdump(*_dout
);
4188 if (newop
.outdata
.length()) {
4189 ::decode(header
, ip
);
4190 ::decode(nkeys
, ip
);
4192 dout(10) << "tmapup header " << header
.length() << dendl
;
4194 if (!bp
.end() && *bp
== CEPH_OSD_TMAP_HDR
) {
4196 ::decode(header
, bp
);
4197 dout(10) << "tmapup new header " << header
.length() << dendl
;
4200 ::encode(header
, obl
);
4202 dout(20) << "tmapup initial nkeys " << nkeys
<< dendl
;
4205 bufferlist newkeydata
;
4206 string nextkey
, last_in_key
;
4208 bool have_next
= false;
4211 ::decode(nextkey
, ip
);
4212 ::decode(nextval
, ip
);
4214 while (!bp
.end() && !result
) {
4221 catch (buffer::error
& e
) {
4224 if (key
< last_in_key
) {
4225 dout(5) << "tmapup warning: key '" << key
<< "' < previous key '" << last_in_key
4226 << "', falling back to an inefficient (unsorted) update" << dendl
;
4228 return do_tmapup_slow(ctx
, bp
, osd_op
, newop
.outdata
);
4232 dout(10) << "tmapup op " << (int)op
<< " key " << key
<< dendl
;
4234 // skip existing intervening keys
4235 bool key_exists
= false;
4236 while (have_next
&& !key_exists
) {
4237 dout(20) << " (have_next=" << have_next
<< " nextkey=" << nextkey
<< ")" << dendl
;
4240 if (nextkey
< key
) {
4242 ::encode(nextkey
, newkeydata
);
4243 ::encode(nextval
, newkeydata
);
4244 dout(20) << " keep " << nextkey
<< " " << nextval
.length() << dendl
;
4246 // don't copy; discard old value. and stop.
4247 dout(20) << " drop " << nextkey
<< " " << nextval
.length() << dendl
;
4252 ::decode(nextkey
, ip
);
4253 ::decode(nextval
, ip
);
4259 if (op
== CEPH_OSD_TMAP_SET
) {
4264 catch (buffer::error
& e
) {
4267 ::encode(key
, newkeydata
);
4268 ::encode(val
, newkeydata
);
4269 dout(20) << " set " << key
<< " " << val
.length() << dendl
;
4271 } else if (op
== CEPH_OSD_TMAP_CREATE
) {
4279 catch (buffer::error
& e
) {
4282 ::encode(key
, newkeydata
);
4283 ::encode(val
, newkeydata
);
4284 dout(20) << " create " << key
<< " " << val
.length() << dendl
;
4286 } else if (op
== CEPH_OSD_TMAP_RM
) {
4291 } else if (op
== CEPH_OSD_TMAP_RMSLOPPY
) {
4294 dout(10) << " invalid tmap op " << (int)op
<< dendl
;
4301 ::encode(nextkey
, newkeydata
);
4302 ::encode(nextval
, newkeydata
);
4303 dout(20) << " keep " << nextkey
<< " " << nextval
.length() << dendl
;
4307 rest
.substr_of(newop
.outdata
, ip
.get_off(), newop
.outdata
.length() - ip
.get_off());
4308 dout(20) << " keep trailing " << rest
.length()
4309 << " at " << newkeydata
.length() << dendl
;
4310 newkeydata
.claim_append(rest
);
4313 // encode final key count + key data
4314 dout(20) << "tmapup final nkeys " << nkeys
<< dendl
;
4315 ::encode(nkeys
, obl
);
4316 obl
.claim_append(newkeydata
);
4319 dout(30) << " final is \n";
4320 obl
.hexdump(*_dout
);
4324 bufferlist::iterator tp
= obl
.begin();
4327 map
<string
,bufferlist
> d
;
4330 dout(0) << " **** debug sanity check, looks ok ****" << dendl
;
4335 dout(20) << "tmapput write " << obl
.length() << dendl
;
4336 newop
.op
.op
= CEPH_OSD_OP_WRITEFULL
;
4337 newop
.op
.extent
.offset
= 0;
4338 newop
.op
.extent
.length
= obl
.length();
4340 do_osd_ops(ctx
, nops
);
4341 osd_op
.outdata
.claim(newop
.outdata
);
4347 static int check_offset_and_length(uint64_t offset
, uint64_t length
, uint64_t max
)
4349 if (offset
>= max
||
4351 offset
+ length
> max
)
4357 struct FillInVerifyExtent
: public Context
{
4360 bufferlist
*outdatap
;
4361 boost::optional
<uint32_t> maybe_crc
;
4366 FillInVerifyExtent(ceph_le64
*r
, int32_t *rv
, bufferlist
*blp
,
4367 boost::optional
<uint32_t> mc
, uint64_t size
,
4368 OSDService
*osd
, hobject_t soid
, __le32 flags
) :
4369 r(r
), rval(rv
), outdatap(blp
), maybe_crc(mc
),
4370 size(size
), osd(osd
), soid(soid
), flags(flags
) {}
4371 void finish(int len
) override
{
4376 // whole object? can we verify the checksum?
4377 if (maybe_crc
&& *r
== size
) {
4378 uint32_t crc
= outdatap
->crc32c(-1);
4379 if (maybe_crc
!= crc
) {
4380 osd
->clog
->error() << std::hex
<< " full-object read crc 0x" << crc
4381 << " != expected 0x" << *maybe_crc
4382 << std::dec
<< " on " << soid
;
4383 if (!(flags
& CEPH_OSD_OP_FLAG_FAILOK
)) {
4392 struct ToSparseReadResult
: public Context
{
4393 bufferlist
& data_bl
;
4394 uint64_t data_offset
;
4396 ToSparseReadResult(bufferlist
& bl
, uint64_t offset
, ceph_le64
& len
):
4397 data_bl(bl
), data_offset(offset
),len(len
) {}
4398 void finish(int r
) override
{
4402 map
<uint64_t, uint64_t> extents
= {{data_offset
, r
}};
4403 ::encode(extents
, outdata
);
4404 ::encode_destructively(data_bl
, outdata
);
4405 data_bl
.swap(outdata
);
4409 template<typename V
>
4410 static string
list_keys(const map
<string
, V
>& m
) {
4412 for (typename map
<string
, V
>::const_iterator itr
= m
.begin(); itr
!= m
.end(); ++itr
) {
4416 s
.append(itr
->first
);
4421 template<typename T
>
4422 static string
list_entries(const T
& m
) {
4424 for (typename
T::const_iterator itr
= m
.begin(); itr
!= m
.end(); ++itr
) {
4433 void PrimaryLogPG::maybe_create_new_object(
4435 bool ignore_transaction
)
4437 ObjectState
& obs
= ctx
->new_obs
;
4439 ctx
->delta_stats
.num_objects
++;
4441 assert(!obs
.oi
.is_whiteout());
4442 obs
.oi
.new_object();
4443 if (!ignore_transaction
)
4444 ctx
->op_t
->create(obs
.oi
.soid
);
4445 } else if (obs
.oi
.is_whiteout()) {
4446 dout(10) << __func__
<< " clearing whiteout on " << obs
.oi
.soid
<< dendl
;
4447 ctx
->new_obs
.oi
.clear_flag(object_info_t::FLAG_WHITEOUT
);
4448 --ctx
->delta_stats
.num_whiteouts
;
4452 struct C_ChecksumRead
: public Context
{
4453 PrimaryLogPG
*primary_log_pg
;
4455 Checksummer::CSumType csum_type
;
4456 bufferlist init_value_bl
;
4457 ceph_le64 read_length
;
4459 Context
*fill_extent_ctx
;
4461 C_ChecksumRead(PrimaryLogPG
*primary_log_pg
, OSDOp
&osd_op
,
4462 Checksummer::CSumType csum_type
, bufferlist
&&init_value_bl
,
4463 boost::optional
<uint32_t> maybe_crc
, uint64_t size
,
4464 OSDService
*osd
, hobject_t soid
, __le32 flags
)
4465 : primary_log_pg(primary_log_pg
), osd_op(osd_op
),
4466 csum_type(csum_type
), init_value_bl(std::move(init_value_bl
)),
4467 fill_extent_ctx(new FillInVerifyExtent(&read_length
, &osd_op
.rval
,
4468 &read_bl
, maybe_crc
, size
,
4469 osd
, soid
, flags
)) {
4472 void finish(int r
) override
{
4473 fill_extent_ctx
->complete(r
);
4475 if (osd_op
.rval
>= 0) {
4476 bufferlist::iterator init_value_bl_it
= init_value_bl
.begin();
4477 osd_op
.rval
= primary_log_pg
->finish_checksum(osd_op
, csum_type
,
4484 int PrimaryLogPG::do_checksum(OpContext
*ctx
, OSDOp
& osd_op
,
4485 bufferlist::iterator
*bl_it
, bool *async_read
)
4487 dout(20) << __func__
<< dendl
;
4489 auto& op
= osd_op
.op
;
4490 if (op
.checksum
.chunk_size
> 0) {
4491 if (op
.checksum
.length
== 0) {
4492 dout(10) << __func__
<< ": length required when chunk size provided"
4496 if (op
.checksum
.length
% op
.checksum
.chunk_size
!= 0) {
4497 dout(10) << __func__
<< ": length not aligned to chunk size" << dendl
;
4502 auto& oi
= ctx
->new_obs
.oi
;
4503 if (op
.checksum
.offset
== 0 && op
.checksum
.length
== 0) {
4504 // zeroed offset+length implies checksum whole object
4505 op
.checksum
.length
= oi
.size
;
4506 } else if (op
.checksum
.offset
+ op
.checksum
.length
> oi
.size
) {
4510 Checksummer::CSumType csum_type
;
4511 switch (op
.checksum
.type
) {
4512 case CEPH_OSD_CHECKSUM_OP_TYPE_XXHASH32
:
4513 csum_type
= Checksummer::CSUM_XXHASH32
;
4515 case CEPH_OSD_CHECKSUM_OP_TYPE_XXHASH64
:
4516 csum_type
= Checksummer::CSUM_XXHASH64
;
4518 case CEPH_OSD_CHECKSUM_OP_TYPE_CRC32C
:
4519 csum_type
= Checksummer::CSUM_CRC32C
;
4522 dout(10) << __func__
<< ": unknown crc type ("
4523 << static_cast<uint32_t>(op
.checksum
.type
) << ")" << dendl
;
4527 size_t csum_init_value_size
= Checksummer::get_csum_init_value_size(csum_type
);
4528 if (bl_it
->get_remaining() < csum_init_value_size
) {
4529 dout(10) << __func__
<< ": init value not provided" << dendl
;
4533 bufferlist init_value_bl
;
4534 init_value_bl
.substr_of(bl_it
->get_bl(), bl_it
->get_off(),
4535 csum_init_value_size
);
4536 bl_it
->advance(csum_init_value_size
);
4538 if (pool
.info
.require_rollback() && op
.checksum
.length
> 0) {
4539 // If there is a data digest and it is possible we are reading
4540 // entire object, pass the digest.
4541 boost::optional
<uint32_t> maybe_crc
;
4542 if (oi
.is_data_digest() && op
.checksum
.offset
== 0 &&
4543 op
.checksum
.length
>= oi
.size
) {
4544 maybe_crc
= oi
.data_digest
;
4548 auto& soid
= oi
.soid
;
4549 auto checksum_ctx
= new C_ChecksumRead(this, osd_op
, csum_type
,
4550 std::move(init_value_bl
), maybe_crc
,
4551 oi
.size
, osd
, soid
, op
.flags
);
4552 ctx
->pending_async_reads
.push_back({
4553 {op
.checksum
.offset
, op
.checksum
.length
, op
.flags
},
4554 {&checksum_ctx
->read_bl
, checksum_ctx
}});
4556 dout(10) << __func__
<< ": async_read noted for " << soid
<< dendl
;
4562 *async_read
= false;
4563 std::vector
<OSDOp
> read_ops(1);
4564 auto& read_op
= read_ops
[0];
4565 if (op
.checksum
.length
> 0) {
4566 read_op
.op
.op
= CEPH_OSD_OP_READ
;
4567 read_op
.op
.flags
= op
.flags
;
4568 read_op
.op
.extent
.offset
= op
.checksum
.offset
;
4569 read_op
.op
.extent
.length
= op
.checksum
.length
;
4570 read_op
.op
.extent
.truncate_size
= 0;
4571 read_op
.op
.extent
.truncate_seq
= 0;
4573 int r
= do_osd_ops(ctx
, read_ops
);
4575 derr
<< __func__
<< ": do_osd_ops failed: " << cpp_strerror(r
) << dendl
;
4580 bufferlist::iterator init_value_bl_it
= init_value_bl
.begin();
4581 return finish_checksum(osd_op
, csum_type
, &init_value_bl_it
,
4585 int PrimaryLogPG::finish_checksum(OSDOp
& osd_op
,
4586 Checksummer::CSumType csum_type
,
4587 bufferlist::iterator
*init_value_bl_it
,
4588 const bufferlist
&read_bl
) {
4589 dout(20) << __func__
<< dendl
;
4591 auto& op
= osd_op
.op
;
4593 if (op
.checksum
.length
> 0 && read_bl
.length() != op
.checksum
.length
) {
4594 derr
<< __func__
<< ": bytes read " << read_bl
.length() << " != "
4595 << op
.checksum
.length
<< dendl
;
4599 size_t csum_chunk_size
= (op
.checksum
.chunk_size
!= 0 ?
4600 op
.checksum
.chunk_size
: read_bl
.length());
4601 uint32_t csum_count
= (csum_chunk_size
> 0 ?
4602 read_bl
.length() / csum_chunk_size
: 0);
4605 bufferptr csum_data
;
4606 if (csum_count
> 0) {
4607 size_t csum_value_size
= Checksummer::get_csum_value_size(csum_type
);
4608 csum_data
= buffer::create(csum_value_size
* csum_count
);
4610 csum
.append(csum_data
);
4612 switch (csum_type
) {
4613 case Checksummer::CSUM_XXHASH32
:
4615 Checksummer::xxhash32::init_value_t init_value
;
4616 ::decode(init_value
, *init_value_bl_it
);
4617 Checksummer::calculate
<Checksummer::xxhash32
>(
4618 init_value
, csum_chunk_size
, 0, read_bl
.length(), read_bl
,
4622 case Checksummer::CSUM_XXHASH64
:
4624 Checksummer::xxhash64::init_value_t init_value
;
4625 ::decode(init_value
, *init_value_bl_it
);
4626 Checksummer::calculate
<Checksummer::xxhash64
>(
4627 init_value
, csum_chunk_size
, 0, read_bl
.length(), read_bl
,
4631 case Checksummer::CSUM_CRC32C
:
4633 Checksummer::crc32c::init_value_t init_value
;
4634 ::decode(init_value
, *init_value_bl_it
);
4635 Checksummer::calculate
<Checksummer::crc32c
>(
4636 init_value
, csum_chunk_size
, 0, read_bl
.length(), read_bl
,
4645 ::encode(csum_count
, osd_op
.outdata
);
4646 osd_op
.outdata
.claim_append(csum
);
4650 int PrimaryLogPG::do_osd_ops(OpContext
*ctx
, vector
<OSDOp
>& ops
)
4653 SnapSetContext
*ssc
= ctx
->obc
->ssc
;
4654 ObjectState
& obs
= ctx
->new_obs
;
4655 object_info_t
& oi
= obs
.oi
;
4656 const hobject_t
& soid
= oi
.soid
;
4658 bool first_read
= true;
4660 PGTransaction
* t
= ctx
->op_t
.get();
4662 dout(10) << "do_osd_op " << soid
<< " " << ops
<< dendl
;
4664 for (vector
<OSDOp
>::iterator p
= ops
.begin(); p
!= ops
.end(); ++p
, ctx
->current_osd_subop_num
++) {
4666 ceph_osd_op
& op
= osd_op
.op
;
4668 // TODO: check endianness (__le32 vs uint32_t, etc.)
4669 // The fields in ceph_osd_op are little-endian (according to the definition in rados.h),
4670 // but the code in this function seems to treat them as native-endian. What should the
4672 tracepoint(osd
, do_osd_op_pre
, soid
.oid
.name
.c_str(), soid
.snap
.val
, op
.op
, ceph_osd_op_name(op
.op
), op
.flags
);
4674 dout(10) << "do_osd_op " << osd_op
<< dendl
;
4676 bufferlist::iterator bp
= osd_op
.indata
.begin();
4678 // user-visible modifcation?
4680 // non user-visible modifications
4681 case CEPH_OSD_OP_WATCH
:
4682 case CEPH_OSD_OP_CACHE_EVICT
:
4683 case CEPH_OSD_OP_CACHE_FLUSH
:
4684 case CEPH_OSD_OP_CACHE_TRY_FLUSH
:
4685 case CEPH_OSD_OP_UNDIRTY
:
4686 case CEPH_OSD_OP_COPY_FROM
: // we handle user_version update explicitly
4687 case CEPH_OSD_OP_CACHE_PIN
:
4688 case CEPH_OSD_OP_CACHE_UNPIN
:
4689 case CEPH_OSD_OP_SET_REDIRECT
:
4692 if (op
.op
& CEPH_OSD_OP_MODE_WR
)
4693 ctx
->user_modify
= true;
4696 // munge -1 truncate to 0 truncate
4697 if (ceph_osd_op_uses_extent(op
.op
) &&
4698 op
.extent
.truncate_seq
== 1 &&
4699 op
.extent
.truncate_size
== (-1ULL)) {
4700 op
.extent
.truncate_size
= 0;
4701 op
.extent
.truncate_seq
= 0;
4704 // munge ZERO -> TRUNCATE? (don't munge to DELETE or we risk hosing attributes)
4705 if (op
.op
== CEPH_OSD_OP_ZERO
&&
4707 op
.extent
.offset
< cct
->_conf
->osd_max_object_size
&&
4708 op
.extent
.length
>= 1 &&
4709 op
.extent
.length
<= cct
->_conf
->osd_max_object_size
&&
4710 op
.extent
.offset
+ op
.extent
.length
>= oi
.size
) {
4711 if (op
.extent
.offset
>= oi
.size
) {
4715 dout(10) << " munging ZERO " << op
.extent
.offset
<< "~" << op
.extent
.length
4716 << " -> TRUNCATE " << op
.extent
.offset
<< " (old size is " << oi
.size
<< ")" << dendl
;
4717 op
.op
= CEPH_OSD_OP_TRUNCATE
;
4724 case CEPH_OSD_OP_CMPEXT
:
4726 tracepoint(osd
, do_osd_op_pre_extent_cmp
, soid
.oid
.name
.c_str(), soid
.snap
.val
, oi
.size
, oi
.truncate_seq
, op
.extent
.offset
, op
.extent
.length
, op
.extent
.truncate_size
, op
.extent
.truncate_seq
);
4727 result
= do_extent_cmp(ctx
, osd_op
);
4730 case CEPH_OSD_OP_SYNC_READ
:
4731 if (pool
.info
.require_rollback()) {
4732 result
= -EOPNOTSUPP
;
4736 case CEPH_OSD_OP_READ
:
4739 __u32 seq
= oi
.truncate_seq
;
4740 uint64_t size
= oi
.size
;
4741 tracepoint(osd
, do_osd_op_pre_read
, soid
.oid
.name
.c_str(), soid
.snap
.val
, size
, seq
, op
.extent
.offset
, op
.extent
.length
, op
.extent
.truncate_size
, op
.extent
.truncate_seq
);
4742 bool trimmed_read
= false;
4743 // are we beyond truncate_size?
4744 if ( (seq
< op
.extent
.truncate_seq
) &&
4745 (op
.extent
.offset
+ op
.extent
.length
> op
.extent
.truncate_size
) )
4746 size
= op
.extent
.truncate_size
;
4748 if (op
.extent
.length
== 0) //length is zero mean read the whole object
4749 op
.extent
.length
= size
;
4751 if (op
.extent
.offset
>= size
) {
4752 op
.extent
.length
= 0;
4753 trimmed_read
= true;
4754 } else if (op
.extent
.offset
+ op
.extent
.length
> size
) {
4755 op
.extent
.length
= size
- op
.extent
.offset
;
4756 trimmed_read
= true;
4759 // read into a buffer
4761 if (trimmed_read
&& op
.extent
.length
== 0) {
4762 // read size was trimmed to zero and it is expected to do nothing
4763 // a read operation of 0 bytes does *not* do nothing, this is why
4764 // the trimmed_read boolean is needed
4765 } else if (pool
.info
.require_rollback()) {
4767 boost::optional
<uint32_t> maybe_crc
;
4768 // If there is a data digest and it is possible we are reading
4769 // entire object, pass the digest. FillInVerifyExtent will
4770 // will check the oi.size again.
4771 if (oi
.is_data_digest() && op
.extent
.offset
== 0 &&
4772 op
.extent
.length
>= oi
.size
)
4773 maybe_crc
= oi
.data_digest
;
4774 ctx
->pending_async_reads
.push_back(
4776 boost::make_tuple(op
.extent
.offset
, op
.extent
.length
, op
.flags
),
4777 make_pair(&osd_op
.outdata
,
4778 new FillInVerifyExtent(&op
.extent
.length
, &osd_op
.rval
,
4779 &osd_op
.outdata
, maybe_crc
, oi
.size
, osd
,
4781 dout(10) << " async_read noted for " << soid
<< dendl
;
4783 int r
= pgbackend
->objects_read_sync(
4784 soid
, op
.extent
.offset
, op
.extent
.length
, op
.flags
, &osd_op
.outdata
);
4786 r
= rep_repair_primary_object(soid
, ctx
->op
);
4789 op
.extent
.length
= r
;
4792 op
.extent
.length
= 0;
4794 dout(10) << " read got " << r
<< " / " << op
.extent
.length
4795 << " bytes from obj " << soid
<< dendl
;
4797 // whole object? can we verify the checksum?
4798 if (op
.extent
.length
== oi
.size
&& oi
.is_data_digest()) {
4799 uint32_t crc
= osd_op
.outdata
.crc32c(-1);
4800 if (oi
.data_digest
!= crc
) {
4801 osd
->clog
->error() << info
.pgid
<< std::hex
4802 << " full-object read crc 0x" << crc
4803 << " != expected 0x" << oi
.data_digest
4804 << std::dec
<< " on " << soid
;
4805 // FIXME fall back to replica or something?
4812 ctx
->data_off
= op
.extent
.offset
;
4814 // XXX the op.extent.length is the requested length for async read
4815 // On error this length is changed to 0 after the error comes back.
4816 ctx
->delta_stats
.num_rd_kb
+= SHIFT_ROUND_UP(op
.extent
.length
, 10);
4817 ctx
->delta_stats
.num_rd
++;
4819 // Skip checking the result and just proceed to the next operation
4826 case CEPH_OSD_OP_CHECKSUM
:
4829 tracepoint(osd
, do_osd_op_pre_checksum
, soid
.oid
.name
.c_str(),
4830 soid
.snap
.val
, oi
.size
, oi
.truncate_seq
, op
.checksum
.type
,
4831 op
.checksum
.offset
, op
.checksum
.length
,
4832 op
.checksum
.chunk_size
);
4835 result
= do_checksum(ctx
, osd_op
, &bp
, &async_read
);
4836 if (result
== 0 && async_read
) {
4843 case CEPH_OSD_OP_MAPEXT
:
4844 tracepoint(osd
, do_osd_op_pre_mapext
, soid
.oid
.name
.c_str(), soid
.snap
.val
, op
.extent
.offset
, op
.extent
.length
);
4845 if (pool
.info
.require_rollback()) {
4846 result
= -EOPNOTSUPP
;
4851 // read into a buffer
4853 int r
= osd
->store
->fiemap(ch
, ghobject_t(soid
, ghobject_t::NO_GEN
,
4855 op
.extent
.offset
, op
.extent
.length
, bl
);
4856 osd_op
.outdata
.claim(bl
);
4860 ctx
->delta_stats
.num_rd_kb
+= SHIFT_ROUND_UP(bl
.length(), 10);
4861 ctx
->delta_stats
.num_rd
++;
4862 dout(10) << " map_extents done on object " << soid
<< dendl
;
4867 case CEPH_OSD_OP_SPARSE_READ
:
4868 tracepoint(osd
, do_osd_op_pre_sparse_read
, soid
.oid
.name
.c_str(), soid
.snap
.val
, oi
.size
, oi
.truncate_seq
, op
.extent
.offset
, op
.extent
.length
, op
.extent
.truncate_size
, op
.extent
.truncate_seq
);
4869 if (op
.extent
.truncate_seq
) {
4870 dout(0) << "sparse_read does not support truncation sequence " << dendl
;
4875 if (pool
.info
.ec_pool()) {
4876 // translate sparse read to a normal one if not supported
4877 uint64_t offset
= op
.extent
.offset
;
4878 uint64_t length
= op
.extent
.length
;
4879 if (offset
> oi
.size
) {
4881 } else if (offset
+ length
> oi
.size
) {
4882 length
= oi
.size
- offset
;
4885 ctx
->pending_async_reads
.push_back(
4887 boost::make_tuple(offset
, length
, op
.flags
),
4890 new ToSparseReadResult(
4891 osd_op
.outdata
, offset
,
4892 op
.extent
.length
/* updated by the callback */))));
4893 dout(10) << " async_read (was sparse_read) noted for " << soid
<< dendl
;
4895 dout(10) << " sparse read ended up empty for " << soid
<< dendl
;
4896 map
<uint64_t, uint64_t> extents
;
4897 ::encode(extents
, osd_op
.outdata
);
4900 // read into a buffer
4901 map
<uint64_t, uint64_t> m
;
4902 uint32_t total_read
= 0;
4903 int r
= osd
->store
->fiemap(ch
, ghobject_t(soid
, ghobject_t::NO_GEN
,
4905 op
.extent
.offset
, op
.extent
.length
, m
);
4910 map
<uint64_t, uint64_t>::iterator miter
;
4912 uint64_t last
= op
.extent
.offset
;
4913 for (miter
= m
.begin(); miter
!= m
.end(); ++miter
) {
4915 if (cct
->_conf
->osd_verify_sparse_read_holes
&&
4916 last
< miter
->first
) {
4918 uint64_t len
= miter
->first
- last
;
4919 r
= pgbackend
->objects_read_sync(soid
, last
, len
, op
.flags
, &t
);
4921 r
= rep_repair_primary_object(soid
, ctx
->op
);
4924 osd
->clog
->error() << coll
<< " " << soid
4925 << " sparse-read failed to read: "
4927 } else if (!t
.is_zero()) {
4928 osd
->clog
->error() << coll
<< " " << soid
<< " sparse-read found data in hole "
4929 << last
<< "~" << len
;
4934 r
= pgbackend
->objects_read_sync(soid
, miter
->first
, miter
->second
, op
.flags
, &tmpbl
);
4940 if (r
< (int)miter
->second
) /* this is usually happen when we get extent that exceeds the actual file size */
4943 dout(10) << "sparse-read " << miter
->first
<< "@" << miter
->second
<< dendl
;
4944 data_bl
.claim_append(tmpbl
);
4945 last
= miter
->first
+ r
;
4953 // verify trailing hole?
4954 if (cct
->_conf
->osd_verify_sparse_read_holes
) {
4955 uint64_t end
= MIN(op
.extent
.offset
+ op
.extent
.length
, oi
.size
);
4958 uint64_t len
= end
- last
;
4959 r
= pgbackend
->objects_read_sync(soid
, last
, len
, op
.flags
, &t
);
4961 osd
->clog
->error() << coll
<< " " << soid
4962 << " sparse-read failed to read: "
4964 } else if (!t
.is_zero()) {
4965 osd
->clog
->error() << coll
<< " " << soid
<< " sparse-read found data in hole "
4966 << last
<< "~" << len
;
4971 // Why SPARSE_READ need checksum? In fact, librbd always use sparse-read.
4972 // Maybe at first, there is no much whole objects. With continued use, more and more whole object exist.
4973 // So from this point, for spare-read add checksum make sense.
4974 if (total_read
== oi
.size
&& oi
.is_data_digest()) {
4975 uint32_t crc
= data_bl
.crc32c(-1);
4976 if (oi
.data_digest
!= crc
) {
4977 osd
->clog
->error() << info
.pgid
<< std::hex
4978 << " full-object read crc 0x" << crc
4979 << " != expected 0x" << oi
.data_digest
4980 << std::dec
<< " on " << soid
;
4981 // FIXME fall back to replica or something?
4987 op
.extent
.length
= total_read
;
4989 ::encode(m
, osd_op
.outdata
); // re-encode since it might be modified
4990 ::encode_destructively(data_bl
, osd_op
.outdata
);
4992 dout(10) << " sparse_read got " << total_read
<< " bytes from object " << soid
<< dendl
;
4994 ctx
->delta_stats
.num_rd_kb
+= SHIFT_ROUND_UP(op
.extent
.length
, 10);
4995 ctx
->delta_stats
.num_rd
++;
4998 case CEPH_OSD_OP_CALL
:
5000 string cname
, mname
;
5003 bp
.copy(op
.cls
.class_len
, cname
);
5004 bp
.copy(op
.cls
.method_len
, mname
);
5005 bp
.copy(op
.cls
.indata_len
, indata
);
5006 } catch (buffer::error
& e
) {
5007 dout(10) << "call unable to decode class + method + indata" << dendl
;
5008 dout(30) << "in dump: ";
5009 osd_op
.indata
.hexdump(*_dout
);
5012 tracepoint(osd
, do_osd_op_pre_call
, soid
.oid
.name
.c_str(), soid
.snap
.val
, "???", "???");
5015 tracepoint(osd
, do_osd_op_pre_call
, soid
.oid
.name
.c_str(), soid
.snap
.val
, cname
.c_str(), mname
.c_str());
5017 ClassHandler::ClassData
*cls
;
5018 result
= osd
->class_handler
->open_class(cname
, &cls
);
5019 assert(result
== 0); // init_op_flags() already verified this works.
5021 ClassHandler::ClassMethod
*method
= cls
->get_method(mname
.c_str());
5023 dout(10) << "call method " << cname
<< "." << mname
<< " does not exist" << dendl
;
5024 result
= -EOPNOTSUPP
;
5028 int flags
= method
->get_flags();
5029 if (flags
& CLS_METHOD_WR
)
5030 ctx
->user_modify
= true;
5033 dout(10) << "call method " << cname
<< "." << mname
<< dendl
;
5034 int prev_rd
= ctx
->num_read
;
5035 int prev_wr
= ctx
->num_write
;
5036 result
= method
->exec((cls_method_context_t
)&ctx
, indata
, outdata
);
5038 if (ctx
->num_read
> prev_rd
&& !(flags
& CLS_METHOD_RD
)) {
5039 derr
<< "method " << cname
<< "." << mname
<< " tried to read object but is not marked RD" << dendl
;
5043 if (ctx
->num_write
> prev_wr
&& !(flags
& CLS_METHOD_WR
)) {
5044 derr
<< "method " << cname
<< "." << mname
<< " tried to update object but is not marked WR" << dendl
;
5049 dout(10) << "method called response length=" << outdata
.length() << dendl
;
5050 op
.extent
.length
= outdata
.length();
5051 osd_op
.outdata
.claim_append(outdata
);
5052 dout(30) << "out dump: ";
5053 osd_op
.outdata
.hexdump(*_dout
);
5058 case CEPH_OSD_OP_STAT
:
5059 // note: stat does not require RD
5061 tracepoint(osd
, do_osd_op_pre_stat
, soid
.oid
.name
.c_str(), soid
.snap
.val
);
5063 if (obs
.exists
&& !oi
.is_whiteout()) {
5064 ::encode(oi
.size
, osd_op
.outdata
);
5065 ::encode(oi
.mtime
, osd_op
.outdata
);
5066 dout(10) << "stat oi has " << oi
.size
<< " " << oi
.mtime
<< dendl
;
5069 dout(10) << "stat oi object does not exist" << dendl
;
5072 ctx
->delta_stats
.num_rd
++;
5076 case CEPH_OSD_OP_ISDIRTY
:
5079 tracepoint(osd
, do_osd_op_pre_isdirty
, soid
.oid
.name
.c_str(), soid
.snap
.val
);
5080 bool is_dirty
= obs
.oi
.is_dirty();
5081 ::encode(is_dirty
, osd_op
.outdata
);
5082 ctx
->delta_stats
.num_rd
++;
5087 case CEPH_OSD_OP_UNDIRTY
:
5090 tracepoint(osd
, do_osd_op_pre_undirty
, soid
.oid
.name
.c_str(), soid
.snap
.val
);
5091 if (oi
.is_dirty()) {
5092 ctx
->undirty
= true; // see make_writeable()
5094 ctx
->delta_stats
.num_wr
++;
5100 case CEPH_OSD_OP_CACHE_TRY_FLUSH
:
5103 tracepoint(osd
, do_osd_op_pre_try_flush
, soid
.oid
.name
.c_str(), soid
.snap
.val
);
5104 if (ctx
->lock_type
!= ObjectContext::RWState::RWNONE
) {
5105 dout(10) << "cache-try-flush without SKIPRWLOCKS flag set" << dendl
;
5109 if (pool
.info
.cache_mode
== pg_pool_t::CACHEMODE_NONE
) {
5117 if (oi
.is_cache_pinned()) {
5118 dout(10) << "cache-try-flush on a pinned object, consider unpin this object first" << dendl
;
5122 if (oi
.is_dirty()) {
5123 result
= start_flush(ctx
->op
, ctx
->obc
, false, NULL
, boost::none
);
5124 if (result
== -EINPROGRESS
)
5132 case CEPH_OSD_OP_CACHE_FLUSH
:
5135 tracepoint(osd
, do_osd_op_pre_cache_flush
, soid
.oid
.name
.c_str(), soid
.snap
.val
);
5136 if (ctx
->lock_type
== ObjectContext::RWState::RWNONE
) {
5137 dout(10) << "cache-flush with SKIPRWLOCKS flag set" << dendl
;
5141 if (pool
.info
.cache_mode
== pg_pool_t::CACHEMODE_NONE
) {
5149 if (oi
.is_cache_pinned()) {
5150 dout(10) << "cache-flush on a pinned object, consider unpin this object first" << dendl
;
5155 if (oi
.is_dirty()) {
5156 result
= start_flush(ctx
->op
, ctx
->obc
, true, &missing
, boost::none
);
5157 if (result
== -EINPROGRESS
)
5162 // Check special return value which has set missing_return
5163 if (result
== -ENOENT
) {
5164 dout(10) << __func__
<< " CEPH_OSD_OP_CACHE_FLUSH got ENOENT" << dendl
;
5165 assert(!missing
.is_min());
5166 wait_for_unreadable_object(missing
, ctx
->op
);
5167 // Error code which is used elsewhere when wait_for_unreadable_object() is used
5173 case CEPH_OSD_OP_CACHE_EVICT
:
5176 tracepoint(osd
, do_osd_op_pre_cache_evict
, soid
.oid
.name
.c_str(), soid
.snap
.val
);
5177 if (pool
.info
.cache_mode
== pg_pool_t::CACHEMODE_NONE
) {
5185 if (oi
.is_cache_pinned()) {
5186 dout(10) << "cache-evict on a pinned object, consider unpin this object first" << dendl
;
5190 if (oi
.is_dirty()) {
5194 if (!oi
.watchers
.empty()) {
5198 if (soid
.snap
== CEPH_NOSNAP
) {
5199 result
= _verify_no_head_clones(soid
, ssc
->snapset
);
5203 result
= _delete_oid(ctx
, true, false);
5205 // mark that this is a cache eviction to avoid triggering normal
5206 // make_writeable() clone or snapdir object creation in finish_ctx()
5207 ctx
->cache_evict
= true;
5209 osd
->logger
->inc(l_osd_tier_evict
);
5213 case CEPH_OSD_OP_GETXATTR
:
5217 bp
.copy(op
.xattr
.name_len
, aname
);
5218 tracepoint(osd
, do_osd_op_pre_getxattr
, soid
.oid
.name
.c_str(), soid
.snap
.val
, aname
.c_str());
5219 string name
= "_" + aname
;
5220 int r
= getattr_maybe_cache(
5225 op
.xattr
.value_len
= osd_op
.outdata
.length();
5227 ctx
->delta_stats
.num_rd_kb
+= SHIFT_ROUND_UP(osd_op
.outdata
.length(), 10);
5231 ctx
->delta_stats
.num_rd
++;
5235 case CEPH_OSD_OP_GETXATTRS
:
5238 tracepoint(osd
, do_osd_op_pre_getxattrs
, soid
.oid
.name
.c_str(), soid
.snap
.val
);
5239 map
<string
, bufferlist
> out
;
5240 result
= getattrs_maybe_cache(
5247 ctx
->delta_stats
.num_rd_kb
+= SHIFT_ROUND_UP(bl
.length(), 10);
5248 ctx
->delta_stats
.num_rd
++;
5249 osd_op
.outdata
.claim_append(bl
);
5253 case CEPH_OSD_OP_CMPXATTR
:
5257 bp
.copy(op
.xattr
.name_len
, aname
);
5258 tracepoint(osd
, do_osd_op_pre_cmpxattr
, soid
.oid
.name
.c_str(), soid
.snap
.val
, aname
.c_str());
5259 string name
= "_" + aname
;
5260 name
[op
.xattr
.name_len
+ 1] = 0;
5263 result
= getattr_maybe_cache(
5267 if (result
< 0 && result
!= -EEXIST
&& result
!= -ENODATA
)
5270 ctx
->delta_stats
.num_rd
++;
5271 ctx
->delta_stats
.num_rd_kb
+= SHIFT_ROUND_UP(xattr
.length(), 10);
5273 switch (op
.xattr
.cmp_mode
) {
5274 case CEPH_OSD_CMPXATTR_MODE_STRING
:
5277 bp
.copy(op
.xattr
.value_len
, val
);
5278 val
[op
.xattr
.value_len
] = 0;
5279 dout(10) << "CEPH_OSD_OP_CMPXATTR name=" << name
<< " val=" << val
5280 << " op=" << (int)op
.xattr
.cmp_op
<< " mode=" << (int)op
.xattr
.cmp_mode
<< dendl
;
5281 result
= do_xattr_cmp_str(op
.xattr
.cmp_op
, val
, xattr
);
5285 case CEPH_OSD_CMPXATTR_MODE_U64
:
5289 ::decode(u64val
, bp
);
5291 catch (buffer::error
& e
) {
5295 dout(10) << "CEPH_OSD_OP_CMPXATTR name=" << name
<< " val=" << u64val
5296 << " op=" << (int)op
.xattr
.cmp_op
<< " mode=" << (int)op
.xattr
.cmp_mode
<< dendl
;
5297 result
= do_xattr_cmp_u64(op
.xattr
.cmp_op
, u64val
, xattr
);
5302 dout(10) << "bad cmp mode " << (int)op
.xattr
.cmp_mode
<< dendl
;
5307 dout(10) << "comparison returned false" << dendl
;
5308 result
= -ECANCELED
;
5312 dout(10) << "comparison returned " << result
<< " " << cpp_strerror(-result
) << dendl
;
5316 dout(10) << "comparison returned true" << dendl
;
5320 case CEPH_OSD_OP_ASSERT_VER
:
5323 uint64_t ver
= op
.assert_ver
.ver
;
5324 tracepoint(osd
, do_osd_op_pre_assert_ver
, soid
.oid
.name
.c_str(), soid
.snap
.val
, ver
);
5327 else if (ver
< oi
.user_version
)
5329 else if (ver
> oi
.user_version
)
5330 result
= -EOVERFLOW
;
5334 case CEPH_OSD_OP_LIST_WATCHERS
:
5337 tracepoint(osd
, do_osd_op_pre_list_watchers
, soid
.oid
.name
.c_str(), soid
.snap
.val
);
5338 obj_list_watch_response_t resp
;
5340 map
<pair
<uint64_t, entity_name_t
>, watch_info_t
>::const_iterator oi_iter
;
5341 for (oi_iter
= oi
.watchers
.begin(); oi_iter
!= oi
.watchers
.end();
5343 dout(20) << "key cookie=" << oi_iter
->first
.first
5344 << " entity=" << oi_iter
->first
.second
<< " "
5345 << oi_iter
->second
<< dendl
;
5346 assert(oi_iter
->first
.first
== oi_iter
->second
.cookie
);
5347 assert(oi_iter
->first
.second
.is_client());
5349 watch_item_t
wi(oi_iter
->first
.second
, oi_iter
->second
.cookie
,
5350 oi_iter
->second
.timeout_seconds
, oi_iter
->second
.addr
);
5351 resp
.entries
.push_back(wi
);
5354 resp
.encode(osd_op
.outdata
, ctx
->get_features());
5357 ctx
->delta_stats
.num_rd
++;
5361 case CEPH_OSD_OP_LIST_SNAPS
:
5364 tracepoint(osd
, do_osd_op_pre_list_snaps
, soid
.oid
.name
.c_str(), soid
.snap
.val
);
5365 obj_list_snap_response_t resp
;
5368 ssc
= ctx
->obc
->ssc
= get_snapset_context(soid
, false);
5372 int clonecount
= ssc
->snapset
.clones
.size();
5373 if (ssc
->snapset
.head_exists
)
5375 resp
.clones
.reserve(clonecount
);
5376 for (auto clone_iter
= ssc
->snapset
.clones
.begin();
5377 clone_iter
!= ssc
->snapset
.clones
.end(); ++clone_iter
) {
5379 ci
.cloneid
= *clone_iter
;
5381 hobject_t clone_oid
= soid
;
5382 clone_oid
.snap
= *clone_iter
;
5384 if (!ssc
->snapset
.is_legacy()) {
5385 auto p
= ssc
->snapset
.clone_snaps
.find(*clone_iter
);
5386 if (p
== ssc
->snapset
.clone_snaps
.end()) {
5387 osd
->clog
->error() << "osd." << osd
->whoami
5388 << ": inconsistent clone_snaps found for oid "
5389 << soid
<< " clone " << *clone_iter
5390 << " snapset " << ssc
->snapset
;
5394 for (auto q
= p
->second
.rbegin(); q
!= p
->second
.rend(); ++q
) {
5395 ci
.snaps
.push_back(*q
);
5398 /* No need to take a lock here. We are only inspecting state cached on
5399 * in the ObjectContext, so we aren't performing an actual read unless
5400 * the clone obc is not already loaded (in which case, it cannot have
5401 * an in progress write). We also do not risk exposing uncommitted
5402 * state since we do have a read lock on the head object or snapdir,
5403 * which we would have to write lock in order to make user visible
5404 * modifications to the snapshot state (snap trim related mutations
5405 * are not user visible).
5407 if (is_missing_object(clone_oid
)) {
5408 dout(20) << "LIST_SNAPS " << clone_oid
<< " missing" << dendl
;
5409 wait_for_unreadable_object(clone_oid
, ctx
->op
);
5414 ObjectContextRef clone_obc
= get_object_context(clone_oid
, false);
5416 if (maybe_handle_cache(
5417 ctx
->op
, true, clone_obc
, -ENOENT
, clone_oid
, true)) {
5418 // promoting the clone
5421 osd
->clog
->error() << "osd." << osd
->whoami
5422 << ": missing clone " << clone_oid
5425 // should not happen
5430 for (vector
<snapid_t
>::reverse_iterator p
=
5431 clone_obc
->obs
.oi
.legacy_snaps
.rbegin();
5432 p
!= clone_obc
->obs
.oi
.legacy_snaps
.rend();
5434 ci
.snaps
.push_back(*p
);
5438 dout(20) << " clone " << *clone_iter
<< " snaps " << ci
.snaps
<< dendl
;
5440 map
<snapid_t
, interval_set
<uint64_t> >::const_iterator coi
;
5441 coi
= ssc
->snapset
.clone_overlap
.find(ci
.cloneid
);
5442 if (coi
== ssc
->snapset
.clone_overlap
.end()) {
5443 osd
->clog
->error() << "osd." << osd
->whoami
5444 << ": inconsistent clone_overlap found for oid "
5445 << soid
<< " clone " << *clone_iter
;
5449 const interval_set
<uint64_t> &o
= coi
->second
;
5450 ci
.overlap
.reserve(o
.num_intervals());
5451 for (interval_set
<uint64_t>::const_iterator r
= o
.begin();
5452 r
!= o
.end(); ++r
) {
5453 ci
.overlap
.push_back(pair
<uint64_t,uint64_t>(r
.get_start(),
5457 map
<snapid_t
, uint64_t>::const_iterator si
;
5458 si
= ssc
->snapset
.clone_size
.find(ci
.cloneid
);
5459 if (si
== ssc
->snapset
.clone_size
.end()) {
5460 osd
->clog
->error() << "osd." << osd
->whoami
5461 << ": inconsistent clone_size found for oid "
5462 << soid
<< " clone " << *clone_iter
;
5466 ci
.size
= si
->second
;
5468 resp
.clones
.push_back(ci
);
5473 if (ssc
->snapset
.head_exists
&&
5474 !ctx
->obc
->obs
.oi
.is_whiteout()) {
5477 ci
.cloneid
= CEPH_NOSNAP
;
5479 //Size for HEAD is oi.size
5482 resp
.clones
.push_back(ci
);
5484 resp
.seq
= ssc
->snapset
.seq
;
5486 resp
.encode(osd_op
.outdata
);
5489 ctx
->delta_stats
.num_rd
++;
5493 case CEPH_OSD_OP_NOTIFY
:
5500 uint32_t ver
; // obsolete
5502 ::decode(timeout
, bp
);
5504 } catch (const buffer::error
&e
) {
5507 tracepoint(osd
, do_osd_op_pre_notify
, soid
.oid
.name
.c_str(), soid
.snap
.val
, timeout
);
5509 timeout
= cct
->_conf
->osd_default_notify_timeout
;
5512 n
.timeout
= timeout
;
5513 n
.notify_id
= osd
->get_next_id(get_osdmap()->get_epoch());
5514 n
.cookie
= op
.watch
.cookie
;
5516 ctx
->notifies
.push_back(n
);
5518 // return our unique notify id to the client
5519 ::encode(n
.notify_id
, osd_op
.outdata
);
5523 case CEPH_OSD_OP_NOTIFY_ACK
:
5527 uint64_t notify_id
= 0;
5528 uint64_t watch_cookie
= 0;
5529 ::decode(notify_id
, bp
);
5530 ::decode(watch_cookie
, bp
);
5531 bufferlist reply_bl
;
5533 ::decode(reply_bl
, bp
);
5535 tracepoint(osd
, do_osd_op_pre_notify_ack
, soid
.oid
.name
.c_str(), soid
.snap
.val
, notify_id
, watch_cookie
, "Y");
5536 OpContext::NotifyAck
ack(notify_id
, watch_cookie
, reply_bl
);
5537 ctx
->notify_acks
.push_back(ack
);
5538 } catch (const buffer::error
&e
) {
5539 tracepoint(osd
, do_osd_op_pre_notify_ack
, soid
.oid
.name
.c_str(), soid
.snap
.val
, op
.watch
.cookie
, 0, "N");
5540 OpContext::NotifyAck
ack(
5541 // op.watch.cookie is actually the notify_id for historical reasons
5544 ctx
->notify_acks
.push_back(ack
);
5549 case CEPH_OSD_OP_SETALLOCHINT
:
5552 tracepoint(osd
, do_osd_op_pre_setallochint
, soid
.oid
.name
.c_str(), soid
.snap
.val
, op
.alloc_hint
.expected_object_size
, op
.alloc_hint
.expected_write_size
);
5553 maybe_create_new_object(ctx
);
5554 oi
.expected_object_size
= op
.alloc_hint
.expected_object_size
;
5555 oi
.expected_write_size
= op
.alloc_hint
.expected_write_size
;
5556 oi
.alloc_hint_flags
= op
.alloc_hint
.flags
;
5557 t
->set_alloc_hint(soid
, op
.alloc_hint
.expected_object_size
,
5558 op
.alloc_hint
.expected_write_size
,
5559 op
.alloc_hint
.flags
);
5560 ctx
->delta_stats
.num_wr
++;
5568 // -- object data --
5570 case CEPH_OSD_OP_WRITE
:
5573 __u32 seq
= oi
.truncate_seq
;
5574 tracepoint(osd
, do_osd_op_pre_write
, soid
.oid
.name
.c_str(), soid
.snap
.val
, oi
.size
, seq
, op
.extent
.offset
, op
.extent
.length
, op
.extent
.truncate_size
, op
.extent
.truncate_seq
);
5575 if (op
.extent
.length
!= osd_op
.indata
.length()) {
5580 if (pool
.info
.has_flag(pg_pool_t::FLAG_WRITE_FADVISE_DONTNEED
))
5581 op
.flags
= op
.flags
| CEPH_OSD_OP_FLAG_FADVISE_DONTNEED
;
5583 if (pool
.info
.requires_aligned_append() &&
5584 (op
.extent
.offset
% pool
.info
.required_alignment() != 0)) {
5585 result
= -EOPNOTSUPP
;
5590 if (pool
.info
.requires_aligned_append() && op
.extent
.offset
) {
5591 result
= -EOPNOTSUPP
;
5594 } else if (op
.extent
.offset
!= oi
.size
&&
5595 pool
.info
.requires_aligned_append()) {
5596 result
= -EOPNOTSUPP
;
5600 if (seq
&& (seq
> op
.extent
.truncate_seq
) &&
5601 (op
.extent
.offset
+ op
.extent
.length
> oi
.size
)) {
5602 // old write, arrived after trimtrunc
5603 op
.extent
.length
= (op
.extent
.offset
> oi
.size
? 0 : oi
.size
- op
.extent
.offset
);
5604 dout(10) << " old truncate_seq " << op
.extent
.truncate_seq
<< " < current " << seq
5605 << ", adjusting write length to " << op
.extent
.length
<< dendl
;
5607 t
.substr_of(osd_op
.indata
, 0, op
.extent
.length
);
5608 osd_op
.indata
.swap(t
);
5610 if (op
.extent
.truncate_seq
> seq
) {
5611 // write arrives before trimtrunc
5612 if (obs
.exists
&& !oi
.is_whiteout()) {
5613 dout(10) << " truncate_seq " << op
.extent
.truncate_seq
<< " > current " << seq
5614 << ", truncating to " << op
.extent
.truncate_size
<< dendl
;
5615 t
->truncate(soid
, op
.extent
.truncate_size
);
5616 oi
.truncate_seq
= op
.extent
.truncate_seq
;
5617 oi
.truncate_size
= op
.extent
.truncate_size
;
5618 if (op
.extent
.truncate_size
!= oi
.size
) {
5619 ctx
->delta_stats
.num_bytes
-= oi
.size
;
5620 ctx
->delta_stats
.num_bytes
+= op
.extent
.truncate_size
;
5621 oi
.size
= op
.extent
.truncate_size
;
5624 dout(10) << " truncate_seq " << op
.extent
.truncate_seq
<< " > current " << seq
5625 << ", but object is new" << dendl
;
5626 oi
.truncate_seq
= op
.extent
.truncate_seq
;
5627 oi
.truncate_size
= op
.extent
.truncate_size
;
5630 result
= check_offset_and_length(op
.extent
.offset
, op
.extent
.length
, cct
->_conf
->osd_max_object_size
);
5634 maybe_create_new_object(ctx
);
5636 if (op
.extent
.length
== 0) {
5637 if (op
.extent
.offset
> oi
.size
) {
5639 soid
, op
.extent
.offset
);
5645 soid
, op
.extent
.offset
, op
.extent
.length
, osd_op
.indata
, op
.flags
);
5648 if (op
.extent
.offset
== 0 && op
.extent
.length
>= oi
.size
)
5649 obs
.oi
.set_data_digest(osd_op
.indata
.crc32c(-1));
5650 else if (op
.extent
.offset
== oi
.size
&& obs
.oi
.is_data_digest())
5651 obs
.oi
.set_data_digest(osd_op
.indata
.crc32c(obs
.oi
.data_digest
));
5653 obs
.oi
.clear_data_digest();
5654 write_update_size_and_usage(ctx
->delta_stats
, oi
, ctx
->modified_ranges
,
5655 op
.extent
.offset
, op
.extent
.length
);
5660 case CEPH_OSD_OP_WRITEFULL
:
5662 { // write full object
5663 tracepoint(osd
, do_osd_op_pre_writefull
, soid
.oid
.name
.c_str(), soid
.snap
.val
, oi
.size
, 0, op
.extent
.length
);
5665 if (op
.extent
.length
!= osd_op
.indata
.length()) {
5669 result
= check_offset_and_length(0, op
.extent
.length
, cct
->_conf
->osd_max_object_size
);
5673 if (pool
.info
.has_flag(pg_pool_t::FLAG_WRITE_FADVISE_DONTNEED
))
5674 op
.flags
= op
.flags
| CEPH_OSD_OP_FLAG_FADVISE_DONTNEED
;
5676 maybe_create_new_object(ctx
);
5677 if (pool
.info
.require_rollback()) {
5678 t
->truncate(soid
, 0);
5679 } else if (obs
.exists
&& op
.extent
.length
< oi
.size
) {
5680 t
->truncate(soid
, op
.extent
.length
);
5682 if (op
.extent
.length
) {
5683 t
->write(soid
, 0, op
.extent
.length
, osd_op
.indata
, op
.flags
);
5685 obs
.oi
.set_data_digest(osd_op
.indata
.crc32c(-1));
5687 write_update_size_and_usage(ctx
->delta_stats
, oi
, ctx
->modified_ranges
,
5688 0, op
.extent
.length
, true);
5692 case CEPH_OSD_OP_WRITESAME
:
5694 tracepoint(osd
, do_osd_op_pre_writesame
, soid
.oid
.name
.c_str(), soid
.snap
.val
, oi
.size
, op
.writesame
.offset
, op
.writesame
.length
, op
.writesame
.data_length
);
5695 result
= do_writesame(ctx
, osd_op
);
5698 case CEPH_OSD_OP_ROLLBACK
:
5700 tracepoint(osd
, do_osd_op_pre_rollback
, soid
.oid
.name
.c_str(), soid
.snap
.val
);
5701 result
= _rollback_to(ctx
, op
);
5704 case CEPH_OSD_OP_ZERO
:
5705 tracepoint(osd
, do_osd_op_pre_zero
, soid
.oid
.name
.c_str(), soid
.snap
.val
, op
.extent
.offset
, op
.extent
.length
);
5706 if (pool
.info
.requires_aligned_append()) {
5707 result
= -EOPNOTSUPP
;
5712 result
= check_offset_and_length(op
.extent
.offset
, op
.extent
.length
, cct
->_conf
->osd_max_object_size
);
5715 assert(op
.extent
.length
);
5716 if (obs
.exists
&& !oi
.is_whiteout()) {
5717 t
->zero(soid
, op
.extent
.offset
, op
.extent
.length
);
5718 interval_set
<uint64_t> ch
;
5719 ch
.insert(op
.extent
.offset
, op
.extent
.length
);
5720 ctx
->modified_ranges
.union_of(ch
);
5721 ctx
->delta_stats
.num_wr
++;
5722 oi
.clear_data_digest();
5728 case CEPH_OSD_OP_CREATE
:
5731 tracepoint(osd
, do_osd_op_pre_create
, soid
.oid
.name
.c_str(), soid
.snap
.val
);
5732 int flags
= le32_to_cpu(op
.flags
);
5733 if (obs
.exists
&& !oi
.is_whiteout() &&
5734 (flags
& CEPH_OSD_OP_FLAG_EXCL
)) {
5735 result
= -EEXIST
; /* this is an exclusive create */
5737 if (osd_op
.indata
.length()) {
5738 bufferlist::iterator p
= osd_op
.indata
.begin();
5741 ::decode(category
, p
);
5743 catch (buffer::error
& e
) {
5747 // category is no longer implemented.
5750 maybe_create_new_object(ctx
);
5757 case CEPH_OSD_OP_TRIMTRUNC
:
5758 op
.extent
.offset
= op
.extent
.truncate_size
;
5761 case CEPH_OSD_OP_TRUNCATE
:
5762 tracepoint(osd
, do_osd_op_pre_truncate
, soid
.oid
.name
.c_str(), soid
.snap
.val
, oi
.size
, oi
.truncate_seq
, op
.extent
.offset
, op
.extent
.length
, op
.extent
.truncate_size
, op
.extent
.truncate_seq
);
5763 if (pool
.info
.requires_aligned_append()) {
5764 result
= -EOPNOTSUPP
;
5770 if (!obs
.exists
|| oi
.is_whiteout()) {
5771 dout(10) << " object dne, truncate is a no-op" << dendl
;
5775 if (op
.extent
.offset
> cct
->_conf
->osd_max_object_size
) {
5780 if (op
.extent
.truncate_seq
) {
5781 assert(op
.extent
.offset
== op
.extent
.truncate_size
);
5782 if (op
.extent
.truncate_seq
<= oi
.truncate_seq
) {
5783 dout(10) << " truncate seq " << op
.extent
.truncate_seq
<< " <= current " << oi
.truncate_seq
5784 << ", no-op" << dendl
;
5787 dout(10) << " truncate seq " << op
.extent
.truncate_seq
<< " > current " << oi
.truncate_seq
5788 << ", truncating" << dendl
;
5789 oi
.truncate_seq
= op
.extent
.truncate_seq
;
5790 oi
.truncate_size
= op
.extent
.truncate_size
;
5793 maybe_create_new_object(ctx
);
5794 t
->truncate(soid
, op
.extent
.offset
);
5795 if (oi
.size
> op
.extent
.offset
) {
5796 interval_set
<uint64_t> trim
;
5797 trim
.insert(op
.extent
.offset
, oi
.size
-op
.extent
.offset
);
5798 ctx
->modified_ranges
.union_of(trim
);
5800 if (op
.extent
.offset
!= oi
.size
) {
5801 ctx
->delta_stats
.num_bytes
-= oi
.size
;
5802 ctx
->delta_stats
.num_bytes
+= op
.extent
.offset
;
5803 oi
.size
= op
.extent
.offset
;
5805 ctx
->delta_stats
.num_wr
++;
5806 // do no set exists, or we will break above DELETE -> TRUNCATE munging.
5808 oi
.clear_data_digest();
5812 case CEPH_OSD_OP_DELETE
:
5814 tracepoint(osd
, do_osd_op_pre_delete
, soid
.oid
.name
.c_str(), soid
.snap
.val
);
5816 result
= _delete_oid(ctx
, false, ctx
->ignore_cache
);
5820 case CEPH_OSD_OP_WATCH
:
5823 tracepoint(osd
, do_osd_op_pre_watch
, soid
.oid
.name
.c_str(), soid
.snap
.val
,
5824 op
.watch
.cookie
, op
.watch
.op
);
5829 uint64_t cookie
= op
.watch
.cookie
;
5830 entity_name_t entity
= ctx
->reqid
.name
;
5831 ObjectContextRef obc
= ctx
->obc
;
5833 dout(10) << "watch " << ceph_osd_watch_op_name(op
.watch
.op
)
5834 << ": ctx->obc=" << (void *)obc
.get() << " cookie=" << cookie
5835 << " oi.version=" << oi
.version
.version
<< " ctx->at_version=" << ctx
->at_version
<< dendl
;
5836 dout(10) << "watch: oi.user_version=" << oi
.user_version
<< dendl
;
5837 dout(10) << "watch: peer_addr="
5838 << ctx
->op
->get_req()->get_connection()->get_peer_addr() << dendl
;
5840 uint32_t timeout
= cct
->_conf
->osd_client_watch_timeout
;
5841 if (op
.watch
.timeout
!= 0) {
5842 timeout
= op
.watch
.timeout
;
5845 watch_info_t
w(cookie
, timeout
,
5846 ctx
->op
->get_req()->get_connection()->get_peer_addr());
5847 if (op
.watch
.op
== CEPH_OSD_WATCH_OP_WATCH
||
5848 op
.watch
.op
== CEPH_OSD_WATCH_OP_LEGACY_WATCH
) {
5849 if (oi
.watchers
.count(make_pair(cookie
, entity
))) {
5850 dout(10) << " found existing watch " << w
<< " by " << entity
<< dendl
;
5852 dout(10) << " registered new watch " << w
<< " by " << entity
<< dendl
;
5853 oi
.watchers
[make_pair(cookie
, entity
)] = w
;
5854 t
->nop(soid
); // make sure update the object_info on disk!
5856 bool will_ping
= (op
.watch
.op
== CEPH_OSD_WATCH_OP_WATCH
);
5857 ctx
->watch_connects
.push_back(make_pair(w
, will_ping
));
5858 } else if (op
.watch
.op
== CEPH_OSD_WATCH_OP_RECONNECT
) {
5859 if (!oi
.watchers
.count(make_pair(cookie
, entity
))) {
5863 dout(10) << " found existing watch " << w
<< " by " << entity
<< dendl
;
5864 ctx
->watch_connects
.push_back(make_pair(w
, true));
5865 } else if (op
.watch
.op
== CEPH_OSD_WATCH_OP_PING
) {
5866 /* Note: WATCH with PING doesn't cause may_write() to return true,
5867 * so if there is nothing else in the transaction, this is going
5868 * to run do_osd_op_effects, but not write out a log entry */
5869 if (!oi
.watchers
.count(make_pair(cookie
, entity
))) {
5873 map
<pair
<uint64_t,entity_name_t
>,WatchRef
>::iterator p
=
5874 obc
->watchers
.find(make_pair(cookie
, entity
));
5875 if (p
== obc
->watchers
.end() ||
5876 !p
->second
->is_connected()) {
5877 // client needs to reconnect
5878 result
= -ETIMEDOUT
;
5881 dout(10) << " found existing watch " << w
<< " by " << entity
<< dendl
;
5882 p
->second
->got_ping(ceph_clock_now());
5884 } else if (op
.watch
.op
== CEPH_OSD_WATCH_OP_UNWATCH
) {
5885 map
<pair
<uint64_t, entity_name_t
>, watch_info_t
>::iterator oi_iter
=
5886 oi
.watchers
.find(make_pair(cookie
, entity
));
5887 if (oi_iter
!= oi
.watchers
.end()) {
5888 dout(10) << " removed watch " << oi_iter
->second
<< " by "
5890 oi
.watchers
.erase(oi_iter
);
5891 t
->nop(soid
); // update oi on disk
5892 ctx
->watch_disconnects
.push_back(
5893 watch_disconnect_t(cookie
, entity
, false));
5895 dout(10) << " can't remove: no watch by " << entity
<< dendl
;
5901 case CEPH_OSD_OP_CACHE_PIN
:
5902 tracepoint(osd
, do_osd_op_pre_cache_pin
, soid
.oid
.name
.c_str(), soid
.snap
.val
);
5903 if ((!pool
.info
.is_tier() ||
5904 pool
.info
.cache_mode
== pg_pool_t::CACHEMODE_NONE
)) {
5906 dout(10) << " pin object is only allowed on the cache tier " << dendl
;
5911 if (!obs
.exists
|| oi
.is_whiteout()) {
5916 if (!oi
.is_cache_pinned()) {
5917 oi
.set_flag(object_info_t::FLAG_CACHE_PIN
);
5919 ctx
->delta_stats
.num_objects_pinned
++;
5920 ctx
->delta_stats
.num_wr
++;
5926 case CEPH_OSD_OP_CACHE_UNPIN
:
5927 tracepoint(osd
, do_osd_op_pre_cache_unpin
, soid
.oid
.name
.c_str(), soid
.snap
.val
);
5928 if ((!pool
.info
.is_tier() ||
5929 pool
.info
.cache_mode
== pg_pool_t::CACHEMODE_NONE
)) {
5931 dout(10) << " pin object is only allowed on the cache tier " << dendl
;
5936 if (!obs
.exists
|| oi
.is_whiteout()) {
5941 if (oi
.is_cache_pinned()) {
5942 oi
.clear_flag(object_info_t::FLAG_CACHE_PIN
);
5944 ctx
->delta_stats
.num_objects_pinned
--;
5945 ctx
->delta_stats
.num_wr
++;
5951 case CEPH_OSD_OP_SET_REDIRECT
:
5954 if (pool
.info
.is_tier()) {
5962 if (get_osdmap()->require_osd_release
< CEPH_RELEASE_LUMINOUS
) {
5963 result
= -EOPNOTSUPP
;
5967 object_t target_name
;
5968 object_locator_t target_oloc
;
5969 snapid_t target_snapid
= (uint64_t)op
.copy_from
.snapid
;
5970 version_t target_version
= op
.copy_from
.src_version
;
5972 ::decode(target_name
, bp
);
5973 ::decode(target_oloc
, bp
);
5975 catch (buffer::error
& e
) {
5980 get_osdmap()->object_locator_to_pg(target_name
, target_oloc
, raw_pg
);
5981 hobject_t
target(target_name
, target_oloc
.key
, target_snapid
,
5982 raw_pg
.ps(), raw_pg
.pool(),
5983 target_oloc
.nspace
);
5984 if (target
== soid
) {
5985 dout(20) << " set-redirect self is invalid" << dendl
;
5989 oi
.set_flag(object_info_t::FLAG_MANIFEST
);
5990 oi
.manifest
.redirect_target
= target
;
5991 oi
.manifest
.type
= object_manifest_t::TYPE_REDIRECT
;
5992 t
->truncate(soid
, 0);
5993 if (oi
.is_omap() && pool
.info
.supports_omap()) {
5994 t
->omap_clear(soid
);
5995 obs
.oi
.clear_omap_digest();
5996 obs
.oi
.clear_flag(object_info_t::FLAG_OMAP
);
5998 ctx
->delta_stats
.num_bytes
-= oi
.size
;
6001 oi
.user_version
= target_version
;
6002 ctx
->user_at_version
= target_version
;
6004 map
<string
,bufferlist
> rmattrs
;
6005 result
= getattrs_maybe_cache(ctx
->obc
,
6011 map
<string
, bufferlist
>::iterator iter
;
6012 for (iter
= rmattrs
.begin(); iter
!= rmattrs
.end(); ++iter
) {
6013 const string
& name
= iter
->first
;
6014 t
->rmattr(soid
, name
);
6016 dout(10) << "set-redirect oid:" << oi
.soid
<< " user_version: " << oi
.user_version
<< dendl
;
6021 // -- object attrs --
6023 case CEPH_OSD_OP_SETXATTR
:
6026 if (cct
->_conf
->osd_max_attr_size
> 0 &&
6027 op
.xattr
.value_len
> cct
->_conf
->osd_max_attr_size
) {
6028 tracepoint(osd
, do_osd_op_pre_setxattr
, soid
.oid
.name
.c_str(), soid
.snap
.val
, "???");
6032 unsigned max_name_len
= MIN(osd
->store
->get_max_attr_name_length(),
6033 cct
->_conf
->osd_max_attr_name_len
);
6034 if (op
.xattr
.name_len
> max_name_len
) {
6035 result
= -ENAMETOOLONG
;
6038 maybe_create_new_object(ctx
);
6040 bp
.copy(op
.xattr
.name_len
, aname
);
6041 tracepoint(osd
, do_osd_op_pre_setxattr
, soid
.oid
.name
.c_str(), soid
.snap
.val
, aname
.c_str());
6042 string name
= "_" + aname
;
6044 bp
.copy(op
.xattr
.value_len
, bl
);
6045 t
->setattr(soid
, name
, bl
);
6046 ctx
->delta_stats
.num_wr
++;
6050 case CEPH_OSD_OP_RMXATTR
:
6054 bp
.copy(op
.xattr
.name_len
, aname
);
6055 tracepoint(osd
, do_osd_op_pre_rmxattr
, soid
.oid
.name
.c_str(), soid
.snap
.val
, aname
.c_str());
6056 if (!obs
.exists
|| oi
.is_whiteout()) {
6060 string name
= "_" + aname
;
6061 t
->rmattr(soid
, name
);
6062 ctx
->delta_stats
.num_wr
++;
6067 // -- fancy writers --
6068 case CEPH_OSD_OP_APPEND
:
6070 tracepoint(osd
, do_osd_op_pre_append
, soid
.oid
.name
.c_str(), soid
.snap
.val
, oi
.size
, oi
.truncate_seq
, op
.extent
.offset
, op
.extent
.length
, op
.extent
.truncate_size
, op
.extent
.truncate_seq
);
6071 // just do it inline; this works because we are happy to execute
6072 // fancy op on replicas as well.
6073 vector
<OSDOp
> nops(1);
6074 OSDOp
& newop
= nops
[0];
6075 newop
.op
.op
= CEPH_OSD_OP_WRITE
;
6076 newop
.op
.extent
.offset
= oi
.size
;
6077 newop
.op
.extent
.length
= op
.extent
.length
;
6078 newop
.op
.extent
.truncate_seq
= oi
.truncate_seq
;
6079 newop
.indata
= osd_op
.indata
;
6080 result
= do_osd_ops(ctx
, nops
);
6081 osd_op
.outdata
.claim(newop
.outdata
);
6085 case CEPH_OSD_OP_STARTSYNC
:
6086 tracepoint(osd
, do_osd_op_pre_startsync
, soid
.oid
.name
.c_str(), soid
.snap
.val
);
6091 // -- trivial map --
6092 case CEPH_OSD_OP_TMAPGET
:
6093 tracepoint(osd
, do_osd_op_pre_tmapget
, soid
.oid
.name
.c_str(), soid
.snap
.val
);
6094 if (pool
.info
.require_rollback()) {
6095 result
= -EOPNOTSUPP
;
6099 vector
<OSDOp
> nops(1);
6100 OSDOp
& newop
= nops
[0];
6101 newop
.op
.op
= CEPH_OSD_OP_SYNC_READ
;
6102 newop
.op
.extent
.offset
= 0;
6103 newop
.op
.extent
.length
= 0;
6104 do_osd_ops(ctx
, nops
);
6105 osd_op
.outdata
.claim(newop
.outdata
);
6109 case CEPH_OSD_OP_TMAPPUT
:
6110 tracepoint(osd
, do_osd_op_pre_tmapput
, soid
.oid
.name
.c_str(), soid
.snap
.val
);
6111 if (pool
.info
.require_rollback()) {
6112 result
= -EOPNOTSUPP
;
6116 //_dout_lock.Lock();
6117 //osd_op.data.hexdump(*_dout);
6118 //_dout_lock.Unlock();
6120 // verify sort order
6121 bool unsorted
= false;
6124 ::decode(header
, bp
);
6131 dout(10) << "tmapput key " << key
<< dendl
;
6134 if (key
< last_key
) {
6135 dout(10) << "TMAPPUT is unordered; resorting" << dendl
;
6144 vector
<OSDOp
> nops(1);
6145 OSDOp
& newop
= nops
[0];
6146 newop
.op
.op
= CEPH_OSD_OP_WRITEFULL
;
6147 newop
.op
.extent
.offset
= 0;
6148 newop
.op
.extent
.length
= osd_op
.indata
.length();
6149 newop
.indata
= osd_op
.indata
;
6152 bp
= osd_op
.indata
.begin();
6154 map
<string
, bufferlist
> m
;
6155 ::decode(header
, bp
);
6159 ::encode(header
, newbl
);
6161 newop
.indata
= newbl
;
6163 result
= do_osd_ops(ctx
, nops
);
6164 assert(result
== 0);
6168 case CEPH_OSD_OP_TMAPUP
:
6169 tracepoint(osd
, do_osd_op_pre_tmapup
, soid
.oid
.name
.c_str(), soid
.snap
.val
);
6170 if (pool
.info
.require_rollback()) {
6171 result
= -EOPNOTSUPP
;
6175 result
= do_tmapup(ctx
, bp
, osd_op
);
6178 case CEPH_OSD_OP_TMAP2OMAP
:
6180 tracepoint(osd
, do_osd_op_pre_tmap2omap
, soid
.oid
.name
.c_str(), soid
.snap
.val
);
6181 result
= do_tmap2omap(ctx
, op
.tmap2omap
.flags
);
6185 case CEPH_OSD_OP_OMAPGETKEYS
:
6189 uint64_t max_return
;
6191 ::decode(start_after
, bp
);
6192 ::decode(max_return
, bp
);
6194 catch (buffer::error
& e
) {
6196 tracepoint(osd
, do_osd_op_pre_omapgetkeys
, soid
.oid
.name
.c_str(), soid
.snap
.val
, "???", 0);
6199 if (max_return
> cct
->_conf
->osd_max_omap_entries_per_request
) {
6200 max_return
= cct
->_conf
->osd_max_omap_entries_per_request
;
6202 tracepoint(osd
, do_osd_op_pre_omapgetkeys
, soid
.oid
.name
.c_str(), soid
.snap
.val
, start_after
.c_str(), max_return
);
6206 bool truncated
= false;
6208 ObjectMap::ObjectMapIterator iter
= osd
->store
->get_omap_iterator(
6209 coll
, ghobject_t(soid
)
6212 iter
->upper_bound(start_after
);
6213 for (num
= 0; iter
->valid(); ++num
, iter
->next(false)) {
6214 if (num
>= max_return
||
6215 bl
.length() >= cct
->_conf
->osd_max_omap_bytes_per_request
) {
6219 ::encode(iter
->key(), bl
);
6221 } // else return empty out_set
6222 ::encode(num
, osd_op
.outdata
);
6223 osd_op
.outdata
.claim_append(bl
);
6224 ::encode(truncated
, osd_op
.outdata
);
6225 ctx
->delta_stats
.num_rd_kb
+= SHIFT_ROUND_UP(osd_op
.outdata
.length(), 10);
6226 ctx
->delta_stats
.num_rd
++;
6230 case CEPH_OSD_OP_OMAPGETVALS
:
6234 uint64_t max_return
;
6235 string filter_prefix
;
6237 ::decode(start_after
, bp
);
6238 ::decode(max_return
, bp
);
6239 ::decode(filter_prefix
, bp
);
6241 catch (buffer::error
& e
) {
6243 tracepoint(osd
, do_osd_op_pre_omapgetvals
, soid
.oid
.name
.c_str(), soid
.snap
.val
, "???", 0, "???");
6246 if (max_return
> cct
->_conf
->osd_max_omap_entries_per_request
) {
6247 max_return
= cct
->_conf
->osd_max_omap_entries_per_request
;
6249 tracepoint(osd
, do_osd_op_pre_omapgetvals
, soid
.oid
.name
.c_str(), soid
.snap
.val
, start_after
.c_str(), max_return
, filter_prefix
.c_str());
6252 bool truncated
= false;
6255 ObjectMap::ObjectMapIterator iter
= osd
->store
->get_omap_iterator(
6256 coll
, ghobject_t(soid
)
6262 iter
->upper_bound(start_after
);
6263 if (filter_prefix
> start_after
) iter
->lower_bound(filter_prefix
);
6266 iter
->key().substr(0, filter_prefix
.size()) == filter_prefix
;
6267 ++num
, iter
->next(false)) {
6268 dout(20) << "Found key " << iter
->key() << dendl
;
6269 if (num
>= max_return
||
6270 bl
.length() >= cct
->_conf
->osd_max_omap_bytes_per_request
) {
6274 ::encode(iter
->key(), bl
);
6275 ::encode(iter
->value(), bl
);
6277 } // else return empty out_set
6278 ::encode(num
, osd_op
.outdata
);
6279 osd_op
.outdata
.claim_append(bl
);
6280 ::encode(truncated
, osd_op
.outdata
);
6281 ctx
->delta_stats
.num_rd_kb
+= SHIFT_ROUND_UP(osd_op
.outdata
.length(), 10);
6282 ctx
->delta_stats
.num_rd
++;
6286 case CEPH_OSD_OP_OMAPGETHEADER
:
6287 tracepoint(osd
, do_osd_op_pre_omapgetheader
, soid
.oid
.name
.c_str(), soid
.snap
.val
);
6288 if (!oi
.is_omap()) {
6289 // return empty header
6294 osd
->store
->omap_get_header(ch
, ghobject_t(soid
), &osd_op
.outdata
);
6295 ctx
->delta_stats
.num_rd_kb
+= SHIFT_ROUND_UP(osd_op
.outdata
.length(), 10);
6296 ctx
->delta_stats
.num_rd
++;
6300 case CEPH_OSD_OP_OMAPGETVALSBYKEYS
:
6303 set
<string
> keys_to_get
;
6305 ::decode(keys_to_get
, bp
);
6307 catch (buffer::error
& e
) {
6309 tracepoint(osd
, do_osd_op_pre_omapgetvalsbykeys
, soid
.oid
.name
.c_str(), soid
.snap
.val
, "???");
6312 tracepoint(osd
, do_osd_op_pre_omapgetvalsbykeys
, soid
.oid
.name
.c_str(), soid
.snap
.val
, list_entries(keys_to_get
).c_str());
6313 map
<string
, bufferlist
> out
;
6315 osd
->store
->omap_get_values(ch
, ghobject_t(soid
), keys_to_get
, &out
);
6316 } // else return empty omap entries
6317 ::encode(out
, osd_op
.outdata
);
6318 ctx
->delta_stats
.num_rd_kb
+= SHIFT_ROUND_UP(osd_op
.outdata
.length(), 10);
6319 ctx
->delta_stats
.num_rd
++;
6323 case CEPH_OSD_OP_OMAP_CMP
:
6326 if (!obs
.exists
|| oi
.is_whiteout()) {
6328 tracepoint(osd
, do_osd_op_pre_omap_cmp
, soid
.oid
.name
.c_str(), soid
.snap
.val
, "???");
6331 map
<string
, pair
<bufferlist
, int> > assertions
;
6333 ::decode(assertions
, bp
);
6335 catch (buffer::error
& e
) {
6337 tracepoint(osd
, do_osd_op_pre_omap_cmp
, soid
.oid
.name
.c_str(), soid
.snap
.val
, "???");
6340 tracepoint(osd
, do_osd_op_pre_omap_cmp
, soid
.oid
.name
.c_str(), soid
.snap
.val
, list_keys(assertions
).c_str());
6342 map
<string
, bufferlist
> out
;
6346 for (map
<string
, pair
<bufferlist
, int> >::iterator i
= assertions
.begin();
6347 i
!= assertions
.end();
6349 to_get
.insert(i
->first
);
6350 int r
= osd
->store
->omap_get_values(ch
, ghobject_t(soid
),
6356 } // else leave out empty
6358 //Should set num_rd_kb based on encode length of map
6359 ctx
->delta_stats
.num_rd
++;
6363 for (map
<string
, pair
<bufferlist
, int> >::iterator i
= assertions
.begin();
6364 i
!= assertions
.end();
6366 auto out_entry
= out
.find(i
->first
);
6367 bufferlist
&bl
= (out_entry
!= out
.end()) ?
6368 out_entry
->second
: empty
;
6369 switch (i
->second
.second
) {
6370 case CEPH_OSD_CMPXATTR_OP_EQ
:
6371 if (!(bl
== i
->second
.first
)) {
6375 case CEPH_OSD_CMPXATTR_OP_LT
:
6376 if (!(bl
< i
->second
.first
)) {
6380 case CEPH_OSD_CMPXATTR_OP_GT
:
6381 if (!(bl
> i
->second
.first
)) {
6399 case CEPH_OSD_OP_OMAPSETVALS
:
6400 if (!pool
.info
.supports_omap()) {
6401 result
= -EOPNOTSUPP
;
6402 tracepoint(osd
, do_osd_op_pre_omapsetvals
, soid
.oid
.name
.c_str(), soid
.snap
.val
);
6407 maybe_create_new_object(ctx
);
6408 bufferlist to_set_bl
;
6410 decode_str_str_map_to_bl(bp
, &to_set_bl
);
6412 catch (buffer::error
& e
) {
6414 tracepoint(osd
, do_osd_op_pre_omapsetvals
, soid
.oid
.name
.c_str(), soid
.snap
.val
);
6417 tracepoint(osd
, do_osd_op_pre_omapsetvals
, soid
.oid
.name
.c_str(), soid
.snap
.val
);
6418 if (cct
->_conf
->subsys
.should_gather(dout_subsys
, 20)) {
6419 dout(20) << "setting vals: " << dendl
;
6420 map
<string
,bufferlist
> to_set
;
6421 bufferlist::iterator pt
= to_set_bl
.begin();
6422 ::decode(to_set
, pt
);
6423 for (map
<string
, bufferlist
>::iterator i
= to_set
.begin();
6426 dout(20) << "\t" << i
->first
<< dendl
;
6429 t
->omap_setkeys(soid
, to_set_bl
);
6430 ctx
->delta_stats
.num_wr
++;
6432 obs
.oi
.set_flag(object_info_t::FLAG_OMAP
);
6433 obs
.oi
.clear_omap_digest();
6436 case CEPH_OSD_OP_OMAPSETHEADER
:
6437 tracepoint(osd
, do_osd_op_pre_omapsetheader
, soid
.oid
.name
.c_str(), soid
.snap
.val
);
6438 if (!pool
.info
.supports_omap()) {
6439 result
= -EOPNOTSUPP
;
6444 maybe_create_new_object(ctx
);
6445 t
->omap_setheader(soid
, osd_op
.indata
);
6446 ctx
->delta_stats
.num_wr
++;
6448 obs
.oi
.set_flag(object_info_t::FLAG_OMAP
);
6449 obs
.oi
.clear_omap_digest();
6452 case CEPH_OSD_OP_OMAPCLEAR
:
6453 tracepoint(osd
, do_osd_op_pre_omapclear
, soid
.oid
.name
.c_str(), soid
.snap
.val
);
6454 if (!pool
.info
.supports_omap()) {
6455 result
= -EOPNOTSUPP
;
6460 if (!obs
.exists
|| oi
.is_whiteout()) {
6465 t
->omap_clear(soid
);
6466 ctx
->delta_stats
.num_wr
++;
6467 obs
.oi
.clear_omap_digest();
6468 obs
.oi
.clear_flag(object_info_t::FLAG_OMAP
);
6473 case CEPH_OSD_OP_OMAPRMKEYS
:
6474 if (!pool
.info
.supports_omap()) {
6475 result
= -EOPNOTSUPP
;
6476 tracepoint(osd
, do_osd_op_pre_omaprmkeys
, soid
.oid
.name
.c_str(), soid
.snap
.val
);
6481 if (!obs
.exists
|| oi
.is_whiteout()) {
6483 tracepoint(osd
, do_osd_op_pre_omaprmkeys
, soid
.oid
.name
.c_str(), soid
.snap
.val
);
6486 bufferlist to_rm_bl
;
6488 decode_str_set_to_bl(bp
, &to_rm_bl
);
6490 catch (buffer::error
& e
) {
6492 tracepoint(osd
, do_osd_op_pre_omaprmkeys
, soid
.oid
.name
.c_str(), soid
.snap
.val
);
6495 tracepoint(osd
, do_osd_op_pre_omaprmkeys
, soid
.oid
.name
.c_str(), soid
.snap
.val
);
6496 t
->omap_rmkeys(soid
, to_rm_bl
);
6497 ctx
->delta_stats
.num_wr
++;
6499 obs
.oi
.clear_omap_digest();
6502 case CEPH_OSD_OP_COPY_GET
:
6504 tracepoint(osd
, do_osd_op_pre_copy_get
, soid
.oid
.name
.c_str(), soid
.snap
.val
);
6505 result
= fill_in_copy_get(ctx
, bp
, osd_op
, ctx
->obc
);
6508 case CEPH_OSD_OP_COPY_FROM
:
6512 object_locator_t src_oloc
;
6513 snapid_t src_snapid
= (uint64_t)op
.copy_from
.snapid
;
6514 version_t src_version
= op
.copy_from
.src_version
;
6516 ::decode(src_name
, bp
);
6517 ::decode(src_oloc
, bp
);
6519 catch (buffer::error
& e
) {
6522 do_osd_op_pre_copy_from
,
6523 soid
.oid
.name
.c_str(),
6535 do_osd_op_pre_copy_from
,
6536 soid
.oid
.name
.c_str(),
6538 src_name
.name
.c_str(),
6540 src_oloc
.key
.c_str(),
6541 src_oloc
.nspace
.c_str(),
6545 if (!ctx
->copy_cb
) {
6548 get_osdmap()->object_locator_to_pg(src_name
, src_oloc
, raw_pg
);
6549 hobject_t
src(src_name
, src_oloc
.key
, src_snapid
,
6550 raw_pg
.ps(), raw_pg
.pool(),
6553 dout(20) << " copy from self is invalid" << dendl
;
6557 CopyFromCallback
*cb
= new CopyFromCallback(ctx
);
6559 start_copy(cb
, ctx
->obc
, src
, src_oloc
, src_version
,
6562 op
.copy_from
.src_fadvise_flags
,
6564 result
= -EINPROGRESS
;
6567 assert(ctx
->copy_cb
->get_result() >= 0);
6568 finish_copyfrom(ctx
);
6575 tracepoint(osd
, do_osd_op_pre_unknown
, soid
.oid
.name
.c_str(), soid
.snap
.val
, op
.op
, ceph_osd_op_name(op
.op
));
6576 dout(1) << "unrecognized osd op " << op
.op
6577 << " " << ceph_osd_op_name(op
.op
)
6579 result
= -EOPNOTSUPP
;
6583 osd_op
.rval
= result
;
6584 tracepoint(osd
, do_osd_op_post
, soid
.oid
.name
.c_str(), soid
.snap
.val
, op
.op
, ceph_osd_op_name(op
.op
), op
.flags
, result
);
6585 if (result
< 0 && (op
.flags
& CEPH_OSD_OP_FLAG_FAILOK
))
6594 int PrimaryLogPG::_get_tmap(OpContext
*ctx
, bufferlist
*header
, bufferlist
*vals
)
6596 if (ctx
->new_obs
.oi
.size
== 0) {
6597 dout(20) << "unable to get tmap for zero sized " << ctx
->new_obs
.oi
.soid
<< dendl
;
6600 vector
<OSDOp
> nops(1);
6601 OSDOp
&newop
= nops
[0];
6602 newop
.op
.op
= CEPH_OSD_OP_TMAPGET
;
6603 do_osd_ops(ctx
, nops
);
6605 bufferlist::iterator i
= newop
.outdata
.begin();
6606 ::decode(*header
, i
);
6607 (*vals
).substr_of(newop
.outdata
, i
.get_off(), i
.get_remaining());
6609 dout(20) << "unsuccessful at decoding tmap for " << ctx
->new_obs
.oi
.soid
6613 dout(20) << "successful at decoding tmap for " << ctx
->new_obs
.oi
.soid
6618 int PrimaryLogPG::_verify_no_head_clones(const hobject_t
& soid
,
6621 // verify that all clones have been evicted
6622 dout(20) << __func__
<< " verifying clones are absent "
6624 for (vector
<snapid_t
>::const_iterator p
= ss
.clones
.begin();
6625 p
!= ss
.clones
.end();
6627 hobject_t clone_oid
= soid
;
6628 clone_oid
.snap
= *p
;
6629 if (is_missing_object(clone_oid
))
6631 ObjectContextRef clone_obc
= get_object_context(clone_oid
, false);
6632 if (clone_obc
&& clone_obc
->obs
.exists
) {
6633 dout(10) << __func__
<< " cannot evict head before clone "
6634 << clone_oid
<< dendl
;
6637 if (copy_ops
.count(clone_oid
)) {
6638 dout(10) << __func__
<< " cannot evict head, pending promote on clone "
6639 << clone_oid
<< dendl
;
6646 inline int PrimaryLogPG::_delete_oid(
6648 bool no_whiteout
, // no whiteouts, no matter what.
6649 bool try_no_whiteout
) // try not to whiteout
6651 SnapSet
& snapset
= ctx
->new_snapset
;
6652 ObjectState
& obs
= ctx
->new_obs
;
6653 object_info_t
& oi
= obs
.oi
;
6654 const hobject_t
& soid
= oi
.soid
;
6655 PGTransaction
* t
= ctx
->op_t
.get();
6657 // cache: cache: set whiteout on delete?
6658 bool whiteout
= false;
6659 if (pool
.info
.cache_mode
!= pg_pool_t::CACHEMODE_NONE
6661 && !try_no_whiteout
) {
6665 if (get_osdmap()->require_osd_release
>= CEPH_RELEASE_LUMINOUS
) {
6667 // in luminous or later, we can't delete the head if there are
6668 // clones. we trust the caller passing no_whiteout has already
6669 // verified they don't exist.
6670 if (!snapset
.clones
.empty() ||
6671 (!ctx
->snapc
.snaps
.empty() && ctx
->snapc
.snaps
[0] > snapset
.seq
)) {
6673 dout(20) << __func__
<< " has or will have clones but no_whiteout=1"
6676 dout(20) << __func__
<< " has or will have clones; will whiteout"
6684 dout(20) << __func__
<< " " << soid
<< " whiteout=" << (int)whiteout
6685 << " no_whiteout=" << (int)no_whiteout
6686 << " try_no_whiteout=" << (int)try_no_whiteout
6688 if (!obs
.exists
|| (obs
.oi
.is_whiteout() && whiteout
))
6694 interval_set
<uint64_t> ch
;
6695 ch
.insert(0, oi
.size
);
6696 ctx
->modified_ranges
.union_of(ch
);
6699 ctx
->delta_stats
.num_wr
++;
6700 if (soid
.is_snap()) {
6701 assert(ctx
->obc
->ssc
->snapset
.clone_overlap
.count(soid
.snap
));
6702 ctx
->delta_stats
.num_bytes
-= ctx
->obc
->ssc
->snapset
.get_clone_bytes(soid
.snap
);
6704 ctx
->delta_stats
.num_bytes
-= oi
.size
;
6709 // disconnect all watchers
6710 for (map
<pair
<uint64_t, entity_name_t
>, watch_info_t
>::iterator p
=
6711 oi
.watchers
.begin();
6712 p
!= oi
.watchers
.end();
6714 dout(20) << __func__
<< " will disconnect watcher " << p
->first
<< dendl
;
6715 ctx
->watch_disconnects
.push_back(
6716 watch_disconnect_t(p
->first
.first
, p
->first
.second
, true));
6718 oi
.watchers
.clear();
6721 dout(20) << __func__
<< " setting whiteout on " << soid
<< dendl
;
6722 oi
.set_flag(object_info_t::FLAG_WHITEOUT
);
6723 ctx
->delta_stats
.num_whiteouts
++;
6725 osd
->logger
->inc(l_osd_tier_whiteout
);
6730 ctx
->delta_stats
.num_objects
--;
6732 ctx
->delta_stats
.num_object_clones
--;
6733 if (oi
.is_whiteout()) {
6734 dout(20) << __func__
<< " deleting whiteout on " << soid
<< dendl
;
6735 ctx
->delta_stats
.num_whiteouts
--;
6736 oi
.clear_flag(object_info_t::FLAG_WHITEOUT
);
6738 if (oi
.is_cache_pinned()) {
6739 ctx
->delta_stats
.num_objects_pinned
--;
6741 if ((legacy
|| snapset
.is_legacy()) && soid
.is_head()) {
6742 snapset
.head_exists
= false;
6748 int PrimaryLogPG::_rollback_to(OpContext
*ctx
, ceph_osd_op
& op
)
6750 SnapSet
& snapset
= ctx
->new_snapset
;
6751 ObjectState
& obs
= ctx
->new_obs
;
6752 object_info_t
& oi
= obs
.oi
;
6753 const hobject_t
& soid
= oi
.soid
;
6754 PGTransaction
* t
= ctx
->op_t
.get();
6755 snapid_t snapid
= (uint64_t)op
.snap
.snapid
;
6756 hobject_t missing_oid
;
6758 dout(10) << "_rollback_to " << soid
<< " snapid " << snapid
<< dendl
;
6760 ObjectContextRef rollback_to
;
6761 int ret
= find_object_context(
6762 hobject_t(soid
.oid
, soid
.get_key(), snapid
, soid
.get_hash(), info
.pgid
.pool(),
6763 soid
.get_namespace()),
6764 &rollback_to
, false, false, &missing_oid
);
6765 if (ret
== -EAGAIN
) {
6766 /* clone must be missing */
6767 assert(is_missing_object(missing_oid
));
6768 dout(20) << "_rollback_to attempted to roll back to a missing object "
6769 << missing_oid
<< " (requested snapid: ) " << snapid
<< dendl
;
6770 block_write_on_degraded_snap(missing_oid
, ctx
->op
);
6774 ObjectContextRef promote_obc
;
6775 cache_result_t tier_mode_result
;
6776 if (obs
.exists
&& obs
.oi
.has_manifest()) {
6778 maybe_handle_manifest_detail(
6784 maybe_handle_cache_detail(
6794 switch (tier_mode_result
) {
6795 case cache_result_t::NOOP
:
6797 case cache_result_t::BLOCKED_PROMOTE
:
6798 assert(promote_obc
);
6799 block_write_on_snap_rollback(soid
, promote_obc
, ctx
->op
);
6801 case cache_result_t::BLOCKED_FULL
:
6802 block_write_on_full_cache(soid
, ctx
->op
);
6805 assert(0 == "must promote was set, other values are not valid");
6810 if (ret
== -ENOENT
|| (rollback_to
&& rollback_to
->obs
.oi
.is_whiteout())) {
6811 // there's no snapshot here, or there's no object.
6812 // if there's no snapshot, we delete the object; otherwise, do nothing.
6813 dout(20) << "_rollback_to deleting head on " << soid
.oid
6814 << " because got ENOENT|whiteout on find_object_context" << dendl
;
6815 if (ctx
->obc
->obs
.oi
.watchers
.size()) {
6816 // Cannot delete an object with watchers
6819 _delete_oid(ctx
, false, false);
6823 // ummm....huh? It *can't* return anything else at time of writing.
6824 assert(0 == "unexpected error code in _rollback_to");
6825 } else { //we got our context, let's use it to do the rollback!
6826 hobject_t
& rollback_to_sobject
= rollback_to
->obs
.oi
.soid
;
6827 if (is_degraded_or_backfilling_object(rollback_to_sobject
)) {
6828 dout(20) << "_rollback_to attempted to roll back to a degraded object "
6829 << rollback_to_sobject
<< " (requested snapid: ) " << snapid
<< dendl
;
6830 block_write_on_degraded_snap(rollback_to_sobject
, ctx
->op
);
6832 } else if (rollback_to
->obs
.oi
.soid
.snap
== CEPH_NOSNAP
) {
6833 // rolling back to the head; we just need to clone it.
6836 /* 1) Delete current head
6837 * 2) Clone correct snapshot into head
6838 * 3) Calculate clone_overlaps by following overlaps
6839 * forward from rollback snapshot */
6840 dout(10) << "_rollback_to deleting " << soid
.oid
6841 << " and rolling back to old snap" << dendl
;
6846 t
->clone(soid
, rollback_to_sobject
);
6847 snapset
.head_exists
= true;
6848 t
->add_obc(rollback_to
);
6850 map
<snapid_t
, interval_set
<uint64_t> >::iterator iter
=
6851 snapset
.clone_overlap
.lower_bound(snapid
);
6852 interval_set
<uint64_t> overlaps
= iter
->second
;
6853 assert(iter
!= snapset
.clone_overlap
.end());
6855 iter
!= snapset
.clone_overlap
.end();
6857 overlaps
.intersection_of(iter
->second
);
6859 if (obs
.oi
.size
> 0) {
6860 interval_set
<uint64_t> modified
;
6861 modified
.insert(0, obs
.oi
.size
);
6862 overlaps
.intersection_of(modified
);
6863 modified
.subtract(overlaps
);
6864 ctx
->modified_ranges
.union_of(modified
);
6867 // Adjust the cached objectcontext
6868 maybe_create_new_object(ctx
, true);
6869 ctx
->delta_stats
.num_bytes
-= obs
.oi
.size
;
6870 ctx
->delta_stats
.num_bytes
+= rollback_to
->obs
.oi
.size
;
6871 obs
.oi
.size
= rollback_to
->obs
.oi
.size
;
6872 if (rollback_to
->obs
.oi
.is_data_digest())
6873 obs
.oi
.set_data_digest(rollback_to
->obs
.oi
.data_digest
);
6875 obs
.oi
.clear_data_digest();
6876 if (rollback_to
->obs
.oi
.is_omap_digest())
6877 obs
.oi
.set_omap_digest(rollback_to
->obs
.oi
.omap_digest
);
6879 obs
.oi
.clear_omap_digest();
6881 if (rollback_to
->obs
.oi
.is_omap()) {
6882 dout(10) << __func__
<< " setting omap flag on " << obs
.oi
.soid
<< dendl
;
6883 obs
.oi
.set_flag(object_info_t::FLAG_OMAP
);
6885 dout(10) << __func__
<< " clearing omap flag on " << obs
.oi
.soid
<< dendl
;
6886 obs
.oi
.clear_flag(object_info_t::FLAG_OMAP
);
6889 snapset
.head_exists
= true;
6895 void PrimaryLogPG::_make_clone(
6898 ObjectContextRef obc
,
6899 const hobject_t
& head
, const hobject_t
& coid
,
6903 ::encode(*poi
, bv
, get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD
, nullptr));
6905 t
->clone(coid
, head
);
6906 setattr_maybe_cache(obc
, ctx
, t
, OI_ATTR
, bv
);
6907 rmattr_maybe_cache(obc
, ctx
, t
, SS_ATTR
);
6910 void PrimaryLogPG::make_writeable(OpContext
*ctx
)
6912 const hobject_t
& soid
= ctx
->obs
->oi
.soid
;
6913 SnapContext
& snapc
= ctx
->snapc
;
6916 assert(soid
.snap
== CEPH_NOSNAP
);
6917 dout(20) << "make_writeable " << soid
<< " snapset=" << ctx
->new_snapset
6918 << " snapc=" << snapc
<< dendl
;
6920 bool was_dirty
= ctx
->obc
->obs
.oi
.is_dirty();
6921 if (ctx
->new_obs
.exists
) {
6922 // we will mark the object dirty
6923 if (ctx
->undirty
&& was_dirty
) {
6924 dout(20) << " clearing DIRTY flag" << dendl
;
6925 assert(ctx
->new_obs
.oi
.is_dirty());
6926 ctx
->new_obs
.oi
.clear_flag(object_info_t::FLAG_DIRTY
);
6927 --ctx
->delta_stats
.num_objects_dirty
;
6928 osd
->logger
->inc(l_osd_tier_clean
);
6929 } else if (!was_dirty
&& !ctx
->undirty
) {
6930 dout(20) << " setting DIRTY flag" << dendl
;
6931 ctx
->new_obs
.oi
.set_flag(object_info_t::FLAG_DIRTY
);
6932 ++ctx
->delta_stats
.num_objects_dirty
;
6933 osd
->logger
->inc(l_osd_tier_dirty
);
6937 dout(20) << " deletion, decrementing num_dirty and clearing flag" << dendl
;
6938 ctx
->new_obs
.oi
.clear_flag(object_info_t::FLAG_DIRTY
);
6939 --ctx
->delta_stats
.num_objects_dirty
;
6943 if ((ctx
->new_obs
.exists
&&
6944 ctx
->new_obs
.oi
.is_omap()) &&
6945 (!ctx
->obc
->obs
.exists
||
6946 !ctx
->obc
->obs
.oi
.is_omap())) {
6947 ++ctx
->delta_stats
.num_objects_omap
;
6949 if ((!ctx
->new_obs
.exists
||
6950 !ctx
->new_obs
.oi
.is_omap()) &&
6951 (ctx
->obc
->obs
.exists
&&
6952 ctx
->obc
->obs
.oi
.is_omap())) {
6953 --ctx
->delta_stats
.num_objects_omap
;
6957 if (ctx
->new_snapset
.seq
> snapc
.seq
) {
6958 snapc
.seq
= ctx
->new_snapset
.seq
;
6959 snapc
.snaps
= ctx
->new_snapset
.snaps
;
6960 filter_snapc(snapc
.snaps
);
6961 dout(10) << " using newer snapc " << snapc
<< dendl
;
6964 if ((ctx
->obs
->exists
&& !ctx
->obs
->oi
.is_whiteout()) && // head exist(ed)
6965 snapc
.snaps
.size() && // there are snaps
6966 !ctx
->cache_evict
&&
6967 snapc
.snaps
[0] > ctx
->new_snapset
.seq
) { // existing object is old
6969 hobject_t coid
= soid
;
6970 coid
.snap
= snapc
.seq
;
6973 for (l
=1; l
<snapc
.snaps
.size() && snapc
.snaps
[l
] > ctx
->new_snapset
.seq
; l
++) ;
6975 vector
<snapid_t
> snaps(l
);
6976 for (unsigned i
=0; i
<l
; i
++)
6977 snaps
[i
] = snapc
.snaps
[i
];
6980 object_info_t
static_snap_oi(coid
);
6981 object_info_t
*snap_oi
;
6983 ctx
->clone_obc
= object_contexts
.lookup_or_create(static_snap_oi
.soid
);
6984 ctx
->clone_obc
->destructor_callback
= new C_PG_ObjectContext(this, ctx
->clone_obc
.get());
6985 ctx
->clone_obc
->obs
.oi
= static_snap_oi
;
6986 ctx
->clone_obc
->obs
.exists
= true;
6987 ctx
->clone_obc
->ssc
= ctx
->obc
->ssc
;
6988 ctx
->clone_obc
->ssc
->ref
++;
6989 if (pool
.info
.require_rollback())
6990 ctx
->clone_obc
->attr_cache
= ctx
->obc
->attr_cache
;
6991 snap_oi
= &ctx
->clone_obc
->obs
.oi
;
6992 bool got
= ctx
->lock_manager
.get_write_greedy(
6997 dout(20) << " got greedy write on clone_obc " << *ctx
->clone_obc
<< dendl
;
6999 snap_oi
= &static_snap_oi
;
7001 snap_oi
->version
= ctx
->at_version
;
7002 snap_oi
->prior_version
= ctx
->obs
->oi
.version
;
7003 snap_oi
->copy_user_bits(ctx
->obs
->oi
);
7005 bool legacy
= ctx
->new_snapset
.is_legacy() ||
7006 get_osdmap()->require_osd_release
< CEPH_RELEASE_LUMINOUS
;
7008 snap_oi
->legacy_snaps
= snaps
;
7011 _make_clone(ctx
, ctx
->op_t
.get(), ctx
->clone_obc
, soid
, coid
, snap_oi
);
7013 ctx
->delta_stats
.num_objects
++;
7014 if (snap_oi
->is_dirty()) {
7015 ctx
->delta_stats
.num_objects_dirty
++;
7016 osd
->logger
->inc(l_osd_tier_dirty
);
7018 if (snap_oi
->is_omap())
7019 ctx
->delta_stats
.num_objects_omap
++;
7020 if (snap_oi
->is_cache_pinned())
7021 ctx
->delta_stats
.num_objects_pinned
++;
7022 ctx
->delta_stats
.num_object_clones
++;
7023 ctx
->new_snapset
.clones
.push_back(coid
.snap
);
7024 ctx
->new_snapset
.clone_size
[coid
.snap
] = ctx
->obs
->oi
.size
;
7026 ctx
->new_snapset
.clone_snaps
[coid
.snap
] = snaps
;
7029 // clone_overlap should contain an entry for each clone
7030 // (an empty interval_set if there is no overlap)
7031 ctx
->new_snapset
.clone_overlap
[coid
.snap
];
7032 if (ctx
->obs
->oi
.size
)
7033 ctx
->new_snapset
.clone_overlap
[coid
.snap
].insert(0, ctx
->obs
->oi
.size
);
7036 dout(10) << " cloning v " << ctx
->obs
->oi
.version
7037 << " to " << coid
<< " v " << ctx
->at_version
7038 << " snaps=" << snaps
7039 << " snapset=" << ctx
->new_snapset
<< dendl
;
7040 ctx
->log
.push_back(pg_log_entry_t(pg_log_entry_t::CLONE
, coid
, ctx
->at_version
,
7041 ctx
->obs
->oi
.version
,
7042 ctx
->obs
->oi
.user_version
,
7043 osd_reqid_t(), ctx
->new_obs
.oi
.mtime
, 0));
7044 ::encode(snaps
, ctx
->log
.back().snaps
);
7046 ctx
->at_version
.version
++;
7049 // update most recent clone_overlap and usage stats
7050 if (ctx
->new_snapset
.clones
.size() > 0) {
7051 /* we need to check whether the most recent clone exists, if it's been evicted,
7052 * it's not included in the stats */
7053 hobject_t last_clone_oid
= soid
;
7054 last_clone_oid
.snap
= ctx
->new_snapset
.clone_overlap
.rbegin()->first
;
7055 if (is_present_clone(last_clone_oid
)) {
7056 interval_set
<uint64_t> &newest_overlap
= ctx
->new_snapset
.clone_overlap
.rbegin()->second
;
7057 ctx
->modified_ranges
.intersection_of(newest_overlap
);
7058 // modified_ranges is still in use by the clone
7059 add_interval_usage(ctx
->modified_ranges
, ctx
->delta_stats
);
7060 newest_overlap
.subtract(ctx
->modified_ranges
);
7064 // update snapset with latest snap context
7065 ctx
->new_snapset
.seq
= snapc
.seq
;
7066 ctx
->new_snapset
.snaps
= snapc
.snaps
;
7067 if (get_osdmap()->require_osd_release
< CEPH_RELEASE_LUMINOUS
) {
7068 // pessimistic assumption that this is a net-new legacy SnapSet
7069 ctx
->delta_stats
.num_legacy_snapsets
++;
7070 ctx
->new_snapset
.head_exists
= ctx
->new_obs
.exists
;
7071 } else if (ctx
->new_snapset
.is_legacy()) {
7072 ctx
->new_snapset
.head_exists
= ctx
->new_obs
.exists
;
7074 dout(20) << "make_writeable " << soid
7075 << " done, snapset=" << ctx
->new_snapset
<< dendl
;
7079 void PrimaryLogPG::write_update_size_and_usage(object_stat_sum_t
& delta_stats
, object_info_t
& oi
,
7080 interval_set
<uint64_t>& modified
, uint64_t offset
,
7081 uint64_t length
, bool write_full
)
7083 interval_set
<uint64_t> ch
;
7086 ch
.insert(0, oi
.size
);
7088 ch
.insert(offset
, length
);
7089 modified
.union_of(ch
);
7090 if (write_full
|| offset
+ length
> oi
.size
) {
7091 uint64_t new_size
= offset
+ length
;
7092 delta_stats
.num_bytes
-= oi
.size
;
7093 delta_stats
.num_bytes
+= new_size
;
7096 delta_stats
.num_wr
++;
7097 delta_stats
.num_wr_kb
+= SHIFT_ROUND_UP(length
, 10);
7100 void PrimaryLogPG::add_interval_usage(interval_set
<uint64_t>& s
, object_stat_sum_t
& delta_stats
)
7102 for (interval_set
<uint64_t>::const_iterator p
= s
.begin(); p
!= s
.end(); ++p
) {
7103 delta_stats
.num_bytes
+= p
.get_len();
7107 void PrimaryLogPG::complete_disconnect_watches(
7108 ObjectContextRef obc
,
7109 const list
<watch_disconnect_t
> &to_disconnect
)
7111 for (list
<watch_disconnect_t
>::const_iterator i
=
7112 to_disconnect
.begin();
7113 i
!= to_disconnect
.end();
7115 pair
<uint64_t, entity_name_t
> watcher(i
->cookie
, i
->name
);
7116 auto watchers_entry
= obc
->watchers
.find(watcher
);
7117 if (watchers_entry
!= obc
->watchers
.end()) {
7118 WatchRef watch
= watchers_entry
->second
;
7119 dout(10) << "do_osd_op_effects disconnect watcher " << watcher
<< dendl
;
7120 obc
->watchers
.erase(watcher
);
7121 watch
->remove(i
->send_disconnect
);
7123 dout(10) << "do_osd_op_effects disconnect failed to find watcher "
7124 << watcher
<< dendl
;
7129 void PrimaryLogPG::do_osd_op_effects(OpContext
*ctx
, const ConnectionRef
& conn
)
7131 entity_name_t entity
= ctx
->reqid
.name
;
7132 dout(15) << "do_osd_op_effects " << entity
<< " con " << conn
.get() << dendl
;
7134 // disconnects first
7135 complete_disconnect_watches(ctx
->obc
, ctx
->watch_disconnects
);
7139 boost::intrusive_ptr
<Session
> session((Session
*)conn
->get_priv());
7142 session
->put(); // get_priv() takes a ref, and so does the intrusive_ptr
7144 for (list
<pair
<watch_info_t
,bool> >::iterator i
= ctx
->watch_connects
.begin();
7145 i
!= ctx
->watch_connects
.end();
7147 pair
<uint64_t, entity_name_t
> watcher(i
->first
.cookie
, entity
);
7148 dout(15) << "do_osd_op_effects applying watch connect on session "
7149 << session
.get() << " watcher " << watcher
<< dendl
;
7151 if (ctx
->obc
->watchers
.count(watcher
)) {
7152 dout(15) << "do_osd_op_effects found existing watch watcher " << watcher
7154 watch
= ctx
->obc
->watchers
[watcher
];
7156 dout(15) << "do_osd_op_effects new watcher " << watcher
7158 watch
= Watch::makeWatchRef(
7159 this, osd
, ctx
->obc
, i
->first
.timeout_seconds
,
7160 i
->first
.cookie
, entity
, conn
->get_peer_addr());
7161 ctx
->obc
->watchers
.insert(
7166 watch
->connect(conn
, i
->second
);
7169 for (list
<notify_info_t
>::iterator p
= ctx
->notifies
.begin();
7170 p
!= ctx
->notifies
.end();
7172 dout(10) << "do_osd_op_effects, notify " << *p
<< dendl
;
7173 ConnectionRef
conn(ctx
->op
->get_req()->get_connection());
7175 Notify::makeNotifyRef(
7177 ctx
->reqid
.name
.num(),
7182 ctx
->obc
->obs
.oi
.user_version
,
7184 for (map
<pair
<uint64_t, entity_name_t
>, WatchRef
>::iterator i
=
7185 ctx
->obc
->watchers
.begin();
7186 i
!= ctx
->obc
->watchers
.end();
7188 dout(10) << "starting notify on watch " << i
->first
<< dendl
;
7189 i
->second
->start_notify(notif
);
7194 for (list
<OpContext::NotifyAck
>::iterator p
= ctx
->notify_acks
.begin();
7195 p
!= ctx
->notify_acks
.end();
7197 if (p
->watch_cookie
)
7198 dout(10) << "notify_ack " << make_pair(p
->watch_cookie
.get(), p
->notify_id
) << dendl
;
7200 dout(10) << "notify_ack " << make_pair("NULL", p
->notify_id
) << dendl
;
7201 for (map
<pair
<uint64_t, entity_name_t
>, WatchRef
>::iterator i
=
7202 ctx
->obc
->watchers
.begin();
7203 i
!= ctx
->obc
->watchers
.end();
7205 if (i
->first
.second
!= entity
) continue;
7206 if (p
->watch_cookie
&&
7207 p
->watch_cookie
.get() != i
->first
.first
) continue;
7208 dout(10) << "acking notify on watch " << i
->first
<< dendl
;
7209 i
->second
->notify_ack(p
->notify_id
, p
->reply_bl
);
7214 hobject_t
PrimaryLogPG::generate_temp_object(const hobject_t
& target
)
7217 ss
<< "temp_" << info
.pgid
<< "_" << get_role()
7218 << "_" << osd
->monc
->get_global_id() << "_" << (++temp_seq
);
7219 hobject_t hoid
= target
.make_temp_hobject(ss
.str());
7220 dout(20) << __func__
<< " " << hoid
<< dendl
;
7224 hobject_t
PrimaryLogPG::get_temp_recovery_object(
7225 const hobject_t
& target
,
7229 ss
<< "temp_recovering_" << info
.pgid
// (note this includes the shardid)
7231 << "_" << info
.history
.same_interval_since
7232 << "_" << target
.snap
;
7233 // pgid + version + interval + snapid is unique, and short
7234 hobject_t hoid
= target
.make_temp_hobject(ss
.str());
7235 dout(20) << __func__
<< " " << hoid
<< dendl
;
7239 int PrimaryLogPG::prepare_transaction(OpContext
*ctx
)
7241 assert(!ctx
->ops
.empty());
7243 const hobject_t
& soid
= ctx
->obs
->oi
.soid
;
7245 // valid snap context?
7246 if (!ctx
->snapc
.is_valid()) {
7247 dout(10) << " invalid snapc " << ctx
->snapc
<< dendl
;
7251 // prepare the actual mutation
7252 int result
= do_osd_ops(ctx
, ctx
->ops
);
7254 if (ctx
->op
->may_write() &&
7255 get_osdmap()->require_osd_release
>= CEPH_RELEASE_KRAKEN
) {
7256 // need to save the error code in the pg log, to detect dup ops,
7257 // but do nothing else
7258 ctx
->update_log_only
= true;
7263 // read-op? write-op noop? done?
7264 if (ctx
->op_t
->empty() && !ctx
->modify
) {
7265 unstable_stats
.add(ctx
->delta_stats
);
7266 if (ctx
->op
->may_write() &&
7267 get_osdmap()->require_osd_release
>= CEPH_RELEASE_KRAKEN
) {
7268 ctx
->update_log_only
= true;
7274 if ((ctx
->delta_stats
.num_bytes
> 0 ||
7275 ctx
->delta_stats
.num_objects
> 0) && // FIXME: keys?
7276 (pool
.info
.has_flag(pg_pool_t::FLAG_FULL
) ||
7277 get_osdmap()->test_flag(CEPH_OSDMAP_FULL
))) {
7278 const MOSDOp
*m
= static_cast<const MOSDOp
*>(ctx
->op
->get_req());
7279 if (ctx
->reqid
.name
.is_mds() || // FIXME: ignore MDS for now
7280 m
->has_flag(CEPH_OSD_FLAG_FULL_FORCE
)) {
7281 dout(20) << __func__
<< " full, but proceeding due to FULL_FORCE or MDS"
7283 } else if (m
->has_flag(CEPH_OSD_FLAG_FULL_TRY
)) {
7284 // they tried, they failed.
7285 dout(20) << __func__
<< " full, replying to FULL_TRY op" << dendl
;
7286 return pool
.info
.has_flag(pg_pool_t::FLAG_FULL
) ? -EDQUOT
: -ENOSPC
;
7289 dout(20) << __func__
<< " full, dropping request (bad client)" << dendl
;
7294 // clone, if necessary
7295 if (soid
.snap
== CEPH_NOSNAP
)
7296 make_writeable(ctx
);
7299 ctx
->new_obs
.exists
? pg_log_entry_t::MODIFY
:
7300 pg_log_entry_t::DELETE
);
7305 void PrimaryLogPG::finish_ctx(OpContext
*ctx
, int log_op_type
, bool maintain_ssc
)
7307 const hobject_t
& soid
= ctx
->obs
->oi
.soid
;
7308 dout(20) << __func__
<< " " << soid
<< " " << ctx
7309 << " op " << pg_log_entry_t::get_op_name(log_op_type
)
7311 utime_t now
= ceph_clock_now();
7316 if (soid
.snap
== CEPH_NOSNAP
&& maintain_ssc
) {
7317 ::encode(ctx
->new_snapset
, bss
);
7318 assert(ctx
->new_obs
.exists
== ctx
->new_snapset
.head_exists
||
7319 !ctx
->new_snapset
.is_legacy());
7321 if (ctx
->new_obs
.exists
) {
7322 if (!ctx
->obs
->exists
) {
7323 if (ctx
->snapset_obc
&& ctx
->snapset_obc
->obs
.exists
) {
7324 hobject_t snapoid
= soid
.get_snapdir();
7325 dout(10) << " removing unneeded snapdir " << snapoid
<< dendl
;
7326 ctx
->log
.push_back(pg_log_entry_t(pg_log_entry_t::DELETE
, snapoid
,
7328 ctx
->snapset_obc
->obs
.oi
.version
,
7329 0, osd_reqid_t(), ctx
->mtime
, 0));
7330 ctx
->op_t
->remove(snapoid
);
7332 ctx
->at_version
.version
++;
7334 ctx
->snapset_obc
->obs
.exists
= false;
7337 } else if (!ctx
->new_snapset
.clones
.empty() &&
7338 !ctx
->cache_evict
&&
7339 !ctx
->new_snapset
.head_exists
&&
7340 (!ctx
->snapset_obc
|| !ctx
->snapset_obc
->obs
.exists
)) {
7341 // save snapset on _snap
7342 hobject_t
snapoid(soid
.oid
, soid
.get_key(), CEPH_SNAPDIR
, soid
.get_hash(),
7343 info
.pgid
.pool(), soid
.get_namespace());
7344 dout(10) << " final snapset " << ctx
->new_snapset
7345 << " in " << snapoid
<< dendl
;
7346 assert(get_osdmap()->require_osd_release
< CEPH_RELEASE_LUMINOUS
);
7347 ctx
->log
.push_back(pg_log_entry_t(pg_log_entry_t::MODIFY
, snapoid
,
7350 0, osd_reqid_t(), ctx
->mtime
, 0));
7352 if (!ctx
->snapset_obc
)
7353 ctx
->snapset_obc
= get_object_context(snapoid
, true);
7355 if (ctx
->lock_type
== ObjectContext::RWState::RWWRITE
) {
7356 got
= ctx
->lock_manager
.get_write_greedy(
7361 assert(ctx
->lock_type
== ObjectContext::RWState::RWEXCL
);
7362 got
= ctx
->lock_manager
.get_lock_type(
7363 ObjectContext::RWState::RWEXCL
,
7369 dout(20) << " got greedy write on snapset_obc " << *ctx
->snapset_obc
<< dendl
;
7370 ctx
->snapset_obc
->obs
.exists
= true;
7371 ctx
->snapset_obc
->obs
.oi
.version
= ctx
->at_version
;
7372 ctx
->snapset_obc
->obs
.oi
.last_reqid
= ctx
->reqid
;
7373 ctx
->snapset_obc
->obs
.oi
.mtime
= ctx
->mtime
;
7374 ctx
->snapset_obc
->obs
.oi
.local_mtime
= now
;
7376 map
<string
, bufferlist
> attrs
;
7377 bufferlist
bv(sizeof(ctx
->new_obs
.oi
));
7378 ::encode(ctx
->snapset_obc
->obs
.oi
, bv
,
7379 get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD
, nullptr));
7380 ctx
->op_t
->create(snapoid
);
7381 attrs
[OI_ATTR
].claim(bv
);
7382 attrs
[SS_ATTR
].claim(bss
);
7383 setattrs_maybe_cache(ctx
->snapset_obc
, ctx
, ctx
->op_t
.get(), attrs
);
7384 ctx
->at_version
.version
++;
7388 // finish and log the op.
7389 if (ctx
->user_modify
) {
7390 // update the user_version for any modify ops, except for the watch op
7391 ctx
->user_at_version
= MAX(info
.last_user_version
, ctx
->new_obs
.oi
.user_version
) + 1;
7392 /* In order for new clients and old clients to interoperate properly
7393 * when exchanging versions, we need to lower bound the user_version
7394 * (which our new clients pay proper attention to)
7395 * by the at_version (which is all the old clients can ever see). */
7396 if (ctx
->at_version
.version
> ctx
->user_at_version
)
7397 ctx
->user_at_version
= ctx
->at_version
.version
;
7398 ctx
->new_obs
.oi
.user_version
= ctx
->user_at_version
;
7400 ctx
->bytes_written
= ctx
->op_t
->get_bytes_written();
7402 if (ctx
->new_obs
.exists
) {
7403 // on the head object
7404 ctx
->new_obs
.oi
.version
= ctx
->at_version
;
7405 ctx
->new_obs
.oi
.prior_version
= ctx
->obs
->oi
.version
;
7406 ctx
->new_obs
.oi
.last_reqid
= ctx
->reqid
;
7407 if (ctx
->mtime
!= utime_t()) {
7408 ctx
->new_obs
.oi
.mtime
= ctx
->mtime
;
7409 dout(10) << " set mtime to " << ctx
->new_obs
.oi
.mtime
<< dendl
;
7410 ctx
->new_obs
.oi
.local_mtime
= now
;
7412 dout(10) << " mtime unchanged at " << ctx
->new_obs
.oi
.mtime
<< dendl
;
7415 map
<string
, bufferlist
> attrs
;
7416 bufferlist
bv(sizeof(ctx
->new_obs
.oi
));
7417 ::encode(ctx
->new_obs
.oi
, bv
,
7418 get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD
, nullptr));
7419 attrs
[OI_ATTR
].claim(bv
);
7421 if (soid
.snap
== CEPH_NOSNAP
) {
7422 dout(10) << " final snapset " << ctx
->new_snapset
7423 << " in " << soid
<< dendl
;
7424 attrs
[SS_ATTR
].claim(bss
);
7426 dout(10) << " no snapset (this is a clone)" << dendl
;
7428 ctx
->op_t
->setattrs(soid
, attrs
);
7430 ctx
->new_obs
.oi
= object_info_t(ctx
->obc
->obs
.oi
.soid
);
7433 bool legacy_snapset
= ctx
->new_snapset
.is_legacy() ||
7434 get_osdmap()->require_osd_release
< CEPH_RELEASE_LUMINOUS
;
7437 ctx
->log
.push_back(pg_log_entry_t(log_op_type
, soid
, ctx
->at_version
,
7438 ctx
->obs
->oi
.version
,
7439 ctx
->user_at_version
, ctx
->reqid
,
7441 if (soid
.snap
< CEPH_NOSNAP
) {
7442 switch (log_op_type
) {
7443 case pg_log_entry_t::MODIFY
:
7444 case pg_log_entry_t::PROMOTE
:
7445 case pg_log_entry_t::CLEAN
:
7446 if (legacy_snapset
) {
7447 dout(20) << __func__
<< " encoding legacy_snaps "
7448 << ctx
->new_obs
.oi
.legacy_snaps
7450 ::encode(ctx
->new_obs
.oi
.legacy_snaps
, ctx
->log
.back().snaps
);
7452 dout(20) << __func__
<< " encoding snaps from " << ctx
->new_snapset
7454 ::encode(ctx
->new_snapset
.clone_snaps
[soid
.snap
], ctx
->log
.back().snaps
);
7462 if (!ctx
->extra_reqids
.empty()) {
7463 dout(20) << __func__
<< " extra_reqids " << ctx
->extra_reqids
<< dendl
;
7464 ctx
->log
.back().extra_reqids
.swap(ctx
->extra_reqids
);
7467 // apply new object state.
7468 ctx
->obc
->obs
= ctx
->new_obs
;
7470 if (soid
.is_head() && !ctx
->obc
->obs
.exists
&&
7471 (!maintain_ssc
|| ctx
->cache_evict
)) {
7472 ctx
->obc
->ssc
->exists
= false;
7473 ctx
->obc
->ssc
->snapset
= SnapSet();
7475 ctx
->obc
->ssc
->exists
= true;
7476 ctx
->obc
->ssc
->snapset
= ctx
->new_snapset
;
7480 void PrimaryLogPG::apply_stats(
7481 const hobject_t
&soid
,
7482 const object_stat_sum_t
&delta_stats
) {
7484 info
.stats
.stats
.add(delta_stats
);
7486 for (set
<pg_shard_t
>::iterator i
= backfill_targets
.begin();
7487 i
!= backfill_targets
.end();
7490 pg_info_t
& pinfo
= peer_info
[bt
];
7491 if (soid
<= pinfo
.last_backfill
)
7492 pinfo
.stats
.stats
.add(delta_stats
);
7493 else if (soid
<= last_backfill_started
)
7494 pending_backfill_updates
[soid
].stats
.add(delta_stats
);
7497 if (is_primary() && scrubber
.active
) {
7498 if (soid
< scrubber
.start
) {
7499 dout(20) << __func__
<< " " << soid
<< " < [" << scrubber
.start
7500 << "," << scrubber
.end
<< ")" << dendl
;
7501 scrub_cstat
.add(delta_stats
);
7503 dout(20) << __func__
<< " " << soid
<< " >= [" << scrubber
.start
7504 << "," << scrubber
.end
<< ")" << dendl
;
7509 void PrimaryLogPG::complete_read_ctx(int result
, OpContext
*ctx
)
7511 const MOSDOp
*m
= static_cast<const MOSDOp
*>(ctx
->op
->get_req());
7512 assert(ctx
->async_reads_complete());
7514 for (vector
<OSDOp
>::iterator p
= ctx
->ops
.begin();
7515 p
!= ctx
->ops
.end() && result
>= 0; ++p
) {
7516 if (p
->rval
< 0 && !(p
->op
.flags
& CEPH_OSD_OP_FLAG_FAILOK
)) {
7520 ctx
->bytes_read
+= p
->outdata
.length();
7522 ctx
->reply
->claim_op_out_data(ctx
->ops
);
7523 ctx
->reply
->get_header().data_off
= ctx
->data_off
;
7525 MOSDOpReply
*reply
= ctx
->reply
;
7526 ctx
->reply
= nullptr;
7529 if (!ctx
->ignore_log_op_stats
) {
7531 publish_stats_to_osd();
7534 // on read, return the current object version
7536 reply
->set_reply_versions(eversion_t(), ctx
->obs
->oi
.user_version
);
7538 reply
->set_reply_versions(eversion_t(), ctx
->user_at_version
);
7540 } else if (result
== -ENOENT
) {
7541 // on ENOENT, set a floor for what the next user version will be.
7542 reply
->set_enoent_reply_versions(info
.last_update
, info
.last_user_version
);
7545 reply
->set_result(result
);
7546 reply
->add_flags(CEPH_OSD_FLAG_ACK
| CEPH_OSD_FLAG_ONDISK
);
7547 osd
->send_message_osd_client(reply
, m
->get_connection());
7551 // ========================================================================
7554 struct C_Copyfrom
: public Context
{
7557 epoch_t last_peering_reset
;
7559 PrimaryLogPG::CopyOpRef cop
;
7560 C_Copyfrom(PrimaryLogPG
*p
, hobject_t o
, epoch_t lpr
,
7561 const PrimaryLogPG::CopyOpRef
& c
)
7562 : pg(p
), oid(o
), last_peering_reset(lpr
),
7565 void finish(int r
) override
{
7566 if (r
== -ECANCELED
)
7569 if (last_peering_reset
== pg
->get_last_peering_reset()) {
7570 pg
->process_copy_chunk(oid
, tid
, r
);
7576 struct C_CopyFrom_AsyncReadCb
: public Context
{
7578 object_copy_data_t reply_obj
;
7581 C_CopyFrom_AsyncReadCb(OSDOp
*osd_op
, uint64_t features
) :
7582 osd_op(osd_op
), features(features
), len(0) {}
7583 void finish(int r
) override
{
7585 assert(len
<= reply_obj
.data
.length());
7587 bl
.substr_of(reply_obj
.data
, 0, len
);
7588 reply_obj
.data
.swap(bl
);
7589 ::encode(reply_obj
, osd_op
->outdata
, features
);
7593 int PrimaryLogPG::fill_in_copy_get(
7595 bufferlist::iterator
& bp
,
7597 ObjectContextRef
&obc
)
7599 object_info_t
& oi
= obc
->obs
.oi
;
7600 hobject_t
& soid
= oi
.soid
;
7602 object_copy_cursor_t cursor
;
7605 ::decode(cursor
, bp
);
7606 ::decode(out_max
, bp
);
7608 catch (buffer::error
& e
) {
7613 const MOSDOp
*op
= reinterpret_cast<const MOSDOp
*>(ctx
->op
->get_req());
7614 uint64_t features
= op
->get_features();
7616 bool async_read_started
= false;
7617 object_copy_data_t _reply_obj
;
7618 C_CopyFrom_AsyncReadCb
*cb
= NULL
;
7619 if (pool
.info
.require_rollback()) {
7620 cb
= new C_CopyFrom_AsyncReadCb(&osd_op
, features
);
7622 object_copy_data_t
&reply_obj
= cb
? cb
->reply_obj
: _reply_obj
;
7624 reply_obj
.size
= oi
.size
;
7625 reply_obj
.mtime
= oi
.mtime
;
7627 if (soid
.snap
< CEPH_NOSNAP
) {
7628 if (obc
->ssc
->snapset
.is_legacy()) {
7629 reply_obj
.snaps
= oi
.legacy_snaps
;
7631 auto p
= obc
->ssc
->snapset
.clone_snaps
.find(soid
.snap
);
7632 assert(p
!= obc
->ssc
->snapset
.clone_snaps
.end()); // warn?
7633 reply_obj
.snaps
= p
->second
;
7636 reply_obj
.snap_seq
= obc
->ssc
->snapset
.seq
;
7638 if (oi
.is_data_digest()) {
7639 reply_obj
.flags
|= object_copy_data_t::FLAG_DATA_DIGEST
;
7640 reply_obj
.data_digest
= oi
.data_digest
;
7642 if (oi
.is_omap_digest()) {
7643 reply_obj
.flags
|= object_copy_data_t::FLAG_OMAP_DIGEST
;
7644 reply_obj
.omap_digest
= oi
.omap_digest
;
7646 reply_obj
.truncate_seq
= oi
.truncate_seq
;
7647 reply_obj
.truncate_size
= oi
.truncate_size
;
7650 map
<string
,bufferlist
>& out_attrs
= reply_obj
.attrs
;
7651 if (!cursor
.attr_complete
) {
7652 result
= getattrs_maybe_cache(
7662 cursor
.attr_complete
= true;
7663 dout(20) << " got attrs" << dendl
;
7666 int64_t left
= out_max
- osd_op
.outdata
.length();
7669 bufferlist
& bl
= reply_obj
.data
;
7670 if (left
> 0 && !cursor
.data_complete
) {
7671 if (cursor
.data_offset
< oi
.size
) {
7672 uint64_t max_read
= MIN(oi
.size
- cursor
.data_offset
, (uint64_t)left
);
7674 async_read_started
= true;
7675 ctx
->pending_async_reads
.push_back(
7677 boost::make_tuple(cursor
.data_offset
, max_read
, osd_op
.op
.flags
),
7678 make_pair(&bl
, cb
)));
7682 result
= pgbackend
->objects_read_sync(
7683 oi
.soid
, cursor
.data_offset
, left
, osd_op
.op
.flags
, &bl
);
7687 assert(result
<= left
);
7689 cursor
.data_offset
+= result
;
7691 if (cursor
.data_offset
== oi
.size
) {
7692 cursor
.data_complete
= true;
7693 dout(20) << " got data" << dendl
;
7695 assert(cursor
.data_offset
<= oi
.size
);
7699 uint32_t omap_keys
= 0;
7700 if (!pool
.info
.supports_omap() || !oi
.is_omap()) {
7701 cursor
.omap_complete
= true;
7703 if (left
> 0 && !cursor
.omap_complete
) {
7704 assert(cursor
.data_complete
);
7705 if (cursor
.omap_offset
.empty()) {
7706 osd
->store
->omap_get_header(ch
, ghobject_t(oi
.soid
),
7707 &reply_obj
.omap_header
);
7709 bufferlist omap_data
;
7710 ObjectMap::ObjectMapIterator iter
=
7711 osd
->store
->get_omap_iterator(coll
, ghobject_t(oi
.soid
));
7713 iter
->upper_bound(cursor
.omap_offset
);
7714 for (; iter
->valid(); iter
->next(false)) {
7716 ::encode(iter
->key(), omap_data
);
7717 ::encode(iter
->value(), omap_data
);
7718 left
-= iter
->key().length() + 4 + iter
->value().length() + 4;
7723 ::encode(omap_keys
, reply_obj
.omap_data
);
7724 reply_obj
.omap_data
.claim_append(omap_data
);
7726 if (iter
->valid()) {
7727 cursor
.omap_offset
= iter
->key();
7729 cursor
.omap_complete
= true;
7730 dout(20) << " got omap" << dendl
;
7735 if (cursor
.is_complete()) {
7736 // include reqids only in the final step. this is a bit fragile
7738 pg_log
.get_log().get_object_reqids(ctx
->obc
->obs
.oi
.soid
, 10, &reply_obj
.reqids
);
7739 dout(20) << " got reqids" << dendl
;
7742 dout(20) << " cursor.is_complete=" << cursor
.is_complete()
7743 << " " << out_attrs
.size() << " attrs"
7744 << " " << bl
.length() << " bytes"
7745 << " " << reply_obj
.omap_header
.length() << " omap header bytes"
7746 << " " << reply_obj
.omap_data
.length() << " omap data bytes in "
7747 << omap_keys
<< " keys"
7748 << " " << reply_obj
.reqids
.size() << " reqids"
7750 reply_obj
.cursor
= cursor
;
7751 if (!async_read_started
) {
7752 ::encode(reply_obj
, osd_op
.outdata
, features
);
7754 if (cb
&& !async_read_started
) {
7761 void PrimaryLogPG::fill_in_copy_get_noent(OpRequestRef
& op
, hobject_t oid
,
7764 // NOTE: we take non-const ref here for claim_op_out_data below; we must
7765 // be careful not to modify anything else that will upset a racing
7767 MOSDOp
*m
= static_cast<MOSDOp
*>(op
->get_nonconst_req());
7768 uint64_t features
= m
->get_features();
7769 object_copy_data_t reply_obj
;
7771 pg_log
.get_log().get_object_reqids(oid
, 10, &reply_obj
.reqids
);
7772 dout(20) << __func__
<< " got reqids " << reply_obj
.reqids
<< dendl
;
7773 ::encode(reply_obj
, osd_op
.outdata
, features
);
7774 osd_op
.rval
= -ENOENT
;
7775 MOSDOpReply
*reply
= new MOSDOpReply(m
, 0, get_osdmap()->get_epoch(), 0, false);
7776 reply
->claim_op_out_data(m
->ops
);
7777 reply
->set_result(-ENOENT
);
7778 reply
->add_flags(CEPH_OSD_FLAG_ACK
| CEPH_OSD_FLAG_ONDISK
);
7779 osd
->send_message_osd_client(reply
, m
->get_connection());
7782 void PrimaryLogPG::start_copy(CopyCallback
*cb
, ObjectContextRef obc
,
7783 hobject_t src
, object_locator_t oloc
,
7784 version_t version
, unsigned flags
,
7785 bool mirror_snapset
,
7786 unsigned src_obj_fadvise_flags
,
7787 unsigned dest_obj_fadvise_flags
)
7789 const hobject_t
& dest
= obc
->obs
.oi
.soid
;
7790 dout(10) << __func__
<< " " << dest
7791 << " from " << src
<< " " << oloc
<< " v" << version
7792 << " flags " << flags
7793 << (mirror_snapset
? " mirror_snapset" : "")
7796 assert(!mirror_snapset
|| (src
.snap
== CEPH_NOSNAP
||
7797 src
.snap
== CEPH_SNAPDIR
));
7799 // cancel a previous in-progress copy?
7800 if (copy_ops
.count(dest
)) {
7801 // FIXME: if the src etc match, we could avoid restarting from the
7803 CopyOpRef cop
= copy_ops
[dest
];
7804 cancel_copy(cop
, false);
7807 CopyOpRef
cop(std::make_shared
<CopyOp
>(cb
, obc
, src
, oloc
, version
, flags
,
7808 mirror_snapset
, src_obj_fadvise_flags
,
7809 dest_obj_fadvise_flags
));
7810 copy_ops
[dest
] = cop
;
7813 _copy_some(obc
, cop
);
7816 void PrimaryLogPG::_copy_some(ObjectContextRef obc
, CopyOpRef cop
)
7818 dout(10) << __func__
<< " " << obc
<< " " << cop
<< dendl
;
7821 if (cop
->flags
& CEPH_OSD_COPY_FROM_FLAG_FLUSH
)
7822 flags
|= CEPH_OSD_FLAG_FLUSH
;
7823 if (cop
->flags
& CEPH_OSD_COPY_FROM_FLAG_IGNORE_CACHE
)
7824 flags
|= CEPH_OSD_FLAG_IGNORE_CACHE
;
7825 if (cop
->flags
& CEPH_OSD_COPY_FROM_FLAG_IGNORE_OVERLAY
)
7826 flags
|= CEPH_OSD_FLAG_IGNORE_OVERLAY
;
7827 if (cop
->flags
& CEPH_OSD_COPY_FROM_FLAG_MAP_SNAP_CLONE
)
7828 flags
|= CEPH_OSD_FLAG_MAP_SNAP_CLONE
;
7829 if (cop
->flags
& CEPH_OSD_COPY_FROM_FLAG_RWORDERED
)
7830 flags
|= CEPH_OSD_FLAG_RWORDERED
;
7832 C_GatherBuilder
gather(cct
);
7834 if (cop
->cursor
.is_initial() && cop
->mirror_snapset
) {
7836 assert(cop
->src
.snap
== CEPH_NOSNAP
);
7838 op
.list_snaps(&cop
->results
.snapset
, NULL
);
7839 ceph_tid_t tid
= osd
->objecter
->read(cop
->src
.oid
, cop
->oloc
, op
,
7841 flags
, gather
.new_sub(), NULL
);
7842 cop
->objecter_tid2
= tid
;
7846 if (cop
->results
.user_version
) {
7847 op
.assert_version(cop
->results
.user_version
);
7849 // we should learn the version after the first chunk, if we didn't know
7851 assert(cop
->cursor
.is_initial());
7853 op
.copy_get(&cop
->cursor
, get_copy_chunk_size(),
7854 &cop
->results
.object_size
, &cop
->results
.mtime
,
7855 &cop
->attrs
, &cop
->data
, &cop
->omap_header
, &cop
->omap_data
,
7856 &cop
->results
.snaps
, &cop
->results
.snap_seq
,
7857 &cop
->results
.flags
,
7858 &cop
->results
.source_data_digest
,
7859 &cop
->results
.source_omap_digest
,
7860 &cop
->results
.reqids
,
7861 &cop
->results
.truncate_seq
,
7862 &cop
->results
.truncate_size
,
7864 op
.set_last_op_flags(cop
->src_obj_fadvise_flags
);
7866 C_Copyfrom
*fin
= new C_Copyfrom(this, obc
->obs
.oi
.soid
,
7867 get_last_peering_reset(), cop
);
7868 gather
.set_finisher(new C_OnFinisher(fin
,
7869 &osd
->objecter_finisher
));
7871 ceph_tid_t tid
= osd
->objecter
->read(cop
->src
.oid
, cop
->oloc
, op
,
7872 cop
->src
.snap
, NULL
,
7875 // discover the object version if we don't know it yet
7876 cop
->results
.user_version
? NULL
: &cop
->results
.user_version
);
7878 cop
->objecter_tid
= tid
;
7882 void PrimaryLogPG::process_copy_chunk(hobject_t oid
, ceph_tid_t tid
, int r
)
7884 dout(10) << __func__
<< " " << oid
<< " tid " << tid
7885 << " " << cpp_strerror(r
) << dendl
;
7886 map
<hobject_t
,CopyOpRef
>::iterator p
= copy_ops
.find(oid
);
7887 if (p
== copy_ops
.end()) {
7888 dout(10) << __func__
<< " no copy_op found" << dendl
;
7891 CopyOpRef cop
= p
->second
;
7892 if (tid
!= cop
->objecter_tid
) {
7893 dout(10) << __func__
<< " tid " << tid
<< " != cop " << cop
7894 << " tid " << cop
->objecter_tid
<< dendl
;
7898 if (cop
->omap_data
.length() || cop
->omap_header
.length())
7899 cop
->results
.has_omap
= true;
7901 if (r
>= 0 && !pool
.info
.supports_omap() &&
7902 (cop
->omap_data
.length() || cop
->omap_header
.length())) {
7905 cop
->objecter_tid
= 0;
7906 cop
->objecter_tid2
= 0; // assume this ordered before us (if it happened)
7907 ObjectContextRef
& cobc
= cop
->obc
;
7912 assert(cop
->rval
>= 0);
7914 if (oid
.snap
< CEPH_NOSNAP
&& !cop
->results
.snaps
.empty()) {
7915 // verify snap hasn't been deleted
7916 vector
<snapid_t
>::iterator p
= cop
->results
.snaps
.begin();
7917 while (p
!= cop
->results
.snaps
.end()) {
7918 if (pool
.info
.is_removed_snap(*p
)) {
7919 dout(10) << __func__
<< " clone snap " << *p
<< " has been deleted"
7921 for (vector
<snapid_t
>::iterator q
= p
+ 1;
7922 q
!= cop
->results
.snaps
.end();
7925 cop
->results
.snaps
.resize(cop
->results
.snaps
.size() - 1);
7930 if (cop
->results
.snaps
.empty()) {
7931 dout(10) << __func__
<< " no more snaps for " << oid
<< dendl
;
7937 assert(cop
->rval
>= 0);
7939 if (!cop
->temp_cursor
.data_complete
) {
7940 cop
->results
.data_digest
= cop
->data
.crc32c(cop
->results
.data_digest
);
7942 if (pool
.info
.supports_omap() && !cop
->temp_cursor
.omap_complete
) {
7943 if (cop
->omap_header
.length()) {
7944 cop
->results
.omap_digest
=
7945 cop
->omap_header
.crc32c(cop
->results
.omap_digest
);
7947 if (cop
->omap_data
.length()) {
7949 keys
.substr_of(cop
->omap_data
, 4, cop
->omap_data
.length() - 4);
7950 cop
->results
.omap_digest
= keys
.crc32c(cop
->results
.omap_digest
);
7954 if (!cop
->temp_cursor
.attr_complete
) {
7955 for (map
<string
,bufferlist
>::iterator p
= cop
->attrs
.begin();
7956 p
!= cop
->attrs
.end();
7958 cop
->results
.attrs
[string("_") + p
->first
] = p
->second
;
7963 if (!cop
->cursor
.is_complete()) {
7964 // write out what we have so far
7965 if (cop
->temp_cursor
.is_initial()) {
7966 assert(!cop
->results
.started_temp_obj
);
7967 cop
->results
.started_temp_obj
= true;
7968 cop
->results
.temp_oid
= generate_temp_object(oid
);
7969 dout(20) << __func__
<< " using temp " << cop
->results
.temp_oid
<< dendl
;
7971 ObjectContextRef tempobc
= get_object_context(cop
->results
.temp_oid
, true);
7972 OpContextUPtr ctx
= simple_opc_create(tempobc
);
7973 if (cop
->temp_cursor
.is_initial()) {
7974 ctx
->new_temp_oid
= cop
->results
.temp_oid
;
7976 _write_copy_chunk(cop
, ctx
->op_t
.get());
7977 simple_opc_submit(std::move(ctx
));
7978 dout(10) << __func__
<< " fetching more" << dendl
;
7979 _copy_some(cobc
, cop
);
7984 if (cop
->results
.is_data_digest() || cop
->results
.is_omap_digest()) {
7985 dout(20) << __func__
<< std::hex
7986 << " got digest: rx data 0x" << cop
->results
.data_digest
7987 << " omap 0x" << cop
->results
.omap_digest
7988 << ", source: data 0x" << cop
->results
.source_data_digest
7989 << " omap 0x" << cop
->results
.source_omap_digest
7991 << " flags " << cop
->results
.flags
7994 if (cop
->results
.is_data_digest() &&
7995 cop
->results
.data_digest
!= cop
->results
.source_data_digest
) {
7996 derr
<< __func__
<< std::hex
<< " data digest 0x" << cop
->results
.data_digest
7997 << " != source 0x" << cop
->results
.source_data_digest
<< std::dec
7999 osd
->clog
->error() << info
.pgid
<< " copy from " << cop
->src
8000 << " to " << cop
->obc
->obs
.oi
.soid
<< std::hex
8001 << " data digest 0x" << cop
->results
.data_digest
8002 << " != source 0x" << cop
->results
.source_data_digest
8007 if (cop
->results
.is_omap_digest() &&
8008 cop
->results
.omap_digest
!= cop
->results
.source_omap_digest
) {
8009 derr
<< __func__
<< std::hex
8010 << " omap digest 0x" << cop
->results
.omap_digest
8011 << " != source 0x" << cop
->results
.source_omap_digest
8012 << std::dec
<< dendl
;
8013 osd
->clog
->error() << info
.pgid
<< " copy from " << cop
->src
8014 << " to " << cop
->obc
->obs
.oi
.soid
<< std::hex
8015 << " omap digest 0x" << cop
->results
.omap_digest
8016 << " != source 0x" << cop
->results
.source_omap_digest
8021 if (cct
->_conf
->osd_debug_inject_copyfrom_error
) {
8022 derr
<< __func__
<< " injecting copyfrom failure" << dendl
;
8027 cop
->results
.fill_in_final_tx
= std::function
<void(PGTransaction
*)>(
8028 [this, &cop
/* avoid ref cycle */](PGTransaction
*t
) {
8029 ObjectState
& obs
= cop
->obc
->obs
;
8030 if (cop
->temp_cursor
.is_initial()) {
8031 dout(20) << "fill_in_final_tx: writing "
8032 << "directly to final object" << dendl
;
8033 // write directly to final object
8034 cop
->results
.temp_oid
= obs
.oi
.soid
;
8035 _write_copy_chunk(cop
, t
);
8037 // finish writing to temp object, then move into place
8038 dout(20) << "fill_in_final_tx: writing to temp object" << dendl
;
8039 _write_copy_chunk(cop
, t
);
8040 t
->rename(obs
.oi
.soid
, cop
->results
.temp_oid
);
8042 t
->setattrs(obs
.oi
.soid
, cop
->results
.attrs
);
8045 dout(20) << __func__
<< " success; committing" << dendl
;
8048 dout(20) << __func__
<< " complete r = " << cpp_strerror(r
) << dendl
;
8049 CopyCallbackResults
results(r
, &cop
->results
);
8050 cop
->cb
->complete(results
);
8052 copy_ops
.erase(cobc
->obs
.oi
.soid
);
8055 if (r
< 0 && cop
->results
.started_temp_obj
) {
8056 dout(10) << __func__
<< " deleting partial temp object "
8057 << cop
->results
.temp_oid
<< dendl
;
8058 ObjectContextRef tempobc
= get_object_context(cop
->results
.temp_oid
, true);
8059 OpContextUPtr ctx
= simple_opc_create(tempobc
);
8060 ctx
->op_t
->remove(cop
->results
.temp_oid
);
8061 ctx
->discard_temp_oid
= cop
->results
.temp_oid
;
8062 simple_opc_submit(std::move(ctx
));
8065 // cancel and requeue proxy ops on this object
8067 for (map
<ceph_tid_t
, ProxyReadOpRef
>::iterator it
= proxyread_ops
.begin();
8068 it
!= proxyread_ops
.end();) {
8069 if (it
->second
->soid
== cobc
->obs
.oi
.soid
) {
8070 cancel_proxy_read((it
++)->second
);
8075 for (map
<ceph_tid_t
, ProxyWriteOpRef
>::iterator it
= proxywrite_ops
.begin();
8076 it
!= proxywrite_ops
.end();) {
8077 if (it
->second
->soid
== cobc
->obs
.oi
.soid
) {
8078 cancel_proxy_write((it
++)->second
);
8083 kick_proxy_ops_blocked(cobc
->obs
.oi
.soid
);
8086 kick_object_context_blocked(cobc
);
8089 void PrimaryLogPG::_write_copy_chunk(CopyOpRef cop
, PGTransaction
*t
)
8091 dout(20) << __func__
<< " " << cop
8092 << " " << cop
->attrs
.size() << " attrs"
8093 << " " << cop
->data
.length() << " bytes"
8094 << " " << cop
->omap_header
.length() << " omap header bytes"
8095 << " " << cop
->omap_data
.length() << " omap data bytes"
8097 if (!cop
->temp_cursor
.attr_complete
) {
8098 t
->create(cop
->results
.temp_oid
);
8100 if (!cop
->temp_cursor
.data_complete
) {
8101 assert(cop
->data
.length() + cop
->temp_cursor
.data_offset
==
8102 cop
->cursor
.data_offset
);
8103 if (pool
.info
.requires_aligned_append() &&
8104 !cop
->cursor
.data_complete
) {
8106 * Trim off the unaligned bit at the end, we'll adjust cursor.data_offset
8107 * to pick it up on the next pass.
8109 assert(cop
->temp_cursor
.data_offset
%
8110 pool
.info
.required_alignment() == 0);
8111 if (cop
->data
.length() % pool
.info
.required_alignment() != 0) {
8113 cop
->data
.length() % pool
.info
.required_alignment();
8115 bl
.substr_of(cop
->data
, 0, cop
->data
.length() - to_trim
);
8117 cop
->cursor
.data_offset
-= to_trim
;
8118 assert(cop
->data
.length() + cop
->temp_cursor
.data_offset
==
8119 cop
->cursor
.data_offset
);
8122 if (cop
->data
.length()) {
8124 cop
->results
.temp_oid
,
8125 cop
->temp_cursor
.data_offset
,
8128 cop
->dest_obj_fadvise_flags
);
8132 if (pool
.info
.supports_omap()) {
8133 if (!cop
->temp_cursor
.omap_complete
) {
8134 if (cop
->omap_header
.length()) {
8136 cop
->results
.temp_oid
,
8138 cop
->omap_header
.clear();
8140 if (cop
->omap_data
.length()) {
8141 map
<string
,bufferlist
> omap
;
8142 bufferlist::iterator p
= cop
->omap_data
.begin();
8144 t
->omap_setkeys(cop
->results
.temp_oid
, omap
);
8145 cop
->omap_data
.clear();
8149 assert(cop
->omap_header
.length() == 0);
8150 assert(cop
->omap_data
.length() == 0);
8152 cop
->temp_cursor
= cop
->cursor
;
8155 void PrimaryLogPG::finish_copyfrom(OpContext
*ctx
)
8157 dout(20) << "finish_copyfrom on " << ctx
->obs
->oi
.soid
<< dendl
;
8158 ObjectState
& obs
= ctx
->new_obs
;
8159 CopyFromCallback
*cb
= static_cast<CopyFromCallback
*>(ctx
->copy_cb
);
8162 dout(20) << __func__
<< ": exists, removing" << dendl
;
8163 ctx
->op_t
->remove(obs
.oi
.soid
);
8165 ctx
->delta_stats
.num_objects
++;
8168 if (cb
->is_temp_obj_used()) {
8169 ctx
->discard_temp_oid
= cb
->results
->temp_oid
;
8171 cb
->results
->fill_in_final_tx(ctx
->op_t
.get());
8173 // CopyFromCallback fills this in for us
8174 obs
.oi
.user_version
= ctx
->user_at_version
;
8176 obs
.oi
.set_data_digest(cb
->results
->data_digest
);
8177 obs
.oi
.set_omap_digest(cb
->results
->omap_digest
);
8179 obs
.oi
.truncate_seq
= cb
->results
->truncate_seq
;
8180 obs
.oi
.truncate_size
= cb
->results
->truncate_size
;
8182 ctx
->extra_reqids
= cb
->results
->reqids
;
8184 // cache: clear whiteout?
8185 if (obs
.oi
.is_whiteout()) {
8186 dout(10) << __func__
<< " clearing whiteout on " << obs
.oi
.soid
<< dendl
;
8187 obs
.oi
.clear_flag(object_info_t::FLAG_WHITEOUT
);
8188 --ctx
->delta_stats
.num_whiteouts
;
8191 if (cb
->results
->has_omap
) {
8192 dout(10) << __func__
<< " setting omap flag on " << obs
.oi
.soid
<< dendl
;
8193 obs
.oi
.set_flag(object_info_t::FLAG_OMAP
);
8195 dout(10) << __func__
<< " clearing omap flag on " << obs
.oi
.soid
<< dendl
;
8196 obs
.oi
.clear_flag(object_info_t::FLAG_OMAP
);
8199 interval_set
<uint64_t> ch
;
8200 if (obs
.oi
.size
> 0)
8201 ch
.insert(0, obs
.oi
.size
);
8202 ctx
->modified_ranges
.union_of(ch
);
8204 if (cb
->get_data_size() != obs
.oi
.size
) {
8205 ctx
->delta_stats
.num_bytes
-= obs
.oi
.size
;
8206 obs
.oi
.size
= cb
->get_data_size();
8207 ctx
->delta_stats
.num_bytes
+= obs
.oi
.size
;
8209 ctx
->delta_stats
.num_wr
++;
8210 ctx
->delta_stats
.num_wr_kb
+= SHIFT_ROUND_UP(obs
.oi
.size
, 10);
8212 osd
->logger
->inc(l_osd_copyfrom
);
8215 void PrimaryLogPG::finish_promote(int r
, CopyResults
*results
,
8216 ObjectContextRef obc
)
8218 const hobject_t
& soid
= obc
->obs
.oi
.soid
;
8219 dout(10) << __func__
<< " " << soid
<< " r=" << r
8220 << " uv" << results
->user_version
<< dendl
;
8222 if (r
== -ECANCELED
) {
8226 if (r
!= -ENOENT
&& soid
.is_snap()) {
8227 if (results
->snaps
.empty()) {
8228 // we must have read "snap" content from the head object in
8229 // the base pool. use snap_seq to construct what snaps should
8230 // be for this clone (what is was before we evicted the clean
8231 // clone from this pool, and what it will be when we flush and
8232 // the clone eventually happens in the base pool).
8233 SnapSet
& snapset
= obc
->ssc
->snapset
;
8234 vector
<snapid_t
>::iterator p
= snapset
.snaps
.begin();
8235 while (p
!= snapset
.snaps
.end() && *p
> soid
.snap
)
8237 while (p
!= snapset
.snaps
.end() && *p
> results
->snap_seq
) {
8238 results
->snaps
.push_back(*p
);
8243 dout(20) << __func__
<< " snaps " << results
->snaps
<< dendl
;
8244 filter_snapc(results
->snaps
);
8246 dout(20) << __func__
<< " filtered snaps " << results
->snaps
<< dendl
;
8247 if (results
->snaps
.empty()) {
8248 dout(20) << __func__
8249 << " snaps are empty, clone is invalid,"
8250 << " setting r to ENOENT" << dendl
;
8255 if (r
< 0 && results
->started_temp_obj
) {
8256 dout(10) << __func__
<< " abort; will clean up partial work" << dendl
;
8257 ObjectContextRef tempobc
= get_object_context(results
->temp_oid
, false);
8259 OpContextUPtr ctx
= simple_opc_create(tempobc
);
8260 ctx
->op_t
->remove(results
->temp_oid
);
8261 simple_opc_submit(std::move(ctx
));
8262 results
->started_temp_obj
= false;
8265 if (r
== -ENOENT
&& soid
.is_snap()) {
8266 dout(10) << __func__
8267 << ": enoent while trying to promote clone, " << soid
8268 << " must have been trimmed, removing from snapset"
8270 hobject_t
head(soid
.get_head());
8271 ObjectContextRef obc
= get_object_context(head
, false);
8274 OpContextUPtr tctx
= simple_opc_create(obc
);
8275 tctx
->at_version
= get_next_version();
8276 filter_snapc(tctx
->new_snapset
.snaps
);
8277 vector
<snapid_t
> new_clones
;
8278 map
<snapid_t
, vector
<snapid_t
>> new_clone_snaps
;
8279 for (vector
<snapid_t
>::iterator i
= tctx
->new_snapset
.clones
.begin();
8280 i
!= tctx
->new_snapset
.clones
.end();
8282 if (*i
!= soid
.snap
) {
8283 new_clones
.push_back(*i
);
8284 auto p
= tctx
->new_snapset
.clone_snaps
.find(*i
);
8285 if (p
!= tctx
->new_snapset
.clone_snaps
.end()) {
8286 new_clone_snaps
[*i
] = p
->second
;
8290 tctx
->new_snapset
.clones
.swap(new_clones
);
8291 tctx
->new_snapset
.clone_overlap
.erase(soid
.snap
);
8292 tctx
->new_snapset
.clone_size
.erase(soid
.snap
);
8293 tctx
->new_snapset
.clone_snaps
.swap(new_clone_snaps
);
8295 // take RWWRITE lock for duration of our local write. ignore starvation.
8296 if (!tctx
->lock_manager
.take_write_lock(
8299 assert(0 == "problem!");
8301 dout(20) << __func__
<< " took lock on obc, " << obc
->rwstate
<< dendl
;
8303 finish_ctx(tctx
.get(), pg_log_entry_t::PROMOTE
);
8305 simple_opc_submit(std::move(tctx
));
8309 bool whiteout
= false;
8311 assert(soid
.snap
== CEPH_NOSNAP
); // snap case is above
8312 dout(10) << __func__
<< " whiteout " << soid
<< dendl
;
8316 if (r
< 0 && !whiteout
) {
8317 derr
<< __func__
<< " unexpected promote error " << cpp_strerror(r
) << dendl
;
8318 // pass error to everyone blocked on this object
8319 // FIXME: this is pretty sloppy, but at this point we got
8320 // something unexpected and don't have many other options.
8321 map
<hobject_t
,list
<OpRequestRef
>>::iterator blocked_iter
=
8322 waiting_for_blocked_object
.find(soid
);
8323 if (blocked_iter
!= waiting_for_blocked_object
.end()) {
8324 while (!blocked_iter
->second
.empty()) {
8325 osd
->reply_op_error(blocked_iter
->second
.front(), r
);
8326 blocked_iter
->second
.pop_front();
8328 waiting_for_blocked_object
.erase(blocked_iter
);
8333 osd
->promote_finish(results
->object_size
);
8335 OpContextUPtr tctx
= simple_opc_create(obc
);
8336 tctx
->at_version
= get_next_version();
8338 ++tctx
->delta_stats
.num_objects
;
8339 if (soid
.snap
< CEPH_NOSNAP
)
8340 ++tctx
->delta_stats
.num_object_clones
;
8341 tctx
->new_obs
.exists
= true;
8343 tctx
->extra_reqids
= results
->reqids
;
8345 bool legacy_snapset
= tctx
->new_snapset
.is_legacy() ||
8346 get_osdmap()->require_osd_release
< CEPH_RELEASE_LUMINOUS
;
8349 // create a whiteout
8350 tctx
->op_t
->create(soid
);
8351 tctx
->new_obs
.oi
.set_flag(object_info_t::FLAG_WHITEOUT
);
8352 ++tctx
->delta_stats
.num_whiteouts
;
8353 dout(20) << __func__
<< " creating whiteout on " << soid
<< dendl
;
8354 osd
->logger
->inc(l_osd_tier_whiteout
);
8356 if (results
->has_omap
) {
8357 dout(10) << __func__
<< " setting omap flag on " << soid
<< dendl
;
8358 tctx
->new_obs
.oi
.set_flag(object_info_t::FLAG_OMAP
);
8359 ++tctx
->delta_stats
.num_objects_omap
;
8362 results
->fill_in_final_tx(tctx
->op_t
.get());
8363 if (results
->started_temp_obj
) {
8364 tctx
->discard_temp_oid
= results
->temp_oid
;
8366 tctx
->new_obs
.oi
.size
= results
->object_size
;
8367 tctx
->new_obs
.oi
.user_version
= results
->user_version
;
8368 // Don't care src object whether have data or omap digest
8369 if (results
->object_size
)
8370 tctx
->new_obs
.oi
.set_data_digest(results
->data_digest
);
8371 if (results
->has_omap
)
8372 tctx
->new_obs
.oi
.set_omap_digest(results
->omap_digest
);
8373 tctx
->new_obs
.oi
.truncate_seq
= results
->truncate_seq
;
8374 tctx
->new_obs
.oi
.truncate_size
= results
->truncate_size
;
8376 if (soid
.snap
!= CEPH_NOSNAP
) {
8377 if (legacy_snapset
) {
8378 tctx
->new_obs
.oi
.legacy_snaps
= results
->snaps
;
8379 assert(!tctx
->new_obs
.oi
.legacy_snaps
.empty());
8381 // it's already in the snapset
8382 assert(obc
->ssc
->snapset
.clone_snaps
.count(soid
.snap
));
8384 assert(obc
->ssc
->snapset
.clone_size
.count(soid
.snap
));
8385 assert(obc
->ssc
->snapset
.clone_size
[soid
.snap
] ==
8386 results
->object_size
);
8387 assert(obc
->ssc
->snapset
.clone_overlap
.count(soid
.snap
));
8389 tctx
->delta_stats
.num_bytes
+= obc
->ssc
->snapset
.get_clone_bytes(soid
.snap
);
8391 tctx
->delta_stats
.num_bytes
+= results
->object_size
;
8395 if (results
->mirror_snapset
) {
8396 assert(tctx
->new_obs
.oi
.soid
.snap
== CEPH_NOSNAP
);
8397 tctx
->new_snapset
.from_snap_set(
8399 get_osdmap()->require_osd_release
< CEPH_RELEASE_LUMINOUS
);
8401 tctx
->new_snapset
.head_exists
= true;
8402 dout(20) << __func__
<< " new_snapset " << tctx
->new_snapset
<< dendl
;
8404 // take RWWRITE lock for duration of our local write. ignore starvation.
8405 if (!tctx
->lock_manager
.take_write_lock(
8408 assert(0 == "problem!");
8410 dout(20) << __func__
<< " took lock on obc, " << obc
->rwstate
<< dendl
;
8412 finish_ctx(tctx
.get(), pg_log_entry_t::PROMOTE
);
8414 simple_opc_submit(std::move(tctx
));
8416 osd
->logger
->inc(l_osd_tier_promote
);
8419 agent_state
->is_idle())
8420 agent_choose_mode();
8423 void PrimaryLogPG::cancel_copy(CopyOpRef cop
, bool requeue
)
8425 dout(10) << __func__
<< " " << cop
->obc
->obs
.oi
.soid
8426 << " from " << cop
->src
<< " " << cop
->oloc
8427 << " v" << cop
->results
.user_version
<< dendl
;
8429 // cancel objecter op, if we can
8430 if (cop
->objecter_tid
) {
8431 osd
->objecter
->op_cancel(cop
->objecter_tid
, -ECANCELED
);
8432 cop
->objecter_tid
= 0;
8433 if (cop
->objecter_tid2
) {
8434 osd
->objecter
->op_cancel(cop
->objecter_tid2
, -ECANCELED
);
8435 cop
->objecter_tid2
= 0;
8439 copy_ops
.erase(cop
->obc
->obs
.oi
.soid
);
8440 cop
->obc
->stop_block();
8442 kick_object_context_blocked(cop
->obc
);
8443 cop
->results
.should_requeue
= requeue
;
8444 CopyCallbackResults
result(-ECANCELED
, &cop
->results
);
8445 cop
->cb
->complete(result
);
8447 // There may still be an objecter callback referencing this copy op.
8448 // That callback will not need the obc since it's been canceled, and
8449 // we need the obc reference to go away prior to flush.
8450 cop
->obc
= ObjectContextRef();
8453 void PrimaryLogPG::cancel_copy_ops(bool requeue
)
8455 dout(10) << __func__
<< dendl
;
8456 map
<hobject_t
,CopyOpRef
>::iterator p
= copy_ops
.begin();
8457 while (p
!= copy_ops
.end()) {
8458 // requeue this op? can I queue up all of them?
8459 cancel_copy((p
++)->second
, requeue
);
8464 // ========================================================================
8467 // Flush a dirty object in the cache tier by writing it back to the
8468 // base tier. The sequence looks like:
8470 // * send a copy-from operation to the base tier to copy the current
8471 // version of the object
8472 // * base tier will pull the object via (perhaps multiple) copy-get(s)
8473 // * on completion, we check if the object has been modified. if so,
8474 // just reply with -EAGAIN.
8475 // * try to take a write lock so we can clear the dirty flag. if this
8476 // fails, wait and retry
8477 // * start a repop that clears the bit.
8479 // If we have to wait, we will retry by coming back through the
8480 // start_flush method. We check if a flush is already in progress
8481 // and, if so, try to finish it by rechecking the version and trying
8482 // to clear the dirty bit.
8484 // In order for the cache-flush (a write op) to not block the copy-get
8485 // from reading the object, the client *must* set the SKIPRWLOCKS
8488 // NOTE: normally writes are strictly ordered for the client, but
8489 // flushes are special in that they can be reordered with respect to
8490 // other writes. In particular, we can't have a flush request block
8491 // an update to the cache pool object!
8493 struct C_Flush
: public Context
{
8496 epoch_t last_peering_reset
;
8499 C_Flush(PrimaryLogPG
*p
, hobject_t o
, epoch_t lpr
)
8500 : pg(p
), oid(o
), last_peering_reset(lpr
),
8501 tid(0), start(ceph_clock_now())
8503 void finish(int r
) override
{
8504 if (r
== -ECANCELED
)
8507 if (last_peering_reset
== pg
->get_last_peering_reset()) {
8508 pg
->finish_flush(oid
, tid
, r
);
8509 pg
->osd
->logger
->tinc(l_osd_tier_flush_lat
, ceph_clock_now() - start
);
8515 int PrimaryLogPG::start_flush(
8516 OpRequestRef op
, ObjectContextRef obc
,
8517 bool blocking
, hobject_t
*pmissing
,
8518 boost::optional
<std::function
<void()>> &&on_flush
)
8520 const object_info_t
& oi
= obc
->obs
.oi
;
8521 const hobject_t
& soid
= oi
.soid
;
8522 dout(10) << __func__
<< " " << soid
8523 << " v" << oi
.version
8524 << " uv" << oi
.user_version
8525 << " " << (blocking
? "blocking" : "non-blocking/best-effort")
8528 // get a filtered snapset, need to remove removed snaps
8529 SnapSet snapset
= obc
->ssc
->snapset
.get_filtered(pool
.info
);
8531 // verify there are no (older) check for dirty clones
8533 dout(20) << " snapset " << snapset
<< dendl
;
8534 vector
<snapid_t
>::reverse_iterator p
= snapset
.clones
.rbegin();
8535 while (p
!= snapset
.clones
.rend() && *p
>= soid
.snap
)
8537 if (p
!= snapset
.clones
.rend()) {
8538 hobject_t next
= soid
;
8540 assert(next
.snap
< soid
.snap
);
8541 if (pg_log
.get_missing().is_missing(next
)) {
8542 dout(10) << __func__
<< " missing clone is " << next
<< dendl
;
8547 ObjectContextRef older_obc
= get_object_context(next
, false);
8549 dout(20) << __func__
<< " next oldest clone is " << older_obc
->obs
.oi
8551 if (older_obc
->obs
.oi
.is_dirty()) {
8552 dout(10) << __func__
<< " next oldest clone is dirty: "
8553 << older_obc
->obs
.oi
<< dendl
;
8557 dout(20) << __func__
<< " next oldest clone " << next
8558 << " is not present; implicitly clean" << dendl
;
8561 dout(20) << __func__
<< " no older clones" << dendl
;
8568 map
<hobject_t
,FlushOpRef
>::iterator p
= flush_ops
.find(soid
);
8569 if (p
!= flush_ops
.end()) {
8570 FlushOpRef fop
= p
->second
;
8571 if (fop
->op
== op
) {
8572 // we couldn't take the write lock on a cache-try-flush before;
8573 // now we are trying again for the lock.
8574 return try_flush_mark_clean(fop
);
8576 if (fop
->flushed_version
== obc
->obs
.oi
.user_version
&&
8577 (fop
->blocking
|| !blocking
)) {
8578 // nonblocking can join anything
8579 // blocking can only join a blocking flush
8580 dout(20) << __func__
<< " piggybacking on existing flush " << dendl
;
8582 fop
->dup_ops
.push_back(op
);
8583 return -EAGAIN
; // clean up this ctx; op will retry later
8586 // cancel current flush since it will fail anyway, or because we
8587 // are blocking and the existing flush is nonblocking.
8588 dout(20) << __func__
<< " canceling previous flush; it will fail" << dendl
;
8590 osd
->reply_op_error(fop
->op
, -EBUSY
);
8591 while (!fop
->dup_ops
.empty()) {
8592 osd
->reply_op_error(fop
->dup_ops
.front(), -EBUSY
);
8593 fop
->dup_ops
.pop_front();
8595 cancel_flush(fop
, false);
8599 * In general, we need to send a delete and a copyfrom.
8600 * Consider snapc 10:[10, 9, 8, 4, 3, 2]:[10(10, 9), 4(4,3,2)]
8601 * where 4 is marked as clean. To flush 10, we have to:
8602 * 1) delete 4:[4,3,2] -- Logically, the object does not exist after 4
8603 * 2) copyfrom 8:[8,4,3,2] -- flush object after snap 8
8605 * There is a complicating case. Supposed there had been a clone 7
8606 * for snaps [7, 6] which has been trimmed since they no longer exist.
8607 * In the base pool, we'd have 5:[4,3,2]:[4(4,3,2)]+head. When we submit
8608 * the delete, the snap will be promoted to 5, and the head will become
8609 * a snapdir. When the copy-from goes through, we'll end up with
8610 * 8:[8,4,3,2]:[4(4,3,2)]+head.
8612 * Another complication is the case where there is an interval change
8613 * after doing the delete and the flush but before marking the object
8614 * clean. We'll happily delete head and then recreate it at the same
8615 * sequence number, which works out ok.
8618 SnapContext snapc
, dsnapc
;
8619 if (snapset
.seq
!= 0) {
8620 if (soid
.snap
== CEPH_NOSNAP
) {
8621 snapc
.seq
= snapset
.seq
;
8622 snapc
.snaps
= snapset
.snaps
;
8624 snapid_t min_included_snap
;
8625 if (snapset
.is_legacy()) {
8626 min_included_snap
= oi
.legacy_snaps
.back();
8628 auto p
= snapset
.clone_snaps
.find(soid
.snap
);
8629 assert(p
!= snapset
.clone_snaps
.end());
8630 min_included_snap
= p
->second
.back();
8632 snapc
= snapset
.get_ssc_as_of(min_included_snap
- 1);
8635 snapid_t prev_snapc
= 0;
8636 for (vector
<snapid_t
>::reverse_iterator citer
= snapset
.clones
.rbegin();
8637 citer
!= snapset
.clones
.rend();
8639 if (*citer
< soid
.snap
) {
8640 prev_snapc
= *citer
;
8645 dsnapc
= snapset
.get_ssc_as_of(prev_snapc
);
8648 object_locator_t
base_oloc(soid
);
8649 base_oloc
.pool
= pool
.info
.tier_of
;
8651 if (dsnapc
.seq
< snapc
.seq
) {
8654 osd
->objecter
->mutate(
8659 ceph::real_clock::from_ceph_timespec(oi
.mtime
),
8660 (CEPH_OSD_FLAG_IGNORE_OVERLAY
|
8661 CEPH_OSD_FLAG_ENFORCE_SNAPC
),
8662 NULL
/* no callback, we'll rely on the ordering w.r.t the next op */);
8665 FlushOpRef
fop(std::make_shared
<FlushOp
>());
8667 fop
->flushed_version
= oi
.user_version
;
8668 fop
->blocking
= blocking
;
8669 fop
->on_flush
= std::move(on_flush
);
8673 if (oi
.is_whiteout()) {
8674 fop
->removal
= true;
8677 object_locator_t
oloc(soid
);
8678 o
.copy_from(soid
.oid
.name
, soid
.snap
, oloc
, oi
.user_version
,
8679 CEPH_OSD_COPY_FROM_FLAG_FLUSH
|
8680 CEPH_OSD_COPY_FROM_FLAG_IGNORE_OVERLAY
|
8681 CEPH_OSD_COPY_FROM_FLAG_IGNORE_CACHE
|
8682 CEPH_OSD_COPY_FROM_FLAG_MAP_SNAP_CLONE
,
8683 LIBRADOS_OP_FLAG_FADVISE_SEQUENTIAL
|LIBRADOS_OP_FLAG_FADVISE_NOCACHE
);
8685 //mean the base tier don't cache data after this
8686 if (agent_state
&& agent_state
->evict_mode
!= TierAgentState::EVICT_MODE_FULL
)
8687 o
.set_last_op_flags(LIBRADOS_OP_FLAG_FADVISE_DONTNEED
);
8689 C_Flush
*fin
= new C_Flush(this, soid
, get_last_peering_reset());
8691 ceph_tid_t tid
= osd
->objecter
->mutate(
8692 soid
.oid
, base_oloc
, o
, snapc
,
8693 ceph::real_clock::from_ceph_timespec(oi
.mtime
),
8694 CEPH_OSD_FLAG_IGNORE_OVERLAY
| CEPH_OSD_FLAG_ENFORCE_SNAPC
,
8695 new C_OnFinisher(fin
,
8696 &osd
->objecter_finisher
));
8697 /* we're under the pg lock and fin->finish() is grabbing that */
8699 fop
->objecter_tid
= tid
;
8701 flush_ops
[soid
] = fop
;
8702 info
.stats
.stats
.sum
.num_flush
++;
8703 info
.stats
.stats
.sum
.num_flush_kb
+= SHIFT_ROUND_UP(oi
.size
, 10);
8704 return -EINPROGRESS
;
8707 void PrimaryLogPG::finish_flush(hobject_t oid
, ceph_tid_t tid
, int r
)
8709 dout(10) << __func__
<< " " << oid
<< " tid " << tid
8710 << " " << cpp_strerror(r
) << dendl
;
8711 map
<hobject_t
,FlushOpRef
>::iterator p
= flush_ops
.find(oid
);
8712 if (p
== flush_ops
.end()) {
8713 dout(10) << __func__
<< " no flush_op found" << dendl
;
8716 FlushOpRef fop
= p
->second
;
8717 if (tid
!= fop
->objecter_tid
) {
8718 dout(10) << __func__
<< " tid " << tid
<< " != fop " << fop
8719 << " tid " << fop
->objecter_tid
<< dendl
;
8722 ObjectContextRef obc
= fop
->obc
;
8723 fop
->objecter_tid
= 0;
8725 if (r
< 0 && !(r
== -ENOENT
&& fop
->removal
)) {
8727 osd
->reply_op_error(fop
->op
, -EBUSY
);
8728 if (fop
->blocking
) {
8730 kick_object_context_blocked(obc
);
8733 if (!fop
->dup_ops
.empty()) {
8734 dout(20) << __func__
<< " requeueing dups" << dendl
;
8735 requeue_ops(fop
->dup_ops
);
8737 if (fop
->on_flush
) {
8738 (*(fop
->on_flush
))();
8739 fop
->on_flush
= boost::none
;
8741 flush_ops
.erase(oid
);
8745 r
= try_flush_mark_clean(fop
);
8746 if (r
== -EBUSY
&& fop
->op
) {
8747 osd
->reply_op_error(fop
->op
, r
);
8751 int PrimaryLogPG::try_flush_mark_clean(FlushOpRef fop
)
8753 ObjectContextRef obc
= fop
->obc
;
8754 const hobject_t
& oid
= obc
->obs
.oi
.soid
;
8756 if (fop
->blocking
) {
8758 kick_object_context_blocked(obc
);
8761 if (fop
->flushed_version
!= obc
->obs
.oi
.user_version
||
8763 if (obc
->obs
.exists
)
8764 dout(10) << __func__
<< " flushed_version " << fop
->flushed_version
8765 << " != current " << obc
->obs
.oi
.user_version
8768 dout(10) << __func__
<< " object no longer exists" << dendl
;
8770 if (!fop
->dup_ops
.empty()) {
8771 dout(20) << __func__
<< " requeueing dups" << dendl
;
8772 requeue_ops(fop
->dup_ops
);
8774 if (fop
->on_flush
) {
8775 (*(fop
->on_flush
))();
8776 fop
->on_flush
= boost::none
;
8778 flush_ops
.erase(oid
);
8780 osd
->logger
->inc(l_osd_tier_flush_fail
);
8782 osd
->logger
->inc(l_osd_tier_try_flush_fail
);
8786 if (!fop
->blocking
&&
8787 scrubber
.write_blocked_by_scrub(oid
)) {
8789 dout(10) << __func__
<< " blocked by scrub" << dendl
;
8790 requeue_op(fop
->op
);
8791 requeue_ops(fop
->dup_ops
);
8792 return -EAGAIN
; // will retry
8794 osd
->logger
->inc(l_osd_tier_try_flush_fail
);
8795 cancel_flush(fop
, false);
8800 // successfully flushed, can we evict this object?
8801 if (!fop
->op
&& agent_state
->evict_mode
!= TierAgentState::EVICT_MODE_IDLE
&&
8802 agent_maybe_evict(obc
, true)) {
8803 osd
->logger
->inc(l_osd_tier_clean
);
8804 if (fop
->on_flush
) {
8805 (*(fop
->on_flush
))();
8806 fop
->on_flush
= boost::none
;
8808 flush_ops
.erase(oid
);
8812 dout(10) << __func__
<< " clearing DIRTY flag for " << oid
<< dendl
;
8813 OpContextUPtr ctx
= simple_opc_create(fop
->obc
);
8815 // successfully flushed; can we clear the dirty bit?
8816 // try to take the lock manually, since we don't
8818 if (ctx
->lock_manager
.get_lock_type(
8819 ObjectContext::RWState::RWWRITE
,
8823 dout(20) << __func__
<< " took write lock" << dendl
;
8824 } else if (fop
->op
) {
8825 dout(10) << __func__
<< " waiting on write lock" << dendl
;
8826 close_op_ctx(ctx
.release());
8827 requeue_op(fop
->op
);
8828 requeue_ops(fop
->dup_ops
);
8829 return -EAGAIN
; // will retry
8831 dout(10) << __func__
<< " failed write lock, no op; failing" << dendl
;
8832 close_op_ctx(ctx
.release());
8833 osd
->logger
->inc(l_osd_tier_try_flush_fail
);
8834 cancel_flush(fop
, false);
8838 if (fop
->on_flush
) {
8839 ctx
->register_on_finish(*(fop
->on_flush
));
8840 fop
->on_flush
= boost::none
;
8843 ctx
->at_version
= get_next_version();
8845 ctx
->new_obs
= obc
->obs
;
8846 ctx
->new_obs
.oi
.clear_flag(object_info_t::FLAG_DIRTY
);
8847 --ctx
->delta_stats
.num_objects_dirty
;
8849 finish_ctx(ctx
.get(), pg_log_entry_t::CLEAN
);
8851 osd
->logger
->inc(l_osd_tier_clean
);
8853 if (!fop
->dup_ops
.empty() || fop
->op
) {
8854 dout(20) << __func__
<< " requeueing for " << ctx
->at_version
<< dendl
;
8855 list
<OpRequestRef
> ls
;
8857 ls
.push_back(fop
->op
);
8858 ls
.splice(ls
.end(), fop
->dup_ops
);
8862 simple_opc_submit(std::move(ctx
));
8864 flush_ops
.erase(oid
);
8867 osd
->logger
->inc(l_osd_tier_flush
);
8869 osd
->logger
->inc(l_osd_tier_try_flush
);
8871 return -EINPROGRESS
;
8874 void PrimaryLogPG::cancel_flush(FlushOpRef fop
, bool requeue
)
8876 dout(10) << __func__
<< " " << fop
->obc
->obs
.oi
.soid
<< " tid "
8877 << fop
->objecter_tid
<< dendl
;
8878 if (fop
->objecter_tid
) {
8879 osd
->objecter
->op_cancel(fop
->objecter_tid
, -ECANCELED
);
8880 fop
->objecter_tid
= 0;
8882 if (fop
->blocking
) {
8883 fop
->obc
->stop_block();
8884 kick_object_context_blocked(fop
->obc
);
8888 requeue_op(fop
->op
);
8889 requeue_ops(fop
->dup_ops
);
8891 if (fop
->on_flush
) {
8892 (*(fop
->on_flush
))();
8893 fop
->on_flush
= boost::none
;
8895 flush_ops
.erase(fop
->obc
->obs
.oi
.soid
);
8898 void PrimaryLogPG::cancel_flush_ops(bool requeue
)
8900 dout(10) << __func__
<< dendl
;
8901 map
<hobject_t
,FlushOpRef
>::iterator p
= flush_ops
.begin();
8902 while (p
!= flush_ops
.end()) {
8903 cancel_flush((p
++)->second
, requeue
);
8907 bool PrimaryLogPG::is_present_clone(hobject_t coid
)
8909 if (!pool
.info
.allow_incomplete_clones())
8911 if (is_missing_object(coid
))
8913 ObjectContextRef obc
= get_object_context(coid
, false);
8914 return obc
&& obc
->obs
.exists
;
8917 // ========================================================================
8920 class C_OSD_RepopApplied
: public Context
{
8922 boost::intrusive_ptr
<PrimaryLogPG::RepGather
> repop
;
8924 C_OSD_RepopApplied(PrimaryLogPG
*pg
, PrimaryLogPG::RepGather
*repop
)
8925 : pg(pg
), repop(repop
) {}
8926 void finish(int) override
{
8927 pg
->repop_all_applied(repop
.get());
8932 void PrimaryLogPG::repop_all_applied(RepGather
*repop
)
8934 dout(10) << __func__
<< ": repop tid " << repop
->rep_tid
<< " all applied "
8936 assert(!repop
->applies_with_commit
);
8937 repop
->all_applied
= true;
8938 if (!repop
->rep_aborted
) {
8943 class C_OSD_RepopCommit
: public Context
{
8945 boost::intrusive_ptr
<PrimaryLogPG::RepGather
> repop
;
8947 C_OSD_RepopCommit(PrimaryLogPG
*pg
, PrimaryLogPG::RepGather
*repop
)
8948 : pg(pg
), repop(repop
) {}
8949 void finish(int) override
{
8950 pg
->repop_all_committed(repop
.get());
8954 void PrimaryLogPG::repop_all_committed(RepGather
*repop
)
8956 dout(10) << __func__
<< ": repop tid " << repop
->rep_tid
<< " all committed "
8958 repop
->all_committed
= true;
8959 if (repop
->applies_with_commit
) {
8960 assert(!repop
->all_applied
);
8961 repop
->all_applied
= true;
8964 if (!repop
->rep_aborted
) {
8965 if (repop
->v
!= eversion_t()) {
8966 last_update_ondisk
= repop
->v
;
8967 last_complete_ondisk
= repop
->pg_local_last_complete
;
8973 void PrimaryLogPG::op_applied(const eversion_t
&applied_version
)
8975 dout(10) << "op_applied version " << applied_version
<< dendl
;
8976 if (applied_version
== eversion_t())
8978 assert(applied_version
> last_update_applied
);
8979 assert(applied_version
<= info
.last_update
);
8980 last_update_applied
= applied_version
;
8982 if (scrubber
.active
) {
8983 if (last_update_applied
== scrubber
.subset_last_update
) {
8984 if (ops_blocked_by_scrub()) {
8985 requeue_scrub(true);
8987 requeue_scrub(false);
8992 assert(scrubber
.start
== scrubber
.end
);
8995 if (scrubber
.active_rep_scrub
) {
8996 if (last_update_applied
== static_cast<const MOSDRepScrub
*>(
8997 scrubber
.active_rep_scrub
->get_req())->scrub_to
) {
9000 PGQueueable(scrubber
.active_rep_scrub
, get_osdmap()->get_epoch()));
9001 scrubber
.active_rep_scrub
= OpRequestRef();
9007 void PrimaryLogPG::eval_repop(RepGather
*repop
)
9009 const MOSDOp
*m
= NULL
;
9011 m
= static_cast<const MOSDOp
*>(repop
->op
->get_req());
9014 dout(10) << "eval_repop " << *repop
9015 << (repop
->rep_done
? " DONE" : "")
9018 dout(10) << "eval_repop " << *repop
<< " (no op)"
9019 << (repop
->rep_done
? " DONE" : "")
9022 if (repop
->rep_done
)
9026 if (repop
->all_committed
) {
9027 dout(10) << " commit: " << *repop
<< dendl
;
9028 for (auto p
= repop
->on_committed
.begin();
9029 p
!= repop
->on_committed
.end();
9030 repop
->on_committed
.erase(p
++)) {
9033 // send dup commits, in order
9034 if (waiting_for_ondisk
.count(repop
->v
)) {
9035 assert(waiting_for_ondisk
.begin()->first
== repop
->v
);
9036 for (list
<pair
<OpRequestRef
, version_t
> >::iterator i
=
9037 waiting_for_ondisk
[repop
->v
].begin();
9038 i
!= waiting_for_ondisk
[repop
->v
].end();
9040 osd
->reply_op_error(i
->first
, repop
->r
, repop
->v
,
9043 waiting_for_ondisk
.erase(repop
->v
);
9048 if (repop
->all_applied
) {
9049 if (repop
->applies_with_commit
) {
9050 assert(repop
->on_applied
.empty());
9052 dout(10) << " applied: " << *repop
<< " " << dendl
;
9053 for (auto p
= repop
->on_applied
.begin();
9054 p
!= repop
->on_applied
.end();
9055 repop
->on_applied
.erase(p
++)) {
9061 if (repop
->all_applied
&& repop
->all_committed
) {
9062 repop
->rep_done
= true;
9064 publish_stats_to_osd();
9065 calc_min_last_complete_ondisk();
9067 dout(10) << " removing " << *repop
<< dendl
;
9068 assert(!repop_queue
.empty());
9069 dout(20) << " q front is " << *repop_queue
.front() << dendl
;
9070 if (repop_queue
.front() != repop
) {
9071 if (!repop
->applies_with_commit
) {
9072 dout(0) << " removing " << *repop
<< dendl
;
9073 dout(0) << " q front is " << *repop_queue
.front() << dendl
;
9074 assert(repop_queue
.front() == repop
);
9077 RepGather
*to_remove
= nullptr;
9078 while (!repop_queue
.empty() &&
9079 (to_remove
= repop_queue
.front())->rep_done
) {
9080 repop_queue
.pop_front();
9081 for (auto p
= to_remove
->on_success
.begin();
9082 p
!= to_remove
->on_success
.end();
9083 to_remove
->on_success
.erase(p
++)) {
9086 remove_repop(to_remove
);
9092 void PrimaryLogPG::issue_repop(RepGather
*repop
, OpContext
*ctx
)
9095 const hobject_t
& soid
= ctx
->obs
->oi
.soid
;
9096 dout(7) << "issue_repop rep_tid " << repop
->rep_tid
9100 repop
->v
= ctx
->at_version
;
9101 if (ctx
->at_version
> eversion_t()) {
9102 for (set
<pg_shard_t
>::iterator i
= actingbackfill
.begin();
9103 i
!= actingbackfill
.end();
9105 if (*i
== get_primary()) continue;
9106 pg_info_t
&pinfo
= peer_info
[*i
];
9107 // keep peer_info up to date
9108 if (pinfo
.last_complete
== pinfo
.last_update
)
9109 pinfo
.last_complete
= ctx
->at_version
;
9110 pinfo
.last_update
= ctx
->at_version
;
9114 ctx
->obc
->ondisk_write_lock();
9116 bool unlock_snapset_obc
= false;
9117 ctx
->op_t
->add_obc(ctx
->obc
);
9118 if (ctx
->clone_obc
) {
9119 ctx
->clone_obc
->ondisk_write_lock();
9120 ctx
->op_t
->add_obc(ctx
->clone_obc
);
9122 if (ctx
->snapset_obc
&& ctx
->snapset_obc
->obs
.oi
.soid
!=
9123 ctx
->obc
->obs
.oi
.soid
) {
9124 ctx
->snapset_obc
->ondisk_write_lock();
9125 unlock_snapset_obc
= true;
9126 ctx
->op_t
->add_obc(ctx
->snapset_obc
);
9129 Context
*on_all_commit
= new C_OSD_RepopCommit(this, repop
);
9130 Context
*on_all_applied
= new C_OSD_RepopApplied(this, repop
);
9131 Context
*onapplied_sync
= new C_OSD_OndiskWriteUnlock(
9134 unlock_snapset_obc
? ctx
->snapset_obc
: ObjectContextRef());
9135 if (!(ctx
->log
.empty())) {
9136 assert(ctx
->at_version
>= projected_last_update
);
9137 projected_last_update
= ctx
->at_version
;
9139 for (auto &&entry
: ctx
->log
) {
9140 projected_log
.add(entry
);
9142 pgbackend
->submit_transaction(
9146 std::move(ctx
->op_t
),
9148 min_last_complete_ondisk
,
9150 ctx
->updated_hset_history
,
9159 PrimaryLogPG::RepGather
*PrimaryLogPG::new_repop(
9160 OpContext
*ctx
, ObjectContextRef obc
,
9164 dout(10) << "new_repop rep_tid " << rep_tid
<< " on " << *ctx
->op
->get_req() << dendl
;
9166 dout(10) << "new_repop rep_tid " << rep_tid
<< " (no op)" << dendl
;
9168 RepGather
*repop
= new RepGather(
9169 ctx
, rep_tid
, info
.last_complete
, false);
9171 repop
->start
= ceph_clock_now();
9173 repop_queue
.push_back(&repop
->queue_item
);
9176 osd
->logger
->inc(l_osd_op_wip
);
9178 dout(10) << __func__
<< ": " << *repop
<< dendl
;
9182 boost::intrusive_ptr
<PrimaryLogPG::RepGather
> PrimaryLogPG::new_repop(
9185 ObcLockManager
&&manager
,
9187 boost::optional
<std::function
<void(void)> > &&on_complete
)
9189 RepGather
*repop
= new RepGather(
9192 std::move(on_complete
),
9199 repop
->start
= ceph_clock_now();
9201 repop_queue
.push_back(&repop
->queue_item
);
9203 osd
->logger
->inc(l_osd_op_wip
);
9205 dout(10) << __func__
<< ": " << *repop
<< dendl
;
9206 return boost::intrusive_ptr
<RepGather
>(repop
);
9209 void PrimaryLogPG::remove_repop(RepGather
*repop
)
9211 dout(20) << __func__
<< " " << *repop
<< dendl
;
9213 for (auto p
= repop
->on_finish
.begin();
9214 p
!= repop
->on_finish
.end();
9215 repop
->on_finish
.erase(p
++)) {
9219 release_object_locks(
9220 repop
->lock_manager
);
9223 osd
->logger
->dec(l_osd_op_wip
);
9226 PrimaryLogPG::OpContextUPtr
PrimaryLogPG::simple_opc_create(ObjectContextRef obc
)
9228 dout(20) << __func__
<< " " << obc
->obs
.oi
.soid
<< dendl
;
9230 ceph_tid_t rep_tid
= osd
->get_tid();
9231 osd_reqid_t
reqid(osd
->get_cluster_msgr_name(), 0, rep_tid
);
9232 OpContextUPtr
ctx(new OpContext(OpRequestRef(), reqid
, ops
, obc
, this));
9233 ctx
->op_t
.reset(new PGTransaction());
9234 ctx
->mtime
= ceph_clock_now();
9238 void PrimaryLogPG::simple_opc_submit(OpContextUPtr ctx
)
9240 RepGather
*repop
= new_repop(ctx
.get(), ctx
->obc
, ctx
->reqid
.tid
);
9241 dout(20) << __func__
<< " " << repop
<< dendl
;
9242 issue_repop(repop
, ctx
.get());
9249 void PrimaryLogPG::submit_log_entries(
9250 const mempool::osd_pglog::list
<pg_log_entry_t
> &entries
,
9251 ObcLockManager
&&manager
,
9252 boost::optional
<std::function
<void(void)> > &&_on_complete
,
9256 dout(10) << __func__
<< " " << entries
<< dendl
;
9257 assert(is_primary());
9260 if (!entries
.empty()) {
9261 assert(entries
.rbegin()->version
>= projected_last_update
);
9262 version
= projected_last_update
= entries
.rbegin()->version
;
9265 boost::intrusive_ptr
<RepGather
> repop
;
9266 boost::optional
<std::function
<void(void)> > on_complete
;
9267 if (get_osdmap()->require_osd_release
>= CEPH_RELEASE_JEWEL
) {
9273 std::move(_on_complete
));
9275 on_complete
= std::move(_on_complete
);
9278 pgbackend
->call_write_ordered(
9279 [this, entries
, repop
, on_complete
]() {
9280 ObjectStore::Transaction t
;
9281 eversion_t old_last_update
= info
.last_update
;
9282 merge_new_log_entries(entries
, t
);
9285 set
<pg_shard_t
> waiting_on
;
9286 for (set
<pg_shard_t
>::const_iterator i
= actingbackfill
.begin();
9287 i
!= actingbackfill
.end();
9289 pg_shard_t
peer(*i
);
9290 if (peer
== pg_whoami
) continue;
9291 assert(peer_missing
.count(peer
));
9292 assert(peer_info
.count(peer
));
9293 if (get_osdmap()->require_osd_release
>= CEPH_RELEASE_JEWEL
) {
9295 MOSDPGUpdateLogMissing
*m
= new MOSDPGUpdateLogMissing(
9297 spg_t(info
.pgid
.pgid
, i
->shard
),
9299 get_osdmap()->get_epoch(),
9302 osd
->send_message_osd_cluster(
9303 peer
.osd
, m
, get_osdmap()->get_epoch());
9304 waiting_on
.insert(peer
);
9306 MOSDPGLog
*m
= new MOSDPGLog(
9307 peer
.shard
, pg_whoami
.shard
,
9308 info
.last_update
.epoch
,
9310 m
->log
.log
= entries
;
9311 m
->log
.tail
= old_last_update
;
9312 m
->log
.head
= info
.last_update
;
9313 osd
->send_message_osd_cluster(
9314 peer
.osd
, m
, get_osdmap()->get_epoch());
9317 if (get_osdmap()->require_osd_release
>= CEPH_RELEASE_JEWEL
) {
9318 ceph_tid_t rep_tid
= repop
->rep_tid
;
9319 waiting_on
.insert(pg_whoami
);
9320 log_entry_update_waiting_on
.insert(
9323 LogUpdateCtx
{std::move(repop
), std::move(waiting_on
)}
9325 struct OnComplete
: public Context
{
9333 : pg(pg
), rep_tid(rep_tid
), epoch(epoch
) {}
9334 void finish(int) override
{
9336 if (!pg
->pg_has_reset_since(epoch
)) {
9337 auto it
= pg
->log_entry_update_waiting_on
.find(rep_tid
);
9338 assert(it
!= pg
->log_entry_update_waiting_on
.end());
9339 auto it2
= it
->second
.waiting_on
.find(pg
->pg_whoami
);
9340 assert(it2
!= it
->second
.waiting_on
.end());
9341 it
->second
.waiting_on
.erase(it2
);
9342 if (it
->second
.waiting_on
.empty()) {
9343 pg
->repop_all_committed(it
->second
.repop
.get());
9344 pg
->log_entry_update_waiting_on
.erase(it
);
9350 t
.register_on_commit(
9351 new OnComplete
{this, rep_tid
, get_osdmap()->get_epoch()});
9354 struct OnComplete
: public Context
{
9356 std::function
<void(void)> on_complete
;
9360 const std::function
<void(void)> &on_complete
,
9363 on_complete(std::move(on_complete
)),
9365 void finish(int) override
{
9367 if (!pg
->pg_has_reset_since(epoch
))
9372 t
.register_on_complete(
9374 this, *on_complete
, get_osdmap()->get_epoch()
9378 t
.register_on_applied(
9379 new C_OSD_OnApplied
{this, get_osdmap()->get_epoch(), info
.last_update
});
9380 int r
= osd
->store
->queue_transaction(osr
.get(), std::move(t
), NULL
);
9385 void PrimaryLogPG::cancel_log_updates()
9387 // get rid of all the LogUpdateCtx so their references to repops are
9389 log_entry_update_waiting_on
.clear();
9392 // -------------------------------------------------------
9394 void PrimaryLogPG::get_watchers(list
<obj_watch_item_t
> &pg_watchers
)
9396 pair
<hobject_t
, ObjectContextRef
> i
;
9397 while (object_contexts
.get_next(i
.first
, &i
)) {
9398 ObjectContextRef
obc(i
.second
);
9399 get_obc_watchers(obc
, pg_watchers
);
9403 void PrimaryLogPG::get_obc_watchers(ObjectContextRef obc
, list
<obj_watch_item_t
> &pg_watchers
)
9405 for (map
<pair
<uint64_t, entity_name_t
>, WatchRef
>::iterator j
=
9406 obc
->watchers
.begin();
9407 j
!= obc
->watchers
.end();
9409 obj_watch_item_t owi
;
9411 owi
.obj
= obc
->obs
.oi
.soid
;
9412 owi
.wi
.addr
= j
->second
->get_peer_addr();
9413 owi
.wi
.name
= j
->second
->get_entity();
9414 owi
.wi
.cookie
= j
->second
->get_cookie();
9415 owi
.wi
.timeout_seconds
= j
->second
->get_timeout();
9417 dout(30) << "watch: Found oid=" << owi
.obj
<< " addr=" << owi
.wi
.addr
9418 << " name=" << owi
.wi
.name
<< " cookie=" << owi
.wi
.cookie
<< dendl
;
9420 pg_watchers
.push_back(owi
);
9424 void PrimaryLogPG::check_blacklisted_watchers()
9426 dout(20) << "PrimaryLogPG::check_blacklisted_watchers for pg " << get_pgid() << dendl
;
9427 pair
<hobject_t
, ObjectContextRef
> i
;
9428 while (object_contexts
.get_next(i
.first
, &i
))
9429 check_blacklisted_obc_watchers(i
.second
);
9432 void PrimaryLogPG::check_blacklisted_obc_watchers(ObjectContextRef obc
)
9434 dout(20) << "PrimaryLogPG::check_blacklisted_obc_watchers for obc " << obc
->obs
.oi
.soid
<< dendl
;
9435 for (map
<pair
<uint64_t, entity_name_t
>, WatchRef
>::iterator k
=
9436 obc
->watchers
.begin();
9437 k
!= obc
->watchers
.end();
9439 //Advance iterator now so handle_watch_timeout() can erase element
9440 map
<pair
<uint64_t, entity_name_t
>, WatchRef
>::iterator j
= k
++;
9441 dout(30) << "watch: Found " << j
->second
->get_entity() << " cookie " << j
->second
->get_cookie() << dendl
;
9442 entity_addr_t ea
= j
->second
->get_peer_addr();
9443 dout(30) << "watch: Check entity_addr_t " << ea
<< dendl
;
9444 if (get_osdmap()->is_blacklisted(ea
)) {
9445 dout(10) << "watch: Found blacklisted watcher for " << ea
<< dendl
;
9446 assert(j
->second
->get_pg() == this);
9447 j
->second
->unregister_cb();
9448 handle_watch_timeout(j
->second
);
9453 void PrimaryLogPG::populate_obc_watchers(ObjectContextRef obc
)
9455 assert(is_active());
9456 assert((recovering
.count(obc
->obs
.oi
.soid
) ||
9457 !is_missing_object(obc
->obs
.oi
.soid
)) ||
9458 (pg_log
.get_log().objects
.count(obc
->obs
.oi
.soid
) && // or this is a revert... see recover_primary()
9459 pg_log
.get_log().objects
.find(obc
->obs
.oi
.soid
)->second
->op
==
9460 pg_log_entry_t::LOST_REVERT
&&
9461 pg_log
.get_log().objects
.find(obc
->obs
.oi
.soid
)->second
->reverting_to
==
9462 obc
->obs
.oi
.version
));
9464 dout(10) << "populate_obc_watchers " << obc
->obs
.oi
.soid
<< dendl
;
9465 assert(obc
->watchers
.empty());
9466 // populate unconnected_watchers
9467 for (map
<pair
<uint64_t, entity_name_t
>, watch_info_t
>::iterator p
=
9468 obc
->obs
.oi
.watchers
.begin();
9469 p
!= obc
->obs
.oi
.watchers
.end();
9471 utime_t expire
= info
.stats
.last_became_active
;
9472 expire
+= p
->second
.timeout_seconds
;
9473 dout(10) << " unconnected watcher " << p
->first
<< " will expire " << expire
<< dendl
;
9475 Watch::makeWatchRef(
9476 this, osd
, obc
, p
->second
.timeout_seconds
, p
->first
.first
,
9477 p
->first
.second
, p
->second
.addr
));
9478 watch
->disconnect();
9479 obc
->watchers
.insert(
9481 make_pair(p
->first
.first
, p
->first
.second
),
9484 // Look for watchers from blacklisted clients and drop
9485 check_blacklisted_obc_watchers(obc
);
9488 void PrimaryLogPG::handle_watch_timeout(WatchRef watch
)
9490 ObjectContextRef obc
= watch
->get_obc(); // handle_watch_timeout owns this ref
9491 dout(10) << "handle_watch_timeout obc " << obc
<< dendl
;
9494 dout(10) << "handle_watch_timeout not active, no-op" << dendl
;
9497 if (is_degraded_or_backfilling_object(obc
->obs
.oi
.soid
)) {
9498 callbacks_for_degraded_object
[obc
->obs
.oi
.soid
].push_back(
9499 watch
->get_delayed_cb()
9501 dout(10) << "handle_watch_timeout waiting for degraded on obj "
9507 if (scrubber
.write_blocked_by_scrub(obc
->obs
.oi
.soid
)) {
9508 dout(10) << "handle_watch_timeout waiting for scrub on obj "
9511 scrubber
.add_callback(
9512 watch
->get_delayed_cb() // This callback!
9517 OpContextUPtr ctx
= simple_opc_create(obc
);
9518 ctx
->at_version
= get_next_version();
9520 object_info_t
& oi
= ctx
->new_obs
.oi
;
9521 oi
.watchers
.erase(make_pair(watch
->get_cookie(),
9522 watch
->get_entity()));
9524 list
<watch_disconnect_t
> watch_disconnects
= {
9525 watch_disconnect_t(watch
->get_cookie(), watch
->get_entity(), true)
9527 ctx
->register_on_success(
9528 [this, obc
, watch_disconnects
]() {
9529 complete_disconnect_watches(obc
, watch_disconnects
);
9533 PGTransaction
*t
= ctx
->op_t
.get();
9534 ctx
->log
.push_back(pg_log_entry_t(pg_log_entry_t::MODIFY
, obc
->obs
.oi
.soid
,
9538 osd_reqid_t(), ctx
->mtime
, 0));
9540 oi
.prior_version
= obc
->obs
.oi
.version
;
9541 oi
.version
= ctx
->at_version
;
9543 ::encode(oi
, bl
, get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD
, nullptr));
9544 t
->setattr(obc
->obs
.oi
.soid
, OI_ATTR
, bl
);
9546 // apply new object state.
9547 ctx
->obc
->obs
= ctx
->new_obs
;
9549 // no ctx->delta_stats
9550 simple_opc_submit(std::move(ctx
));
9553 ObjectContextRef
PrimaryLogPG::create_object_context(const object_info_t
& oi
,
9554 SnapSetContext
*ssc
)
9556 ObjectContextRef
obc(object_contexts
.lookup_or_create(oi
.soid
));
9557 assert(obc
->destructor_callback
== NULL
);
9558 obc
->destructor_callback
= new C_PG_ObjectContext(this, obc
.get());
9560 obc
->obs
.exists
= false;
9563 register_snapset_context(ssc
);
9564 dout(10) << "create_object_context " << (void*)obc
.get() << " " << oi
.soid
<< " " << dendl
;
9566 populate_obc_watchers(obc
);
9570 ObjectContextRef
PrimaryLogPG::get_object_context(
9571 const hobject_t
& soid
,
9573 const map
<string
, bufferlist
> *attrs
)
9576 attrs
|| !pg_log
.get_missing().is_missing(soid
) ||
9577 // or this is a revert... see recover_primary()
9578 (pg_log
.get_log().objects
.count(soid
) &&
9579 pg_log
.get_log().objects
.find(soid
)->second
->op
==
9580 pg_log_entry_t::LOST_REVERT
));
9581 ObjectContextRef obc
= object_contexts
.lookup(soid
);
9582 osd
->logger
->inc(l_osd_object_ctx_cache_total
);
9584 osd
->logger
->inc(l_osd_object_ctx_cache_hit
);
9585 dout(10) << __func__
<< ": found obc in cache: " << obc
9588 dout(10) << __func__
<< ": obc NOT found in cache: " << soid
<< dendl
;
9592 assert(attrs
->count(OI_ATTR
));
9593 bv
= attrs
->find(OI_ATTR
)->second
;
9595 int r
= pgbackend
->objects_get_attr(soid
, OI_ATTR
, &bv
);
9598 dout(10) << __func__
<< ": no obc for soid "
9599 << soid
<< " and !can_create"
9601 return ObjectContextRef(); // -ENOENT!
9604 dout(10) << __func__
<< ": no obc for soid "
9605 << soid
<< " but can_create"
9608 object_info_t
oi(soid
);
9609 SnapSetContext
*ssc
= get_snapset_context(
9610 soid
, true, 0, false);
9612 obc
= create_object_context(oi
, ssc
);
9613 dout(10) << __func__
<< ": " << obc
<< " " << soid
9614 << " " << obc
->rwstate
9615 << " oi: " << obc
->obs
.oi
9616 << " ssc: " << obc
->ssc
9617 << " snapset: " << obc
->ssc
->snapset
<< dendl
;
9624 bufferlist::iterator bliter
= bv
.begin();
9625 ::decode(oi
, bliter
);
9627 dout(0) << __func__
<< ": obc corrupt: " << soid
<< dendl
;
9628 return ObjectContextRef(); // -ENOENT!
9631 assert(oi
.soid
.pool
== (int64_t)info
.pgid
.pool());
9633 obc
= object_contexts
.lookup_or_create(oi
.soid
);
9634 obc
->destructor_callback
= new C_PG_ObjectContext(this, obc
.get());
9636 obc
->obs
.exists
= true;
9638 obc
->ssc
= get_snapset_context(
9640 soid
.has_snapset() ? attrs
: 0);
9643 populate_obc_watchers(obc
);
9645 if (pool
.info
.require_rollback()) {
9647 obc
->attr_cache
= *attrs
;
9649 int r
= pgbackend
->objects_get_attrs(
9656 dout(10) << __func__
<< ": creating obc from disk: " << obc
9660 // XXX: Caller doesn't expect this
9661 if (obc
->ssc
== NULL
) {
9662 derr
<< __func__
<< ": obc->ssc not available, not returning context" << dendl
;
9663 return ObjectContextRef(); // -ENOENT!
9666 dout(10) << __func__
<< ": " << obc
<< " " << soid
9667 << " " << obc
->rwstate
9668 << " oi: " << obc
->obs
.oi
9669 << " exists: " << (int)obc
->obs
.exists
9670 << " ssc: " << obc
->ssc
9671 << " snapset: " << obc
->ssc
->snapset
<< dendl
;
9675 void PrimaryLogPG::context_registry_on_change()
9677 pair
<hobject_t
, ObjectContextRef
> i
;
9678 while (object_contexts
.get_next(i
.first
, &i
)) {
9679 ObjectContextRef
obc(i
.second
);
9681 for (map
<pair
<uint64_t, entity_name_t
>, WatchRef
>::iterator j
=
9682 obc
->watchers
.begin();
9683 j
!= obc
->watchers
.end();
9684 obc
->watchers
.erase(j
++)) {
9685 j
->second
->discard();
9693 * If we return an error, and set *pmissing, then promoting that
9696 * If we return -EAGAIN, we will always set *pmissing to the missing
9697 * object to wait for.
9699 * If we return an error but do not set *pmissing, then we know the
9700 * object does not exist.
9702 int PrimaryLogPG::find_object_context(const hobject_t
& oid
,
9703 ObjectContextRef
*pobc
,
9705 bool map_snapid_to_clone
,
9706 hobject_t
*pmissing
)
9709 assert(oid
.pool
== static_cast<int64_t>(info
.pgid
.pool()));
9711 if (oid
.snap
== CEPH_NOSNAP
) {
9712 ObjectContextRef obc
= get_object_context(oid
, can_create
);
9718 dout(10) << "find_object_context " << oid
9720 << " oi=" << obc
->obs
.oi
9727 hobject_t head
= oid
.get_head();
9729 // want the snapdir?
9730 if (oid
.snap
== CEPH_SNAPDIR
) {
9731 // return head or snapdir, whichever exists.
9732 ObjectContextRef headobc
= get_object_context(head
, can_create
);
9733 ObjectContextRef obc
= headobc
;
9734 if (!obc
|| !obc
->obs
.exists
)
9735 obc
= get_object_context(oid
, can_create
);
9736 if (!obc
|| !obc
->obs
.exists
) {
9737 // if we have neither, we would want to promote the head.
9741 *pobc
= headobc
; // may be null
9744 dout(10) << "find_object_context " << oid
9746 << " oi=" << obc
->obs
.oi
9750 // always populate ssc for SNAPDIR...
9752 obc
->ssc
= get_snapset_context(
9758 if (!map_snapid_to_clone
&& pool
.info
.is_removed_snap(oid
.snap
)) {
9759 dout(10) << __func__
<< " snap " << oid
.snap
<< " is removed" << dendl
;
9763 SnapSetContext
*ssc
= get_snapset_context(oid
, can_create
);
9764 if (!ssc
|| !(ssc
->exists
|| can_create
)) {
9765 dout(20) << __func__
<< " " << oid
<< " no snapset" << dendl
;
9767 *pmissing
= head
; // start by getting the head
9769 put_snapset_context(ssc
);
9773 if (map_snapid_to_clone
) {
9774 dout(10) << "find_object_context " << oid
<< " @" << oid
.snap
9775 << " snapset " << ssc
->snapset
9776 << " map_snapid_to_clone=true" << dendl
;
9777 if (oid
.snap
> ssc
->snapset
.seq
) {
9778 // already must be readable
9779 ObjectContextRef obc
= get_object_context(head
, false);
9780 dout(10) << "find_object_context " << oid
<< " @" << oid
.snap
9781 << " snapset " << ssc
->snapset
9782 << " maps to head" << dendl
;
9784 put_snapset_context(ssc
);
9785 return (obc
&& obc
->obs
.exists
) ? 0 : -ENOENT
;
9787 vector
<snapid_t
>::const_iterator citer
= std::find(
9788 ssc
->snapset
.clones
.begin(),
9789 ssc
->snapset
.clones
.end(),
9791 if (citer
== ssc
->snapset
.clones
.end()) {
9792 dout(10) << "find_object_context " << oid
<< " @" << oid
.snap
9793 << " snapset " << ssc
->snapset
9794 << " maps to nothing" << dendl
;
9795 put_snapset_context(ssc
);
9799 dout(10) << "find_object_context " << oid
<< " @" << oid
.snap
9800 << " snapset " << ssc
->snapset
9801 << " maps to " << oid
<< dendl
;
9803 if (pg_log
.get_missing().is_missing(oid
)) {
9804 dout(10) << "find_object_context " << oid
<< " @" << oid
.snap
9805 << " snapset " << ssc
->snapset
9806 << " " << oid
<< " is missing" << dendl
;
9809 put_snapset_context(ssc
);
9813 ObjectContextRef obc
= get_object_context(oid
, false);
9814 if (!obc
|| !obc
->obs
.exists
) {
9815 dout(10) << "find_object_context " << oid
<< " @" << oid
.snap
9816 << " snapset " << ssc
->snapset
9817 << " " << oid
<< " is not present" << dendl
;
9820 put_snapset_context(ssc
);
9823 dout(10) << "find_object_context " << oid
<< " @" << oid
.snap
9824 << " snapset " << ssc
->snapset
9825 << " " << oid
<< " HIT" << dendl
;
9827 put_snapset_context(ssc
);
9830 ceph_abort(); //unreachable
9833 dout(10) << "find_object_context " << oid
<< " @" << oid
.snap
9834 << " snapset " << ssc
->snapset
<< dendl
;
9837 if (oid
.snap
> ssc
->snapset
.seq
) {
9838 if (ssc
->snapset
.head_exists
) {
9839 ObjectContextRef obc
= get_object_context(head
, false);
9840 dout(10) << "find_object_context " << head
9841 << " want " << oid
.snap
<< " > snapset seq " << ssc
->snapset
.seq
9842 << " -- HIT " << obc
->obs
9847 assert(ssc
== obc
->ssc
);
9848 put_snapset_context(ssc
);
9853 dout(10) << "find_object_context " << head
9854 << " want " << oid
.snap
<< " > snapset seq " << ssc
->snapset
.seq
9855 << " but head dne -- DNE"
9857 put_snapset_context(ssc
);
9861 // which clone would it be?
9863 while (k
< ssc
->snapset
.clones
.size() &&
9864 ssc
->snapset
.clones
[k
] < oid
.snap
)
9866 if (k
== ssc
->snapset
.clones
.size()) {
9867 dout(10) << "find_object_context no clones with last >= oid.snap "
9868 << oid
.snap
<< " -- DNE" << dendl
;
9869 put_snapset_context(ssc
);
9872 hobject_t
soid(oid
.oid
, oid
.get_key(), ssc
->snapset
.clones
[k
], oid
.get_hash(),
9873 info
.pgid
.pool(), oid
.get_namespace());
9875 if (pg_log
.get_missing().is_missing(soid
)) {
9876 dout(20) << "find_object_context " << soid
<< " missing, try again later"
9880 put_snapset_context(ssc
);
9884 ObjectContextRef obc
= get_object_context(soid
, false);
9885 if (!obc
|| !obc
->obs
.exists
) {
9886 dout(20) << __func__
<< " missing clone " << soid
<< dendl
;
9889 put_snapset_context(ssc
);
9896 assert(obc
->ssc
== ssc
);
9897 put_snapset_context(ssc
);
9902 dout(20) << "find_object_context " << soid
9903 << " snapset " << obc
->ssc
->snapset
9904 << " legacy_snaps " << obc
->obs
.oi
.legacy_snaps
9906 snapid_t first
, last
;
9907 if (obc
->ssc
->snapset
.is_legacy()) {
9908 first
= obc
->obs
.oi
.legacy_snaps
.back();
9909 last
= obc
->obs
.oi
.legacy_snaps
.front();
9911 auto p
= obc
->ssc
->snapset
.clone_snaps
.find(soid
.snap
);
9912 assert(p
!= obc
->ssc
->snapset
.clone_snaps
.end());
9913 first
= p
->second
.back();
9914 last
= p
->second
.front();
9916 if (first
<= oid
.snap
) {
9917 dout(20) << "find_object_context " << soid
<< " [" << first
<< "," << last
9918 << "] contains " << oid
.snap
<< " -- HIT " << obc
->obs
<< dendl
;
9922 dout(20) << "find_object_context " << soid
<< " [" << first
<< "," << last
9923 << "] does not contain " << oid
.snap
<< " -- DNE" << dendl
;
9928 void PrimaryLogPG::object_context_destructor_callback(ObjectContext
*obc
)
9931 put_snapset_context(obc
->ssc
);
9934 void PrimaryLogPG::add_object_context_to_pg_stat(ObjectContextRef obc
, pg_stat_t
*pgstat
)
9936 object_info_t
& oi
= obc
->obs
.oi
;
9938 dout(10) << "add_object_context_to_pg_stat " << oi
.soid
<< dendl
;
9939 object_stat_sum_t stat
;
9941 stat
.num_bytes
+= oi
.size
;
9943 if (oi
.soid
.snap
!= CEPH_SNAPDIR
)
9946 stat
.num_objects_dirty
++;
9947 if (oi
.is_whiteout())
9948 stat
.num_whiteouts
++;
9950 stat
.num_objects_omap
++;
9951 if (oi
.is_cache_pinned())
9952 stat
.num_objects_pinned
++;
9954 if (oi
.soid
.snap
&& oi
.soid
.snap
!= CEPH_NOSNAP
&& oi
.soid
.snap
!= CEPH_SNAPDIR
) {
9955 stat
.num_object_clones
++;
9958 obc
->ssc
= get_snapset_context(oi
.soid
, false);
9961 // subtract off clone overlap
9962 if (obc
->ssc
->snapset
.clone_overlap
.count(oi
.soid
.snap
)) {
9963 interval_set
<uint64_t>& o
= obc
->ssc
->snapset
.clone_overlap
[oi
.soid
.snap
];
9964 for (interval_set
<uint64_t>::const_iterator r
= o
.begin();
9967 stat
.num_bytes
-= r
.get_len();
9973 pgstat
->stats
.sum
.add(stat
);
9976 void PrimaryLogPG::kick_object_context_blocked(ObjectContextRef obc
)
9978 const hobject_t
& soid
= obc
->obs
.oi
.soid
;
9979 if (obc
->is_blocked()) {
9980 dout(10) << __func__
<< " " << soid
<< " still blocked" << dendl
;
9984 map
<hobject_t
, list
<OpRequestRef
>>::iterator p
= waiting_for_blocked_object
.find(soid
);
9985 if (p
!= waiting_for_blocked_object
.end()) {
9986 list
<OpRequestRef
>& ls
= p
->second
;
9987 dout(10) << __func__
<< " " << soid
<< " requeuing " << ls
.size() << " requests" << dendl
;
9989 waiting_for_blocked_object
.erase(p
);
9992 map
<hobject_t
, ObjectContextRef
>::iterator i
=
9993 objects_blocked_on_snap_promotion
.find(obc
->obs
.oi
.soid
.get_head());
9994 if (i
!= objects_blocked_on_snap_promotion
.end()) {
9995 assert(i
->second
== obc
);
9996 objects_blocked_on_snap_promotion
.erase(i
);
9999 if (obc
->requeue_scrub_on_unblock
) {
10000 obc
->requeue_scrub_on_unblock
= false;
10005 SnapSetContext
*PrimaryLogPG::get_snapset_context(
10006 const hobject_t
& oid
,
10008 const map
<string
, bufferlist
> *attrs
,
10011 Mutex::Locker
l(snapset_contexts_lock
);
10012 SnapSetContext
*ssc
;
10013 map
<hobject_t
, SnapSetContext
*>::iterator p
= snapset_contexts
.find(
10014 oid
.get_snapdir());
10015 if (p
!= snapset_contexts
.end()) {
10016 if (can_create
|| p
->second
->exists
) {
10025 if (!(oid
.is_head() && !oid_existed
))
10026 r
= pgbackend
->objects_get_attr(oid
.get_head(), SS_ATTR
, &bv
);
10029 if (!(oid
.is_snapdir() && !oid_existed
))
10030 r
= pgbackend
->objects_get_attr(oid
.get_snapdir(), SS_ATTR
, &bv
);
10031 if (r
< 0 && !can_create
)
10035 assert(attrs
->count(SS_ATTR
));
10036 bv
= attrs
->find(SS_ATTR
)->second
;
10038 ssc
= new SnapSetContext(oid
.get_snapdir());
10039 _register_snapset_context(ssc
);
10041 bufferlist::iterator bvp
= bv
.begin();
10043 ssc
->snapset
.decode(bvp
);
10044 } catch (buffer::error
& e
) {
10045 dout(0) << __func__
<< " Can't decode snapset: " << e
<< dendl
;
10048 ssc
->exists
= true;
10050 ssc
->exists
= false;
10058 void PrimaryLogPG::put_snapset_context(SnapSetContext
*ssc
)
10060 Mutex::Locker
l(snapset_contexts_lock
);
10062 if (ssc
->ref
== 0) {
10063 if (ssc
->registered
)
10064 snapset_contexts
.erase(ssc
->oid
);
10069 /** pull - request object from a peer
10074 * NONE - didn't pull anything
10075 * YES - pulled what the caller wanted
10076 * OTHER - needed to pull something else first (_head or _snapdir)
10078 enum { PULL_NONE
, PULL_OTHER
, PULL_YES
};
10080 int PrimaryLogPG::recover_missing(
10081 const hobject_t
&soid
, eversion_t v
,
10083 PGBackend::RecoveryHandle
*h
)
10085 if (missing_loc
.is_unfound(soid
)) {
10086 dout(7) << "pull " << soid
10088 << " but it is unfound" << dendl
;
10092 // is this a snapped object? if so, consult the snapset.. we may not need the entire object!
10093 ObjectContextRef obc
;
10094 ObjectContextRef head_obc
;
10095 if (soid
.snap
&& soid
.snap
< CEPH_NOSNAP
) {
10096 // do we have the head and/or snapdir?
10097 hobject_t head
= soid
.get_head();
10098 if (pg_log
.get_missing().is_missing(head
)) {
10099 if (recovering
.count(head
)) {
10100 dout(10) << " missing but already recovering head " << head
<< dendl
;
10103 int r
= recover_missing(
10104 head
, pg_log
.get_missing().get_items().find(head
)->second
.need
, priority
,
10106 if (r
!= PULL_NONE
)
10111 head
= soid
.get_snapdir();
10112 if (pg_log
.get_missing().is_missing(head
)) {
10113 if (recovering
.count(head
)) {
10114 dout(10) << " missing but already recovering snapdir " << head
<< dendl
;
10117 int r
= recover_missing(
10118 head
, pg_log
.get_missing().get_items().find(head
)->second
.need
, priority
,
10120 if (r
!= PULL_NONE
)
10126 // we must have one or the other
10127 head_obc
= get_object_context(
10132 head_obc
= get_object_context(
10133 soid
.get_snapdir(),
10138 start_recovery_op(soid
);
10139 assert(!recovering
.count(soid
));
10140 recovering
.insert(make_pair(soid
, obc
));
10141 int r
= pgbackend
->recover_object(
10147 // This is only a pull which shouldn't return an error
10152 void PrimaryLogPG::send_remove_op(
10153 const hobject_t
& oid
, eversion_t v
, pg_shard_t peer
)
10155 ceph_tid_t tid
= osd
->get_tid();
10156 osd_reqid_t
rid(osd
->get_cluster_msgr_name(), 0, tid
);
10158 dout(10) << "send_remove_op " << oid
<< " from osd." << peer
10159 << " tid " << tid
<< dendl
;
10161 MOSDSubOp
*subop
= new MOSDSubOp(
10162 rid
, pg_whoami
, spg_t(info
.pgid
.pgid
, peer
.shard
),
10163 oid
, CEPH_OSD_FLAG_ACK
,
10164 get_osdmap()->get_epoch(), tid
, v
);
10165 subop
->ops
= vector
<OSDOp
>(1);
10166 subop
->ops
[0].op
.op
= CEPH_OSD_OP_DELETE
;
10168 osd
->send_message_osd_cluster(peer
.osd
, subop
, get_osdmap()->get_epoch());
10172 void PrimaryLogPG::finish_degraded_object(const hobject_t
& oid
)
10174 dout(10) << "finish_degraded_object " << oid
<< dendl
;
10175 ObjectContextRef
obc(object_contexts
.lookup(oid
));
10176 if (callbacks_for_degraded_object
.count(oid
)) {
10177 list
<Context
*> contexts
;
10178 contexts
.swap(callbacks_for_degraded_object
[oid
]);
10179 callbacks_for_degraded_object
.erase(oid
);
10180 for (list
<Context
*>::iterator i
= contexts
.begin();
10181 i
!= contexts
.end();
10186 map
<hobject_t
, snapid_t
>::iterator i
= objects_blocked_on_degraded_snap
.find(
10188 if (i
!= objects_blocked_on_degraded_snap
.end() &&
10189 i
->second
== oid
.snap
)
10190 objects_blocked_on_degraded_snap
.erase(i
);
10193 void PrimaryLogPG::_committed_pushed_object(
10194 epoch_t epoch
, eversion_t last_complete
)
10197 if (!pg_has_reset_since(epoch
)) {
10198 dout(10) << "_committed_pushed_object last_complete " << last_complete
<< " now ondisk" << dendl
;
10199 last_complete_ondisk
= last_complete
;
10201 if (last_complete_ondisk
== info
.last_update
) {
10202 if (!is_primary()) {
10203 // Either we are a replica or backfill target.
10204 // we are fully up to date. tell the primary!
10205 osd
->send_message_osd_cluster(
10208 get_osdmap()->get_epoch(),
10209 spg_t(info
.pgid
.pgid
, get_primary().shard
),
10210 last_complete_ondisk
),
10211 get_osdmap()->get_epoch());
10213 calc_min_last_complete_ondisk();
10218 dout(10) << "_committed_pushed_object pg has changed, not touching last_complete_ondisk" << dendl
;
10224 void PrimaryLogPG::_applied_recovered_object(ObjectContextRef obc
)
10227 dout(10) << "_applied_recovered_object " << *obc
<< dendl
;
10229 assert(active_pushes
>= 1);
10232 // requeue an active chunky scrub waiting on recovery ops
10233 if (!deleting
&& active_pushes
== 0
10234 && scrubber
.is_chunky_scrub_active()) {
10235 if (ops_blocked_by_scrub()) {
10236 requeue_scrub(true);
10238 requeue_scrub(false);
10245 void PrimaryLogPG::_applied_recovered_object_replica()
10248 dout(10) << "_applied_recovered_object_replica" << dendl
;
10250 assert(active_pushes
>= 1);
10253 // requeue an active chunky scrub waiting on recovery ops
10254 if (!deleting
&& active_pushes
== 0 &&
10255 scrubber
.active_rep_scrub
&& static_cast<const MOSDRepScrub
*>(
10256 scrubber
.active_rep_scrub
->get_req())->chunky
) {
10259 PGQueueable(scrubber
.active_rep_scrub
, get_osdmap()->get_epoch()));
10260 scrubber
.active_rep_scrub
= OpRequestRef();
10266 void PrimaryLogPG::recover_got(hobject_t oid
, eversion_t v
)
10268 dout(10) << "got missing " << oid
<< " v " << v
<< dendl
;
10269 pg_log
.recover_got(oid
, v
, info
);
10270 if (pg_log
.get_log().complete_to
!= pg_log
.get_log().log
.end()) {
10271 dout(10) << "last_complete now " << info
.last_complete
10272 << " log.complete_to " << pg_log
.get_log().complete_to
->version
10275 dout(10) << "last_complete now " << info
.last_complete
10276 << " log.complete_to at end" << dendl
;
10277 //below is not true in the repair case.
10278 //assert(missing.num_missing() == 0); // otherwise, complete_to was wrong.
10279 assert(info
.last_complete
== info
.last_update
);
10283 void PrimaryLogPG::primary_failed(const hobject_t
&soid
)
10285 list
<pg_shard_t
> fl
= { pg_whoami
};
10286 failed_push(fl
, soid
);
10289 void PrimaryLogPG::failed_push(const list
<pg_shard_t
> &from
, const hobject_t
&soid
)
10291 dout(20) << __func__
<< ": " << soid
<< dendl
;
10292 assert(recovering
.count(soid
));
10293 auto obc
= recovering
[soid
];
10295 list
<OpRequestRef
> blocked_ops
;
10296 obc
->drop_recovery_read(&blocked_ops
);
10297 requeue_ops(blocked_ops
);
10299 recovering
.erase(soid
);
10300 for (auto&& i
: from
)
10301 missing_loc
.remove_location(soid
, i
);
10302 dout(0) << __func__
<< " " << soid
<< " from shard " << from
10303 << ", reps on " << missing_loc
.get_locations(soid
)
10304 << " unfound? " << missing_loc
.is_unfound(soid
) << dendl
;
10305 finish_recovery_op(soid
); // close out this attempt,
10308 void PrimaryLogPG::sub_op_remove(OpRequestRef op
)
10310 const MOSDSubOp
*m
= static_cast<const MOSDSubOp
*>(op
->get_req());
10311 assert(m
->get_type() == MSG_OSD_SUBOP
);
10312 dout(7) << "sub_op_remove " << m
->poid
<< dendl
;
10314 op
->mark_started();
10316 ObjectStore::Transaction t
;
10317 remove_snap_mapped_object(t
, m
->poid
);
10318 int r
= osd
->store
->queue_transaction(osr
.get(), std::move(t
), NULL
);
10322 eversion_t
PrimaryLogPG::pick_newest_available(const hobject_t
& oid
)
10325 pg_missing_item pmi
;
10326 bool is_missing
= pg_log
.get_missing().is_missing(oid
, &pmi
);
10327 assert(is_missing
);
10329 dout(10) << "pick_newest_available " << oid
<< " " << v
<< " on osd." << osd
->whoami
<< " (local)" << dendl
;
10331 assert(!actingbackfill
.empty());
10332 for (set
<pg_shard_t
>::iterator i
= actingbackfill
.begin();
10333 i
!= actingbackfill
.end();
10335 if (*i
== get_primary()) continue;
10336 pg_shard_t peer
= *i
;
10337 if (!peer_missing
[peer
].is_missing(oid
)) {
10340 eversion_t h
= peer_missing
[peer
].get_items().at(oid
).have
;
10341 dout(10) << "pick_newest_available " << oid
<< " " << h
<< " on osd." << peer
<< dendl
;
10346 dout(10) << "pick_newest_available " << oid
<< " " << v
<< " (newest)" << dendl
;
10350 void PrimaryLogPG::do_update_log_missing(OpRequestRef
&op
)
10352 const MOSDPGUpdateLogMissing
*m
= static_cast<const MOSDPGUpdateLogMissing
*>(
10354 assert(m
->get_type() == MSG_OSD_PG_UPDATE_LOG_MISSING
);
10355 ObjectStore::Transaction t
;
10356 append_log_entries_update_missing(m
->entries
, t
);
10358 Context
*complete
= new FunctionContext(
10360 const MOSDPGUpdateLogMissing
*msg
= static_cast<const MOSDPGUpdateLogMissing
*>(
10363 if (!pg_has_reset_since(msg
->get_epoch())) {
10364 MOSDPGUpdateLogMissingReply
*reply
=
10365 new MOSDPGUpdateLogMissingReply(
10366 spg_t(info
.pgid
.pgid
, primary_shard().shard
),
10371 reply
->set_priority(CEPH_MSG_PRIO_HIGH
);
10372 msg
->get_connection()->send_message(reply
);
10377 if (get_osdmap()->require_osd_release
>= CEPH_RELEASE_KRAKEN
) {
10378 t
.register_on_commit(complete
);
10380 /* Hack to work around the fact that ReplicatedBackend sends
10381 * ack+commit if commit happens first
10383 * This behavior is no longer necessary, but we preserve it so old
10384 * primaries can keep their repops in order */
10385 if (pool
.info
.ec_pool()) {
10386 t
.register_on_complete(complete
);
10388 t
.register_on_commit(complete
);
10391 t
.register_on_applied(
10392 new C_OSD_OnApplied
{this, get_osdmap()->get_epoch(), info
.last_update
});
10393 int tr
= osd
->store
->queue_transaction(
10400 void PrimaryLogPG::do_update_log_missing_reply(OpRequestRef
&op
)
10402 const MOSDPGUpdateLogMissingReply
*m
=
10403 static_cast<const MOSDPGUpdateLogMissingReply
*>(
10405 dout(20) << __func__
<< " got reply from "
10406 << m
->get_from() << dendl
;
10408 auto it
= log_entry_update_waiting_on
.find(m
->get_tid());
10409 if (it
!= log_entry_update_waiting_on
.end()) {
10410 if (it
->second
.waiting_on
.count(m
->get_from())) {
10411 it
->second
.waiting_on
.erase(m
->get_from());
10414 << info
.pgid
<< " got reply "
10415 << *m
<< " from shard we are not waiting for "
10419 if (it
->second
.waiting_on
.empty()) {
10420 repop_all_committed(it
->second
.repop
.get());
10421 log_entry_update_waiting_on
.erase(it
);
10425 << info
.pgid
<< " got reply "
10426 << *m
<< " on unknown tid " << m
->get_tid();
10430 /* Mark all unfound objects as lost.
10432 void PrimaryLogPG::mark_all_unfound_lost(
10437 dout(3) << __func__
<< " " << pg_log_entry_t::get_op_name(what
) << dendl
;
10438 list
<hobject_t
> oids
;
10440 dout(30) << __func__
<< ": log before:\n";
10441 pg_log
.get_log().print(*_dout
);
10444 mempool::osd_pglog::list
<pg_log_entry_t
> log_entries
;
10446 utime_t mtime
= ceph_clock_now();
10447 map
<hobject_t
, pg_missing_item
>::const_iterator m
=
10448 missing_loc
.get_needs_recovery().begin();
10449 map
<hobject_t
, pg_missing_item
>::const_iterator mend
=
10450 missing_loc
.get_needs_recovery().end();
10452 ObcLockManager manager
;
10453 eversion_t v
= get_next_version();
10454 v
.epoch
= get_osdmap()->get_epoch();
10455 uint64_t num_unfound
= missing_loc
.num_unfound();
10456 while (m
!= mend
) {
10457 const hobject_t
&oid(m
->first
);
10458 if (!missing_loc
.is_unfound(oid
)) {
10459 // We only care about unfound objects
10464 ObjectContextRef obc
;
10468 case pg_log_entry_t::LOST_MARK
:
10469 assert(0 == "actually, not implemented yet!");
10472 case pg_log_entry_t::LOST_REVERT
:
10473 prev
= pick_newest_available(oid
);
10474 if (prev
> eversion_t()) {
10477 pg_log_entry_t::LOST_REVERT
, oid
, v
,
10478 m
->second
.need
, 0, osd_reqid_t(), mtime
, 0);
10479 e
.reverting_to
= prev
;
10480 e
.mark_unrollbackable();
10481 log_entries
.push_back(e
);
10482 dout(10) << e
<< dendl
;
10484 // we are now missing the new version; recovery code will sort it out.
10490 case pg_log_entry_t::LOST_DELETE
:
10492 pg_log_entry_t
e(pg_log_entry_t::LOST_DELETE
, oid
, v
, m
->second
.need
,
10493 0, osd_reqid_t(), mtime
, 0);
10494 if (get_osdmap()->require_osd_release
>= CEPH_RELEASE_JEWEL
) {
10495 if (pool
.info
.require_rollback()) {
10496 e
.mod_desc
.try_rmobject(v
.version
);
10498 e
.mark_unrollbackable();
10500 } // otherwise, just do what we used to do
10501 dout(10) << e
<< dendl
;
10502 log_entries
.push_back(e
);
10503 oids
.push_back(oid
);
10515 info
.stats
.stats_invalid
= true;
10517 submit_log_entries(
10519 std::move(manager
),
10520 boost::optional
<std::function
<void(void)> >(
10521 [this, oids
, con
, num_unfound
, tid
]() {
10522 for (auto oid
: oids
)
10523 missing_loc
.recovered(oid
);
10524 for (auto& p
: waiting_for_unreadable_object
) {
10525 release_backoffs(p
.first
);
10527 requeue_object_waiters(waiting_for_unreadable_object
);
10531 ss
<< "pg has " << num_unfound
10532 << " objects unfound and apparently lost marking";
10533 string rs
= ss
.str();
10534 dout(0) << "do_command r=" << 0 << " " << rs
<< dendl
;
10535 osd
->clog
->info() << rs
;
10537 MCommandReply
*reply
= new MCommandReply(0, rs
);
10538 reply
->set_tid(tid
);
10539 con
->send_message(reply
);
10545 void PrimaryLogPG::_split_into(pg_t child_pgid
, PG
*child
, unsigned split_bits
)
10547 assert(repop_queue
.empty());
10551 * pg status change notification
10554 void PrimaryLogPG::apply_and_flush_repops(bool requeue
)
10556 list
<OpRequestRef
> rq
;
10558 // apply all repops
10559 while (!repop_queue
.empty()) {
10560 RepGather
*repop
= repop_queue
.front();
10561 repop_queue
.pop_front();
10562 dout(10) << " canceling repop tid " << repop
->rep_tid
<< dendl
;
10563 repop
->rep_aborted
= true;
10564 repop
->on_applied
.clear();
10565 repop
->on_committed
.clear();
10566 repop
->on_success
.clear();
10570 dout(10) << " requeuing " << *repop
->op
->get_req() << dendl
;
10571 rq
.push_back(repop
->op
);
10572 repop
->op
= OpRequestRef();
10575 // also requeue any dups, interleaved into position
10576 map
<eversion_t
, list
<pair
<OpRequestRef
, version_t
> > >::iterator p
=
10577 waiting_for_ondisk
.find(repop
->v
);
10578 if (p
!= waiting_for_ondisk
.end()) {
10579 dout(10) << " also requeuing ondisk waiters " << p
->second
<< dendl
;
10580 for (list
<pair
<OpRequestRef
, version_t
> >::iterator i
=
10582 i
!= p
->second
.end();
10584 rq
.push_back(i
->first
);
10586 waiting_for_ondisk
.erase(p
);
10590 remove_repop(repop
);
10593 assert(repop_queue
.empty());
10597 if (!waiting_for_ondisk
.empty()) {
10598 for (map
<eversion_t
, list
<pair
<OpRequestRef
, version_t
> > >::iterator i
=
10599 waiting_for_ondisk
.begin();
10600 i
!= waiting_for_ondisk
.end();
10602 for (list
<pair
<OpRequestRef
, version_t
> >::iterator j
=
10604 j
!= i
->second
.end();
10606 derr
<< __func__
<< ": op " << *(j
->first
->get_req()) << " waiting on "
10607 << i
->first
<< dendl
;
10610 assert(waiting_for_ondisk
.empty());
10614 waiting_for_ondisk
.clear();
10617 void PrimaryLogPG::on_flushed()
10619 assert(flushes_in_progress
> 0);
10620 flushes_in_progress
--;
10621 if (flushes_in_progress
== 0) {
10622 requeue_ops(waiting_for_peered
);
10624 if (!is_peered() || !is_primary()) {
10625 pair
<hobject_t
, ObjectContextRef
> i
;
10626 while (object_contexts
.get_next(i
.first
, &i
)) {
10627 derr
<< "on_flushed: object " << i
.first
<< " obc still alive" << dendl
;
10629 assert(object_contexts
.empty());
10631 pgbackend
->on_flushed();
10634 void PrimaryLogPG::on_removal(ObjectStore::Transaction
*t
)
10636 dout(10) << "on_removal" << dendl
;
10638 // adjust info to backfill
10639 info
.set_last_backfill(hobject_t());
10640 pg_log
.reset_backfill();
10645 PGLogEntryHandler rollbacker
{this, t
};
10646 pg_log
.roll_forward(&rollbacker
);
10648 write_if_dirty(*t
);
10654 void PrimaryLogPG::on_shutdown()
10656 dout(10) << "on_shutdown" << dendl
;
10658 // remove from queues
10659 osd
->pg_stat_queue_dequeue(this);
10660 osd
->peering_wq
.dequeue(this);
10662 // handles queue races
10665 if (recovery_queued
) {
10666 recovery_queued
= false;
10667 osd
->clear_queued_recovery(this);
10670 clear_scrub_reserved();
10671 scrub_clear_state();
10673 unreg_next_scrub();
10674 cancel_copy_ops(false);
10675 cancel_flush_ops(false);
10676 cancel_proxy_ops(false);
10677 apply_and_flush_repops(false);
10678 cancel_log_updates();
10679 // we must remove PGRefs, so do this this prior to release_backoffs() callers
10681 // clean up snap trim references
10682 snap_trimmer_machine
.process_event(Reset());
10684 pgbackend
->on_change();
10686 context_registry_on_change();
10687 object_contexts
.clear();
10689 osd
->remote_reserver
.cancel_reservation(info
.pgid
);
10690 osd
->local_reserver
.cancel_reservation(info
.pgid
);
10692 clear_primary_state();
10696 void PrimaryLogPG::on_activate()
10699 if (needs_recovery()) {
10700 dout(10) << "activate not all replicas are up-to-date, queueing recovery" << dendl
;
10701 queue_peering_event(
10703 std::make_shared
<CephPeeringEvt
>(
10704 get_osdmap()->get_epoch(),
10705 get_osdmap()->get_epoch(),
10707 } else if (needs_backfill()) {
10708 dout(10) << "activate queueing backfill" << dendl
;
10709 queue_peering_event(
10711 std::make_shared
<CephPeeringEvt
>(
10712 get_osdmap()->get_epoch(),
10713 get_osdmap()->get_epoch(),
10714 RequestBackfill())));
10716 dout(10) << "activate all replicas clean, no recovery" << dendl
;
10717 eio_errors_to_process
= false;
10718 queue_peering_event(
10720 std::make_shared
<CephPeeringEvt
>(
10721 get_osdmap()->get_epoch(),
10722 get_osdmap()->get_epoch(),
10723 AllReplicasRecovered())));
10726 publish_stats_to_osd();
10728 if (!backfill_targets
.empty()) {
10729 last_backfill_started
= earliest_backfill();
10730 new_backfill
= true;
10731 assert(!last_backfill_started
.is_max());
10732 dout(5) << "on activate: bft=" << backfill_targets
10733 << " from " << last_backfill_started
<< dendl
;
10734 for (set
<pg_shard_t
>::iterator i
= backfill_targets
.begin();
10735 i
!= backfill_targets
.end();
10737 dout(5) << "target shard " << *i
10738 << " from " << peer_info
[*i
].last_backfill
10747 void PrimaryLogPG::_on_new_interval()
10751 void PrimaryLogPG::on_change(ObjectStore::Transaction
*t
)
10753 dout(10) << "on_change" << dendl
;
10755 if (hit_set
&& hit_set
->insert_count() == 0) {
10756 dout(20) << " discarding empty hit_set" << dendl
;
10760 if (recovery_queued
) {
10761 recovery_queued
= false;
10762 osd
->clear_queued_recovery(this);
10765 // requeue everything in the reverse order they should be
10767 requeue_ops(waiting_for_peered
);
10768 requeue_ops(waiting_for_active
);
10770 clear_scrub_reserved();
10772 cancel_copy_ops(is_primary());
10773 cancel_flush_ops(is_primary());
10774 cancel_proxy_ops(is_primary());
10776 // requeue object waiters
10777 for (auto& p
: waiting_for_unreadable_object
) {
10778 release_backoffs(p
.first
);
10780 if (is_primary()) {
10781 requeue_object_waiters(waiting_for_unreadable_object
);
10783 waiting_for_unreadable_object
.clear();
10785 for (map
<hobject_t
,list
<OpRequestRef
>>::iterator p
= waiting_for_degraded_object
.begin();
10786 p
!= waiting_for_degraded_object
.end();
10787 waiting_for_degraded_object
.erase(p
++)) {
10788 release_backoffs(p
->first
);
10790 requeue_ops(p
->second
);
10793 finish_degraded_object(p
->first
);
10796 // requeues waiting_for_scrub
10797 scrub_clear_state();
10799 for (auto p
= waiting_for_blocked_object
.begin();
10800 p
!= waiting_for_blocked_object
.end();
10801 waiting_for_blocked_object
.erase(p
++)) {
10803 requeue_ops(p
->second
);
10807 for (auto i
= callbacks_for_degraded_object
.begin();
10808 i
!= callbacks_for_degraded_object
.end();
10810 finish_degraded_object((i
++)->first
);
10812 assert(callbacks_for_degraded_object
.empty());
10814 if (is_primary()) {
10815 requeue_ops(waiting_for_cache_not_full
);
10817 waiting_for_cache_not_full
.clear();
10819 objects_blocked_on_cache_full
.clear();
10821 for (list
<pair
<OpRequestRef
, OpContext
*> >::iterator i
=
10822 in_progress_async_reads
.begin();
10823 i
!= in_progress_async_reads
.end();
10824 in_progress_async_reads
.erase(i
++)) {
10825 close_op_ctx(i
->second
);
10827 requeue_op(i
->first
);
10830 // this will requeue ops we were working on but didn't finish, and
10832 apply_and_flush_repops(is_primary());
10833 cancel_log_updates();
10835 // do this *after* apply_and_flush_repops so that we catch any newly
10836 // registered watches.
10837 context_registry_on_change();
10839 pgbackend
->on_change_cleanup(t
);
10840 scrubber
.cleanup_store(t
);
10841 pgbackend
->on_change();
10843 // clear snap_trimmer state
10844 snap_trimmer_machine
.process_event(Reset());
10846 debug_op_order
.clear();
10847 unstable_stats
.clear();
10849 // we don't want to cache object_contexts through the interval change
10850 // NOTE: we actually assert that all currently live references are dead
10851 // by the time the flush for the next interval completes.
10852 object_contexts
.clear();
10854 // should have been cleared above by finishing all of the degraded objects
10855 assert(objects_blocked_on_degraded_snap
.empty());
10858 void PrimaryLogPG::on_role_change()
10860 dout(10) << "on_role_change" << dendl
;
10861 if (get_role() != 0 && hit_set
) {
10862 dout(10) << " clearing hit set" << dendl
;
10867 void PrimaryLogPG::on_pool_change()
10869 dout(10) << __func__
<< dendl
;
10870 // requeue cache full waiters just in case the cache_mode is
10871 // changing away from writeback mode. note that if we are not
10872 // active the normal requeuing machinery is sufficient (and properly
10875 pool
.info
.cache_mode
!= pg_pool_t::CACHEMODE_WRITEBACK
&&
10876 !waiting_for_cache_not_full
.empty()) {
10877 dout(10) << __func__
<< " requeuing full waiters (not in writeback) "
10879 requeue_ops(waiting_for_cache_not_full
);
10880 objects_blocked_on_cache_full
.clear();
10886 // clear state. called on recovery completion AND cancellation.
10887 void PrimaryLogPG::_clear_recovery_state()
10889 missing_loc
.clear();
10890 #ifdef DEBUG_RECOVERY_OIDS
10891 recovering_oids
.clear();
10893 last_backfill_started
= hobject_t();
10894 set
<hobject_t
>::iterator i
= backfills_in_flight
.begin();
10895 while (i
!= backfills_in_flight
.end()) {
10896 assert(recovering
.count(*i
));
10897 backfills_in_flight
.erase(i
++);
10900 list
<OpRequestRef
> blocked_ops
;
10901 for (map
<hobject_t
, ObjectContextRef
>::iterator i
= recovering
.begin();
10902 i
!= recovering
.end();
10903 recovering
.erase(i
++)) {
10905 i
->second
->drop_recovery_read(&blocked_ops
);
10906 requeue_ops(blocked_ops
);
10909 assert(backfills_in_flight
.empty());
10910 pending_backfill_updates
.clear();
10911 assert(recovering
.empty());
10912 pgbackend
->clear_recovery_state();
10915 void PrimaryLogPG::cancel_pull(const hobject_t
&soid
)
10917 dout(20) << __func__
<< ": " << soid
<< dendl
;
10918 assert(recovering
.count(soid
));
10919 ObjectContextRef obc
= recovering
[soid
];
10921 list
<OpRequestRef
> blocked_ops
;
10922 obc
->drop_recovery_read(&blocked_ops
);
10923 requeue_ops(blocked_ops
);
10925 recovering
.erase(soid
);
10926 finish_recovery_op(soid
);
10927 release_backoffs(soid
);
10928 if (waiting_for_degraded_object
.count(soid
)) {
10929 dout(20) << " kicking degraded waiters on " << soid
<< dendl
;
10930 requeue_ops(waiting_for_degraded_object
[soid
]);
10931 waiting_for_degraded_object
.erase(soid
);
10933 if (waiting_for_unreadable_object
.count(soid
)) {
10934 dout(20) << " kicking unreadable waiters on " << soid
<< dendl
;
10935 requeue_ops(waiting_for_unreadable_object
[soid
]);
10936 waiting_for_unreadable_object
.erase(soid
);
10938 if (is_missing_object(soid
))
10939 pg_log
.set_last_requested(0); // get recover_primary to start over
10940 finish_degraded_object(soid
);
10943 void PrimaryLogPG::check_recovery_sources(const OSDMapRef
& osdmap
)
10946 * check that any peers we are planning to (or currently) pulling
10947 * objects from are dealt with.
10949 missing_loc
.check_recovery_sources(osdmap
);
10950 pgbackend
->check_recovery_sources(osdmap
);
10952 for (set
<pg_shard_t
>::iterator i
= peer_log_requested
.begin();
10953 i
!= peer_log_requested
.end();
10955 if (!osdmap
->is_up(i
->osd
)) {
10956 dout(10) << "peer_log_requested removing " << *i
<< dendl
;
10957 peer_log_requested
.erase(i
++);
10963 for (set
<pg_shard_t
>::iterator i
= peer_missing_requested
.begin();
10964 i
!= peer_missing_requested
.end();
10966 if (!osdmap
->is_up(i
->osd
)) {
10967 dout(10) << "peer_missing_requested removing " << *i
<< dendl
;
10968 peer_missing_requested
.erase(i
++);
10975 void PG::MissingLoc::check_recovery_sources(const OSDMapRef
& osdmap
)
10977 set
<pg_shard_t
> now_down
;
10978 for (set
<pg_shard_t
>::iterator p
= missing_loc_sources
.begin();
10979 p
!= missing_loc_sources
.end();
10981 if (osdmap
->is_up(p
->osd
)) {
10985 ldout(pg
->cct
, 10) << "check_recovery_sources source osd." << *p
<< " now down" << dendl
;
10986 now_down
.insert(*p
);
10987 missing_loc_sources
.erase(p
++);
10990 if (now_down
.empty()) {
10991 ldout(pg
->cct
, 10) << "check_recovery_sources no source osds (" << missing_loc_sources
<< ") went down" << dendl
;
10993 ldout(pg
->cct
, 10) << "check_recovery_sources sources osds " << now_down
<< " now down, remaining sources are "
10994 << missing_loc_sources
<< dendl
;
10996 // filter missing_loc
10997 map
<hobject_t
, set
<pg_shard_t
>>::iterator p
= missing_loc
.begin();
10998 while (p
!= missing_loc
.end()) {
10999 set
<pg_shard_t
>::iterator q
= p
->second
.begin();
11000 while (q
!= p
->second
.end())
11001 if (now_down
.count(*q
)) {
11002 p
->second
.erase(q
++);
11006 if (p
->second
.empty())
11007 missing_loc
.erase(p
++);
11015 bool PrimaryLogPG::start_recovery_ops(
11017 ThreadPool::TPHandle
&handle
,
11018 uint64_t *ops_started
)
11020 uint64_t& started
= *ops_started
;
11022 bool work_in_progress
= false;
11023 assert(is_primary());
11025 if (!state_test(PG_STATE_RECOVERING
) &&
11026 !state_test(PG_STATE_BACKFILL
)) {
11027 /* TODO: I think this case is broken and will make do_recovery()
11028 * unhappy since we're returning false */
11029 dout(10) << "recovery raced and were queued twice, ignoring!" << dendl
;
11033 const pg_missing_t
&missing
= pg_log
.get_missing();
11035 unsigned int num_missing
= missing
.num_missing();
11036 uint64_t num_unfound
= get_num_unfound();
11038 if (num_missing
== 0) {
11039 info
.last_complete
= info
.last_update
;
11042 if (num_missing
== num_unfound
) {
11043 // All of the missing objects we have are unfound.
11044 // Recover the replicas.
11045 started
= recover_replicas(max
, handle
);
11048 // We still have missing objects that we should grab from replicas.
11049 started
+= recover_primary(max
, handle
);
11051 if (!started
&& num_unfound
!= get_num_unfound()) {
11052 // second chance to recovery replicas
11053 started
= recover_replicas(max
, handle
);
11057 work_in_progress
= true;
11059 bool deferred_backfill
= false;
11060 if (recovering
.empty() &&
11061 state_test(PG_STATE_BACKFILL
) &&
11062 !backfill_targets
.empty() && started
< max
&&
11063 missing
.num_missing() == 0 &&
11064 waiting_on_backfill
.empty()) {
11065 if (get_osdmap()->test_flag(CEPH_OSDMAP_NOBACKFILL
)) {
11066 dout(10) << "deferring backfill due to NOBACKFILL" << dendl
;
11067 deferred_backfill
= true;
11068 } else if (get_osdmap()->test_flag(CEPH_OSDMAP_NOREBALANCE
) &&
11070 dout(10) << "deferring backfill due to NOREBALANCE" << dendl
;
11071 deferred_backfill
= true;
11072 } else if (!backfill_reserved
) {
11073 dout(10) << "deferring backfill due to !backfill_reserved" << dendl
;
11074 if (!backfill_reserving
) {
11075 dout(10) << "queueing RequestBackfill" << dendl
;
11076 backfill_reserving
= true;
11077 queue_peering_event(
11079 std::make_shared
<CephPeeringEvt
>(
11080 get_osdmap()->get_epoch(),
11081 get_osdmap()->get_epoch(),
11082 RequestBackfill())));
11084 deferred_backfill
= true;
11086 started
+= recover_backfill(max
- started
, handle
, &work_in_progress
);
11090 dout(10) << " started " << started
<< dendl
;
11091 osd
->logger
->inc(l_osd_rop
, started
);
11093 if (!recovering
.empty() ||
11094 work_in_progress
|| recovery_ops_active
> 0 || deferred_backfill
)
11095 return work_in_progress
;
11097 assert(recovering
.empty());
11098 assert(recovery_ops_active
== 0);
11100 dout(10) << __func__
<< " needs_recovery: "
11101 << missing_loc
.get_needs_recovery()
11103 dout(10) << __func__
<< " missing_loc: "
11104 << missing_loc
.get_missing_locs()
11106 int unfound
= get_num_unfound();
11108 dout(10) << " still have " << unfound
<< " unfound" << dendl
;
11109 return work_in_progress
;
11112 if (missing
.num_missing() > 0) {
11113 // this shouldn't happen!
11114 osd
->clog
->error() << info
.pgid
<< " recovery ending with " << missing
.num_missing()
11115 << ": " << missing
.get_items();
11116 return work_in_progress
;
11119 if (needs_recovery()) {
11120 // this shouldn't happen!
11121 // We already checked num_missing() so we must have missing replicas
11122 osd
->clog
->error() << info
.pgid
<< " recovery ending with missing replicas";
11123 return work_in_progress
;
11126 if (state_test(PG_STATE_RECOVERING
)) {
11127 state_clear(PG_STATE_RECOVERING
);
11128 if (needs_backfill()) {
11129 dout(10) << "recovery done, queuing backfill" << dendl
;
11130 queue_peering_event(
11132 std::make_shared
<CephPeeringEvt
>(
11133 get_osdmap()->get_epoch(),
11134 get_osdmap()->get_epoch(),
11135 RequestBackfill())));
11137 dout(10) << "recovery done, no backfill" << dendl
;
11138 eio_errors_to_process
= false;
11139 queue_peering_event(
11141 std::make_shared
<CephPeeringEvt
>(
11142 get_osdmap()->get_epoch(),
11143 get_osdmap()->get_epoch(),
11144 AllReplicasRecovered())));
11146 } else { // backfilling
11147 state_clear(PG_STATE_BACKFILL
);
11148 dout(10) << "recovery done, backfill done" << dendl
;
11149 eio_errors_to_process
= false;
11150 queue_peering_event(
11152 std::make_shared
<CephPeeringEvt
>(
11153 get_osdmap()->get_epoch(),
11154 get_osdmap()->get_epoch(),
11162 * do one recovery op.
11163 * return true if done, false if nothing left to do.
11165 uint64_t PrimaryLogPG::recover_primary(uint64_t max
, ThreadPool::TPHandle
&handle
)
11167 assert(is_primary());
11169 const pg_missing_t
&missing
= pg_log
.get_missing();
11171 dout(10) << "recover_primary recovering " << recovering
.size()
11172 << " in pg" << dendl
;
11173 dout(10) << "recover_primary " << missing
<< dendl
;
11174 dout(25) << "recover_primary " << missing
.get_items() << dendl
;
11177 pg_log_entry_t
*latest
= 0;
11178 unsigned started
= 0;
11181 PGBackend::RecoveryHandle
*h
= pgbackend
->open_recovery_op();
11182 map
<version_t
, hobject_t
>::const_iterator p
=
11183 missing
.get_rmissing().lower_bound(pg_log
.get_log().last_requested
);
11184 while (p
!= missing
.get_rmissing().end()) {
11185 handle
.reset_tp_timeout();
11187 version_t v
= p
->first
;
11189 if (pg_log
.get_log().objects
.count(p
->second
)) {
11190 latest
= pg_log
.get_log().objects
.find(p
->second
)->second
;
11191 assert(latest
->is_update());
11192 soid
= latest
->soid
;
11197 const pg_missing_item
& item
= missing
.get_items().find(p
->second
)->second
;
11200 hobject_t head
= soid
.get_head();
11202 eversion_t need
= item
.need
;
11204 dout(10) << "recover_primary "
11205 << soid
<< " " << item
.need
11206 << (missing
.is_missing(soid
) ? " (missing)":"")
11207 << (missing
.is_missing(head
) ? " (missing head)":"")
11208 << (recovering
.count(soid
) ? " (recovering)":"")
11209 << (recovering
.count(head
) ? " (recovering head)":"")
11213 switch (latest
->op
) {
11214 case pg_log_entry_t::CLONE
:
11216 * Handling for this special case removed for now, until we
11217 * can correctly construct an accurate SnapSet from the old
11222 case pg_log_entry_t::LOST_REVERT
:
11224 if (item
.have
== latest
->reverting_to
) {
11225 ObjectContextRef obc
= get_object_context(soid
, true);
11227 if (obc
->obs
.oi
.version
== latest
->version
) {
11228 // I'm already reverting
11229 dout(10) << " already reverting " << soid
<< dendl
;
11231 dout(10) << " reverting " << soid
<< " to " << latest
->prior_version
<< dendl
;
11232 obc
->ondisk_write_lock();
11233 obc
->obs
.oi
.version
= latest
->version
;
11235 ObjectStore::Transaction t
;
11237 obc
->obs
.oi
.encode(
11239 get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD
, nullptr));
11240 assert(!pool
.info
.require_rollback());
11241 t
.setattr(coll
, ghobject_t(soid
), OI_ATTR
, b2
);
11243 recover_got(soid
, latest
->version
);
11244 missing_loc
.add_location(soid
, pg_whoami
);
11248 osd
->store
->queue_transaction(osr
.get(), std::move(t
),
11249 new C_OSD_AppliedRecoveredObject(this, obc
),
11250 new C_OSD_CommittedPushedObject(
11252 get_osdmap()->get_epoch(),
11253 info
.last_complete
),
11254 new C_OSD_OndiskWriteUnlock(obc
));
11259 * Pull the old version of the object. Update missing_loc here to have the location
11260 * of the version we want.
11262 * This doesn't use the usual missing_loc paths, but that's okay:
11263 * - if we have it locally, we hit the case above, and go from there.
11264 * - if we don't, we always pass through this case during recovery and set up the location
11266 * - this way we don't need to mangle the missing code to be general about needing an old
11269 eversion_t alternate_need
= latest
->reverting_to
;
11270 dout(10) << " need to pull prior_version " << alternate_need
<< " for revert " << item
<< dendl
;
11272 for (map
<pg_shard_t
, pg_missing_t
>::iterator p
= peer_missing
.begin();
11273 p
!= peer_missing
.end();
11275 if (p
->second
.is_missing(soid
, need
) &&
11276 p
->second
.get_items().at(soid
).have
== alternate_need
) {
11277 missing_loc
.add_location(soid
, p
->first
);
11279 dout(10) << " will pull " << alternate_need
<< " or " << need
11280 << " from one of " << missing_loc
.get_locations(soid
)
11288 if (!recovering
.count(soid
)) {
11289 if (recovering
.count(head
)) {
11292 int r
= recover_missing(
11293 soid
, need
, get_recovery_op_priority(), h
);
11306 if (started
>= max
)
11311 // only advance last_requested if we haven't skipped anything
11313 pg_log
.set_last_requested(v
);
11316 pgbackend
->run_recovery_op(h
, get_recovery_op_priority());
11320 bool PrimaryLogPG::primary_error(
11321 const hobject_t
& soid
, eversion_t v
)
11323 pg_log
.missing_add(soid
, v
, eversion_t());
11324 pg_log
.set_last_requested(0);
11325 missing_loc
.remove_location(soid
, pg_whoami
);
11327 assert(!actingbackfill
.empty());
11328 for (set
<pg_shard_t
>::iterator i
= actingbackfill
.begin();
11329 i
!= actingbackfill
.end();
11331 if (*i
== get_primary()) continue;
11332 pg_shard_t peer
= *i
;
11333 if (!peer_missing
[peer
].is_missing(soid
, v
)) {
11334 missing_loc
.add_location(soid
, peer
);
11335 dout(10) << info
.pgid
<< " unexpectedly missing " << soid
<< " v" << v
11336 << ", there should be a copy on shard " << peer
<< dendl
;
11341 osd
->clog
->error() << info
.pgid
<< " missing primary copy of " << soid
<< ", unfound";
11343 osd
->clog
->error() << info
.pgid
<< " missing primary copy of " << soid
11344 << ", will try copies on " << missing_loc
.get_locations(soid
);
11348 int PrimaryLogPG::prep_object_replica_pushes(
11349 const hobject_t
& soid
, eversion_t v
,
11350 PGBackend::RecoveryHandle
*h
)
11352 assert(is_primary());
11353 dout(10) << __func__
<< ": on " << soid
<< dendl
;
11355 // NOTE: we know we will get a valid oloc off of disk here.
11356 ObjectContextRef obc
= get_object_context(soid
, false);
11358 primary_error(soid
, v
);
11362 if (!obc
->get_recovery_read()) {
11363 dout(20) << "recovery delayed on " << soid
11364 << "; could not get rw_manager lock" << dendl
;
11367 dout(20) << "recovery got recovery read lock on " << soid
11371 start_recovery_op(soid
);
11372 assert(!recovering
.count(soid
));
11373 recovering
.insert(make_pair(soid
, obc
));
11375 /* We need this in case there is an in progress write on the object. In fact,
11376 * the only possible write is an update to the xattr due to a lost_revert --
11377 * a client write would be blocked since the object is degraded.
11378 * In almost all cases, therefore, this lock should be uncontended.
11380 obc
->ondisk_read_lock();
11381 int r
= pgbackend
->recover_object(
11384 ObjectContextRef(),
11385 obc
, // has snapset context
11387 obc
->ondisk_read_unlock();
11389 dout(0) << __func__
<< " Error " << r
<< " on oid " << soid
<< dendl
;
11390 primary_failed(soid
);
11391 primary_error(soid
, v
);
11397 uint64_t PrimaryLogPG::recover_replicas(uint64_t max
, ThreadPool::TPHandle
&handle
)
11399 dout(10) << __func__
<< "(" << max
<< ")" << dendl
;
11400 uint64_t started
= 0;
11402 PGBackend::RecoveryHandle
*h
= pgbackend
->open_recovery_op();
11404 // this is FAR from an optimal recovery order. pretty lame, really.
11405 assert(!actingbackfill
.empty());
11406 for (set
<pg_shard_t
>::iterator i
= actingbackfill
.begin();
11407 i
!= actingbackfill
.end();
11409 if (*i
== get_primary()) continue;
11410 pg_shard_t peer
= *i
;
11411 map
<pg_shard_t
, pg_missing_t
>::const_iterator pm
= peer_missing
.find(peer
);
11412 assert(pm
!= peer_missing
.end());
11413 map
<pg_shard_t
, pg_info_t
>::const_iterator pi
= peer_info
.find(peer
);
11414 assert(pi
!= peer_info
.end());
11415 size_t m_sz
= pm
->second
.num_missing();
11417 dout(10) << " peer osd." << peer
<< " missing " << m_sz
<< " objects." << dendl
;
11418 dout(20) << " peer osd." << peer
<< " missing " << pm
->second
.get_items() << dendl
;
11421 const pg_missing_t
&m(pm
->second
);
11422 for (map
<version_t
, hobject_t
>::const_iterator p
= m
.get_rmissing().begin();
11423 p
!= m
.get_rmissing().end() && started
< max
;
11425 handle
.reset_tp_timeout();
11426 const hobject_t
soid(p
->second
);
11428 if (missing_loc
.is_unfound(soid
)) {
11429 dout(10) << __func__
<< ": " << soid
<< " still unfound" << dendl
;
11433 if (soid
> pi
->second
.last_backfill
) {
11434 if (!recovering
.count(soid
)) {
11435 derr
<< __func__
<< ": object " << soid
<< " last_backfill " << pi
->second
.last_backfill
<< dendl
;
11436 derr
<< __func__
<< ": object added to missing set for backfill, but "
11437 << "is not in recovering, error!" << dendl
;
11443 if (recovering
.count(soid
)) {
11444 dout(10) << __func__
<< ": already recovering " << soid
<< dendl
;
11448 if (soid
.is_snap() && pg_log
.get_missing().is_missing(soid
.get_head())) {
11449 dout(10) << __func__
<< ": " << soid
.get_head()
11450 << " still missing on primary" << dendl
;
11454 if (soid
.is_snap() && pg_log
.get_missing().is_missing(soid
.get_snapdir())) {
11455 dout(10) << __func__
<< ": " << soid
.get_snapdir()
11456 << " still missing on primary" << dendl
;
11460 if (pg_log
.get_missing().is_missing(soid
)) {
11461 dout(10) << __func__
<< ": " << soid
<< " still missing on primary" << dendl
;
11465 dout(10) << __func__
<< ": recover_object_replicas(" << soid
<< ")" << dendl
;
11466 map
<hobject_t
,pg_missing_item
>::const_iterator r
= m
.get_items().find(soid
);
11467 started
+= prep_object_replica_pushes(soid
, r
->second
.need
,
11472 pgbackend
->run_recovery_op(h
, get_recovery_op_priority());
11476 hobject_t
PrimaryLogPG::earliest_peer_backfill() const
11478 hobject_t e
= hobject_t::get_max();
11479 for (set
<pg_shard_t
>::const_iterator i
= backfill_targets
.begin();
11480 i
!= backfill_targets
.end();
11482 pg_shard_t peer
= *i
;
11483 map
<pg_shard_t
, BackfillInterval
>::const_iterator iter
=
11484 peer_backfill_info
.find(peer
);
11485 assert(iter
!= peer_backfill_info
.end());
11486 if (iter
->second
.begin
< e
)
11487 e
= iter
->second
.begin
;
11492 bool PrimaryLogPG::all_peer_done() const
11494 // Primary hasn't got any more objects
11495 assert(backfill_info
.empty());
11497 for (set
<pg_shard_t
>::const_iterator i
= backfill_targets
.begin();
11498 i
!= backfill_targets
.end();
11500 pg_shard_t bt
= *i
;
11501 map
<pg_shard_t
, BackfillInterval
>::const_iterator piter
=
11502 peer_backfill_info
.find(bt
);
11503 assert(piter
!= peer_backfill_info
.end());
11504 const BackfillInterval
& pbi
= piter
->second
;
11505 // See if peer has more to process
11506 if (!pbi
.extends_to_end() || !pbi
.empty())
11517 * backfilled: fully pushed to replica or present in replica's missing set (both
11518 * our copy and theirs).
11520 * All objects on a backfill_target in
11521 * [MIN,peer_backfill_info[backfill_target].begin) are valid; logically-removed
11522 * objects have been actually deleted and all logically-valid objects are replicated.
11523 * There may be PG objects in this interval yet to be backfilled.
11525 * All objects in PG in [MIN,backfill_info.begin) have been backfilled to all
11526 * backfill_targets. There may be objects on backfill_target(s) yet to be deleted.
11528 * For a backfill target, all objects < MIN(peer_backfill_info[target].begin,
11529 * backfill_info.begin) in PG are backfilled. No deleted objects in this
11530 * interval remain on the backfill target.
11532 * For a backfill target, all objects <= peer_info[target].last_backfill
11533 * have been backfilled to target
11535 * There *MAY* be missing/outdated objects between last_backfill_started and
11536 * MIN(peer_backfill_info[*].begin, backfill_info.begin) in the event that client
11537 * io created objects since the last scan. For this reason, we call
11538 * update_range() again before continuing backfill.
11540 uint64_t PrimaryLogPG::recover_backfill(
11542 ThreadPool::TPHandle
&handle
, bool *work_started
)
11544 dout(10) << "recover_backfill (" << max
<< ")"
11545 << " bft=" << backfill_targets
11546 << " last_backfill_started " << last_backfill_started
11547 << (new_backfill
? " new_backfill":"")
11549 assert(!backfill_targets
.empty());
11551 // Initialize from prior backfill state
11552 if (new_backfill
) {
11553 // on_activate() was called prior to getting here
11554 assert(last_backfill_started
== earliest_backfill());
11555 new_backfill
= false;
11557 // initialize BackfillIntervals
11558 for (set
<pg_shard_t
>::iterator i
= backfill_targets
.begin();
11559 i
!= backfill_targets
.end();
11561 peer_backfill_info
[*i
].reset(peer_info
[*i
].last_backfill
);
11563 backfill_info
.reset(last_backfill_started
);
11565 backfills_in_flight
.clear();
11566 pending_backfill_updates
.clear();
11569 for (set
<pg_shard_t
>::iterator i
= backfill_targets
.begin();
11570 i
!= backfill_targets
.end();
11572 dout(10) << "peer osd." << *i
11573 << " info " << peer_info
[*i
]
11574 << " interval " << peer_backfill_info
[*i
].begin
11575 << "-" << peer_backfill_info
[*i
].end
11576 << " " << peer_backfill_info
[*i
].objects
.size() << " objects"
11580 // update our local interval to cope with recent changes
11581 backfill_info
.begin
= last_backfill_started
;
11582 update_range(&backfill_info
, handle
);
11585 vector
<boost::tuple
<hobject_t
, eversion_t
, pg_shard_t
> > to_remove
;
11586 set
<hobject_t
> add_to_stat
;
11588 for (set
<pg_shard_t
>::iterator i
= backfill_targets
.begin();
11589 i
!= backfill_targets
.end();
11591 peer_backfill_info
[*i
].trim_to(
11592 std::max(peer_info
[*i
].last_backfill
, last_backfill_started
));
11594 backfill_info
.trim_to(last_backfill_started
);
11596 PGBackend::RecoveryHandle
*h
= pgbackend
->open_recovery_op();
11597 while (ops
< max
) {
11598 if (backfill_info
.begin
<= earliest_peer_backfill() &&
11599 !backfill_info
.extends_to_end() && backfill_info
.empty()) {
11600 hobject_t next
= backfill_info
.end
;
11601 backfill_info
.reset(next
);
11602 backfill_info
.end
= hobject_t::get_max();
11603 update_range(&backfill_info
, handle
);
11604 backfill_info
.trim();
11607 dout(20) << " my backfill interval " << backfill_info
<< dendl
;
11609 bool sent_scan
= false;
11610 for (set
<pg_shard_t
>::iterator i
= backfill_targets
.begin();
11611 i
!= backfill_targets
.end();
11613 pg_shard_t bt
= *i
;
11614 BackfillInterval
& pbi
= peer_backfill_info
[bt
];
11616 dout(20) << " peer shard " << bt
<< " backfill " << pbi
<< dendl
;
11617 if (pbi
.begin
<= backfill_info
.begin
&&
11618 !pbi
.extends_to_end() && pbi
.empty()) {
11619 dout(10) << " scanning peer osd." << bt
<< " from " << pbi
.end
<< dendl
;
11620 epoch_t e
= get_osdmap()->get_epoch();
11621 MOSDPGScan
*m
= new MOSDPGScan(
11622 MOSDPGScan::OP_SCAN_GET_DIGEST
, pg_whoami
, e
, last_peering_reset
,
11623 spg_t(info
.pgid
.pgid
, bt
.shard
),
11624 pbi
.end
, hobject_t());
11625 osd
->send_message_osd_cluster(bt
.osd
, m
, get_osdmap()->get_epoch());
11626 assert(waiting_on_backfill
.find(bt
) == waiting_on_backfill
.end());
11627 waiting_on_backfill
.insert(bt
);
11632 // Count simultaneous scans as a single op and let those complete
11635 start_recovery_op(hobject_t::get_max()); // XXX: was pbi.end
11639 if (backfill_info
.empty() && all_peer_done()) {
11640 dout(10) << " reached end for both local and all peers" << dendl
;
11644 // Get object within set of peers to operate on and
11645 // the set of targets for which that object applies.
11646 hobject_t check
= earliest_peer_backfill();
11648 if (check
< backfill_info
.begin
) {
11650 set
<pg_shard_t
> check_targets
;
11651 for (set
<pg_shard_t
>::iterator i
= backfill_targets
.begin();
11652 i
!= backfill_targets
.end();
11654 pg_shard_t bt
= *i
;
11655 BackfillInterval
& pbi
= peer_backfill_info
[bt
];
11656 if (pbi
.begin
== check
)
11657 check_targets
.insert(bt
);
11659 assert(!check_targets
.empty());
11661 dout(20) << " BACKFILL removing " << check
11662 << " from peers " << check_targets
<< dendl
;
11663 for (set
<pg_shard_t
>::iterator i
= check_targets
.begin();
11664 i
!= check_targets
.end();
11666 pg_shard_t bt
= *i
;
11667 BackfillInterval
& pbi
= peer_backfill_info
[bt
];
11668 assert(pbi
.begin
== check
);
11670 to_remove
.push_back(boost::make_tuple(check
, pbi
.objects
.begin()->second
, bt
));
11674 /* This requires a bit of explanation. We compare head against
11675 * last_backfill to determine whether to send an operation
11676 * to the replica. A single write operation can touch up to three
11677 * objects: head, the snapdir, and a new clone which sorts closer to
11678 * head than any existing clone. If last_backfill points at a clone,
11679 * the transaction won't be sent and all 3 must lie on the right side
11680 * of the line (i.e., we'll backfill them later). If last_backfill
11681 * points at snapdir, it sorts greater than head, so we send the
11682 * transaction which is correct because all three must lie to the left
11685 * If it points at head, we have a bit of an issue. If head actually
11686 * exists, no problem, because any transaction which touches snapdir
11687 * must end up creating it (and deleting head), so sending the
11688 * operation won't pose a problem -- we'll end up having to scan it,
11689 * but it'll end up being the right version so we won't bother to
11690 * rebackfill it. However, if head doesn't exist, any write on head
11691 * will remove snapdir. For a replicated pool, this isn't a problem,
11692 * ENOENT on remove isn't an issue and it's in backfill future anyway.
11693 * It only poses a problem for EC pools, because we never just delete
11694 * an object, we rename it into a rollback object. That operation
11695 * will end up crashing the osd with ENOENT. Tolerating the failure
11696 * wouldn't work either, even if snapdir exists, we'd be creating a
11697 * rollback object past the last_backfill line which wouldn't get
11698 * cleaned up (no rollback objects past the last_backfill line is an
11699 * existing important invariant). Thus, let's avoid the whole issue
11700 * by just not updating last_backfill_started here if head doesn't
11701 * exist and snapdir does. We aren't using up a recovery count here,
11702 * so we're going to recover snapdir immediately anyway. We'll only
11703 * fail "backward" if we fail to get the rw lock and that just means
11704 * we'll re-process this section of the hash space again.
11706 * I'm choosing this hack here because the really "correct" answer is
11707 * going to be to unify snapdir and head into a single object (a
11708 * snapdir is really just a confusing way to talk about head existing
11709 * as a whiteout), but doing that is going to be a somewhat larger
11712 * @see http://tracker.ceph.com/issues/17668
11714 if (!(check
.is_head() &&
11715 backfill_info
.begin
.is_snapdir() &&
11716 check
== backfill_info
.begin
.get_head()))
11717 last_backfill_started
= check
;
11719 // Don't increment ops here because deletions
11720 // are cheap and not replied to unlike real recovery_ops,
11721 // and we can't increment ops without requeueing ourself
11724 eversion_t
& obj_v
= backfill_info
.objects
.begin()->second
;
11726 vector
<pg_shard_t
> need_ver_targs
, missing_targs
, keep_ver_targs
, skip_targs
;
11727 for (set
<pg_shard_t
>::iterator i
= backfill_targets
.begin();
11728 i
!= backfill_targets
.end();
11730 pg_shard_t bt
= *i
;
11731 BackfillInterval
& pbi
= peer_backfill_info
[bt
];
11732 // Find all check peers that have the wrong version
11733 if (check
== backfill_info
.begin
&& check
== pbi
.begin
) {
11734 if (pbi
.objects
.begin()->second
!= obj_v
) {
11735 need_ver_targs
.push_back(bt
);
11737 keep_ver_targs
.push_back(bt
);
11740 pg_info_t
& pinfo
= peer_info
[bt
];
11742 // Only include peers that we've caught up to their backfill line
11743 // otherwise, they only appear to be missing this object
11744 // because their pbi.begin > backfill_info.begin.
11745 if (backfill_info
.begin
> pinfo
.last_backfill
)
11746 missing_targs
.push_back(bt
);
11748 skip_targs
.push_back(bt
);
11752 if (!keep_ver_targs
.empty()) {
11753 // These peers have version obj_v
11754 dout(20) << " BACKFILL keeping " << check
11755 << " with ver " << obj_v
11756 << " on peers " << keep_ver_targs
<< dendl
;
11757 //assert(!waiting_for_degraded_object.count(check));
11759 if (!need_ver_targs
.empty() || !missing_targs
.empty()) {
11760 ObjectContextRef obc
= get_object_context(backfill_info
.begin
, false);
11762 if (obc
->get_recovery_read()) {
11763 if (!need_ver_targs
.empty()) {
11764 dout(20) << " BACKFILL replacing " << check
11765 << " with ver " << obj_v
11766 << " to peers " << need_ver_targs
<< dendl
;
11768 if (!missing_targs
.empty()) {
11769 dout(20) << " BACKFILL pushing " << backfill_info
.begin
11770 << " with ver " << obj_v
11771 << " to peers " << missing_targs
<< dendl
;
11773 vector
<pg_shard_t
> all_push
= need_ver_targs
;
11774 all_push
.insert(all_push
.end(), missing_targs
.begin(), missing_targs
.end());
11776 handle
.reset_tp_timeout();
11777 int r
= prep_backfill_object_push(backfill_info
.begin
, obj_v
, obc
, all_push
, h
);
11779 *work_started
= true;
11780 dout(0) << __func__
<< " Error " << r
<< " trying to backfill " << backfill_info
.begin
<< dendl
;
11785 *work_started
= true;
11786 dout(20) << "backfill blocking on " << backfill_info
.begin
11787 << "; could not get rw_manager lock" << dendl
;
11791 dout(20) << "need_ver_targs=" << need_ver_targs
11792 << " keep_ver_targs=" << keep_ver_targs
<< dendl
;
11793 dout(20) << "backfill_targets=" << backfill_targets
11794 << " missing_targs=" << missing_targs
11795 << " skip_targs=" << skip_targs
<< dendl
;
11797 last_backfill_started
= backfill_info
.begin
;
11798 add_to_stat
.insert(backfill_info
.begin
); // XXX: Only one for all pushes?
11799 backfill_info
.pop_front();
11800 vector
<pg_shard_t
> check_targets
= need_ver_targs
;
11801 check_targets
.insert(check_targets
.end(), keep_ver_targs
.begin(), keep_ver_targs
.end());
11802 for (vector
<pg_shard_t
>::iterator i
= check_targets
.begin();
11803 i
!= check_targets
.end();
11805 pg_shard_t bt
= *i
;
11806 BackfillInterval
& pbi
= peer_backfill_info
[bt
];
11812 hobject_t backfill_pos
=
11813 std::min(backfill_info
.begin
, earliest_peer_backfill());
11815 for (set
<hobject_t
>::iterator i
= add_to_stat
.begin();
11816 i
!= add_to_stat
.end();
11818 ObjectContextRef obc
= get_object_context(*i
, false);
11821 add_object_context_to_pg_stat(obc
, &stat
);
11822 pending_backfill_updates
[*i
] = stat
;
11824 if (HAVE_FEATURE(get_min_upacting_features(), SERVER_LUMINOUS
)) {
11825 map
<pg_shard_t
,MOSDPGBackfillRemove
*> reqs
;
11826 for (unsigned i
= 0; i
< to_remove
.size(); ++i
) {
11827 handle
.reset_tp_timeout();
11828 const hobject_t
& oid
= to_remove
[i
].get
<0>();
11829 eversion_t v
= to_remove
[i
].get
<1>();
11830 pg_shard_t peer
= to_remove
[i
].get
<2>();
11831 MOSDPGBackfillRemove
*m
;
11832 auto it
= reqs
.find(peer
);
11833 if (it
!= reqs
.end()) {
11836 m
= reqs
[peer
] = new MOSDPGBackfillRemove(
11837 spg_t(info
.pgid
.pgid
, peer
.shard
),
11838 get_osdmap()->get_epoch());
11840 m
->ls
.push_back(make_pair(oid
, v
));
11842 if (oid
<= last_backfill_started
)
11843 pending_backfill_updates
[oid
]; // add empty stat!
11845 for (auto p
: reqs
) {
11846 osd
->send_message_osd_cluster(p
.first
.osd
, p
.second
,
11847 get_osdmap()->get_epoch());
11850 // for jewel targets
11851 for (unsigned i
= 0; i
< to_remove
.size(); ++i
) {
11852 handle
.reset_tp_timeout();
11854 // ordered before any subsequent updates
11855 send_remove_op(to_remove
[i
].get
<0>(), to_remove
[i
].get
<1>(),
11856 to_remove
[i
].get
<2>());
11858 if (to_remove
[i
].get
<0>() <= last_backfill_started
)
11859 pending_backfill_updates
[to_remove
[i
].get
<0>()]; // add empty stat!
11863 pgbackend
->run_recovery_op(h
, get_recovery_op_priority());
11865 dout(5) << "backfill_pos is " << backfill_pos
<< dendl
;
11866 for (set
<hobject_t
>::iterator i
= backfills_in_flight
.begin();
11867 i
!= backfills_in_flight
.end();
11869 dout(20) << *i
<< " is still in flight" << dendl
;
11872 hobject_t next_backfill_to_complete
= backfills_in_flight
.empty() ?
11873 backfill_pos
: *(backfills_in_flight
.begin());
11874 hobject_t new_last_backfill
= earliest_backfill();
11875 dout(10) << "starting new_last_backfill at " << new_last_backfill
<< dendl
;
11876 for (map
<hobject_t
, pg_stat_t
>::iterator i
=
11877 pending_backfill_updates
.begin();
11878 i
!= pending_backfill_updates
.end() &&
11879 i
->first
< next_backfill_to_complete
;
11880 pending_backfill_updates
.erase(i
++)) {
11881 dout(20) << " pending_backfill_update " << i
->first
<< dendl
;
11882 assert(i
->first
> new_last_backfill
);
11883 for (set
<pg_shard_t
>::iterator j
= backfill_targets
.begin();
11884 j
!= backfill_targets
.end();
11886 pg_shard_t bt
= *j
;
11887 pg_info_t
& pinfo
= peer_info
[bt
];
11888 //Add stats to all peers that were missing object
11889 if (i
->first
> pinfo
.last_backfill
)
11890 pinfo
.stats
.add(i
->second
);
11892 new_last_backfill
= i
->first
;
11894 dout(10) << "possible new_last_backfill at " << new_last_backfill
<< dendl
;
11896 assert(!pending_backfill_updates
.empty() ||
11897 new_last_backfill
== last_backfill_started
);
11898 if (pending_backfill_updates
.empty() &&
11899 backfill_pos
.is_max()) {
11900 assert(backfills_in_flight
.empty());
11901 new_last_backfill
= backfill_pos
;
11902 last_backfill_started
= backfill_pos
;
11904 dout(10) << "final new_last_backfill at " << new_last_backfill
<< dendl
;
11906 // If new_last_backfill == MAX, then we will send OP_BACKFILL_FINISH to
11907 // all the backfill targets. Otherwise, we will move last_backfill up on
11908 // those targets need it and send OP_BACKFILL_PROGRESS to them.
11909 for (set
<pg_shard_t
>::iterator i
= backfill_targets
.begin();
11910 i
!= backfill_targets
.end();
11912 pg_shard_t bt
= *i
;
11913 pg_info_t
& pinfo
= peer_info
[bt
];
11915 if (new_last_backfill
> pinfo
.last_backfill
) {
11916 pinfo
.set_last_backfill(new_last_backfill
);
11917 epoch_t e
= get_osdmap()->get_epoch();
11918 MOSDPGBackfill
*m
= NULL
;
11919 if (pinfo
.last_backfill
.is_max()) {
11920 m
= new MOSDPGBackfill(
11921 MOSDPGBackfill::OP_BACKFILL_FINISH
,
11923 last_peering_reset
,
11924 spg_t(info
.pgid
.pgid
, bt
.shard
));
11925 // Use default priority here, must match sub_op priority
11926 /* pinfo.stats might be wrong if we did log-based recovery on the
11927 * backfilled portion in addition to continuing backfill.
11929 pinfo
.stats
= info
.stats
;
11930 start_recovery_op(hobject_t::get_max());
11932 m
= new MOSDPGBackfill(
11933 MOSDPGBackfill::OP_BACKFILL_PROGRESS
,
11935 last_peering_reset
,
11936 spg_t(info
.pgid
.pgid
, bt
.shard
));
11937 // Use default priority here, must match sub_op priority
11939 m
->last_backfill
= pinfo
.last_backfill
;
11940 m
->stats
= pinfo
.stats
;
11941 osd
->send_message_osd_cluster(bt
.osd
, m
, get_osdmap()->get_epoch());
11942 dout(10) << " peer " << bt
11943 << " num_objects now " << pinfo
.stats
.stats
.sum
.num_objects
11944 << " / " << info
.stats
.stats
.sum
.num_objects
<< dendl
;
11949 *work_started
= true;
11953 int PrimaryLogPG::prep_backfill_object_push(
11954 hobject_t oid
, eversion_t v
,
11955 ObjectContextRef obc
,
11956 vector
<pg_shard_t
> peers
,
11957 PGBackend::RecoveryHandle
*h
)
11959 dout(10) << __func__
<< " " << oid
<< " v " << v
<< " to peers " << peers
<< dendl
;
11960 assert(!peers
.empty());
11962 backfills_in_flight
.insert(oid
);
11963 for (unsigned int i
= 0 ; i
< peers
.size(); ++i
) {
11964 map
<pg_shard_t
, pg_missing_t
>::iterator bpm
= peer_missing
.find(peers
[i
]);
11965 assert(bpm
!= peer_missing
.end());
11966 bpm
->second
.add(oid
, eversion_t(), eversion_t());
11969 assert(!recovering
.count(oid
));
11971 start_recovery_op(oid
);
11972 recovering
.insert(make_pair(oid
, obc
));
11974 // We need to take the read_lock here in order to flush in-progress writes
11975 obc
->ondisk_read_lock();
11976 int r
= pgbackend
->recover_object(
11979 ObjectContextRef(),
11982 obc
->ondisk_read_unlock();
11984 dout(0) << __func__
<< " Error " << r
<< " on oid " << oid
<< dendl
;
11985 primary_failed(oid
);
11986 primary_error(oid
, v
);
11987 backfills_in_flight
.erase(oid
);
11988 missing_loc
.add_missing(oid
, v
, eversion_t());
11993 void PrimaryLogPG::update_range(
11994 BackfillInterval
*bi
,
11995 ThreadPool::TPHandle
&handle
)
11997 int local_min
= cct
->_conf
->osd_backfill_scan_min
;
11998 int local_max
= cct
->_conf
->osd_backfill_scan_max
;
12000 if (bi
->version
< info
.log_tail
) {
12001 dout(10) << __func__
<< ": bi is old, rescanning local backfill_info"
12003 if (last_update_applied
>= info
.log_tail
) {
12004 bi
->version
= last_update_applied
;
12007 bi
->version
= info
.last_update
;
12009 scan_range(local_min
, local_max
, bi
, handle
);
12012 if (bi
->version
>= projected_last_update
) {
12013 dout(10) << __func__
<< ": bi is current " << dendl
;
12014 assert(bi
->version
== projected_last_update
);
12015 } else if (bi
->version
>= info
.log_tail
) {
12016 if (pg_log
.get_log().empty() && projected_log
.empty()) {
12017 /* Because we don't move log_tail on split, the log might be
12018 * empty even if log_tail != last_update. However, the only
12019 * way to get here with an empty log is if log_tail is actually
12020 * eversion_t(), because otherwise the entry which changed
12021 * last_update since the last scan would have to be present.
12023 assert(bi
->version
== eversion_t());
12027 dout(10) << __func__
<< ": bi is old, (" << bi
->version
12028 << ") can be updated with log to projected_last_update "
12029 << projected_last_update
<< dendl
;
12031 auto func
= [&](const pg_log_entry_t
&e
) {
12032 dout(10) << __func__
<< ": updating from version " << e
.version
12034 const hobject_t
&soid
= e
.soid
;
12035 if (soid
>= bi
->begin
&&
12037 if (e
.is_update()) {
12038 dout(10) << __func__
<< ": " << e
.soid
<< " updated to version "
12039 << e
.version
<< dendl
;
12040 bi
->objects
.erase(e
.soid
);
12041 bi
->objects
.insert(
12045 } else if (e
.is_delete()) {
12046 dout(10) << __func__
<< ": " << e
.soid
<< " removed" << dendl
;
12047 bi
->objects
.erase(e
.soid
);
12051 dout(10) << "scanning pg log first" << dendl
;
12052 pg_log
.get_log().scan_log_after(bi
->version
, func
);
12053 dout(10) << "scanning projected log" << dendl
;
12054 projected_log
.scan_log_after(bi
->version
, func
);
12055 bi
->version
= projected_last_update
;
12057 assert(0 == "scan_range should have raised bi->version past log_tail");
12061 void PrimaryLogPG::scan_range(
12062 int min
, int max
, BackfillInterval
*bi
,
12063 ThreadPool::TPHandle
&handle
)
12065 assert(is_locked());
12066 dout(10) << "scan_range from " << bi
->begin
<< dendl
;
12067 bi
->clear_objects();
12069 vector
<hobject_t
> ls
;
12071 int r
= pgbackend
->objects_list_partial(bi
->begin
, min
, max
, &ls
, &bi
->end
);
12073 dout(10) << " got " << ls
.size() << " items, next " << bi
->end
<< dendl
;
12074 dout(20) << ls
<< dendl
;
12076 for (vector
<hobject_t
>::iterator p
= ls
.begin(); p
!= ls
.end(); ++p
) {
12077 handle
.reset_tp_timeout();
12078 ObjectContextRef obc
;
12080 obc
= object_contexts
.lookup(*p
);
12082 bi
->objects
[*p
] = obc
->obs
.oi
.version
;
12083 dout(20) << " " << *p
<< " " << obc
->obs
.oi
.version
<< dendl
;
12086 int r
= pgbackend
->objects_get_attr(*p
, OI_ATTR
, &bl
);
12088 /* If the object does not exist here, it must have been removed
12089 * between the collection_list_partial and here. This can happen
12090 * for the first item in the range, which is usually last_backfill.
12096 object_info_t
oi(bl
);
12097 bi
->objects
[*p
] = oi
.version
;
12098 dout(20) << " " << *p
<< " " << oi
.version
<< dendl
;
12106 * verifies that stray objects have been deleted
12108 void PrimaryLogPG::check_local()
12110 dout(10) << __func__
<< dendl
;
12112 assert(info
.last_update
>= pg_log
.get_tail()); // otherwise we need some help!
12114 if (!cct
->_conf
->osd_debug_verify_stray_on_activate
)
12117 // just scan the log.
12118 set
<hobject_t
> did
;
12119 for (list
<pg_log_entry_t
>::const_reverse_iterator p
= pg_log
.get_log().log
.rbegin();
12120 p
!= pg_log
.get_log().log
.rend();
12122 if (did
.count(p
->soid
))
12124 did
.insert(p
->soid
);
12126 if (p
->is_delete()) {
12127 dout(10) << " checking " << p
->soid
12128 << " at " << p
->version
<< dendl
;
12130 int r
= osd
->store
->stat(
12132 ghobject_t(p
->soid
, ghobject_t::NO_GEN
, pg_whoami
.shard
),
12134 if (r
!= -ENOENT
) {
12135 derr
<< __func__
<< " " << p
->soid
<< " exists, but should have been "
12136 << "deleted" << dendl
;
12137 assert(0 == "erroneously present object");
12140 // ignore old(+missing) objects
12147 // ===========================
12150 hobject_t
PrimaryLogPG::get_hit_set_current_object(utime_t stamp
)
12153 ss
<< "hit_set_" << info
.pgid
.pgid
<< "_current_" << stamp
;
12154 hobject_t
hoid(sobject_t(ss
.str(), CEPH_NOSNAP
), "",
12155 info
.pgid
.ps(), info
.pgid
.pool(),
12156 cct
->_conf
->osd_hit_set_namespace
);
12157 dout(20) << __func__
<< " " << hoid
<< dendl
;
12161 hobject_t
PrimaryLogPG::get_hit_set_archive_object(utime_t start
,
12166 ss
<< "hit_set_" << info
.pgid
.pgid
<< "_archive_";
12168 start
.gmtime(ss
) << "_";
12171 start
.localtime(ss
) << "_";
12174 hobject_t
hoid(sobject_t(ss
.str(), CEPH_NOSNAP
), "",
12175 info
.pgid
.ps(), info
.pgid
.pool(),
12176 cct
->_conf
->osd_hit_set_namespace
);
12177 dout(20) << __func__
<< " " << hoid
<< dendl
;
12181 void PrimaryLogPG::hit_set_clear()
12183 dout(20) << __func__
<< dendl
;
12185 hit_set_start_stamp
= utime_t();
12188 void PrimaryLogPG::hit_set_setup()
12190 if (!is_active() ||
12196 if (is_active() && is_primary() &&
12197 (!pool
.info
.hit_set_count
||
12198 !pool
.info
.hit_set_period
||
12199 pool
.info
.hit_set_params
.get_type() == HitSet::TYPE_NONE
)) {
12202 // only primary is allowed to remove all the hit set objects
12203 hit_set_remove_all();
12207 // FIXME: discard any previous data for now
12210 // include any writes we know about from the pg log. this doesn't
12211 // capture reads, but it is better than nothing!
12212 hit_set_apply_log();
12215 void PrimaryLogPG::hit_set_remove_all()
12217 // If any archives are degraded we skip this
12218 for (list
<pg_hit_set_info_t
>::iterator p
= info
.hit_set
.history
.begin();
12219 p
!= info
.hit_set
.history
.end();
12221 hobject_t aoid
= get_hit_set_archive_object(p
->begin
, p
->end
, p
->using_gmt
);
12223 // Once we hit a degraded object just skip
12224 if (is_degraded_or_backfilling_object(aoid
))
12226 if (scrubber
.write_blocked_by_scrub(aoid
))
12230 if (!info
.hit_set
.history
.empty()) {
12231 list
<pg_hit_set_info_t
>::reverse_iterator p
= info
.hit_set
.history
.rbegin();
12232 assert(p
!= info
.hit_set
.history
.rend());
12233 hobject_t oid
= get_hit_set_archive_object(p
->begin
, p
->end
, p
->using_gmt
);
12234 assert(!is_degraded_or_backfilling_object(oid
));
12235 ObjectContextRef obc
= get_object_context(oid
, false);
12238 OpContextUPtr ctx
= simple_opc_create(obc
);
12239 ctx
->at_version
= get_next_version();
12240 ctx
->updated_hset_history
= info
.hit_set
;
12241 utime_t now
= ceph_clock_now();
12243 hit_set_trim(ctx
, 0);
12244 simple_opc_submit(std::move(ctx
));
12247 info
.hit_set
= pg_hit_set_history_t();
12249 agent_state
->discard_hit_sets();
12253 void PrimaryLogPG::hit_set_create()
12255 utime_t now
= ceph_clock_now();
12256 // make a copy of the params to modify
12257 HitSet::Params
params(pool
.info
.hit_set_params
);
12259 dout(20) << __func__
<< " " << params
<< dendl
;
12260 if (pool
.info
.hit_set_params
.get_type() == HitSet::TYPE_BLOOM
) {
12261 BloomHitSet::Params
*p
=
12262 static_cast<BloomHitSet::Params
*>(params
.impl
.get());
12264 // convert false positive rate so it holds up across the full period
12265 p
->set_fpp(p
->get_fpp() / pool
.info
.hit_set_count
);
12266 if (p
->get_fpp() <= 0.0)
12267 p
->set_fpp(.01); // fpp cannot be zero!
12269 // if we don't have specified size, estimate target size based on the
12271 if (p
->target_size
== 0 && hit_set
) {
12272 utime_t dur
= now
- hit_set_start_stamp
;
12273 unsigned unique
= hit_set
->approx_unique_insert_count();
12274 dout(20) << __func__
<< " previous set had approx " << unique
12275 << " unique items over " << dur
<< " seconds" << dendl
;
12276 p
->target_size
= (double)unique
* (double)pool
.info
.hit_set_period
12279 if (p
->target_size
<
12280 static_cast<uint64_t>(cct
->_conf
->osd_hit_set_min_size
))
12281 p
->target_size
= cct
->_conf
->osd_hit_set_min_size
;
12284 > static_cast<uint64_t>(cct
->_conf
->osd_hit_set_max_size
))
12285 p
->target_size
= cct
->_conf
->osd_hit_set_max_size
;
12287 p
->seed
= now
.sec();
12289 dout(10) << __func__
<< " target_size " << p
->target_size
12290 << " fpp " << p
->get_fpp() << dendl
;
12292 hit_set
.reset(new HitSet(params
));
12293 hit_set_start_stamp
= now
;
12297 * apply log entries to set
12299 * this would only happen after peering, to at least capture writes
12300 * during an interval that was potentially lost.
12302 bool PrimaryLogPG::hit_set_apply_log()
12307 eversion_t to
= info
.last_update
;
12308 eversion_t from
= info
.hit_set
.current_last_update
;
12310 dout(20) << __func__
<< " no update" << dendl
;
12314 dout(20) << __func__
<< " " << to
<< " .. " << info
.last_update
<< dendl
;
12315 list
<pg_log_entry_t
>::const_reverse_iterator p
= pg_log
.get_log().log
.rbegin();
12316 while (p
!= pg_log
.get_log().log
.rend() && p
->version
> to
)
12318 while (p
!= pg_log
.get_log().log
.rend() && p
->version
> from
) {
12319 hit_set
->insert(p
->soid
);
12326 void PrimaryLogPG::hit_set_persist()
12328 dout(10) << __func__
<< dendl
;
12330 unsigned max
= pool
.info
.hit_set_count
;
12332 utime_t now
= ceph_clock_now();
12335 // If any archives are degraded we skip this persist request
12336 // account for the additional entry being added below
12337 for (list
<pg_hit_set_info_t
>::iterator p
= info
.hit_set
.history
.begin();
12338 p
!= info
.hit_set
.history
.end();
12340 hobject_t aoid
= get_hit_set_archive_object(p
->begin
, p
->end
, p
->using_gmt
);
12342 // Once we hit a degraded object just skip further trim
12343 if (is_degraded_or_backfilling_object(aoid
))
12345 if (scrubber
.write_blocked_by_scrub(aoid
))
12349 // If backfill is in progress and we could possibly overlap with the
12350 // hit_set_* objects, back off. Since these all have
12351 // hobject_t::hash set to pgid.ps(), and those sort first, we can
12352 // look just at that. This is necessary because our transactions
12353 // may include a modify of the new hit_set *and* a delete of the
12354 // old one, and this may span the backfill boundary.
12355 for (set
<pg_shard_t
>::iterator p
= backfill_targets
.begin();
12356 p
!= backfill_targets
.end();
12358 assert(peer_info
.count(*p
));
12359 const pg_info_t
& pi
= peer_info
[*p
];
12360 if (pi
.last_backfill
== hobject_t() ||
12361 pi
.last_backfill
.get_hash() == info
.pgid
.ps()) {
12362 dout(10) << __func__
<< " backfill target osd." << *p
12363 << " last_backfill has not progressed past pgid ps"
12370 pg_hit_set_info_t new_hset
= pg_hit_set_info_t(pool
.info
.use_gmt_hitset
);
12371 new_hset
.begin
= hit_set_start_stamp
;
12372 new_hset
.end
= now
;
12373 oid
= get_hit_set_archive_object(
12376 new_hset
.using_gmt
);
12378 // If the current object is degraded we skip this persist request
12379 if (scrubber
.write_blocked_by_scrub(oid
))
12383 ::encode(*hit_set
, bl
);
12384 dout(20) << __func__
<< " archive " << oid
<< dendl
;
12387 agent_state
->add_hit_set(new_hset
.begin
, hit_set
);
12388 uint32_t size
= agent_state
->hit_set_map
.size();
12389 if (size
>= pool
.info
.hit_set_count
) {
12390 size
= pool
.info
.hit_set_count
> 0 ? pool
.info
.hit_set_count
- 1: 0;
12392 hit_set_in_memory_trim(size
);
12395 ObjectContextRef obc
= get_object_context(oid
, true);
12396 OpContextUPtr ctx
= simple_opc_create(obc
);
12398 ctx
->at_version
= get_next_version();
12399 ctx
->updated_hset_history
= info
.hit_set
;
12400 pg_hit_set_history_t
&updated_hit_set_hist
= *(ctx
->updated_hset_history
);
12402 updated_hit_set_hist
.current_last_update
= info
.last_update
;
12403 new_hset
.version
= ctx
->at_version
;
12405 updated_hit_set_hist
.history
.push_back(new_hset
);
12408 // fabricate an object_info_t and SnapSet
12409 obc
->obs
.oi
.version
= ctx
->at_version
;
12410 obc
->obs
.oi
.mtime
= now
;
12411 obc
->obs
.oi
.size
= bl
.length();
12412 obc
->obs
.exists
= true;
12413 obc
->obs
.oi
.set_data_digest(bl
.crc32c(-1));
12415 ctx
->new_obs
= obc
->obs
;
12417 obc
->ssc
->snapset
.head_exists
= true;
12418 ctx
->new_snapset
= obc
->ssc
->snapset
;
12420 ctx
->delta_stats
.num_objects
++;
12421 ctx
->delta_stats
.num_objects_hit_set_archive
++;
12422 ctx
->delta_stats
.num_bytes
+= bl
.length();
12423 ctx
->delta_stats
.num_bytes_hit_set_archive
+= bl
.length();
12426 ::encode(ctx
->new_snapset
, bss
);
12427 bufferlist
boi(sizeof(ctx
->new_obs
.oi
));
12428 ::encode(ctx
->new_obs
.oi
, boi
,
12429 get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD
, nullptr));
12431 ctx
->op_t
->create(oid
);
12433 ctx
->op_t
->write(oid
, 0, bl
.length(), bl
, 0);
12435 map
<string
, bufferlist
> attrs
;
12436 attrs
[OI_ATTR
].claim(boi
);
12437 attrs
[SS_ATTR
].claim(bss
);
12438 setattrs_maybe_cache(ctx
->obc
, ctx
.get(), ctx
->op_t
.get(), attrs
);
12439 ctx
->log
.push_back(
12441 pg_log_entry_t::MODIFY
,
12451 hit_set_trim(ctx
, max
);
12453 simple_opc_submit(std::move(ctx
));
12456 void PrimaryLogPG::hit_set_trim(OpContextUPtr
&ctx
, unsigned max
)
12458 assert(ctx
->updated_hset_history
);
12459 pg_hit_set_history_t
&updated_hit_set_hist
=
12460 *(ctx
->updated_hset_history
);
12461 for (unsigned num
= updated_hit_set_hist
.history
.size(); num
> max
; --num
) {
12462 list
<pg_hit_set_info_t
>::iterator p
= updated_hit_set_hist
.history
.begin();
12463 assert(p
!= updated_hit_set_hist
.history
.end());
12464 hobject_t oid
= get_hit_set_archive_object(p
->begin
, p
->end
, p
->using_gmt
);
12466 assert(!is_degraded_or_backfilling_object(oid
));
12468 dout(20) << __func__
<< " removing " << oid
<< dendl
;
12469 ++ctx
->at_version
.version
;
12470 ctx
->log
.push_back(
12471 pg_log_entry_t(pg_log_entry_t::DELETE
,
12480 ctx
->op_t
->remove(oid
);
12481 updated_hit_set_hist
.history
.pop_front();
12483 ObjectContextRef obc
= get_object_context(oid
, false);
12485 --ctx
->delta_stats
.num_objects
;
12486 --ctx
->delta_stats
.num_objects_hit_set_archive
;
12487 ctx
->delta_stats
.num_bytes
-= obc
->obs
.oi
.size
;
12488 ctx
->delta_stats
.num_bytes_hit_set_archive
-= obc
->obs
.oi
.size
;
12492 void PrimaryLogPG::hit_set_in_memory_trim(uint32_t max_in_memory
)
12494 while (agent_state
->hit_set_map
.size() > max_in_memory
) {
12495 agent_state
->remove_oldest_hit_set();
12500 // =======================================
12503 void PrimaryLogPG::agent_setup()
12505 assert(is_locked());
12506 if (!is_active() ||
12508 pool
.info
.cache_mode
== pg_pool_t::CACHEMODE_NONE
||
12509 pool
.info
.tier_of
< 0 ||
12510 !get_osdmap()->have_pg_pool(pool
.info
.tier_of
)) {
12514 if (!agent_state
) {
12515 agent_state
.reset(new TierAgentState
);
12517 // choose random starting position
12518 agent_state
->position
= hobject_t();
12519 agent_state
->position
.pool
= info
.pgid
.pool();
12520 agent_state
->position
.set_hash(pool
.info
.get_random_pg_position(
12523 agent_state
->start
= agent_state
->position
;
12525 dout(10) << __func__
<< " allocated new state, position "
12526 << agent_state
->position
<< dendl
;
12528 dout(10) << __func__
<< " keeping existing state" << dendl
;
12531 if (info
.stats
.stats_invalid
) {
12532 osd
->clog
->warn() << "pg " << info
.pgid
<< " has invalid (post-split) stats; must scrub before tier agent can activate";
12535 agent_choose_mode();
12538 void PrimaryLogPG::agent_clear()
12541 agent_state
.reset(NULL
);
12544 // Return false if no objects operated on since start of object hash space
12545 bool PrimaryLogPG::agent_work(int start_max
, int agent_flush_quota
)
12548 if (!agent_state
) {
12549 dout(10) << __func__
<< " no agent state, stopping" << dendl
;
12556 if (agent_state
->is_idle()) {
12557 dout(10) << __func__
<< " idle, stopping" << dendl
;
12562 osd
->logger
->inc(l_osd_agent_wake
);
12564 dout(10) << __func__
12565 << " max " << start_max
12566 << ", flush " << agent_state
->get_flush_mode_name()
12567 << ", evict " << agent_state
->get_evict_mode_name()
12568 << ", pos " << agent_state
->position
12570 assert(is_primary());
12571 assert(is_active());
12573 agent_load_hit_sets();
12575 const pg_pool_t
*base_pool
= get_osdmap()->get_pg_pool(pool
.info
.tier_of
);
12579 int ls_max
= cct
->_conf
->osd_pool_default_cache_max_evict_check_size
;
12581 // list some objects. this conveniently lists clones (oldest to
12582 // newest) before heads... the same order we want to flush in.
12584 // NOTE: do not flush the Sequencer. we will assume that the
12585 // listing we get back is imprecise.
12586 vector
<hobject_t
> ls
;
12588 int r
= pgbackend
->objects_list_partial(agent_state
->position
, ls_min
, ls_max
,
12591 dout(20) << __func__
<< " got " << ls
.size() << " objects" << dendl
;
12593 for (vector
<hobject_t
>::iterator p
= ls
.begin();
12596 if (p
->nspace
== cct
->_conf
->osd_hit_set_namespace
) {
12597 dout(20) << __func__
<< " skip (hit set) " << *p
<< dendl
;
12598 osd
->logger
->inc(l_osd_agent_skip
);
12601 if (is_degraded_or_backfilling_object(*p
)) {
12602 dout(20) << __func__
<< " skip (degraded) " << *p
<< dendl
;
12603 osd
->logger
->inc(l_osd_agent_skip
);
12606 if (is_missing_object(p
->get_head())) {
12607 dout(20) << __func__
<< " skip (missing head) " << *p
<< dendl
;
12608 osd
->logger
->inc(l_osd_agent_skip
);
12611 ObjectContextRef obc
= get_object_context(*p
, false, NULL
);
12613 // we didn't flush; we may miss something here.
12614 dout(20) << __func__
<< " skip (no obc) " << *p
<< dendl
;
12615 osd
->logger
->inc(l_osd_agent_skip
);
12618 if (!obc
->obs
.exists
) {
12619 dout(20) << __func__
<< " skip (dne) " << obc
->obs
.oi
.soid
<< dendl
;
12620 osd
->logger
->inc(l_osd_agent_skip
);
12623 if (scrubber
.write_blocked_by_scrub(obc
->obs
.oi
.soid
)) {
12624 dout(20) << __func__
<< " skip (scrubbing) " << obc
->obs
.oi
<< dendl
;
12625 osd
->logger
->inc(l_osd_agent_skip
);
12628 if (obc
->is_blocked()) {
12629 dout(20) << __func__
<< " skip (blocked) " << obc
->obs
.oi
<< dendl
;
12630 osd
->logger
->inc(l_osd_agent_skip
);
12633 if (obc
->is_request_pending()) {
12634 dout(20) << __func__
<< " skip (request pending) " << obc
->obs
.oi
<< dendl
;
12635 osd
->logger
->inc(l_osd_agent_skip
);
12639 // be careful flushing omap to an EC pool.
12640 if (!base_pool
->supports_omap() &&
12641 obc
->obs
.oi
.is_omap()) {
12642 dout(20) << __func__
<< " skip (omap to EC) " << obc
->obs
.oi
<< dendl
;
12643 osd
->logger
->inc(l_osd_agent_skip
);
12647 if (agent_state
->evict_mode
!= TierAgentState::EVICT_MODE_IDLE
&&
12648 agent_maybe_evict(obc
, false))
12650 else if (agent_state
->flush_mode
!= TierAgentState::FLUSH_MODE_IDLE
&&
12651 agent_flush_quota
> 0 && agent_maybe_flush(obc
)) {
12653 --agent_flush_quota
;
12655 if (started
>= start_max
) {
12656 // If finishing early, set "next" to the next object
12657 if (++p
!= ls
.end())
12663 if (++agent_state
->hist_age
> cct
->_conf
->osd_agent_hist_halflife
) {
12664 dout(20) << __func__
<< " resetting atime and temp histograms" << dendl
;
12665 agent_state
->hist_age
= 0;
12666 agent_state
->temp_hist
.decay();
12669 // Total objects operated on so far
12670 int total_started
= agent_state
->started
+ started
;
12671 bool need_delay
= false;
12673 dout(20) << __func__
<< " start pos " << agent_state
->position
12674 << " next start pos " << next
12675 << " started " << total_started
<< dendl
;
12677 // See if we've made a full pass over the object hash space
12678 // This might check at most ls_max objects a second time to notice that
12679 // we've checked every objects at least once.
12680 if (agent_state
->position
< agent_state
->start
&&
12681 next
>= agent_state
->start
) {
12682 dout(20) << __func__
<< " wrap around " << agent_state
->start
<< dendl
;
12683 if (total_started
== 0)
12687 agent_state
->start
= next
;
12689 agent_state
->started
= total_started
;
12691 // See if we are starting from beginning
12693 agent_state
->position
= hobject_t();
12695 agent_state
->position
= next
;
12697 // Discard old in memory HitSets
12698 hit_set_in_memory_trim(pool
.info
.hit_set_count
);
12701 assert(agent_state
->delaying
== false);
12706 agent_choose_mode();
12711 void PrimaryLogPG::agent_load_hit_sets()
12713 if (agent_state
->evict_mode
== TierAgentState::EVICT_MODE_IDLE
) {
12717 if (agent_state
->hit_set_map
.size() < info
.hit_set
.history
.size()) {
12718 dout(10) << __func__
<< dendl
;
12719 for (list
<pg_hit_set_info_t
>::iterator p
= info
.hit_set
.history
.begin();
12720 p
!= info
.hit_set
.history
.end(); ++p
) {
12721 if (agent_state
->hit_set_map
.count(p
->begin
.sec()) == 0) {
12722 dout(10) << __func__
<< " loading " << p
->begin
<< "-"
12723 << p
->end
<< dendl
;
12724 if (!pool
.info
.is_replicated()) {
12725 // FIXME: EC not supported here yet
12726 derr
<< __func__
<< " on non-replicated pool" << dendl
;
12730 hobject_t oid
= get_hit_set_archive_object(p
->begin
, p
->end
, p
->using_gmt
);
12731 if (is_unreadable_object(oid
)) {
12732 dout(10) << __func__
<< " unreadable " << oid
<< ", waiting" << dendl
;
12736 ObjectContextRef obc
= get_object_context(oid
, false);
12738 derr
<< __func__
<< ": could not load hitset " << oid
<< dendl
;
12744 obc
->ondisk_read_lock();
12745 int r
= osd
->store
->read(ch
, ghobject_t(oid
), 0, 0, bl
);
12747 obc
->ondisk_read_unlock();
12749 HitSetRef
hs(new HitSet
);
12750 bufferlist::iterator pbl
= bl
.begin();
12751 ::decode(*hs
, pbl
);
12752 agent_state
->add_hit_set(p
->begin
.sec(), hs
);
12758 bool PrimaryLogPG::agent_maybe_flush(ObjectContextRef
& obc
)
12760 if (!obc
->obs
.oi
.is_dirty()) {
12761 dout(20) << __func__
<< " skip (clean) " << obc
->obs
.oi
<< dendl
;
12762 osd
->logger
->inc(l_osd_agent_skip
);
12765 if (obc
->obs
.oi
.is_cache_pinned()) {
12766 dout(20) << __func__
<< " skip (cache_pinned) " << obc
->obs
.oi
<< dendl
;
12767 osd
->logger
->inc(l_osd_agent_skip
);
12771 utime_t now
= ceph_clock_now();
12772 utime_t ob_local_mtime
;
12773 if (obc
->obs
.oi
.local_mtime
!= utime_t()) {
12774 ob_local_mtime
= obc
->obs
.oi
.local_mtime
;
12776 ob_local_mtime
= obc
->obs
.oi
.mtime
;
12778 bool evict_mode_full
=
12779 (agent_state
->evict_mode
== TierAgentState::EVICT_MODE_FULL
);
12780 if (!evict_mode_full
&&
12781 obc
->obs
.oi
.soid
.snap
== CEPH_NOSNAP
&& // snaps immutable; don't delay
12782 (ob_local_mtime
+ utime_t(pool
.info
.cache_min_flush_age
, 0) > now
)) {
12783 dout(20) << __func__
<< " skip (too young) " << obc
->obs
.oi
<< dendl
;
12784 osd
->logger
->inc(l_osd_agent_skip
);
12788 if (osd
->agent_is_active_oid(obc
->obs
.oi
.soid
)) {
12789 dout(20) << __func__
<< " skip (flushing) " << obc
->obs
.oi
<< dendl
;
12790 osd
->logger
->inc(l_osd_agent_skip
);
12794 dout(10) << __func__
<< " flushing " << obc
->obs
.oi
<< dendl
;
12796 // FIXME: flush anything dirty, regardless of what distribution of
12799 hobject_t oid
= obc
->obs
.oi
.soid
;
12800 osd
->agent_start_op(oid
);
12801 // no need to capture a pg ref, can't outlive fop or ctx
12802 std::function
<void()> on_flush
= [this, oid
]() {
12803 osd
->agent_finish_op(oid
);
12806 int result
= start_flush(
12807 OpRequestRef(), obc
, false, NULL
,
12809 if (result
!= -EINPROGRESS
) {
12811 dout(10) << __func__
<< " start_flush() failed " << obc
->obs
.oi
12812 << " with " << result
<< dendl
;
12813 osd
->logger
->inc(l_osd_agent_skip
);
12817 osd
->logger
->inc(l_osd_agent_flush
);
12821 bool PrimaryLogPG::agent_maybe_evict(ObjectContextRef
& obc
, bool after_flush
)
12823 const hobject_t
& soid
= obc
->obs
.oi
.soid
;
12824 if (!after_flush
&& obc
->obs
.oi
.is_dirty()) {
12825 dout(20) << __func__
<< " skip (dirty) " << obc
->obs
.oi
<< dendl
;
12828 if (!obc
->obs
.oi
.watchers
.empty()) {
12829 dout(20) << __func__
<< " skip (watchers) " << obc
->obs
.oi
<< dendl
;
12832 if (obc
->is_blocked()) {
12833 dout(20) << __func__
<< " skip (blocked) " << obc
->obs
.oi
<< dendl
;
12836 if (obc
->obs
.oi
.is_cache_pinned()) {
12837 dout(20) << __func__
<< " skip (cache_pinned) " << obc
->obs
.oi
<< dendl
;
12841 if (soid
.snap
== CEPH_NOSNAP
) {
12842 int result
= _verify_no_head_clones(soid
, obc
->ssc
->snapset
);
12844 dout(20) << __func__
<< " skip (clones) " << obc
->obs
.oi
<< dendl
;
12849 if (agent_state
->evict_mode
!= TierAgentState::EVICT_MODE_FULL
) {
12850 // is this object old than cache_min_evict_age?
12851 utime_t now
= ceph_clock_now();
12852 utime_t ob_local_mtime
;
12853 if (obc
->obs
.oi
.local_mtime
!= utime_t()) {
12854 ob_local_mtime
= obc
->obs
.oi
.local_mtime
;
12856 ob_local_mtime
= obc
->obs
.oi
.mtime
;
12858 if (ob_local_mtime
+ utime_t(pool
.info
.cache_min_evict_age
, 0) > now
) {
12859 dout(20) << __func__
<< " skip (too young) " << obc
->obs
.oi
<< dendl
;
12860 osd
->logger
->inc(l_osd_agent_skip
);
12863 // is this object old and/or cold enough?
12865 uint64_t temp_upper
= 0, temp_lower
= 0;
12867 agent_estimate_temp(soid
, &temp
);
12868 agent_state
->temp_hist
.add(temp
);
12869 agent_state
->temp_hist
.get_position_micro(temp
, &temp_lower
, &temp_upper
);
12871 dout(20) << __func__
12872 << " temp " << temp
12873 << " pos " << temp_lower
<< "-" << temp_upper
12874 << ", evict_effort " << agent_state
->evict_effort
12876 dout(30) << "agent_state:\n";
12877 Formatter
*f
= Formatter::create("");
12878 f
->open_object_section("agent_state");
12879 agent_state
->dump(f
);
12880 f
->close_section();
12885 if (1000000 - temp_upper
>= agent_state
->evict_effort
)
12889 dout(10) << __func__
<< " evicting " << obc
->obs
.oi
<< dendl
;
12890 OpContextUPtr ctx
= simple_opc_create(obc
);
12892 if (!ctx
->lock_manager
.get_lock_type(
12893 ObjectContext::RWState::RWWRITE
,
12897 close_op_ctx(ctx
.release());
12898 dout(20) << __func__
<< " skip (cannot get lock) " << obc
->obs
.oi
<< dendl
;
12902 osd
->agent_start_evict_op();
12903 ctx
->register_on_finish(
12905 osd
->agent_finish_evict_op();
12908 ctx
->at_version
= get_next_version();
12909 assert(ctx
->new_obs
.exists
);
12910 int r
= _delete_oid(ctx
.get(), true, false);
12911 if (obc
->obs
.oi
.is_omap())
12912 ctx
->delta_stats
.num_objects_omap
--;
12913 ctx
->delta_stats
.num_evict
++;
12914 ctx
->delta_stats
.num_evict_kb
+= SHIFT_ROUND_UP(obc
->obs
.oi
.size
, 10);
12915 if (obc
->obs
.oi
.is_dirty())
12916 --ctx
->delta_stats
.num_objects_dirty
;
12918 finish_ctx(ctx
.get(), pg_log_entry_t::DELETE
, false);
12919 simple_opc_submit(std::move(ctx
));
12920 osd
->logger
->inc(l_osd_tier_evict
);
12921 osd
->logger
->inc(l_osd_agent_evict
);
12925 void PrimaryLogPG::agent_stop()
12927 dout(20) << __func__
<< dendl
;
12928 if (agent_state
&& !agent_state
->is_idle()) {
12929 agent_state
->evict_mode
= TierAgentState::EVICT_MODE_IDLE
;
12930 agent_state
->flush_mode
= TierAgentState::FLUSH_MODE_IDLE
;
12931 osd
->agent_disable_pg(this, agent_state
->evict_effort
);
12935 void PrimaryLogPG::agent_delay()
12937 dout(20) << __func__
<< dendl
;
12938 if (agent_state
&& !agent_state
->is_idle()) {
12939 assert(agent_state
->delaying
== false);
12940 agent_state
->delaying
= true;
12941 osd
->agent_disable_pg(this, agent_state
->evict_effort
);
12945 void PrimaryLogPG::agent_choose_mode_restart()
12947 dout(20) << __func__
<< dendl
;
12949 if (agent_state
&& agent_state
->delaying
) {
12950 agent_state
->delaying
= false;
12951 agent_choose_mode(true);
12956 bool PrimaryLogPG::agent_choose_mode(bool restart
, OpRequestRef op
)
12958 bool requeued
= false;
12959 // Let delay play out
12960 if (agent_state
->delaying
) {
12961 dout(20) << __func__
<< this << " delaying, ignored" << dendl
;
12965 TierAgentState::flush_mode_t flush_mode
= TierAgentState::FLUSH_MODE_IDLE
;
12966 TierAgentState::evict_mode_t evict_mode
= TierAgentState::EVICT_MODE_IDLE
;
12967 unsigned evict_effort
= 0;
12969 if (info
.stats
.stats_invalid
) {
12970 // idle; stats can't be trusted until we scrub.
12971 dout(20) << __func__
<< " stats invalid (post-split), idle" << dendl
;
12976 uint64_t divisor
= pool
.info
.get_pg_num_divisor(info
.pgid
.pgid
);
12977 assert(divisor
> 0);
12979 // adjust (effective) user objects down based on the number
12980 // of HitSet objects, which should not count toward our total since
12981 // they cannot be flushed.
12982 uint64_t unflushable
= info
.stats
.stats
.sum
.num_objects_hit_set_archive
;
12984 // also exclude omap objects if ec backing pool
12985 const pg_pool_t
*base_pool
= get_osdmap()->get_pg_pool(pool
.info
.tier_of
);
12987 if (!base_pool
->supports_omap())
12988 unflushable
+= info
.stats
.stats
.sum
.num_objects_omap
;
12990 uint64_t num_user_objects
= info
.stats
.stats
.sum
.num_objects
;
12991 if (num_user_objects
> unflushable
)
12992 num_user_objects
-= unflushable
;
12994 num_user_objects
= 0;
12996 uint64_t num_user_bytes
= info
.stats
.stats
.sum
.num_bytes
;
12997 uint64_t unflushable_bytes
= info
.stats
.stats
.sum
.num_bytes_hit_set_archive
;
12998 num_user_bytes
-= unflushable_bytes
;
12999 uint64_t num_overhead_bytes
= osd
->store
->estimate_objects_overhead(num_user_objects
);
13000 num_user_bytes
+= num_overhead_bytes
;
13002 // also reduce the num_dirty by num_objects_omap
13003 int64_t num_dirty
= info
.stats
.stats
.sum
.num_objects_dirty
;
13004 if (!base_pool
->supports_omap()) {
13005 if (num_dirty
> info
.stats
.stats
.sum
.num_objects_omap
)
13006 num_dirty
-= info
.stats
.stats
.sum
.num_objects_omap
;
13011 dout(10) << __func__
13013 << TierAgentState::get_flush_mode_name(agent_state
->flush_mode
)
13015 << TierAgentState::get_evict_mode_name(agent_state
->evict_mode
)
13016 << " num_objects: " << info
.stats
.stats
.sum
.num_objects
13017 << " num_bytes: " << info
.stats
.stats
.sum
.num_bytes
13018 << " num_objects_dirty: " << info
.stats
.stats
.sum
.num_objects_dirty
13019 << " num_objects_omap: " << info
.stats
.stats
.sum
.num_objects_omap
13020 << " num_dirty: " << num_dirty
13021 << " num_user_objects: " << num_user_objects
13022 << " num_user_bytes: " << num_user_bytes
13023 << " num_overhead_bytes: " << num_overhead_bytes
13024 << " pool.info.target_max_bytes: " << pool
.info
.target_max_bytes
13025 << " pool.info.target_max_objects: " << pool
.info
.target_max_objects
13028 // get dirty, full ratios
13029 uint64_t dirty_micro
= 0;
13030 uint64_t full_micro
= 0;
13031 if (pool
.info
.target_max_bytes
&& num_user_objects
> 0) {
13032 uint64_t avg_size
= num_user_bytes
/ num_user_objects
;
13034 num_dirty
* avg_size
* 1000000 /
13035 MAX(pool
.info
.target_max_bytes
/ divisor
, 1);
13037 num_user_objects
* avg_size
* 1000000 /
13038 MAX(pool
.info
.target_max_bytes
/ divisor
, 1);
13040 if (pool
.info
.target_max_objects
> 0) {
13041 uint64_t dirty_objects_micro
=
13042 num_dirty
* 1000000 /
13043 MAX(pool
.info
.target_max_objects
/ divisor
, 1);
13044 if (dirty_objects_micro
> dirty_micro
)
13045 dirty_micro
= dirty_objects_micro
;
13046 uint64_t full_objects_micro
=
13047 num_user_objects
* 1000000 /
13048 MAX(pool
.info
.target_max_objects
/ divisor
, 1);
13049 if (full_objects_micro
> full_micro
)
13050 full_micro
= full_objects_micro
;
13052 dout(20) << __func__
<< " dirty " << ((float)dirty_micro
/ 1000000.0)
13053 << " full " << ((float)full_micro
/ 1000000.0)
13057 uint64_t flush_target
= pool
.info
.cache_target_dirty_ratio_micro
;
13058 uint64_t flush_high_target
= pool
.info
.cache_target_dirty_high_ratio_micro
;
13059 uint64_t flush_slop
= (float)flush_target
* cct
->_conf
->osd_agent_slop
;
13060 if (restart
|| agent_state
->flush_mode
== TierAgentState::FLUSH_MODE_IDLE
) {
13061 flush_target
+= flush_slop
;
13062 flush_high_target
+= flush_slop
;
13064 flush_target
-= MIN(flush_target
, flush_slop
);
13065 flush_high_target
-= MIN(flush_high_target
, flush_slop
);
13068 if (dirty_micro
> flush_high_target
) {
13069 flush_mode
= TierAgentState::FLUSH_MODE_HIGH
;
13070 } else if (dirty_micro
> flush_target
) {
13071 flush_mode
= TierAgentState::FLUSH_MODE_LOW
;
13075 uint64_t evict_target
= pool
.info
.cache_target_full_ratio_micro
;
13076 uint64_t evict_slop
= (float)evict_target
* cct
->_conf
->osd_agent_slop
;
13077 if (restart
|| agent_state
->evict_mode
== TierAgentState::EVICT_MODE_IDLE
)
13078 evict_target
+= evict_slop
;
13080 evict_target
-= MIN(evict_target
, evict_slop
);
13082 if (full_micro
> 1000000) {
13083 // evict anything clean
13084 evict_mode
= TierAgentState::EVICT_MODE_FULL
;
13085 evict_effort
= 1000000;
13086 } else if (full_micro
> evict_target
) {
13087 // set effort in [0..1] range based on where we are between
13088 evict_mode
= TierAgentState::EVICT_MODE_SOME
;
13089 uint64_t over
= full_micro
- evict_target
;
13090 uint64_t span
= 1000000 - evict_target
;
13091 evict_effort
= MAX(over
* 1000000 / span
,
13092 (unsigned)(1000000.0 * cct
->_conf
->osd_agent_min_evict_effort
));
13094 // quantize effort to avoid too much reordering in the agent_queue.
13095 uint64_t inc
= cct
->_conf
->osd_agent_quantize_effort
* 1000000;
13097 uint64_t was
= evict_effort
;
13098 evict_effort
-= evict_effort
% inc
;
13099 if (evict_effort
< inc
)
13100 evict_effort
= inc
;
13101 assert(evict_effort
>= inc
&& evict_effort
<= 1000000);
13102 dout(30) << __func__
<< " evict_effort " << was
<< " quantized by " << inc
<< " to " << evict_effort
<< dendl
;
13107 bool old_idle
= agent_state
->is_idle();
13108 if (flush_mode
!= agent_state
->flush_mode
) {
13109 dout(5) << __func__
<< " flush_mode "
13110 << TierAgentState::get_flush_mode_name(agent_state
->flush_mode
)
13112 << TierAgentState::get_flush_mode_name(flush_mode
)
13114 if (flush_mode
== TierAgentState::FLUSH_MODE_HIGH
) {
13115 osd
->agent_inc_high_count();
13116 info
.stats
.stats
.sum
.num_flush_mode_high
= 1;
13117 } else if (flush_mode
== TierAgentState::FLUSH_MODE_LOW
) {
13118 info
.stats
.stats
.sum
.num_flush_mode_low
= 1;
13120 if (agent_state
->flush_mode
== TierAgentState::FLUSH_MODE_HIGH
) {
13121 osd
->agent_dec_high_count();
13122 info
.stats
.stats
.sum
.num_flush_mode_high
= 0;
13123 } else if (agent_state
->flush_mode
== TierAgentState::FLUSH_MODE_LOW
) {
13124 info
.stats
.stats
.sum
.num_flush_mode_low
= 0;
13126 agent_state
->flush_mode
= flush_mode
;
13128 if (evict_mode
!= agent_state
->evict_mode
) {
13129 dout(5) << __func__
<< " evict_mode "
13130 << TierAgentState::get_evict_mode_name(agent_state
->evict_mode
)
13132 << TierAgentState::get_evict_mode_name(evict_mode
)
13134 if (agent_state
->evict_mode
== TierAgentState::EVICT_MODE_FULL
&&
13138 requeue_ops(waiting_for_active
);
13139 requeue_ops(waiting_for_scrub
);
13140 requeue_ops(waiting_for_cache_not_full
);
13141 objects_blocked_on_cache_full
.clear();
13144 if (evict_mode
== TierAgentState::EVICT_MODE_SOME
) {
13145 info
.stats
.stats
.sum
.num_evict_mode_some
= 1;
13146 } else if (evict_mode
== TierAgentState::EVICT_MODE_FULL
) {
13147 info
.stats
.stats
.sum
.num_evict_mode_full
= 1;
13149 if (agent_state
->evict_mode
== TierAgentState::EVICT_MODE_SOME
) {
13150 info
.stats
.stats
.sum
.num_evict_mode_some
= 0;
13151 } else if (agent_state
->evict_mode
== TierAgentState::EVICT_MODE_FULL
) {
13152 info
.stats
.stats
.sum
.num_evict_mode_full
= 0;
13154 agent_state
->evict_mode
= evict_mode
;
13156 uint64_t old_effort
= agent_state
->evict_effort
;
13157 if (evict_effort
!= agent_state
->evict_effort
) {
13158 dout(5) << __func__
<< " evict_effort "
13159 << ((float)agent_state
->evict_effort
/ 1000000.0)
13161 << ((float)evict_effort
/ 1000000.0)
13163 agent_state
->evict_effort
= evict_effort
;
13166 // NOTE: we are using evict_effort as a proxy for *all* agent effort
13167 // (including flush). This is probably fine (they should be
13168 // correlated) but it is not precisely correct.
13169 if (agent_state
->is_idle()) {
13170 if (!restart
&& !old_idle
) {
13171 osd
->agent_disable_pg(this, old_effort
);
13174 if (restart
|| old_idle
) {
13175 osd
->agent_enable_pg(this, agent_state
->evict_effort
);
13176 } else if (old_effort
!= agent_state
->evict_effort
) {
13177 osd
->agent_adjust_pg(this, old_effort
, agent_state
->evict_effort
);
13183 void PrimaryLogPG::agent_estimate_temp(const hobject_t
& oid
, int *temp
)
13188 if (hit_set
->contains(oid
))
13191 int last_n
= pool
.info
.hit_set_search_last_n
;
13192 for (map
<time_t,HitSetRef
>::reverse_iterator p
=
13193 agent_state
->hit_set_map
.rbegin(); last_n
> 0 &&
13194 p
!= agent_state
->hit_set_map
.rend(); ++p
, ++i
) {
13195 if (p
->second
->contains(oid
)) {
13196 *temp
+= pool
.info
.get_grade(i
);
13202 // Dup op detection
13204 bool PrimaryLogPG::already_complete(eversion_t v
)
13206 dout(20) << __func__
<< ": " << v
<< dendl
;
13207 for (xlist
<RepGather
*>::iterator i
= repop_queue
.begin();
13210 dout(20) << __func__
<< ": " << **i
<< dendl
;
13211 // skip copy from temp object ops
13212 if ((*i
)->v
== eversion_t()) {
13213 dout(20) << __func__
<< ": " << **i
13214 << " version is empty" << dendl
;
13218 dout(20) << __func__
<< ": " << **i
13219 << " (*i)->v past v" << dendl
;
13222 if (!(*i
)->all_committed
) {
13223 dout(20) << __func__
<< ": " << **i
13224 << " not committed, returning false"
13229 dout(20) << __func__
<< ": returning true" << dendl
;
13233 bool PrimaryLogPG::already_ack(eversion_t v
)
13235 dout(20) << __func__
<< ": " << v
<< dendl
;
13236 for (xlist
<RepGather
*>::iterator i
= repop_queue
.begin();
13239 // skip copy from temp object ops
13240 if ((*i
)->v
== eversion_t()) {
13241 dout(20) << __func__
<< ": " << **i
13242 << " version is empty" << dendl
;
13246 dout(20) << __func__
<< ": " << **i
13247 << " (*i)->v past v" << dendl
;
13250 if (!(*i
)->all_applied
) {
13251 dout(20) << __func__
<< ": " << **i
13252 << " not applied, returning false"
13257 dout(20) << __func__
<< ": returning true" << dendl
;
13262 // ==========================================================================================
13266 bool PrimaryLogPG::_range_available_for_scrub(
13267 const hobject_t
&begin
, const hobject_t
&end
)
13269 pair
<hobject_t
, ObjectContextRef
> next
;
13270 next
.second
= object_contexts
.lookup(begin
);
13271 next
.first
= begin
;
13273 while (more
&& next
.first
< end
) {
13274 if (next
.second
&& next
.second
->is_blocked()) {
13275 next
.second
->requeue_scrub_on_unblock
= true;
13276 dout(10) << __func__
<< ": scrub delayed, "
13277 << next
.first
<< " is blocked"
13281 more
= object_contexts
.get_next(next
.first
, &next
);
13286 static bool doing_clones(const boost::optional
<SnapSet
> &snapset
,
13287 const vector
<snapid_t
>::reverse_iterator
&curclone
) {
13288 return snapset
&& curclone
!= snapset
.get().clones
.rend();
13291 void PrimaryLogPG::log_missing(unsigned missing
,
13292 const boost::optional
<hobject_t
> &head
,
13293 LogChannelRef clog
,
13297 bool allow_incomplete_clones
)
13300 if (allow_incomplete_clones
) {
13301 dout(20) << func
<< " " << mode
<< " " << pgid
<< " " << head
.get()
13302 << " skipped " << missing
<< " clone(s) in cache tier" << dendl
;
13304 clog
->info() << mode
<< " " << pgid
<< " " << head
.get()
13305 << " " << missing
<< " missing clone(s)";
13309 unsigned PrimaryLogPG::process_clones_to(const boost::optional
<hobject_t
> &head
,
13310 const boost::optional
<SnapSet
> &snapset
,
13311 LogChannelRef clog
,
13314 bool allow_incomplete_clones
,
13315 boost::optional
<snapid_t
> target
,
13316 vector
<snapid_t
>::reverse_iterator
*curclone
,
13317 inconsistent_snapset_wrapper
&e
)
13321 unsigned missing
= 0;
13323 // NOTE: clones are in descending order, thus **curclone > target test here
13324 hobject_t
next_clone(head
.get());
13325 while(doing_clones(snapset
, *curclone
) && (!target
|| **curclone
> *target
)) {
13327 // it is okay to be missing one or more clones in a cache tier.
13328 // skip higher-numbered clones in the list.
13329 if (!allow_incomplete_clones
) {
13330 next_clone
.snap
= **curclone
;
13331 clog
->error() << mode
<< " " << pgid
<< " " << head
.get()
13332 << " expected clone " << next_clone
;
13333 ++scrubber
.shallow_errors
;
13334 e
.set_clone_missing(next_clone
.snap
);
13336 // Clones are descending
13343 * Validate consistency of the object info and snap sets.
13345 * We are sort of comparing 2 lists. The main loop is on objmap.objects. But
13346 * the comparison of the objects is against multiple snapset.clones. There are
13347 * multiple clone lists and in between lists we expect head or snapdir.
13353 * obj1 snap 1 head/snapdir, unexpected obj1 snap 1
13354 * obj2 head head/snapdir, head ok
13355 * [SnapSet clones 6 4 2 1]
13356 * obj2 snap 7 obj2 snap 6, unexpected obj2 snap 7
13357 * obj2 snap 6 obj2 snap 6, match
13358 * obj2 snap 4 obj2 snap 4, match
13359 * obj3 head obj2 snap 2 (expected), obj2 snap 1 (expected), head ok
13360 * [Snapset clones 3 1]
13361 * obj3 snap 3 obj3 snap 3 match
13362 * obj3 snap 1 obj3 snap 1 match
13363 * obj4 snapdir head/snapdir, snapdir ok
13364 * [Snapset clones 4]
13365 * EOL obj4 snap 4, (expected)
13367 void PrimaryLogPG::scrub_snapshot_metadata(
13368 ScrubMap
&scrubmap
,
13369 const map
<hobject_t
, pair
<uint32_t, uint32_t>> &missing_digest
)
13371 dout(10) << __func__
<< dendl
;
13373 coll_t
c(info
.pgid
);
13374 bool repair
= state_test(PG_STATE_REPAIR
);
13375 bool deep_scrub
= state_test(PG_STATE_DEEP_SCRUB
);
13376 const char *mode
= (repair
? "repair": (deep_scrub
? "deep-scrub" : "scrub"));
13377 boost::optional
<snapid_t
> all_clones
; // Unspecified snapid_t or boost::none
13379 /// snapsets to repair
13380 map
<hobject_t
,SnapSet
> snapset_to_repair
;
13382 // traverse in reverse order.
13383 boost::optional
<hobject_t
> head
;
13384 boost::optional
<SnapSet
> snapset
; // If initialized so will head (above)
13385 vector
<snapid_t
>::reverse_iterator curclone
; // Defined only if snapset initialized
13386 unsigned missing
= 0;
13387 inconsistent_snapset_wrapper soid_error
, head_error
;
13389 bufferlist last_data
;
13391 for (map
<hobject_t
,ScrubMap::object
>::reverse_iterator
13392 p
= scrubmap
.objects
.rbegin(); p
!= scrubmap
.objects
.rend(); ++p
) {
13393 const hobject_t
& soid
= p
->first
;
13394 soid_error
= inconsistent_snapset_wrapper
{soid
};
13395 object_stat_sum_t stat
;
13396 boost::optional
<object_info_t
> oi
;
13398 if (!soid
.is_snapdir())
13399 stat
.num_objects
++;
13401 if (soid
.nspace
== cct
->_conf
->osd_hit_set_namespace
)
13402 stat
.num_objects_hit_set_archive
++;
13404 if (soid
.is_snap()) {
13406 stat
.num_object_clones
++;
13410 if (p
->second
.attrs
.count(OI_ATTR
) == 0) {
13412 osd
->clog
->error() << mode
<< " " << info
.pgid
<< " " << soid
13413 << " no '" << OI_ATTR
<< "' attr";
13414 ++scrubber
.shallow_errors
;
13415 soid_error
.set_oi_attr_missing();
13418 bv
.push_back(p
->second
.attrs
[OI_ATTR
]);
13420 oi
= object_info_t(); // Initialize optional<> before decode into it
13421 oi
.get().decode(bv
);
13422 } catch (buffer::error
& e
) {
13424 osd
->clog
->error() << mode
<< " " << info
.pgid
<< " " << soid
13425 << " can't decode '" << OI_ATTR
<< "' attr " << e
.what();
13426 ++scrubber
.shallow_errors
;
13427 soid_error
.set_oi_attr_corrupted();
13428 soid_error
.set_oi_attr_missing(); // Not available too
13433 if (pgbackend
->be_get_ondisk_size(oi
->size
) != p
->second
.size
) {
13434 osd
->clog
->error() << mode
<< " " << info
.pgid
<< " " << soid
13435 << " on disk size (" << p
->second
.size
13436 << ") does not match object info size ("
13437 << oi
->size
<< ") adjusted for ondisk to ("
13438 << pgbackend
->be_get_ondisk_size(oi
->size
)
13440 soid_error
.set_size_mismatch();
13441 ++scrubber
.shallow_errors
;
13444 dout(20) << mode
<< " " << soid
<< " " << oi
.get() << dendl
;
13446 // A clone num_bytes will be added later when we have snapset
13447 if (!soid
.is_snap()) {
13448 stat
.num_bytes
+= oi
->size
;
13450 if (soid
.nspace
== cct
->_conf
->osd_hit_set_namespace
)
13451 stat
.num_bytes_hit_set_archive
+= oi
->size
;
13453 if (!soid
.is_snapdir()) {
13454 if (oi
->is_dirty())
13455 ++stat
.num_objects_dirty
;
13456 if (oi
->is_whiteout())
13457 ++stat
.num_whiteouts
;
13459 ++stat
.num_objects_omap
;
13460 if (oi
->is_cache_pinned())
13461 ++stat
.num_objects_pinned
;
13464 // pessimistic assumption that this object might contain a
13466 stat
.num_legacy_snapsets
++;
13469 // Check for any problems while processing clones
13470 if (doing_clones(snapset
, curclone
)) {
13471 boost::optional
<snapid_t
> target
;
13472 // Expecting an object with snap for current head
13473 if (soid
.has_snapset() || soid
.get_head() != head
->get_head()) {
13475 dout(10) << __func__
<< " " << mode
<< " " << info
.pgid
<< " new object "
13476 << soid
<< " while processing " << head
.get() << dendl
;
13478 target
= all_clones
;
13480 assert(soid
.is_snap());
13481 target
= soid
.snap
;
13484 // Log any clones we were expecting to be there up to target
13485 // This will set missing, but will be a no-op if snap.soid == *curclone.
13486 missing
+= process_clones_to(head
, snapset
, osd
->clog
, info
.pgid
, mode
,
13487 pool
.info
.allow_incomplete_clones(), target
, &curclone
,
13491 // Check doing_clones() again in case we ran process_clones_to()
13492 if (doing_clones(snapset
, curclone
)) {
13493 // A head/snapdir would have processed all clones above
13494 // or all greater than *curclone.
13495 assert(soid
.is_snap() && *curclone
<= soid
.snap
);
13497 // After processing above clone snap should match the expected curclone
13498 expected
= (*curclone
== soid
.snap
);
13500 // If we aren't doing clones any longer, then expecting head/snapdir
13501 expected
= soid
.has_snapset();
13504 // If we couldn't read the head's snapset, just ignore clones
13505 if (head
&& !snapset
) {
13506 osd
->clog
->error() << mode
<< " " << info
.pgid
<< " " << soid
13507 << " clone ignored due to missing snapset";
13509 osd
->clog
->error() << mode
<< " " << info
.pgid
<< " " << soid
13510 << " is an unexpected clone";
13512 ++scrubber
.shallow_errors
;
13513 soid_error
.set_headless();
13514 scrubber
.store
->add_snap_error(pool
.id
, soid_error
);
13515 if (head
&& soid
.get_head() == head
->get_head())
13516 head_error
.set_clone(soid
.snap
);
13521 if (soid
.has_snapset()) {
13524 log_missing(missing
, head
, osd
->clog
, info
.pgid
, __func__
, mode
,
13525 pool
.info
.allow_incomplete_clones());
13528 // Save previous head error information
13529 if (head
&& head_error
.errors
)
13530 scrubber
.store
->add_snap_error(pool
.id
, head_error
);
13531 // Set this as a new head object
13534 head_error
= soid_error
;
13536 dout(20) << __func__
<< " " << mode
<< " new head " << head
<< dendl
;
13538 if (p
->second
.attrs
.count(SS_ATTR
) == 0) {
13539 osd
->clog
->error() << mode
<< " " << info
.pgid
<< " " << soid
13540 << " no '" << SS_ATTR
<< "' attr";
13541 ++scrubber
.shallow_errors
;
13542 snapset
= boost::none
;
13543 head_error
.set_ss_attr_missing();
13546 bl
.push_back(p
->second
.attrs
[SS_ATTR
]);
13547 bufferlist::iterator blp
= bl
.begin();
13549 snapset
= SnapSet(); // Initialize optional<> before decoding into it
13550 ::decode(snapset
.get(), blp
);
13551 } catch (buffer::error
& e
) {
13552 snapset
= boost::none
;
13553 osd
->clog
->error() << mode
<< " " << info
.pgid
<< " " << soid
13554 << " can't decode '" << SS_ATTR
<< "' attr " << e
.what();
13555 ++scrubber
.shallow_errors
;
13556 head_error
.set_ss_attr_corrupted();
13561 // what will be next?
13562 curclone
= snapset
->clones
.rbegin();
13564 if (!snapset
->clones
.empty()) {
13565 dout(20) << " snapset " << snapset
.get() << dendl
;
13566 if (snapset
->seq
== 0) {
13567 osd
->clog
->error() << mode
<< " " << info
.pgid
<< " " << soid
13568 << " snaps.seq not set";
13569 ++scrubber
.shallow_errors
;
13570 head_error
.set_snapset_mismatch();
13574 if (soid
.is_head() && !snapset
->head_exists
) {
13575 osd
->clog
->error() << mode
<< " " << info
.pgid
<< " " << soid
13576 << " snapset.head_exists=false, but head exists";
13577 ++scrubber
.shallow_errors
;
13578 head_error
.set_head_mismatch();
13580 if (soid
.is_snapdir() && snapset
->head_exists
) {
13581 osd
->clog
->error() << mode
<< " " << info
.pgid
<< " " << soid
13582 << " snapset.head_exists=true, but snapdir exists";
13583 ++scrubber
.shallow_errors
;
13584 head_error
.set_head_mismatch();
13587 if (get_osdmap()->require_osd_release
>= CEPH_RELEASE_LUMINOUS
) {
13588 if (soid
.is_snapdir()) {
13589 dout(10) << " will move snapset to head from " << soid
<< dendl
;
13590 snapset_to_repair
[soid
.get_head()] = *snapset
;
13591 } else if (snapset
->is_legacy()) {
13592 dout(10) << " will convert legacy snapset on " << soid
<< " " << *snapset
13594 snapset_to_repair
[soid
.get_head()] = *snapset
;
13597 stat
.num_legacy_snapsets
++;
13600 // pessimistic assumption that this object might contain a
13602 stat
.num_legacy_snapsets
++;
13605 assert(soid
.is_snap());
13608 assert(soid
.snap
== *curclone
);
13610 dout(20) << __func__
<< " " << mode
<< " matched clone " << soid
<< dendl
;
13612 if (snapset
->clone_size
.count(soid
.snap
) == 0) {
13613 osd
->clog
->error() << mode
<< " " << info
.pgid
<< " " << soid
13614 << " is missing in clone_size";
13615 ++scrubber
.shallow_errors
;
13616 soid_error
.set_size_mismatch();
13618 if (oi
&& oi
->size
!= snapset
->clone_size
[soid
.snap
]) {
13619 osd
->clog
->error() << mode
<< " " << info
.pgid
<< " " << soid
13620 << " size " << oi
->size
<< " != clone_size "
13621 << snapset
->clone_size
[*curclone
];
13622 ++scrubber
.shallow_errors
;
13623 soid_error
.set_size_mismatch();
13626 if (snapset
->clone_overlap
.count(soid
.snap
) == 0) {
13627 osd
->clog
->error() << mode
<< " " << info
.pgid
<< " " << soid
13628 << " is missing in clone_overlap";
13629 ++scrubber
.shallow_errors
;
13630 soid_error
.set_size_mismatch();
13632 // This checking is based on get_clone_bytes(). The first 2 asserts
13633 // can't happen because we know we have a clone_size and
13634 // a clone_overlap. Now we check that the interval_set won't
13635 // cause the last assert.
13636 uint64_t size
= snapset
->clone_size
.find(soid
.snap
)->second
;
13637 const interval_set
<uint64_t> &overlap
=
13638 snapset
->clone_overlap
.find(soid
.snap
)->second
;
13639 bool bad_interval_set
= false;
13640 for (interval_set
<uint64_t>::const_iterator i
= overlap
.begin();
13641 i
!= overlap
.end(); ++i
) {
13642 if (size
< i
.get_len()) {
13643 bad_interval_set
= true;
13646 size
-= i
.get_len();
13649 if (bad_interval_set
) {
13650 osd
->clog
->error() << mode
<< " " << info
.pgid
<< " " << soid
13651 << " bad interval_set in clone_overlap";
13652 ++scrubber
.shallow_errors
;
13653 soid_error
.set_size_mismatch();
13655 stat
.num_bytes
+= snapset
->get_clone_bytes(soid
.snap
);
13660 // migrate legacy_snaps to snapset?
13661 auto p
= snapset_to_repair
.find(soid
.get_head());
13662 if (p
!= snapset_to_repair
.end()) {
13663 if (!oi
|| oi
->legacy_snaps
.empty()) {
13664 osd
->clog
->error() << mode
<< " " << info
.pgid
<< " " << soid
13665 << " has no oi or legacy_snaps; cannot convert "
13667 ++scrubber
.shallow_errors
;
13669 dout(20) << __func__
<< " copying legacy_snaps " << oi
->legacy_snaps
13670 << " to snapset " << p
->second
<< dendl
;
13671 p
->second
.clone_snaps
[soid
.snap
] = oi
->legacy_snaps
;
13677 if (soid_error
.errors
)
13678 scrubber
.store
->add_snap_error(pool
.id
, soid_error
);
13681 scrub_cstat
.add(stat
);
13684 if (doing_clones(snapset
, curclone
)) {
13685 dout(10) << __func__
<< " " << mode
<< " " << info
.pgid
13686 << " No more objects while processing " << head
.get() << dendl
;
13688 missing
+= process_clones_to(head
, snapset
, osd
->clog
, info
.pgid
, mode
,
13689 pool
.info
.allow_incomplete_clones(), all_clones
, &curclone
,
13692 // There could be missing found by the test above or even
13693 // before dropping out of the loop for the last head.
13695 log_missing(missing
, head
, osd
->clog
, info
.pgid
, __func__
,
13696 mode
, pool
.info
.allow_incomplete_clones());
13698 if (head
&& head_error
.errors
)
13699 scrubber
.store
->add_snap_error(pool
.id
, head_error
);
13701 for (map
<hobject_t
,pair
<uint32_t,uint32_t>>::const_iterator p
=
13702 missing_digest
.begin();
13703 p
!= missing_digest
.end();
13705 if (p
->first
.is_snapdir())
13707 dout(10) << __func__
<< " recording digests for " << p
->first
<< dendl
;
13708 ObjectContextRef obc
= get_object_context(p
->first
, false);
13710 osd
->clog
->error() << info
.pgid
<< " " << mode
13711 << " cannot get object context for "
13714 } else if (obc
->obs
.oi
.soid
!= p
->first
) {
13715 osd
->clog
->error() << info
.pgid
<< " " << mode
13716 << " object " << p
->first
13717 << " has a valid oi attr with a mismatched name, "
13718 << " obc->obs.oi.soid: " << obc
->obs
.oi
.soid
;
13721 OpContextUPtr ctx
= simple_opc_create(obc
);
13722 ctx
->at_version
= get_next_version();
13723 ctx
->mtime
= utime_t(); // do not update mtime
13724 ctx
->new_obs
.oi
.set_data_digest(p
->second
.first
);
13725 ctx
->new_obs
.oi
.set_omap_digest(p
->second
.second
);
13726 finish_ctx(ctx
.get(), pg_log_entry_t::MODIFY
);
13728 ctx
->register_on_success(
13730 dout(20) << "updating scrub digest" << dendl
;
13731 if (--scrubber
.num_digest_updates_pending
== 0) {
13736 simple_opc_submit(std::move(ctx
));
13737 ++scrubber
.num_digest_updates_pending
;
13739 for (auto& p
: snapset_to_repair
) {
13740 // cache pools may not have the clones, which means we won't know
13741 // what snaps they have. fake out the clone_snaps entries anyway (with
13742 // blank snap lists).
13743 p
.second
.head_exists
= true;
13744 if (pool
.info
.allow_incomplete_clones()) {
13745 for (auto s
: p
.second
.clones
) {
13746 if (p
.second
.clone_snaps
.count(s
) == 0) {
13747 dout(10) << __func__
<< " " << p
.first
<< " faking clone_snaps for "
13749 p
.second
.clone_snaps
[s
];
13753 if (p
.second
.clones
.size() != p
.second
.clone_snaps
.size() ||
13754 p
.second
.is_legacy()) {
13755 // this happens if we encounter other errors above, like a missing
13757 dout(10) << __func__
<< " not writing snapset to " << p
.first
13758 << " snapset " << p
.second
<< " clones " << p
.second
.clones
13759 << "; didn't convert fully" << dendl
;
13760 scrub_cstat
.sum
.num_legacy_snapsets
++;
13763 dout(10) << __func__
<< " writing snapset to " << p
.first
13764 << " " << p
.second
<< dendl
;
13765 ObjectContextRef obc
= get_object_context(p
.first
, true);
13767 osd
->clog
->error() << info
.pgid
<< " " << mode
13768 << " cannot get object context for "
13771 } else if (obc
->obs
.oi
.soid
!= p
.first
) {
13772 osd
->clog
->error() << info
.pgid
<< " " << mode
13773 << " object " << p
.first
13774 << " has a valid oi attr with a mismatched name, "
13775 << " obc->obs.oi.soid: " << obc
->obs
.oi
.soid
;
13778 ObjectContextRef snapset_obc
;
13779 if (!obc
->obs
.exists
) {
13780 snapset_obc
= get_object_context(p
.first
.get_snapdir(), false);
13781 if (!snapset_obc
) {
13782 osd
->clog
->error() << info
.pgid
<< " " << mode
13783 << " cannot get object context for "
13784 << p
.first
.get_snapdir();
13788 OpContextUPtr ctx
= simple_opc_create(obc
);
13789 PGTransaction
*t
= ctx
->op_t
.get();
13790 ctx
->snapset_obc
= snapset_obc
;
13791 ctx
->at_version
= get_next_version();
13792 ctx
->mtime
= utime_t(); // do not update mtime
13793 ctx
->new_snapset
= p
.second
;
13794 if (!ctx
->new_obs
.exists
) {
13795 dout(20) << __func__
<< " making " << p
.first
<< " a whiteout" << dendl
;
13796 ctx
->new_obs
.exists
= true;
13797 ctx
->new_snapset
.head_exists
= true;
13798 ctx
->new_obs
.oi
.set_flag(object_info_t::FLAG_WHITEOUT
);
13799 ++ctx
->delta_stats
.num_whiteouts
;
13800 ++ctx
->delta_stats
.num_objects
;
13801 t
->create(p
.first
);
13802 if (p
.first
< scrubber
.start
) {
13803 dout(20) << __func__
<< " kludging around update outside of scrub range"
13806 scrub_cstat
.add(ctx
->delta_stats
);
13809 dout(20) << __func__
<< " final snapset " << ctx
->new_snapset
<< dendl
;
13810 assert(!ctx
->new_snapset
.is_legacy());
13811 finish_ctx(ctx
.get(), pg_log_entry_t::MODIFY
);
13812 ctx
->register_on_success(
13814 dout(20) << "updating snapset" << dendl
;
13815 if (--scrubber
.num_digest_updates_pending
== 0) {
13820 simple_opc_submit(std::move(ctx
));
13821 ++scrubber
.num_digest_updates_pending
;
13824 dout(10) << __func__
<< " (" << mode
<< ") finish" << dendl
;
13827 void PrimaryLogPG::_scrub_clear_state()
13829 scrub_cstat
= object_stat_collection_t();
13832 void PrimaryLogPG::_scrub_finish()
13834 bool repair
= state_test(PG_STATE_REPAIR
);
13835 bool deep_scrub
= state_test(PG_STATE_DEEP_SCRUB
);
13836 const char *mode
= (repair
? "repair": (deep_scrub
? "deep-scrub" : "scrub"));
13838 if (info
.stats
.stats_invalid
) {
13839 info
.stats
.stats
= scrub_cstat
;
13840 info
.stats
.stats_invalid
= false;
13843 agent_choose_mode();
13846 dout(10) << mode
<< " got "
13847 << scrub_cstat
.sum
.num_objects
<< "/" << info
.stats
.stats
.sum
.num_objects
<< " objects, "
13848 << scrub_cstat
.sum
.num_object_clones
<< "/" << info
.stats
.stats
.sum
.num_object_clones
<< " clones, "
13849 << scrub_cstat
.sum
.num_objects_dirty
<< "/" << info
.stats
.stats
.sum
.num_objects_dirty
<< " dirty, "
13850 << scrub_cstat
.sum
.num_objects_omap
<< "/" << info
.stats
.stats
.sum
.num_objects_omap
<< " omap, "
13851 << scrub_cstat
.sum
.num_objects_pinned
<< "/" << info
.stats
.stats
.sum
.num_objects_pinned
<< " pinned, "
13852 << scrub_cstat
.sum
.num_objects_hit_set_archive
<< "/" << info
.stats
.stats
.sum
.num_objects_hit_set_archive
<< " hit_set_archive, "
13853 << scrub_cstat
.sum
.num_bytes
<< "/" << info
.stats
.stats
.sum
.num_bytes
<< " bytes, "
13854 << scrub_cstat
.sum
.num_bytes_hit_set_archive
<< "/" << info
.stats
.stats
.sum
.num_bytes_hit_set_archive
<< " hit_set_archive bytes."
13857 if (scrub_cstat
.sum
.num_objects
!= info
.stats
.stats
.sum
.num_objects
||
13858 scrub_cstat
.sum
.num_object_clones
!= info
.stats
.stats
.sum
.num_object_clones
||
13859 (scrub_cstat
.sum
.num_objects_dirty
!= info
.stats
.stats
.sum
.num_objects_dirty
&&
13860 !info
.stats
.dirty_stats_invalid
) ||
13861 (scrub_cstat
.sum
.num_objects_omap
!= info
.stats
.stats
.sum
.num_objects_omap
&&
13862 !info
.stats
.omap_stats_invalid
) ||
13863 (scrub_cstat
.sum
.num_objects_pinned
!= info
.stats
.stats
.sum
.num_objects_pinned
&&
13864 !info
.stats
.pin_stats_invalid
) ||
13865 (scrub_cstat
.sum
.num_objects_hit_set_archive
!= info
.stats
.stats
.sum
.num_objects_hit_set_archive
&&
13866 !info
.stats
.hitset_stats_invalid
) ||
13867 (scrub_cstat
.sum
.num_bytes_hit_set_archive
!= info
.stats
.stats
.sum
.num_bytes_hit_set_archive
&&
13868 !info
.stats
.hitset_bytes_stats_invalid
) ||
13869 scrub_cstat
.sum
.num_whiteouts
!= info
.stats
.stats
.sum
.num_whiteouts
||
13870 scrub_cstat
.sum
.num_bytes
!= info
.stats
.stats
.sum
.num_bytes
) {
13871 osd
->clog
->error() << info
.pgid
<< " " << mode
13872 << " stat mismatch, got "
13873 << scrub_cstat
.sum
.num_objects
<< "/" << info
.stats
.stats
.sum
.num_objects
<< " objects, "
13874 << scrub_cstat
.sum
.num_object_clones
<< "/" << info
.stats
.stats
.sum
.num_object_clones
<< " clones, "
13875 << scrub_cstat
.sum
.num_objects_dirty
<< "/" << info
.stats
.stats
.sum
.num_objects_dirty
<< " dirty, "
13876 << scrub_cstat
.sum
.num_objects_omap
<< "/" << info
.stats
.stats
.sum
.num_objects_omap
<< " omap, "
13877 << scrub_cstat
.sum
.num_objects_pinned
<< "/" << info
.stats
.stats
.sum
.num_objects_pinned
<< " pinned, "
13878 << scrub_cstat
.sum
.num_objects_hit_set_archive
<< "/" << info
.stats
.stats
.sum
.num_objects_hit_set_archive
<< " hit_set_archive, "
13879 << scrub_cstat
.sum
.num_whiteouts
<< "/" << info
.stats
.stats
.sum
.num_whiteouts
<< " whiteouts, "
13880 << scrub_cstat
.sum
.num_bytes
<< "/" << info
.stats
.stats
.sum
.num_bytes
<< " bytes, "
13881 << scrub_cstat
.sum
.num_bytes_hit_set_archive
<< "/" << info
.stats
.stats
.sum
.num_bytes_hit_set_archive
<< " hit_set_archive bytes.";
13882 ++scrubber
.shallow_errors
;
13886 info
.stats
.stats
= scrub_cstat
;
13887 info
.stats
.dirty_stats_invalid
= false;
13888 info
.stats
.omap_stats_invalid
= false;
13889 info
.stats
.hitset_stats_invalid
= false;
13890 info
.stats
.hitset_bytes_stats_invalid
= false;
13891 publish_stats_to_osd();
13894 } else if (scrub_cstat
.sum
.num_legacy_snapsets
!=
13895 info
.stats
.stats
.sum
.num_legacy_snapsets
) {
13896 osd
->clog
->info() << info
.pgid
<< " " << mode
<< " updated num_legacy_snapsets"
13897 << " from " << info
.stats
.stats
.sum
.num_legacy_snapsets
13898 << " -> " << scrub_cstat
.sum
.num_legacy_snapsets
<< "\n";
13899 info
.stats
.stats
.sum
.num_legacy_snapsets
= scrub_cstat
.sum
.num_legacy_snapsets
;
13900 publish_stats_to_osd();
13903 // Clear object context cache to get repair information
13905 object_contexts
.clear();
13908 bool PrimaryLogPG::check_osdmap_full(const set
<pg_shard_t
> &missing_on
)
13910 return osd
->check_osdmap_full(missing_on
);
13913 int PrimaryLogPG::rep_repair_primary_object(const hobject_t
& soid
, OpRequestRef op
)
13915 // Only supports replicated pools
13916 assert(!pool
.info
.require_rollback());
13917 assert(is_primary());
13919 dout(10) << __func__
<< " " << soid
13920 << " peers osd.{" << actingbackfill
<< "}" << dendl
;
13923 block_for_clean(soid
, op
);
13927 assert(!pg_log
.get_missing().is_missing(soid
));
13931 int r
= get_pgbackend()->objects_get_attr(soid
, OI_ATTR
, &bv
);
13933 // Leave v and try to repair without a version, getting attr failed
13934 dout(0) << __func__
<< ": Need version of replica, objects_get_attr failed: "
13935 << soid
<< " error=" << r
<< dendl
;
13937 bufferlist::iterator bliter
= bv
.begin();
13938 ::decode(oi
, bliter
);
13941 // Leave v as default constructed. This will fail when sent to older OSDs, but
13942 // not much worse than failing here.
13943 dout(0) << __func__
<< ": Need version of replica, bad object_info_t: " << soid
<< dendl
;
13946 missing_loc
.add_missing(soid
, v
, eversion_t());
13947 if (primary_error(soid
, v
)) {
13948 dout(0) << __func__
<< " No other replicas available for " << soid
<< dendl
;
13949 // XXX: If we knew that there is no down osd which could include this
13950 // object, it would be nice if we could return EIO here.
13951 // If a "never fail" flag was available, that could be used
13952 // for rbd to NOT return EIO until object marked lost.
13954 // Drop through to save this op in case an osd comes up with the object.
13957 // Restart the op after object becomes readable again
13958 waiting_for_unreadable_object
[soid
].push_back(op
);
13959 op
->mark_delayed("waiting for missing object");
13961 if (!eio_errors_to_process
) {
13962 eio_errors_to_process
= true;
13963 assert(is_clean());
13964 queue_peering_event(
13966 std::make_shared
<CephPeeringEvt
>(
13967 get_osdmap()->get_epoch(),
13968 get_osdmap()->get_epoch(),
13971 // A prior error must have already cleared clean state and queued recovery
13972 // or a map change has triggered re-peering.
13973 // Not inlining the recovery by calling maybe_kick_recovery(soid);
13974 dout(5) << __func__
<< ": Read error on " << soid
<< ", but already seen errors" << dendl
;
13980 /*---SnapTrimmer Logging---*/
13982 #define dout_prefix *_dout << pg->gen_prefix()
13984 void PrimaryLogPG::SnapTrimmer::log_enter(const char *state_name
)
13986 ldout(pg
->cct
, 20) << "enter " << state_name
<< dendl
;
13989 void PrimaryLogPG::SnapTrimmer::log_exit(const char *state_name
, utime_t enter_time
)
13991 ldout(pg
->cct
, 20) << "exit " << state_name
<< dendl
;
13994 /*---SnapTrimmer states---*/
13996 #define dout_prefix (*_dout << context< SnapTrimmer >().pg->gen_prefix() \
13997 << "SnapTrimmer state<" << get_state_name() << ">: ")
14000 PrimaryLogPG::NotTrimming::NotTrimming(my_context ctx
)
14002 NamedState(context
< SnapTrimmer
>().pg
, "NotTrimming")
14004 context
< SnapTrimmer
>().log_enter(state_name
);
14007 void PrimaryLogPG::NotTrimming::exit()
14009 context
< SnapTrimmer
>().log_exit(state_name
, enter_time
);
14012 boost::statechart::result
PrimaryLogPG::NotTrimming::react(const KickTrim
&)
14014 PrimaryLogPG
*pg
= context
< SnapTrimmer
>().pg
;
14015 ldout(pg
->cct
, 10) << "NotTrimming react KickTrim" << dendl
;
14017 if (!(pg
->is_primary() && pg
->is_active())) {
14018 ldout(pg
->cct
, 10) << "NotTrimming not primary or active" << dendl
;
14019 return discard_event();
14021 if (!pg
->is_clean() ||
14022 pg
->snap_trimq
.empty()) {
14023 ldout(pg
->cct
, 10) << "NotTrimming not clean or nothing to trim" << dendl
;
14024 return discard_event();
14026 if (pg
->scrubber
.active
) {
14027 ldout(pg
->cct
, 10) << " scrubbing, will requeue snap_trimmer after" << dendl
;
14028 return transit
< WaitScrub
>();
14030 return transit
< Trimming
>();
14034 boost::statechart::result
PrimaryLogPG::WaitReservation::react(const SnapTrimReserved
&)
14036 PrimaryLogPG
*pg
= context
< SnapTrimmer
>().pg
;
14037 ldout(pg
->cct
, 10) << "WaitReservation react SnapTrimReserved" << dendl
;
14040 if (!context
< SnapTrimmer
>().can_trim()) {
14041 post_event(KickTrim());
14042 return transit
< NotTrimming
>();
14045 context
<Trimming
>().snap_to_trim
= pg
->snap_trimq
.range_start();
14046 ldout(pg
->cct
, 10) << "NotTrimming: trimming "
14047 << pg
->snap_trimq
.range_start()
14049 return transit
< AwaitAsyncWork
>();
14052 /* AwaitAsyncWork */
14053 PrimaryLogPG::AwaitAsyncWork::AwaitAsyncWork(my_context ctx
)
14055 NamedState(context
< SnapTrimmer
>().pg
, "Trimming/AwaitAsyncWork")
14057 auto *pg
= context
< SnapTrimmer
>().pg
;
14058 context
< SnapTrimmer
>().log_enter(state_name
);
14059 context
< SnapTrimmer
>().pg
->osd
->queue_for_snap_trim(pg
);
14060 pg
->state_set(PG_STATE_SNAPTRIM
);
14061 pg
->state_clear(PG_STATE_SNAPTRIM_ERROR
);
14062 pg
->publish_stats_to_osd();
14065 boost::statechart::result
PrimaryLogPG::AwaitAsyncWork::react(const DoSnapWork
&)
14067 PrimaryLogPGRef pg
= context
< SnapTrimmer
>().pg
;
14068 snapid_t snap_to_trim
= context
<Trimming
>().snap_to_trim
;
14069 auto &in_flight
= context
<Trimming
>().in_flight
;
14070 assert(in_flight
.empty());
14072 assert(pg
->is_primary() && pg
->is_active());
14073 if (!context
< SnapTrimmer
>().can_trim()) {
14074 ldout(pg
->cct
, 10) << "something changed, reverting to NotTrimming" << dendl
;
14075 post_event(KickTrim());
14076 return transit
< NotTrimming
>();
14079 ldout(pg
->cct
, 10) << "AwaitAsyncWork: trimming snap " << snap_to_trim
<< dendl
;
14081 vector
<hobject_t
> to_trim
;
14082 unsigned max
= pg
->cct
->_conf
->osd_pg_max_concurrent_snap_trims
;
14083 to_trim
.reserve(max
);
14084 int r
= pg
->snap_mapper
.get_next_objects_to_trim(
14088 if (r
!= 0 && r
!= -ENOENT
) {
14089 lderr(pg
->cct
) << "get_next_objects_to_trim returned "
14090 << cpp_strerror(r
) << dendl
;
14091 assert(0 == "get_next_objects_to_trim returned an invalid code");
14092 } else if (r
== -ENOENT
) {
14094 ldout(pg
->cct
, 10) << "got ENOENT" << dendl
;
14096 ldout(pg
->cct
, 10) << "adding snap " << snap_to_trim
14097 << " to purged_snaps"
14099 pg
->info
.purged_snaps
.insert(snap_to_trim
);
14100 pg
->snap_trimq
.erase(snap_to_trim
);
14101 ldout(pg
->cct
, 10) << "purged_snaps now "
14102 << pg
->info
.purged_snaps
<< ", snap_trimq now "
14103 << pg
->snap_trimq
<< dendl
;
14105 ObjectStore::Transaction t
;
14106 pg
->dirty_big_info
= true;
14107 pg
->write_if_dirty(t
);
14108 int tr
= pg
->osd
->store
->queue_transaction(pg
->osr
.get(), std::move(t
), NULL
);
14111 pg
->share_pg_info();
14112 post_event(KickTrim());
14113 return transit
< NotTrimming
>();
14115 assert(!to_trim
.empty());
14117 for (auto &&object
: to_trim
) {
14119 ldout(pg
->cct
, 10) << "AwaitAsyncWork react trimming " << object
<< dendl
;
14121 int error
= pg
->trim_object(in_flight
.empty(), object
, &ctx
);
14123 if (error
== -ENOLCK
) {
14124 ldout(pg
->cct
, 10) << "could not get write lock on obj "
14125 << object
<< dendl
;
14127 pg
->state_set(PG_STATE_SNAPTRIM_ERROR
);
14128 ldout(pg
->cct
, 10) << "Snaptrim error=" << error
<< dendl
;
14130 if (!in_flight
.empty()) {
14131 ldout(pg
->cct
, 10) << "letting the ones we already started finish" << dendl
;
14132 return transit
< WaitRepops
>();
14134 if (error
== -ENOLCK
) {
14135 ldout(pg
->cct
, 10) << "waiting for it to clear"
14137 return transit
< WaitRWLock
>();
14139 return transit
< NotTrimming
>();
14143 in_flight
.insert(object
);
14144 ctx
->register_on_success(
14145 [pg
, object
, &in_flight
]() {
14146 assert(in_flight
.find(object
) != in_flight
.end());
14147 in_flight
.erase(object
);
14148 if (in_flight
.empty()) {
14149 if (pg
->state_test(PG_STATE_SNAPTRIM_ERROR
)) {
14150 pg
->snap_trimmer_machine
.process_event(Reset());
14152 pg
->snap_trimmer_machine
.process_event(RepopsComplete());
14157 pg
->simple_opc_submit(std::move(ctx
));
14160 return transit
< WaitRepops
>();
14163 void PrimaryLogPG::setattr_maybe_cache(
14164 ObjectContextRef obc
,
14170 t
->setattr(obc
->obs
.oi
.soid
, key
, val
);
14173 void PrimaryLogPG::setattrs_maybe_cache(
14174 ObjectContextRef obc
,
14177 map
<string
, bufferlist
> &attrs
)
14179 t
->setattrs(obc
->obs
.oi
.soid
, attrs
);
14182 void PrimaryLogPG::rmattr_maybe_cache(
14183 ObjectContextRef obc
,
14188 t
->rmattr(obc
->obs
.oi
.soid
, key
);
14191 int PrimaryLogPG::getattr_maybe_cache(
14192 ObjectContextRef obc
,
14196 if (pool
.info
.require_rollback()) {
14197 map
<string
, bufferlist
>::iterator i
= obc
->attr_cache
.find(key
);
14198 if (i
!= obc
->attr_cache
.end()) {
14206 return pgbackend
->objects_get_attr(obc
->obs
.oi
.soid
, key
, val
);
14209 int PrimaryLogPG::getattrs_maybe_cache(
14210 ObjectContextRef obc
,
14211 map
<string
, bufferlist
> *out
,
14215 if (pool
.info
.require_rollback()) {
14217 *out
= obc
->attr_cache
;
14219 r
= pgbackend
->objects_get_attrs(obc
->obs
.oi
.soid
, out
);
14221 if (out
&& user_only
) {
14222 map
<string
, bufferlist
> tmp
;
14223 for (map
<string
, bufferlist
>::iterator i
= out
->begin();
14226 if (i
->first
.size() > 1 && i
->first
[0] == '_')
14227 tmp
[i
->first
.substr(1, i
->first
.size())].claim(i
->second
);
14234 bool PrimaryLogPG::check_failsafe_full(ostream
&ss
) {
14235 return osd
->check_failsafe_full(ss
);
14238 void intrusive_ptr_add_ref(PrimaryLogPG
*pg
) { pg
->get("intptr"); }
14239 void intrusive_ptr_release(PrimaryLogPG
*pg
) { pg
->put("intptr"); }
14241 #ifdef PG_DEBUG_REFS
14242 uint64_t get_with_id(PrimaryLogPG
*pg
) { return pg
->get_with_id(); }
14243 void put_with_id(PrimaryLogPG
*pg
, uint64_t id
) { return pg
->put_with_id(id
); }
14246 void intrusive_ptr_add_ref(PrimaryLogPG::RepGather
*repop
) { repop
->get(); }
14247 void intrusive_ptr_release(PrimaryLogPG::RepGather
*repop
) { repop
->put(); }