]> git.proxmox.com Git - ceph.git/blame - ceph/src/osd/PrimaryLogPG.cc
import ceph pacific 16.2.5
[ceph.git] / ceph / src / osd / PrimaryLogPG.cc
CommitLineData
7c673cae
FG
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3/*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 * Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com>
8 *
9 * Author: Loic Dachary <loic@dachary.org>
10 *
11 * This is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public
13 * License version 2.1, as published by the Free Software
14 * Foundation. See file COPYING.
15 *
16 */
17
18#include "boost/tuple/tuple.hpp"
19#include "boost/intrusive_ptr.hpp"
20#include "PG.h"
f67539c2 21#include "pg_scrubber.h"
7c673cae
FG
22#include "PrimaryLogPG.h"
23#include "OSD.h"
f67539c2 24#include "PrimaryLogScrub.h"
7c673cae
FG
25#include "OpRequest.h"
26#include "ScrubStore.h"
27#include "Session.h"
28#include "objclass/objclass.h"
f67539c2 29#include "osd/ClassHandler.h"
7c673cae 30
f67539c2 31#include "cls/cas/cls_cas_ops.h"
9f95a23c 32#include "common/ceph_crypto.h"
7c673cae
FG
33#include "common/errno.h"
34#include "common/scrub_types.h"
35#include "common/perf_counters.h"
36
37#include "messages/MOSDOp.h"
38#include "messages/MOSDBackoff.h"
7c673cae
FG
39#include "messages/MOSDPGTrim.h"
40#include "messages/MOSDPGScan.h"
41#include "messages/MOSDRepScrub.h"
42#include "messages/MOSDPGBackfill.h"
43#include "messages/MOSDPGBackfillRemove.h"
44#include "messages/MOSDPGUpdateLogMissing.h"
45#include "messages/MOSDPGUpdateLogMissingReply.h"
46#include "messages/MCommandReply.h"
47#include "messages/MOSDScrubReserve.h"
7c673cae
FG
48#include "common/EventTrace.h"
49
50#include "common/config.h"
51#include "include/compat.h"
52#include "mon/MonClient.h"
53#include "osdc/Objecter.h"
54#include "json_spirit/json_spirit_value.h"
55#include "json_spirit/json_spirit_reader.h"
11fdf7f2 56#include "include/ceph_assert.h" // json_spirit clobbers it
7c673cae
FG
57#include "include/rados/rados_types.hpp"
58
59#ifdef WITH_LTTNG
60#include "tracing/osd.h"
61#else
62#define tracepoint(...)
63#endif
64
65#define dout_context cct
66#define dout_subsys ceph_subsys_osd
67#define DOUT_PREFIX_ARGS this, osd->whoami, get_osdmap()
68#undef dout_prefix
69#define dout_prefix _prefix(_dout, this)
7c673cae
FG
70
71#include <sstream>
72#include <utility>
73
74#include <errno.h>
f67539c2
TL
75#ifdef HAVE_JAEGER
76#include "common/tracer.h"
77#endif
78
79#include <common/CDC.h>
7c673cae
FG
80
81MEMPOOL_DEFINE_OBJECT_FACTORY(PrimaryLogPG, replicatedpg, osd);
82
f67539c2
TL
83using std::list;
84using std::ostream;
85using std::pair;
86using std::make_pair;
87using std::map;
88using std::ostringstream;
89using std::set;
90using std::string;
91using std::string_view;
92using std::stringstream;
93using std::unique_ptr;
94using std::vector;
95
96using ceph::bufferlist;
97using ceph::bufferptr;
98using ceph::Formatter;
99using ceph::decode;
100using ceph::decode_noclear;
101using ceph::encode;
102using ceph::encode_destructively;
103
9f95a23c 104using namespace ceph::osd::scheduler;
f67539c2
TL
105using TOPNSPC::common::cmd_getval;
106
107template <typename T>
108static ostream& _prefix(std::ostream *_dout, T *pg) {
109 return pg->gen_prefix(*_dout);
110}
7c673cae 111
7c673cae
FG
112/**
113 * The CopyCallback class defines an interface for completions to the
114 * copy_start code. Users of the copy infrastructure must implement
115 * one and give an instance of the class to start_copy.
116 *
117 * The implementer is responsible for making sure that the CopyCallback
118 * can associate itself with the correct copy operation.
119 */
120class PrimaryLogPG::CopyCallback : public GenContext<CopyCallbackResults> {
121protected:
122 CopyCallback() {}
123 /**
124 * results.get<0>() is the return code: 0 for success; -ECANCELED if
125 * the operation was cancelled by the local OSD; -errno for other issues.
126 * results.get<1>() is a pointer to a CopyResults object, which you are
127 * responsible for deleting.
128 */
129 void finish(CopyCallbackResults results_) override = 0;
130
131public:
132 /// Provide the final size of the copied object to the CopyCallback
133 ~CopyCallback() override {}
134};
135
136template <typename T>
137class PrimaryLogPG::BlessedGenContext : public GenContext<T> {
138 PrimaryLogPGRef pg;
139 unique_ptr<GenContext<T>> c;
140 epoch_t e;
141public:
142 BlessedGenContext(PrimaryLogPG *pg, GenContext<T> *c, epoch_t e)
143 : pg(pg), c(c), e(e) {}
144 void finish(T t) override {
9f95a23c 145 std::scoped_lock locker{*pg};
7c673cae
FG
146 if (pg->pg_has_reset_since(e))
147 c.reset();
148 else
149 c.release()->complete(t);
7c673cae 150 }
11fdf7f2
TL
151 bool sync_finish(T t) {
152 // we assume here all blessed/wrapped Contexts can complete synchronously.
153 c.release()->complete(t);
154 return true;
155 }
7c673cae
FG
156};
157
158GenContext<ThreadPool::TPHandle&> *PrimaryLogPG::bless_gencontext(
159 GenContext<ThreadPool::TPHandle&> *c) {
160 return new BlessedGenContext<ThreadPool::TPHandle&>(
11fdf7f2
TL
161 this, c, get_osdmap_epoch());
162}
163
164template <typename T>
165class PrimaryLogPG::UnlockedBlessedGenContext : public GenContext<T> {
166 PrimaryLogPGRef pg;
167 unique_ptr<GenContext<T>> c;
168 epoch_t e;
169public:
170 UnlockedBlessedGenContext(PrimaryLogPG *pg, GenContext<T> *c, epoch_t e)
171 : pg(pg), c(c), e(e) {}
172 void finish(T t) override {
173 if (pg->pg_has_reset_since(e))
174 c.reset();
175 else
176 c.release()->complete(t);
177 }
178 bool sync_finish(T t) {
179 // we assume here all blessed/wrapped Contexts can complete synchronously.
180 c.release()->complete(t);
181 return true;
182 }
183};
184
185GenContext<ThreadPool::TPHandle&> *PrimaryLogPG::bless_unlocked_gencontext(
186 GenContext<ThreadPool::TPHandle&> *c) {
187 return new UnlockedBlessedGenContext<ThreadPool::TPHandle&>(
188 this, c, get_osdmap_epoch());
7c673cae
FG
189}
190
191class PrimaryLogPG::BlessedContext : public Context {
192 PrimaryLogPGRef pg;
193 unique_ptr<Context> c;
194 epoch_t e;
195public:
196 BlessedContext(PrimaryLogPG *pg, Context *c, epoch_t e)
197 : pg(pg), c(c), e(e) {}
198 void finish(int r) override {
9f95a23c 199 std::scoped_lock locker{*pg};
7c673cae
FG
200 if (pg->pg_has_reset_since(e))
201 c.reset();
202 else
203 c.release()->complete(r);
7c673cae 204 }
f67539c2 205 bool sync_finish(int r) override {
11fdf7f2
TL
206 // we assume here all blessed/wrapped Contexts can complete synchronously.
207 c.release()->complete(r);
208 return true;
209 }
7c673cae
FG
210};
211
7c673cae 212Context *PrimaryLogPG::bless_context(Context *c) {
11fdf7f2 213 return new BlessedContext(this, c, get_osdmap_epoch());
7c673cae
FG
214}
215
216class PrimaryLogPG::C_PG_ObjectContext : public Context {
217 PrimaryLogPGRef pg;
218 ObjectContext *obc;
219 public:
220 C_PG_ObjectContext(PrimaryLogPG *p, ObjectContext *o) :
221 pg(p), obc(o) {}
222 void finish(int r) override {
223 pg->object_context_destructor_callback(obc);
224 }
225};
226
7c673cae
FG
227struct OnReadComplete : public Context {
228 PrimaryLogPG *pg;
229 PrimaryLogPG::OpContext *opcontext;
230 OnReadComplete(
231 PrimaryLogPG *pg,
232 PrimaryLogPG::OpContext *ctx) : pg(pg), opcontext(ctx) {}
233 void finish(int r) override {
7c673cae
FG
234 opcontext->finish_read(pg);
235 }
236 ~OnReadComplete() override {}
237};
238
239class PrimaryLogPG::C_OSD_AppliedRecoveredObject : public Context {
240 PrimaryLogPGRef pg;
241 ObjectContextRef obc;
242 public:
243 C_OSD_AppliedRecoveredObject(PrimaryLogPG *p, ObjectContextRef o) :
244 pg(p), obc(o) {}
11fdf7f2
TL
245 bool sync_finish(int r) override {
246 pg->_applied_recovered_object(obc);
247 return true;
248 }
7c673cae 249 void finish(int r) override {
9f95a23c 250 std::scoped_lock locker{*pg};
7c673cae
FG
251 pg->_applied_recovered_object(obc);
252 }
253};
254
255class PrimaryLogPG::C_OSD_CommittedPushedObject : public Context {
256 PrimaryLogPGRef pg;
257 epoch_t epoch;
258 eversion_t last_complete;
259 public:
260 C_OSD_CommittedPushedObject(
261 PrimaryLogPG *p, epoch_t epoch, eversion_t lc) :
262 pg(p), epoch(epoch), last_complete(lc) {
263 }
264 void finish(int r) override {
265 pg->_committed_pushed_object(epoch, last_complete);
266 }
267};
268
269class PrimaryLogPG::C_OSD_AppliedRecoveredObjectReplica : public Context {
270 PrimaryLogPGRef pg;
271 public:
272 explicit C_OSD_AppliedRecoveredObjectReplica(PrimaryLogPG *p) :
273 pg(p) {}
11fdf7f2
TL
274 bool sync_finish(int r) override {
275 pg->_applied_recovered_object_replica();
276 return true;
277 }
7c673cae 278 void finish(int r) override {
9f95a23c 279 std::scoped_lock locker{*pg};
7c673cae
FG
280 pg->_applied_recovered_object_replica();
281 }
282};
283
284// OpContext
285void PrimaryLogPG::OpContext::start_async_reads(PrimaryLogPG *pg)
286{
287 inflightreads = 1;
288 list<pair<boost::tuple<uint64_t, uint64_t, unsigned>,
289 pair<bufferlist*, Context*> > > in;
290 in.swap(pending_async_reads);
291 pg->pgbackend->objects_read_async(
292 obc->obs.oi.soid,
293 in,
294 new OnReadComplete(pg, this), pg->get_pool().fast_read);
295}
296void PrimaryLogPG::OpContext::finish_read(PrimaryLogPG *pg)
297{
11fdf7f2 298 ceph_assert(inflightreads > 0);
7c673cae
FG
299 --inflightreads;
300 if (async_reads_complete()) {
11fdf7f2
TL
301 ceph_assert(pg->in_progress_async_reads.size());
302 ceph_assert(pg->in_progress_async_reads.front().second == this);
7c673cae 303 pg->in_progress_async_reads.pop_front();
c07f9fc5
FG
304
305 // Restart the op context now that all reads have been
306 // completed. Read failures will be handled by the op finisher
307 pg->execute_ctx(this);
7c673cae
FG
308 }
309}
310
c07f9fc5 311class CopyFromCallback : public PrimaryLogPG::CopyCallback {
7c673cae 312public:
c07f9fc5 313 PrimaryLogPG::CopyResults *results = nullptr;
7c673cae 314 PrimaryLogPG::OpContext *ctx;
c07f9fc5 315 OSDOp &osd_op;
9f95a23c
TL
316 uint32_t truncate_seq;
317 uint64_t truncate_size;
318 bool have_truncate = false;
c07f9fc5
FG
319
320 CopyFromCallback(PrimaryLogPG::OpContext *ctx, OSDOp &osd_op)
321 : ctx(ctx), osd_op(osd_op) {
322 }
7c673cae
FG
323 ~CopyFromCallback() override {}
324
325 void finish(PrimaryLogPG::CopyCallbackResults results_) override {
326 results = results_.get<1>();
327 int r = results_.get<0>();
7c673cae 328
9f95a23c
TL
329 // Only use truncate_{seq,size} from the original object if the client
330 // did not sent us these parameters
331 if (!have_truncate) {
332 truncate_seq = results->truncate_seq;
333 truncate_size = results->truncate_size;
334 }
335
7c673cae
FG
336 // for finish_copyfrom
337 ctx->user_at_version = results->user_version;
338
339 if (r >= 0) {
340 ctx->pg->execute_ctx(ctx);
c07f9fc5 341 } else {
7c673cae
FG
342 if (r != -ECANCELED) { // on cancel just toss it out; client resends
343 if (ctx->op)
344 ctx->pg->osd->reply_op_error(ctx->op, r);
345 } else if (results->should_requeue) {
346 if (ctx->op)
347 ctx->pg->requeue_op(ctx->op);
348 }
349 ctx->pg->close_op_ctx(ctx);
350 }
351 }
352
353 bool is_temp_obj_used() {
354 return results->started_temp_obj;
355 }
356 uint64_t get_data_size() {
357 return results->object_size;
358 }
9f95a23c
TL
359 void set_truncate(uint32_t seq, uint64_t size) {
360 truncate_seq = seq;
361 truncate_size = size;
362 have_truncate = true;
363 }
c07f9fc5
FG
364};
365
366struct CopyFromFinisher : public PrimaryLogPG::OpFinisher {
367 CopyFromCallback *copy_from_callback;
368
11fdf7f2 369 explicit CopyFromFinisher(CopyFromCallback *copy_from_callback)
c07f9fc5
FG
370 : copy_from_callback(copy_from_callback) {
371 }
372
373 int execute() override {
374 // instance will be destructed after this method completes
375 copy_from_callback->ctx->pg->finish_copyfrom(copy_from_callback);
376 return 0;
7c673cae
FG
377 }
378};
379
380// ======================
381// PGBackend::Listener
382
383void PrimaryLogPG::on_local_recover(
384 const hobject_t &hoid,
385 const ObjectRecoveryInfo &_recovery_info,
386 ObjectContextRef obc,
c07f9fc5 387 bool is_delete,
7c673cae
FG
388 ObjectStore::Transaction *t
389 )
390{
391 dout(10) << __func__ << ": " << hoid << dendl;
392
393 ObjectRecoveryInfo recovery_info(_recovery_info);
394 clear_object_snap_mapping(t, hoid);
c07f9fc5 395 if (!is_delete && recovery_info.soid.is_snap()) {
7c673cae
FG
396 OSDriver::OSTransaction _t(osdriver.get_transaction(t));
397 set<snapid_t> snaps;
11fdf7f2
TL
398 dout(20) << " snapset " << recovery_info.ss << dendl;
399 auto p = recovery_info.ss.clone_snaps.find(hoid.snap);
400 if (p != recovery_info.ss.clone_snaps.end()) {
401 snaps.insert(p->second.begin(), p->second.end());
1adf2230
AA
402 dout(20) << " snaps " << snaps << dendl;
403 snap_mapper.add_oid(
11fdf7f2
TL
404 recovery_info.soid,
405 snaps,
406 &_t);
407 } else {
408 derr << __func__ << " " << hoid << " had no clone_snaps" << dendl;
7c673cae 409 }
7c673cae 410 }
9f95a23c
TL
411 if (!is_delete && recovery_state.get_pg_log().get_missing().is_missing(recovery_info.soid) &&
412 recovery_state.get_pg_log().get_missing().get_items().find(recovery_info.soid)->second.need > recovery_info.version) {
11fdf7f2 413 ceph_assert(is_primary());
9f95a23c 414 const pg_log_entry_t *latest = recovery_state.get_pg_log().get_log().objects.find(recovery_info.soid)->second;
7c673cae
FG
415 if (latest->op == pg_log_entry_t::LOST_REVERT &&
416 latest->reverting_to == recovery_info.version) {
417 dout(10) << " got old revert version " << recovery_info.version
418 << " for " << *latest << dendl;
419 recovery_info.version = latest->version;
420 // update the attr to the revert event version
421 recovery_info.oi.prior_version = recovery_info.oi.version;
422 recovery_info.oi.version = latest->version;
423 bufferlist bl;
11fdf7f2 424 encode(recovery_info.oi, bl,
7c673cae 425 get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
11fdf7f2 426 ceph_assert(!pool.info.is_erasure());
7c673cae
FG
427 t->setattr(coll, ghobject_t(recovery_info.soid), OI_ATTR, bl);
428 if (obc)
429 obc->attr_cache[OI_ATTR] = bl;
430 }
431 }
432
433 // keep track of active pushes for scrub
434 ++active_pushes;
435
9f95a23c
TL
436 recovery_state.recover_got(
437 recovery_info.soid,
438 recovery_info.version,
439 is_delete,
440 *t);
7c673cae
FG
441
442 if (is_primary()) {
c07f9fc5
FG
443 if (!is_delete) {
444 obc->obs.exists = true;
7c673cae 445
c07f9fc5 446 bool got = obc->get_recovery_read();
11fdf7f2 447 ceph_assert(got);
7c673cae 448
11fdf7f2 449 ceph_assert(recovering.count(obc->obs.oi.soid));
c07f9fc5
FG
450 recovering[obc->obs.oi.soid] = obc;
451 obc->obs.oi = recovery_info.oi; // may have been updated above
c07f9fc5 452 }
7c673cae
FG
453
454 t->register_on_applied(new C_OSD_AppliedRecoveredObject(this, obc));
7c673cae
FG
455
456 publish_stats_to_osd();
7c673cae
FG
457 release_backoffs(hoid);
458 if (!is_unreadable_object(hoid)) {
459 auto unreadable_object_entry = waiting_for_unreadable_object.find(hoid);
460 if (unreadable_object_entry != waiting_for_unreadable_object.end()) {
461 dout(20) << " kicking unreadable waiters on " << hoid << dendl;
462 requeue_ops(unreadable_object_entry->second);
463 waiting_for_unreadable_object.erase(unreadable_object_entry);
464 }
465 }
7c673cae
FG
466 } else {
467 t->register_on_applied(
468 new C_OSD_AppliedRecoveredObjectReplica(this));
469
470 }
471
472 t->register_on_commit(
473 new C_OSD_CommittedPushedObject(
474 this,
11fdf7f2 475 get_osdmap_epoch(),
7c673cae 476 info.last_complete));
7c673cae
FG
477}
478
479void PrimaryLogPG::on_global_recover(
480 const hobject_t &soid,
c07f9fc5
FG
481 const object_stat_sum_t &stat_diff,
482 bool is_delete)
7c673cae 483{
9f95a23c 484 recovery_state.object_recovered(soid, stat_diff);
7c673cae
FG
485 publish_stats_to_osd();
486 dout(10) << "pushed " << soid << " to all replicas" << dendl;
f67539c2 487 auto i = recovering.find(soid);
11fdf7f2 488 ceph_assert(i != recovering.end());
7c673cae 489
11fdf7f2 490 if (i->second && i->second->rwstate.recovery_read_marker) {
c07f9fc5
FG
491 // recover missing won't have had an obc, but it gets filled in
492 // during on_local_recover
11fdf7f2 493 ceph_assert(i->second);
c07f9fc5
FG
494 list<OpRequestRef> requeue_list;
495 i->second->drop_recovery_read(&requeue_list);
496 requeue_ops(requeue_list);
497 }
7c673cae
FG
498
499 backfills_in_flight.erase(soid);
500
501 recovering.erase(i);
502 finish_recovery_op(soid);
503 release_backoffs(soid);
504 auto degraded_object_entry = waiting_for_degraded_object.find(soid);
505 if (degraded_object_entry != waiting_for_degraded_object.end()) {
506 dout(20) << " kicking degraded waiters on " << soid << dendl;
507 requeue_ops(degraded_object_entry->second);
508 waiting_for_degraded_object.erase(degraded_object_entry);
509 }
510 auto unreadable_object_entry = waiting_for_unreadable_object.find(soid);
511 if (unreadable_object_entry != waiting_for_unreadable_object.end()) {
512 dout(20) << " kicking unreadable waiters on " << soid << dendl;
513 requeue_ops(unreadable_object_entry->second);
514 waiting_for_unreadable_object.erase(unreadable_object_entry);
515 }
516 finish_degraded_object(soid);
517}
518
7c673cae
FG
519void PrimaryLogPG::schedule_recovery_work(
520 GenContext<ThreadPool::TPHandle&> *c)
521{
11fdf7f2 522 osd->queue_recovery_context(this, c);
7c673cae
FG
523}
524
9f95a23c
TL
525void PrimaryLogPG::replica_clear_repop_obc(
526 const vector<pg_log_entry_t> &logv,
527 ObjectStore::Transaction &t)
7c673cae 528{
9f95a23c
TL
529 for (auto &&e: logv) {
530 /* Have to blast all clones, they share a snapset */
531 object_contexts.clear_range(
532 e.soid.get_object_boundary(), e.soid.get_head());
533 ceph_assert(
534 snapset_contexts.find(e.soid.get_head()) ==
535 snapset_contexts.end());
536 }
224ce89b
WB
537}
538
11fdf7f2
TL
539bool PrimaryLogPG::should_send_op(
540 pg_shard_t peer,
541 const hobject_t &hoid) {
542 if (peer == get_primary())
543 return true;
9f95a23c 544 ceph_assert(recovery_state.has_peer_info(peer));
11fdf7f2
TL
545 bool should_send =
546 hoid.pool != (int64_t)info.pgid.pool() ||
547 hoid <= last_backfill_started ||
9f95a23c 548 hoid <= recovery_state.get_peer_info(peer).last_backfill;
11fdf7f2 549 if (!should_send) {
9f95a23c 550 ceph_assert(is_backfill_target(peer));
11fdf7f2
TL
551 dout(10) << __func__ << " issue_repop shipping empty opt to osd." << peer
552 << ", object " << hoid
553 << " beyond std::max(last_backfill_started "
554 << ", peer_info[peer].last_backfill "
9f95a23c
TL
555 << recovery_state.get_peer_info(peer).last_backfill
556 << ")" << dendl;
11fdf7f2
TL
557 return should_send;
558 }
9f95a23c
TL
559 if (is_async_recovery_target(peer) &&
560 recovery_state.get_peer_missing(peer).is_missing(hoid)) {
11fdf7f2
TL
561 should_send = false;
562 dout(10) << __func__ << " issue_repop shipping empty opt to osd." << peer
563 << ", object " << hoid
564 << " which is pending recovery in async_recovery_targets" << dendl;
565 }
566 return should_send;
567}
568
569
7c673cae
FG
570ConnectionRef PrimaryLogPG::get_con_osd_cluster(
571 int peer, epoch_t from_epoch)
572{
573 return osd->get_con_osd_cluster(peer, from_epoch);
574}
575
576PerfCounters *PrimaryLogPG::get_logger()
577{
578 return osd->logger;
579}
580
581
582// ====================
583// missing objects
584
585bool PrimaryLogPG::is_missing_object(const hobject_t& soid) const
586{
9f95a23c 587 return recovery_state.get_pg_log().get_missing().get_items().count(soid);
7c673cae
FG
588}
589
590void PrimaryLogPG::maybe_kick_recovery(
591 const hobject_t &soid)
592{
593 eversion_t v;
11fdf7f2 594 bool work_started = false;
9f95a23c 595 if (!recovery_state.get_missing_loc().needs_recovery(soid, &v))
7c673cae
FG
596 return;
597
598 map<hobject_t, ObjectContextRef>::const_iterator p = recovering.find(soid);
599 if (p != recovering.end()) {
600 dout(7) << "object " << soid << " v " << v << ", already recovering." << dendl;
9f95a23c 601 } else if (recovery_state.get_missing_loc().is_unfound(soid)) {
7c673cae
FG
602 dout(7) << "object " << soid << " v " << v << ", is unfound." << dendl;
603 } else {
604 dout(7) << "object " << soid << " v " << v << ", recovering." << dendl;
605 PGBackend::RecoveryHandle *h = pgbackend->open_recovery_op();
606 if (is_missing_object(soid)) {
9f95a23c
TL
607 recover_missing(soid, v, CEPH_MSG_PRIO_HIGH, h);
608 } else if (recovery_state.get_missing_loc().is_deleted(soid)) {
11fdf7f2 609 prep_object_replica_deletes(soid, v, h, &work_started);
7c673cae 610 } else {
11fdf7f2 611 prep_object_replica_pushes(soid, v, h, &work_started);
7c673cae 612 }
9f95a23c 613 pgbackend->run_recovery_op(h, CEPH_MSG_PRIO_HIGH);
7c673cae
FG
614 }
615}
616
617void PrimaryLogPG::wait_for_unreadable_object(
618 const hobject_t& soid, OpRequestRef op)
619{
11fdf7f2 620 ceph_assert(is_unreadable_object(soid));
7c673cae
FG
621 maybe_kick_recovery(soid);
622 waiting_for_unreadable_object[soid].push_back(op);
623 op->mark_delayed("waiting for missing object");
624}
625
7c673cae
FG
626bool PrimaryLogPG::is_degraded_or_backfilling_object(const hobject_t& soid)
627{
628 /* The conditions below may clear (on_local_recover, before we queue
629 * the transaction) before we actually requeue the degraded waiters
630 * in on_global_recover after the transaction completes.
631 */
632 if (waiting_for_degraded_object.count(soid))
633 return true;
9f95a23c 634 if (recovery_state.get_pg_log().get_missing().get_items().count(soid))
7c673cae 635 return true;
9f95a23c
TL
636 ceph_assert(!get_acting_recovery_backfill().empty());
637 for (set<pg_shard_t>::iterator i = get_acting_recovery_backfill().begin();
638 i != get_acting_recovery_backfill().end();
7c673cae
FG
639 ++i) {
640 if (*i == get_primary()) continue;
641 pg_shard_t peer = *i;
9f95a23c 642 auto peer_missing_entry = recovery_state.get_peer_missing().find(peer);
11fdf7f2
TL
643 // If an object is missing on an async_recovery_target, return false.
644 // This will not block the op and the object is async recovered later.
9f95a23c 645 if (peer_missing_entry != recovery_state.get_peer_missing().end() &&
11fdf7f2 646 peer_missing_entry->second.get_items().count(soid)) {
9f95a23c 647 if (is_async_recovery_target(peer))
11fdf7f2
TL
648 continue;
649 else
650 return true;
651 }
7c673cae
FG
652 // Object is degraded if after last_backfill AND
653 // we are backfilling it
9f95a23c
TL
654 if (is_backfill_target(peer) &&
655 recovery_state.get_peer_info(peer).last_backfill <= soid &&
7c673cae
FG
656 last_backfill_started >= soid &&
657 backfills_in_flight.count(soid))
658 return true;
659 }
660 return false;
661}
662
11fdf7f2
TL
663bool PrimaryLogPG::is_degraded_on_async_recovery_target(const hobject_t& soid)
664{
9f95a23c
TL
665 for (auto &i: get_async_recovery_targets()) {
666 auto peer_missing_entry = recovery_state.get_peer_missing().find(i);
667 if (peer_missing_entry != recovery_state.get_peer_missing().end() &&
11fdf7f2
TL
668 peer_missing_entry->second.get_items().count(soid)) {
669 dout(30) << __func__ << " " << soid << dendl;
670 return true;
671 }
672 }
673 return false;
674}
675
7c673cae
FG
676void PrimaryLogPG::wait_for_degraded_object(const hobject_t& soid, OpRequestRef op)
677{
11fdf7f2 678 ceph_assert(is_degraded_or_backfilling_object(soid) || is_degraded_on_async_recovery_target(soid));
7c673cae
FG
679
680 maybe_kick_recovery(soid);
681 waiting_for_degraded_object[soid].push_back(op);
682 op->mark_delayed("waiting for degraded object");
683}
684
685void PrimaryLogPG::block_write_on_full_cache(
686 const hobject_t& _oid, OpRequestRef op)
687{
688 const hobject_t oid = _oid.get_head();
689 dout(20) << __func__ << ": blocking object " << oid
690 << " on full cache" << dendl;
691 objects_blocked_on_cache_full.insert(oid);
692 waiting_for_cache_not_full.push_back(op);
693 op->mark_delayed("waiting for cache not full");
694}
695
224ce89b
WB
696void PrimaryLogPG::block_for_clean(
697 const hobject_t& oid, OpRequestRef op)
698{
699 dout(20) << __func__ << ": blocking object " << oid
700 << " on primary repair" << dendl;
701 waiting_for_clean_to_primary_repair.push_back(op);
702 op->mark_delayed("waiting for clean to repair");
703}
704
7c673cae
FG
705void PrimaryLogPG::block_write_on_snap_rollback(
706 const hobject_t& oid, ObjectContextRef obc, OpRequestRef op)
707{
708 dout(20) << __func__ << ": blocking object " << oid.get_head()
709 << " on snap promotion " << obc->obs.oi.soid << dendl;
710 // otherwise, we'd have blocked in do_op
11fdf7f2
TL
711 ceph_assert(oid.is_head());
712 ceph_assert(objects_blocked_on_snap_promotion.count(oid) == 0);
7c673cae
FG
713 objects_blocked_on_snap_promotion[oid] = obc;
714 wait_for_blocked_object(obc->obs.oi.soid, op);
715}
716
717void PrimaryLogPG::block_write_on_degraded_snap(
718 const hobject_t& snap, OpRequestRef op)
719{
720 dout(20) << __func__ << ": blocking object " << snap.get_head()
721 << " on degraded snap " << snap << dendl;
722 // otherwise, we'd have blocked in do_op
11fdf7f2 723 ceph_assert(objects_blocked_on_degraded_snap.count(snap.get_head()) == 0);
7c673cae
FG
724 objects_blocked_on_degraded_snap[snap.get_head()] = snap.snap;
725 wait_for_degraded_object(snap, op);
726}
727
11fdf7f2 728bool PrimaryLogPG::maybe_await_blocked_head(
7c673cae
FG
729 const hobject_t &hoid,
730 OpRequestRef op)
731{
732 ObjectContextRef obc;
733 obc = object_contexts.lookup(hoid.get_head());
734 if (obc) {
735 if (obc->is_blocked()) {
736 wait_for_blocked_object(obc->obs.oi.soid, op);
737 return true;
738 } else {
739 return false;
740 }
741 }
7c673cae
FG
742 return false;
743}
744
745void PrimaryLogPG::wait_for_blocked_object(const hobject_t& soid, OpRequestRef op)
746{
747 dout(10) << __func__ << " " << soid << " " << op << dendl;
748 waiting_for_blocked_object[soid].push_back(op);
749 op->mark_delayed("waiting for blocked object");
750}
751
752void PrimaryLogPG::maybe_force_recovery()
753{
b32b8144 754 // no force if not in degraded/recovery/backfill states
7c673cae
FG
755 if (!is_degraded() &&
756 !state_test(PG_STATE_RECOVERING |
757 PG_STATE_RECOVERY_WAIT |
3efd9988 758 PG_STATE_BACKFILLING |
7c673cae
FG
759 PG_STATE_BACKFILL_WAIT |
760 PG_STATE_BACKFILL_TOOFULL))
761 return;
762
9f95a23c 763 if (recovery_state.get_pg_log().get_log().approx_size() <
7c673cae
FG
764 cct->_conf->osd_max_pg_log_entries *
765 cct->_conf->osd_force_recovery_pg_log_entries_factor)
766 return;
767
768 // find the oldest missing object
9f95a23c 769 version_t min_version = recovery_state.get_pg_log().get_log().head.version;
7c673cae 770 hobject_t soid;
9f95a23c
TL
771 if (!recovery_state.get_pg_log().get_missing().get_rmissing().empty()) {
772 min_version = recovery_state.get_pg_log().get_missing().get_rmissing().begin()->first;
773 soid = recovery_state.get_pg_log().get_missing().get_rmissing().begin()->second;
7c673cae 774 }
9f95a23c
TL
775 ceph_assert(!get_acting_recovery_backfill().empty());
776 for (set<pg_shard_t>::iterator it = get_acting_recovery_backfill().begin();
777 it != get_acting_recovery_backfill().end();
7c673cae
FG
778 ++it) {
779 if (*it == get_primary()) continue;
780 pg_shard_t peer = *it;
9f95a23c
TL
781 auto it_missing = recovery_state.get_peer_missing().find(peer);
782 if (it_missing != recovery_state.get_peer_missing().end() &&
11fdf7f2 783 !it_missing->second.get_rmissing().empty()) {
9f95a23c 784 const auto& min_obj = recovery_state.get_peer_missing(peer).get_rmissing().begin();
11fdf7f2
TL
785 dout(20) << __func__ << " peer " << peer << " min_version " << min_obj->first
786 << " oid " << min_obj->second << dendl;
787 if (min_version > min_obj->first) {
788 min_version = min_obj->first;
789 soid = min_obj->second;
790 }
7c673cae
FG
791 }
792 }
793
794 // recover it
795 if (soid != hobject_t())
796 maybe_kick_recovery(soid);
797}
798
9f95a23c
TL
799bool PrimaryLogPG::check_laggy(OpRequestRef& op)
800{
801 if (!HAVE_FEATURE(recovery_state.get_min_upacting_features(),
802 SERVER_OCTOPUS)) {
803 dout(20) << __func__ << " not all upacting has SERVER_OCTOPUS" << dendl;
804 return true;
7c673cae 805 }
9f95a23c
TL
806 if (state_test(PG_STATE_WAIT)) {
807 dout(10) << __func__ << " PG is WAIT state" << dendl;
808 } else if (!state_test(PG_STATE_LAGGY)) {
809 auto mnow = osd->get_mnow();
810 auto ru = recovery_state.get_readable_until();
811 if (mnow <= ru) {
812 // not laggy
813 return true;
814 }
815 dout(10) << __func__
816 << " mnow " << mnow
817 << " > readable_until " << ru << dendl;
7c673cae 818
9f95a23c
TL
819 if (!is_primary()) {
820 osd->reply_op_error(op, -EAGAIN);
821 return false;
7c673cae 822 }
7c673cae 823
9f95a23c
TL
824 // go to laggy state
825 state_set(PG_STATE_LAGGY);
826 publish_stats_to_osd();
7c673cae 827 }
9f95a23c
TL
828 dout(10) << __func__ << " not readable" << dendl;
829 waiting_for_readable.push_back(op);
830 op->mark_delayed("waiting for readable");
831 return false;
832}
7c673cae 833
9f95a23c 834bool PrimaryLogPG::check_laggy_requeue(OpRequestRef& op)
7c673cae 835{
9f95a23c
TL
836 if (!HAVE_FEATURE(recovery_state.get_min_upacting_features(),
837 SERVER_OCTOPUS)) {
838 return true;
7c673cae 839 }
9f95a23c
TL
840 if (!state_test(PG_STATE_WAIT) && !state_test(PG_STATE_LAGGY)) {
841 return true; // not laggy
842 }
843 dout(10) << __func__ << " not readable" << dendl;
844 waiting_for_readable.push_front(op);
845 op->mark_delayed("waiting for readable");
7c673cae
FG
846 return false;
847}
848
9f95a23c 849void PrimaryLogPG::recheck_readable()
7c673cae 850{
9f95a23c
TL
851 if (!is_wait() && !is_laggy()) {
852 dout(20) << __func__ << " wasn't wait or laggy" << dendl;
853 return;
854 }
855 auto mnow = osd->get_mnow();
856 bool pub = false;
857 if (is_wait()) {
858 auto prior_readable_until_ub = recovery_state.get_prior_readable_until_ub();
859 if (mnow < prior_readable_until_ub) {
860 dout(10) << __func__ << " still wait (mnow " << mnow
861 << " < prior_readable_until_ub " << prior_readable_until_ub
862 << ")" << dendl;
863 } else {
864 dout(10) << __func__ << " no longer wait (mnow " << mnow
865 << " >= prior_readable_until_ub " << prior_readable_until_ub
866 << ")" << dendl;
867 state_clear(PG_STATE_WAIT);
868 recovery_state.clear_prior_readable_until_ub();
869 pub = true;
870 }
871 }
872 if (is_laggy()) {
873 auto ru = recovery_state.get_readable_until();
874 if (ru == ceph::signedspan::zero()) {
875 dout(10) << __func__ << " still laggy (mnow " << mnow
876 << ", readable_until zero)" << dendl;
877 } else if (mnow >= ru) {
878 dout(10) << __func__ << " still laggy (mnow " << mnow
879 << " >= readable_until " << ru << ")" << dendl;
880 } else {
881 dout(10) << __func__ << " no longer laggy (mnow " << mnow
882 << " < readable_until " << ru << ")" << dendl;
883 state_clear(PG_STATE_LAGGY);
884 pub = true;
885 }
886 }
887 if (pub) {
888 publish_stats_to_osd();
889 }
890 if (!is_laggy() && !is_wait()) {
891 requeue_ops(waiting_for_readable);
892 }
7c673cae
FG
893}
894
9f95a23c 895bool PrimaryLogPG::pgls_filter(const PGLSFilter& filter, const hobject_t& sobj)
7c673cae
FG
896{
897 bufferlist bl;
898
899 // If filter has expressed an interest in an xattr, load it.
9f95a23c 900 if (!filter.get_xattr().empty()) {
7c673cae
FG
901 int ret = pgbackend->objects_get_attr(
902 sobj,
9f95a23c 903 filter.get_xattr(),
7c673cae 904 &bl);
9f95a23c 905 dout(0) << "getattr (sobj=" << sobj << ", attr=" << filter.get_xattr() << ") returned " << ret << dendl;
7c673cae 906 if (ret < 0) {
9f95a23c 907 if (ret != -ENODATA || filter.reject_empty_xattr()) {
7c673cae
FG
908 return false;
909 }
910 }
911 }
912
9f95a23c 913 return filter.filter(sobj, bl);
7c673cae
FG
914}
915
9f95a23c
TL
916std::pair<int, std::unique_ptr<const PGLSFilter>>
917PrimaryLogPG::get_pgls_filter(bufferlist::const_iterator& iter)
7c673cae
FG
918{
919 string type;
9f95a23c
TL
920 // storing non-const PGLSFilter for the sake of ::init()
921 std::unique_ptr<PGLSFilter> filter;
7c673cae
FG
922
923 try {
11fdf7f2 924 decode(type, iter);
7c673cae 925 }
f67539c2 926 catch (ceph::buffer::error& e) {
9f95a23c 927 return { -EINVAL, nullptr };
7c673cae
FG
928 }
929
9f95a23c
TL
930 if (type.compare("plain") == 0) {
931 filter = std::make_unique<PGLSPlainFilter>();
7c673cae
FG
932 } else {
933 std::size_t dot = type.find(".");
934 if (dot == std::string::npos || dot == 0 || dot == type.size() - 1) {
9f95a23c 935 return { -EINVAL, nullptr };
7c673cae
FG
936 }
937
938 const std::string class_name = type.substr(0, dot);
939 const std::string filter_name = type.substr(dot + 1);
940 ClassHandler::ClassData *cls = NULL;
9f95a23c 941 int r = ClassHandler::get_instance().open_class(class_name, &cls);
7c673cae
FG
942 if (r != 0) {
943 derr << "Error opening class '" << class_name << "': "
944 << cpp_strerror(r) << dendl;
f67539c2 945 if (r != -EPERM) // propagate permission error
7c673cae 946 r = -EINVAL;
9f95a23c 947 return { r, nullptr };
7c673cae 948 } else {
11fdf7f2 949 ceph_assert(cls);
7c673cae
FG
950 }
951
952 ClassHandler::ClassFilter *class_filter = cls->get_filter(filter_name);
953 if (class_filter == NULL) {
954 derr << "Error finding filter '" << filter_name << "' in class "
955 << class_name << dendl;
9f95a23c 956 return { -EINVAL, nullptr };
7c673cae 957 }
9f95a23c 958 filter.reset(class_filter->fn());
7c673cae
FG
959 if (!filter) {
960 // Object classes are obliged to return us something, but let's
961 // give an error rather than asserting out.
962 derr << "Buggy class " << class_name << " failed to construct "
963 "filter " << filter_name << dendl;
9f95a23c 964 return { -EINVAL, nullptr };
7c673cae
FG
965 }
966 }
967
11fdf7f2 968 ceph_assert(filter);
7c673cae
FG
969 int r = filter->init(iter);
970 if (r < 0) {
971 derr << "Error initializing filter " << type << ": "
972 << cpp_strerror(r) << dendl;
9f95a23c 973 return { -EINVAL, nullptr };
7c673cae
FG
974 } else {
975 // Successfully constructed and initialized, return it.
9f95a23c 976 return std::make_pair(0, std::move(filter));
7c673cae
FG
977 }
978}
979
980
981// ==========================================================
982
9f95a23c
TL
983void PrimaryLogPG::do_command(
984 const string_view& orig_prefix,
985 const cmdmap_t& cmdmap,
986 const bufferlist& idata,
987 std::function<void(int,const std::string&,bufferlist&)> on_finish)
7c673cae 988{
7c673cae 989 string format;
9f95a23c
TL
990 cmd_getval(cmdmap, "format", format);
991 std::unique_ptr<Formatter> f(Formatter::create(
992 format, "json-pretty", "json-pretty"));
993 int ret = 0;
994 stringstream ss; // stderr error message stream
995 bufferlist outbl; // if empty at end, we'll dump formatter as output
996
997 // get final prefix:
998 // - ceph pg <pgid> foo -> prefix=pg, cmd=foo
999 // - ceph tell <pgid> foo -> prefix=foo
1000 string prefix(orig_prefix);
7c673cae 1001 string command;
9f95a23c
TL
1002 cmd_getval(cmdmap, "cmd", command);
1003 if (command.size()) {
1004 prefix = command;
1005 }
1006
1007 if (prefix == "query") {
7c673cae 1008 f->open_object_section("pg");
7c673cae 1009 f->dump_stream("snap_trimq") << snap_trimq;
b32b8144 1010 f->dump_unsigned("snap_trimq_len", snap_trimq.size());
9f95a23c 1011 recovery_state.dump_peering_state(f.get());
7c673cae
FG
1012
1013 f->open_array_section("recovery_state");
1014 handle_query_state(f.get());
1015 f->close_section();
1016
f67539c2
TL
1017 if (is_primary() && is_active() && m_scrubber) {
1018 m_scrubber->dump(f.get());
1019 }
1020
7c673cae
FG
1021 f->open_object_section("agent_state");
1022 if (agent_state)
1023 agent_state->dump(f.get());
1024 f->close_section();
1025
1026 f->close_section();
7c673cae 1027 }
9f95a23c
TL
1028
1029 else if (prefix == "mark_unfound_lost") {
7c673cae 1030 string mulcmd;
9f95a23c 1031 cmd_getval(cmdmap, "mulcmd", mulcmd);
7c673cae
FG
1032 int mode = -1;
1033 if (mulcmd == "revert") {
11fdf7f2 1034 if (pool.info.is_erasure()) {
7c673cae 1035 ss << "mode must be 'delete' for ec pool";
9f95a23c
TL
1036 ret = -EINVAL;
1037 goto out;
7c673cae
FG
1038 }
1039 mode = pg_log_entry_t::LOST_REVERT;
1040 } else if (mulcmd == "delete") {
1041 mode = pg_log_entry_t::LOST_DELETE;
1042 } else {
1043 ss << "mode must be 'revert' or 'delete'; mark not yet implemented";
9f95a23c
TL
1044 ret = -EINVAL;
1045 goto out;
7c673cae 1046 }
11fdf7f2 1047 ceph_assert(mode == pg_log_entry_t::LOST_REVERT ||
9f95a23c 1048 mode == pg_log_entry_t::LOST_DELETE);
7c673cae
FG
1049
1050 if (!is_primary()) {
1051 ss << "not primary";
9f95a23c
TL
1052 ret = -EROFS;
1053 goto out;
7c673cae
FG
1054 }
1055
9f95a23c 1056 uint64_t unfound = recovery_state.get_missing_loc().num_unfound();
7c673cae
FG
1057 if (!unfound) {
1058 ss << "pg has no unfound objects";
9f95a23c 1059 goto out; // make command idempotent
7c673cae
FG
1060 }
1061
9f95a23c 1062 if (!recovery_state.all_unfound_are_queried_or_lost(get_osdmap())) {
7c673cae
FG
1063 ss << "pg has " << unfound
1064 << " unfound objects but we haven't probed all sources, not marking lost";
9f95a23c
TL
1065 ret = -EINVAL;
1066 goto out;
7c673cae
FG
1067 }
1068
9f95a23c
TL
1069 mark_all_unfound_lost(mode, on_finish);
1070 return;
7c673cae 1071 }
9f95a23c
TL
1072
1073 else if (prefix == "list_unfound") {
7c673cae
FG
1074 hobject_t offset;
1075 string offset_json;
11fdf7f2 1076 bool show_offset = false;
9f95a23c 1077 if (cmd_getval(cmdmap, "offset", offset_json)) {
7c673cae
FG
1078 json_spirit::Value v;
1079 try {
1080 if (!json_spirit::read(offset_json, v))
1081 throw std::runtime_error("bad json");
1082 offset.decode(v);
1083 } catch (std::runtime_error& e) {
1084 ss << "error parsing offset: " << e.what();
9f95a23c
TL
1085 ret = -EINVAL;
1086 goto out;
7c673cae 1087 }
11fdf7f2 1088 show_offset = true;
7c673cae
FG
1089 }
1090 f->open_object_section("missing");
11fdf7f2 1091 if (show_offset) {
7c673cae
FG
1092 f->open_object_section("offset");
1093 offset.dump(f.get());
1094 f->close_section();
1095 }
9f95a23c
TL
1096 auto &needs_recovery_map = recovery_state.get_missing_loc()
1097 .get_needs_recovery();
11fdf7f2 1098 f->dump_int("num_missing", needs_recovery_map.size());
7c673cae 1099 f->dump_int("num_unfound", get_num_unfound());
7c673cae
FG
1100 map<hobject_t, pg_missing_item>::const_iterator p =
1101 needs_recovery_map.upper_bound(offset);
1102 {
1103 f->open_array_section("objects");
1104 int32_t num = 0;
9f95a23c
TL
1105 for (; p != needs_recovery_map.end() &&
1106 num < cct->_conf->osd_command_max_records;
1107 ++p) {
1108 if (recovery_state.get_missing_loc().is_unfound(p->first)) {
7c673cae
FG
1109 f->open_object_section("object");
1110 {
1111 f->open_object_section("oid");
1112 p->first.dump(f.get());
1113 f->close_section();
1114 }
1115 p->second.dump(f.get()); // have, need keys
1116 {
1117 f->open_array_section("locations");
9f95a23c
TL
1118 for (auto &&r : recovery_state.get_missing_loc().get_locations(
1119 p->first)) {
1120 f->dump_stream("shard") << r;
1121 }
7c673cae
FG
1122 f->close_section();
1123 }
1124 f->close_section();
1125 num++;
1126 }
1127 }
1128 f->close_section();
1129 }
f67539c2
TL
1130 // Get possible locations of missing objects from pg information
1131 PeeringState::QueryUnfound q(f.get());
1132 recovery_state.handle_event(q, 0);
7c673cae
FG
1133 f->dump_bool("more", p != needs_recovery_map.end());
1134 f->close_section();
7c673cae
FG
1135 }
1136
9f95a23c
TL
1137 else if (prefix == "scrub" ||
1138 prefix == "deep_scrub") {
1139 bool deep = (prefix == "deep_scrub");
1140 int64_t time;
1141 cmd_getval(cmdmap, "time", time, (int64_t)0);
1142
1143 if (is_primary()) {
1144 const pg_pool_t *p = &pool.info;
1145 double pool_scrub_max_interval = 0;
1146 double scrub_max_interval;
1147 if (deep) {
1148 p->opts.get(pool_opts_t::DEEP_SCRUB_INTERVAL, &pool_scrub_max_interval);
1149 scrub_max_interval = pool_scrub_max_interval > 0 ?
1150 pool_scrub_max_interval : g_conf()->osd_deep_scrub_interval;
1151 } else {
1152 p->opts.get(pool_opts_t::SCRUB_MAX_INTERVAL, &pool_scrub_max_interval);
1153 scrub_max_interval = pool_scrub_max_interval > 0 ?
1154 pool_scrub_max_interval : g_conf()->osd_scrub_max_interval;
1155 }
1156 // Instead of marking must_scrub force a schedule scrub
1157 utime_t stamp = ceph_clock_now();
1158 if (time == 0)
1159 stamp -= scrub_max_interval;
1160 else
1161 stamp -= (float)time;
1162 stamp -= 100.0; // push back last scrub more for good measure
1163 if (deep) {
1164 set_last_deep_scrub_stamp(stamp);
1165 } else {
1166 set_last_scrub_stamp(stamp);
1167 }
1168 f->open_object_section("result");
1169 f->dump_bool("deep", deep);
1170 f->dump_stream("stamp") << stamp;
1171 f->close_section();
1172 } else {
1173 ss << "Not primary";
1174 ret = -EPERM;
1175 }
1176 outbl.append(ss.str());
1177 }
1178
1179 else {
1180 ret = -ENOSYS;
1181 ss << "prefix '" << prefix << "' not implemented";
1182 }
1183
1184 out:
1185 if (ret >= 0 && outbl.length() == 0) {
1186 f->flush(outbl);
1187 }
1188 on_finish(ret, ss.str(), outbl);
7c673cae
FG
1189}
1190
9f95a23c 1191
7c673cae
FG
1192// ==========================================================
1193
1194void PrimaryLogPG::do_pg_op(OpRequestRef op)
1195{
9f95a23c 1196 const MOSDOp *m = static_cast<const MOSDOp *>(op->get_req());
11fdf7f2 1197 ceph_assert(m->get_type() == CEPH_MSG_OSD_OP);
7c673cae
FG
1198 dout(10) << "do_pg_op " << *m << dendl;
1199
1200 op->mark_started();
1201
1202 int result = 0;
1203 string cname, mname;
7c673cae
FG
1204
1205 snapid_t snapid = m->get_snapid();
1206
1207 vector<OSDOp> ops = m->ops;
1208
1209 for (vector<OSDOp>::iterator p = ops.begin(); p != ops.end(); ++p) {
9f95a23c 1210 std::unique_ptr<const PGLSFilter> filter;
7c673cae 1211 OSDOp& osd_op = *p;
11fdf7f2 1212 auto bp = p->indata.cbegin();
7c673cae
FG
1213 switch (p->op.op) {
1214 case CEPH_OSD_OP_PGNLS_FILTER:
1215 try {
11fdf7f2
TL
1216 decode(cname, bp);
1217 decode(mname, bp);
7c673cae 1218 }
f67539c2 1219 catch (const ceph::buffer::error& e) {
7c673cae
FG
1220 dout(0) << "unable to decode PGLS_FILTER description in " << *m << dendl;
1221 result = -EINVAL;
1222 break;
1223 }
9f95a23c 1224 std::tie(result, filter) = get_pgls_filter(bp);
7c673cae
FG
1225 if (result < 0)
1226 break;
1227
11fdf7f2 1228 ceph_assert(filter);
7c673cae
FG
1229
1230 // fall through
1231
1232 case CEPH_OSD_OP_PGNLS:
1233 if (snapid != CEPH_NOSNAP) {
1234 result = -EINVAL;
1235 break;
1236 }
1237 if (get_osdmap()->raw_pg_to_pg(m->get_pg()) != info.pgid.pgid) {
1238 dout(10) << " pgnls pg=" << m->get_pg()
1239 << " " << get_osdmap()->raw_pg_to_pg(m->get_pg())
1240 << " != " << info.pgid << dendl;
1241 result = 0; // hmm?
1242 } else {
11fdf7f2
TL
1243 unsigned list_size = std::min<uint64_t>(cct->_conf->osd_max_pgls,
1244 p->op.pgls.count);
7c673cae 1245
11fdf7f2
TL
1246 dout(10) << " pgnls pg=" << m->get_pg() << " count " << list_size
1247 << dendl;
7c673cae
FG
1248 // read into a buffer
1249 vector<hobject_t> sentries;
1250 pg_nls_response_t response;
1251 try {
11fdf7f2 1252 decode(response.handle, bp);
7c673cae 1253 }
f67539c2 1254 catch (const ceph::buffer::error& e) {
7c673cae
FG
1255 dout(0) << "unable to decode PGNLS handle in " << *m << dendl;
1256 result = -EINVAL;
1257 break;
1258 }
1259
1260 hobject_t next;
1261 hobject_t lower_bound = response.handle;
1262 hobject_t pg_start = info.pgid.pgid.get_hobj_start();
1263 hobject_t pg_end = info.pgid.pgid.get_hobj_end(pool.info.get_pg_num());
1264 dout(10) << " pgnls lower_bound " << lower_bound
1265 << " pg_end " << pg_end << dendl;
1266 if (((!lower_bound.is_max() && lower_bound >= pg_end) ||
1267 (lower_bound != hobject_t() && lower_bound < pg_start))) {
1268 // this should only happen with a buggy client.
1269 dout(10) << "outside of PG bounds " << pg_start << " .. "
1270 << pg_end << dendl;
1271 result = -EINVAL;
1272 break;
1273 }
1274
1275 hobject_t current = lower_bound;
7c673cae
FG
1276 int r = pgbackend->objects_list_partial(
1277 current,
1278 list_size,
1279 list_size,
1280 &sentries,
1281 &next);
1282 if (r != 0) {
1283 result = -EINVAL;
1284 break;
1285 }
1286
1287 map<hobject_t, pg_missing_item>::const_iterator missing_iter =
9f95a23c 1288 recovery_state.get_pg_log().get_missing().get_items().lower_bound(current);
7c673cae
FG
1289 vector<hobject_t>::iterator ls_iter = sentries.begin();
1290 hobject_t _max = hobject_t::get_max();
1291 while (1) {
1292 const hobject_t &mcand =
9f95a23c 1293 missing_iter == recovery_state.get_pg_log().get_missing().get_items().end() ?
7c673cae
FG
1294 _max :
1295 missing_iter->first;
1296 const hobject_t &lcand =
1297 ls_iter == sentries.end() ?
1298 _max :
1299 *ls_iter;
1300
1301 hobject_t candidate;
1302 if (mcand == lcand) {
1303 candidate = mcand;
1304 if (!mcand.is_max()) {
1305 ++ls_iter;
1306 ++missing_iter;
1307 }
1308 } else if (mcand < lcand) {
1309 candidate = mcand;
11fdf7f2 1310 ceph_assert(!mcand.is_max());
7c673cae
FG
1311 ++missing_iter;
1312 } else {
1313 candidate = lcand;
11fdf7f2 1314 ceph_assert(!lcand.is_max());
7c673cae
FG
1315 ++ls_iter;
1316 }
1317
1318 dout(10) << " pgnls candidate 0x" << std::hex << candidate.get_hash()
11fdf7f2
TL
1319 << " vs lower bound 0x" << lower_bound.get_hash()
1320 << std::dec << dendl;
7c673cae
FG
1321
1322 if (candidate >= next) {
1323 break;
1324 }
1325
1326 if (response.entries.size() == list_size) {
1327 next = candidate;
1328 break;
1329 }
1330
7c673cae
FG
1331 if (candidate.snap != CEPH_NOSNAP)
1332 continue;
1333
1334 // skip internal namespace
1335 if (candidate.get_namespace() == cct->_conf->osd_hit_set_namespace)
1336 continue;
1337
9f95a23c 1338 if (recovery_state.get_missing_loc().is_deleted(candidate))
c07f9fc5
FG
1339 continue;
1340
7c673cae
FG
1341 // skip wrong namespace
1342 if (m->get_hobj().nspace != librados::all_nspaces &&
1343 candidate.get_namespace() != m->get_hobj().nspace)
1344 continue;
1345
9f95a23c 1346 if (filter && !pgls_filter(*filter, candidate))
7c673cae
FG
1347 continue;
1348
1349 dout(20) << "pgnls item 0x" << std::hex
1350 << candidate.get_hash()
1351 << ", rev 0x" << hobject_t::_reverse_bits(candidate.get_hash())
1352 << std::dec << " "
1353 << candidate.oid.name << dendl;
1354
1355 librados::ListObjectImpl item;
1356 item.nspace = candidate.get_namespace();
1357 item.oid = candidate.oid.name;
1358 item.locator = candidate.get_key();
1359 response.entries.push_back(item);
1360 }
1361
1362 if (next.is_max() &&
9f95a23c 1363 missing_iter == recovery_state.get_pg_log().get_missing().get_items().end() &&
7c673cae
FG
1364 ls_iter == sentries.end()) {
1365 result = 1;
1366
1367 // Set response.handle to the start of the next PG according
1368 // to the object sort order.
1369 response.handle = info.pgid.pgid.get_hobj_end(pool.info.get_pg_num());
1370 } else {
1371 response.handle = next;
1372 }
1373 dout(10) << "pgnls handle=" << response.handle << dendl;
11fdf7f2 1374 encode(response, osd_op.outdata);
7c673cae
FG
1375 dout(10) << " pgnls result=" << result << " outdata.length()="
1376 << osd_op.outdata.length() << dendl;
1377 }
1378 break;
1379
1380 case CEPH_OSD_OP_PGLS_FILTER:
1381 try {
11fdf7f2
TL
1382 decode(cname, bp);
1383 decode(mname, bp);
7c673cae 1384 }
f67539c2 1385 catch (const ceph::buffer::error& e) {
7c673cae
FG
1386 dout(0) << "unable to decode PGLS_FILTER description in " << *m << dendl;
1387 result = -EINVAL;
1388 break;
1389 }
9f95a23c 1390 std::tie(result, filter) = get_pgls_filter(bp);
7c673cae
FG
1391 if (result < 0)
1392 break;
1393
11fdf7f2 1394 ceph_assert(filter);
7c673cae
FG
1395
1396 // fall through
1397
1398 case CEPH_OSD_OP_PGLS:
1399 if (snapid != CEPH_NOSNAP) {
1400 result = -EINVAL;
1401 break;
1402 }
1403 if (get_osdmap()->raw_pg_to_pg(m->get_pg()) != info.pgid.pgid) {
1404 dout(10) << " pgls pg=" << m->get_pg()
1405 << " " << get_osdmap()->raw_pg_to_pg(m->get_pg())
1406 << " != " << info.pgid << dendl;
1407 result = 0; // hmm?
1408 } else {
11fdf7f2
TL
1409 unsigned list_size = std::min<uint64_t>(cct->_conf->osd_max_pgls,
1410 p->op.pgls.count);
7c673cae
FG
1411
1412 dout(10) << " pgls pg=" << m->get_pg() << " count " << list_size << dendl;
1413 // read into a buffer
1414 vector<hobject_t> sentries;
1415 pg_ls_response_t response;
1416 try {
11fdf7f2 1417 decode(response.handle, bp);
7c673cae 1418 }
f67539c2 1419 catch (const ceph::buffer::error& e) {
7c673cae
FG
1420 dout(0) << "unable to decode PGLS handle in " << *m << dendl;
1421 result = -EINVAL;
1422 break;
1423 }
1424
1425 hobject_t next;
1426 hobject_t current = response.handle;
7c673cae
FG
1427 int r = pgbackend->objects_list_partial(
1428 current,
1429 list_size,
1430 list_size,
1431 &sentries,
1432 &next);
1433 if (r != 0) {
1434 result = -EINVAL;
1435 break;
1436 }
1437
9f95a23c 1438 ceph_assert(snapid == CEPH_NOSNAP || recovery_state.get_pg_log().get_missing().get_items().empty());
7c673cae
FG
1439
1440 map<hobject_t, pg_missing_item>::const_iterator missing_iter =
9f95a23c 1441 recovery_state.get_pg_log().get_missing().get_items().lower_bound(current);
7c673cae
FG
1442 vector<hobject_t>::iterator ls_iter = sentries.begin();
1443 hobject_t _max = hobject_t::get_max();
1444 while (1) {
1445 const hobject_t &mcand =
9f95a23c 1446 missing_iter == recovery_state.get_pg_log().get_missing().get_items().end() ?
7c673cae
FG
1447 _max :
1448 missing_iter->first;
1449 const hobject_t &lcand =
1450 ls_iter == sentries.end() ?
1451 _max :
1452 *ls_iter;
1453
1454 hobject_t candidate;
1455 if (mcand == lcand) {
1456 candidate = mcand;
1457 if (!mcand.is_max()) {
1458 ++ls_iter;
1459 ++missing_iter;
1460 }
1461 } else if (mcand < lcand) {
1462 candidate = mcand;
11fdf7f2 1463 ceph_assert(!mcand.is_max());
7c673cae
FG
1464 ++missing_iter;
1465 } else {
1466 candidate = lcand;
11fdf7f2 1467 ceph_assert(!lcand.is_max());
7c673cae
FG
1468 ++ls_iter;
1469 }
1470
1471 if (candidate >= next) {
1472 break;
1473 }
f67539c2 1474
7c673cae
FG
1475 if (response.entries.size() == list_size) {
1476 next = candidate;
1477 break;
1478 }
1479
7c673cae
FG
1480 if (candidate.snap != CEPH_NOSNAP)
1481 continue;
1482
1483 // skip wrong namespace
1484 if (candidate.get_namespace() != m->get_hobj().nspace)
1485 continue;
1486
9f95a23c 1487 if (recovery_state.get_missing_loc().is_deleted(candidate))
c07f9fc5
FG
1488 continue;
1489
9f95a23c 1490 if (filter && !pgls_filter(*filter, candidate))
7c673cae
FG
1491 continue;
1492
1493 response.entries.push_back(make_pair(candidate.oid,
1494 candidate.get_key()));
1495 }
1496 if (next.is_max() &&
9f95a23c 1497 missing_iter == recovery_state.get_pg_log().get_missing().get_items().end() &&
7c673cae
FG
1498 ls_iter == sentries.end()) {
1499 result = 1;
1500 }
1501 response.handle = next;
11fdf7f2 1502 encode(response, osd_op.outdata);
7c673cae
FG
1503 dout(10) << " pgls result=" << result << " outdata.length()="
1504 << osd_op.outdata.length() << dendl;
1505 }
1506 break;
1507
1508 case CEPH_OSD_OP_PG_HITSET_LS:
1509 {
1510 list< pair<utime_t,utime_t> > ls;
1511 for (list<pg_hit_set_info_t>::const_iterator p = info.hit_set.history.begin();
1512 p != info.hit_set.history.end();
1513 ++p)
1514 ls.push_back(make_pair(p->begin, p->end));
1515 if (hit_set)
1516 ls.push_back(make_pair(hit_set_start_stamp, utime_t()));
11fdf7f2 1517 encode(ls, osd_op.outdata);
7c673cae
FG
1518 }
1519 break;
1520
1521 case CEPH_OSD_OP_PG_HITSET_GET:
1522 {
1523 utime_t stamp(osd_op.op.hit_set_get.stamp);
1524 if (hit_set_start_stamp && stamp >= hit_set_start_stamp) {
1525 // read the current in-memory HitSet, not the version we've
1526 // checkpointed.
1527 if (!hit_set) {
1528 result= -ENOENT;
1529 break;
1530 }
11fdf7f2 1531 encode(*hit_set, osd_op.outdata);
7c673cae
FG
1532 result = osd_op.outdata.length();
1533 } else {
1534 // read an archived HitSet.
1535 hobject_t oid;
1536 for (list<pg_hit_set_info_t>::const_iterator p = info.hit_set.history.begin();
1537 p != info.hit_set.history.end();
1538 ++p) {
1539 if (stamp >= p->begin && stamp <= p->end) {
1540 oid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt);
1541 break;
1542 }
1543 }
1544 if (oid == hobject_t()) {
1545 result = -ENOENT;
1546 break;
1547 }
1548 if (!pool.info.is_replicated()) {
1549 // FIXME: EC not supported yet
1550 result = -EOPNOTSUPP;
1551 break;
1552 }
1553 if (is_unreadable_object(oid)) {
1554 wait_for_unreadable_object(oid, op);
7c673cae
FG
1555 return;
1556 }
1557 result = osd->store->read(ch, ghobject_t(oid), 0, 0, osd_op.outdata);
1558 }
1559 }
1560 break;
1561
1562 case CEPH_OSD_OP_SCRUBLS:
1563 result = do_scrub_ls(m, &osd_op);
1564 break;
1565
1566 default:
1567 result = -EINVAL;
1568 break;
1569 }
1570
1571 if (result < 0)
1572 break;
1573 }
1574
1575 // reply
11fdf7f2 1576 MOSDOpReply *reply = new MOSDOpReply(m, 0, get_osdmap_epoch(),
7c673cae
FG
1577 CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK,
1578 false);
1579 reply->claim_op_out_data(ops);
1580 reply->set_result(result);
1581 reply->set_reply_versions(info.last_update, info.last_user_version);
1582 osd->send_message_osd_client(reply, m->get_connection());
7c673cae
FG
1583}
1584
9f95a23c 1585int PrimaryLogPG::do_scrub_ls(const MOSDOp *m, OSDOp *osd_op)
7c673cae
FG
1586{
1587 if (m->get_pg() != info.pgid.pgid) {
1588 dout(10) << " scrubls pg=" << m->get_pg() << " != " << info.pgid << dendl;
1589 return -EINVAL; // hmm?
1590 }
11fdf7f2 1591 auto bp = osd_op->indata.cbegin();
7c673cae
FG
1592 scrub_ls_arg_t arg;
1593 try {
1594 arg.decode(bp);
f67539c2 1595 } catch (ceph::buffer::error&) {
7c673cae
FG
1596 dout(10) << " corrupted scrub_ls_arg_t" << dendl;
1597 return -EINVAL;
1598 }
f67539c2 1599
7c673cae
FG
1600 int r = 0;
1601 scrub_ls_result_t result = {.interval = info.history.same_interval_since};
f67539c2 1602
7c673cae
FG
1603 if (arg.interval != 0 && arg.interval != info.history.same_interval_since) {
1604 r = -EAGAIN;
7c673cae 1605 } else {
f67539c2
TL
1606 bool store_queried = m_scrubber && m_scrubber->get_store_errors(arg, result);
1607 if (store_queried) {
1608 encode(result, osd_op->outdata);
1609 } else {
1610 // the scrubber's store is not initialized
1611 r = -ENOENT;
1612 }
7c673cae 1613 }
f67539c2 1614
7c673cae
FG
1615 return r;
1616}
1617
f67539c2
TL
1618/**
1619 * Releases locks
1620 *
1621 * @param manager [in] manager with locks to release
1622 */
1623void PrimaryLogPG::release_object_locks(
1624 ObcLockManager &lock_manager) {
1625 std::list<std::pair<ObjectContextRef, std::list<OpRequestRef> > > to_req;
1626 bool requeue_recovery = false;
1627 bool requeue_snaptrim = false;
1628 lock_manager.put_locks(
1629 &to_req,
1630 &requeue_recovery,
1631 &requeue_snaptrim);
1632 if (requeue_recovery)
1633 queue_recovery();
1634 if (requeue_snaptrim)
1635 snap_trimmer_machine.process_event(TrimWriteUnblocked());
1636
1637 if (!to_req.empty()) {
1638 // requeue at front of scrub blocking queue if we are blocked by scrub
1639 for (auto &&p: to_req) {
1640 if (m_scrubber->write_blocked_by_scrub(p.first->obs.oi.soid.get_head())) {
1641 for (auto& op : p.second) {
1642 op->mark_delayed("waiting for scrub");
1643 }
1644
1645 waiting_for_scrub.splice(
1646 waiting_for_scrub.begin(),
1647 p.second,
1648 p.second.begin(),
1649 p.second.end());
1650 } else if (is_laggy()) {
1651 for (auto& op : p.second) {
1652 op->mark_delayed("waiting for readable");
1653 }
1654 waiting_for_readable.splice(
1655 waiting_for_readable.begin(),
1656 p.second,
1657 p.second.begin(),
1658 p.second.end());
1659 } else {
1660 requeue_ops(p.second);
1661 }
1662 }
1663 }
1664}
1665
7c673cae 1666PrimaryLogPG::PrimaryLogPG(OSDService *o, OSDMapRef curmap,
11fdf7f2
TL
1667 const PGPool &_pool,
1668 const map<string,string>& ec_profile, spg_t p) :
7c673cae
FG
1669 PG(o, curmap, _pool, p),
1670 pgbackend(
1671 PGBackend::build_pg_backend(
11fdf7f2 1672 _pool.info, ec_profile, this, coll_t(p), ch, o->store, cct)),
7c673cae 1673 object_contexts(o->cct, o->cct->_conf->osd_pg_object_context_cache_count),
7c673cae
FG
1674 new_backfill(false),
1675 temp_seq(0),
1676 snap_trimmer_machine(this)
f67539c2 1677{
9f95a23c 1678 recovery_state.set_backend_predicates(
7c673cae
FG
1679 pgbackend->get_is_readable_predicate(),
1680 pgbackend->get_is_recoverable_predicate());
1681 snap_trimmer_machine.initiate();
f67539c2
TL
1682
1683 m_scrubber = make_unique<PrimaryLogScrub>(this);
7c673cae
FG
1684}
1685
1686void PrimaryLogPG::get_src_oloc(const object_t& oid, const object_locator_t& oloc, object_locator_t& src_oloc)
1687{
1688 src_oloc = oloc;
1689 if (oloc.key.empty())
1690 src_oloc.key = oid.name;
1691}
1692
1693void PrimaryLogPG::handle_backoff(OpRequestRef& op)
1694{
9f95a23c
TL
1695 auto m = op->get_req<MOSDBackoff>();
1696 auto session = ceph::ref_cast<Session>(m->get_connection()->get_priv());
7c673cae
FG
1697 if (!session)
1698 return; // drop it.
7c673cae
FG
1699 hobject_t begin = info.pgid.pgid.get_hobj_start();
1700 hobject_t end = info.pgid.pgid.get_hobj_end(pool.info.get_pg_num());
1701 if (begin < m->begin) {
1702 begin = m->begin;
1703 }
1704 if (end > m->end) {
1705 end = m->end;
1706 }
1707 dout(10) << __func__ << " backoff ack id " << m->id
1708 << " [" << begin << "," << end << ")" << dendl;
1709 session->ack_backoff(cct, m->pgid, m->id, begin, end);
1710}
1711
1712void PrimaryLogPG::do_request(
1713 OpRequestRef& op,
1714 ThreadPool::TPHandle &handle)
1715{
1716 if (op->osd_trace) {
1717 op->pg_trace.init("pg op", &trace_endpoint, &op->osd_trace);
1718 op->pg_trace.event("do request");
1719 }
f67539c2
TL
1720#ifdef HAVE_JAEGER
1721 if (op->osd_parent_span) {
1722 auto do_req_span = jaeger_tracing::child_span(__func__, op->osd_parent_span);
1723 }
1724#endif
1725// make sure we have a new enough map
7c673cae
FG
1726 auto p = waiting_for_map.find(op->get_source());
1727 if (p != waiting_for_map.end()) {
1728 // preserve ordering
1729 dout(20) << __func__ << " waiting_for_map "
1730 << p->first << " not empty, queueing" << dendl;
1731 p->second.push_back(op);
1732 op->mark_delayed("waiting_for_map not empty");
1733 return;
1734 }
1735 if (!have_same_or_newer_map(op->min_epoch)) {
1736 dout(20) << __func__ << " min " << op->min_epoch
1737 << ", queue on waiting_for_map " << op->get_source() << dendl;
1738 waiting_for_map[op->get_source()].push_back(op);
1739 op->mark_delayed("op must wait for map");
181888fb 1740 osd->request_osdmap_update(op->min_epoch);
7c673cae
FG
1741 return;
1742 }
1743
1744 if (can_discard_request(op)) {
1745 return;
1746 }
1747
1748 // pg-wide backoffs
1749 const Message *m = op->get_req();
11fdf7f2 1750 int msg_type = m->get_type();
7c673cae 1751 if (m->get_connection()->has_feature(CEPH_FEATURE_RADOS_BACKOFF)) {
9f95a23c 1752 auto session = ceph::ref_cast<Session>(m->get_connection()->get_priv());
7c673cae
FG
1753 if (!session)
1754 return; // drop it.
11fdf7f2 1755 if (msg_type == CEPH_MSG_OSD_OP) {
7c673cae
FG
1756 if (session->check_backoff(cct, info.pgid,
1757 info.pgid.pgid.get_hobj_start(), m)) {
1758 return;
1759 }
1760
1761 bool backoff =
1762 is_down() ||
1763 is_incomplete() ||
1764 (!is_active() && is_peered());
11fdf7f2 1765 if (g_conf()->osd_backoff_on_peering && !backoff) {
7c673cae
FG
1766 if (is_peering()) {
1767 backoff = true;
1768 }
1769 }
1770 if (backoff) {
1771 add_pg_backoff(session);
1772 return;
1773 }
1774 }
1775 // pg backoff acks at pg-level
11fdf7f2 1776 if (msg_type == CEPH_MSG_OSD_BACKOFF) {
7c673cae
FG
1777 const MOSDBackoff *ba = static_cast<const MOSDBackoff*>(m);
1778 if (ba->begin != ba->end) {
1779 handle_backoff(op);
1780 return;
1781 }
1782 }
1783 }
1784
7c673cae
FG
1785 if (!is_peered()) {
1786 // Delay unless PGBackend says it's ok
1787 if (pgbackend->can_handle_while_inactive(op)) {
1788 bool handled = pgbackend->handle_message(op);
11fdf7f2 1789 ceph_assert(handled);
7c673cae
FG
1790 return;
1791 } else {
1792 waiting_for_peered.push_back(op);
1793 op->mark_delayed("waiting for peered");
1794 return;
1795 }
1796 }
1797
9f95a23c
TL
1798 if (recovery_state.needs_flush()) {
1799 dout(20) << "waiting for flush on " << op << dendl;
b32b8144
FG
1800 waiting_for_flush.push_back(op);
1801 op->mark_delayed("waiting for flush");
1802 return;
1803 }
1804
9f95a23c 1805 ceph_assert(is_peered() && !recovery_state.needs_flush());
7c673cae
FG
1806 if (pgbackend->handle_message(op))
1807 return;
1808
11fdf7f2 1809 switch (msg_type) {
7c673cae
FG
1810 case CEPH_MSG_OSD_OP:
1811 case CEPH_MSG_OSD_BACKOFF:
1812 if (!is_active()) {
1813 dout(20) << " peered, not active, waiting for active on " << op << dendl;
1814 waiting_for_active.push_back(op);
1815 op->mark_delayed("waiting for active");
1816 return;
1817 }
11fdf7f2 1818 switch (msg_type) {
7c673cae
FG
1819 case CEPH_MSG_OSD_OP:
1820 // verify client features
1821 if ((pool.info.has_tiers() || pool.info.is_tier()) &&
1822 !op->has_feature(CEPH_FEATURE_OSD_CACHEPOOL)) {
1823 osd->reply_op_error(op, -EOPNOTSUPP);
1824 return;
1825 }
1826 do_op(op);
1827 break;
1828 case CEPH_MSG_OSD_BACKOFF:
1829 // object-level backoff acks handled in osdop context
1830 handle_backoff(op);
1831 break;
1832 }
1833 break;
1834
7c673cae
FG
1835 case MSG_OSD_PG_SCAN:
1836 do_scan(op, handle);
1837 break;
1838
1839 case MSG_OSD_PG_BACKFILL:
1840 do_backfill(op);
1841 break;
1842
1843 case MSG_OSD_PG_BACKFILL_REMOVE:
1844 do_backfill_remove(op);
1845 break;
1846
1847 case MSG_OSD_SCRUB_RESERVE:
1848 {
f67539c2
TL
1849 if (!m_scrubber) {
1850 osd->reply_op_error(op, -EAGAIN);
1851 return;
1852 }
9f95a23c 1853 auto m = op->get_req<MOSDScrubReserve>();
7c673cae
FG
1854 switch (m->type) {
1855 case MOSDScrubReserve::REQUEST:
f67539c2 1856 m_scrubber->handle_scrub_reserve_request(op);
7c673cae
FG
1857 break;
1858 case MOSDScrubReserve::GRANT:
f67539c2 1859 m_scrubber->handle_scrub_reserve_grant(op, m->from);
7c673cae
FG
1860 break;
1861 case MOSDScrubReserve::REJECT:
f67539c2 1862 m_scrubber->handle_scrub_reserve_reject(op, m->from);
7c673cae
FG
1863 break;
1864 case MOSDScrubReserve::RELEASE:
f67539c2 1865 m_scrubber->handle_scrub_reserve_release(op);
7c673cae
FG
1866 break;
1867 }
1868 }
1869 break;
1870
1871 case MSG_OSD_REP_SCRUB:
1872 replica_scrub(op, handle);
1873 break;
1874
1875 case MSG_OSD_REP_SCRUBMAP:
1876 do_replica_scrub_map(op);
1877 break;
1878
1879 case MSG_OSD_PG_UPDATE_LOG_MISSING:
1880 do_update_log_missing(op);
1881 break;
1882
1883 case MSG_OSD_PG_UPDATE_LOG_MISSING_REPLY:
1884 do_update_log_missing_reply(op);
1885 break;
1886
1887 default:
11fdf7f2 1888 ceph_abort_msg("bad message type in do_request");
7c673cae
FG
1889 }
1890}
1891
7c673cae
FG
1892/** do_op - do an op
1893 * pg lock will be held (if multithreaded)
1894 * osd_lock NOT held.
1895 */
1896void PrimaryLogPG::do_op(OpRequestRef& op)
1897{
11fdf7f2 1898 FUNCTRACE(cct);
7c673cae
FG
1899 // NOTE: take a non-const pointer here; we must be careful not to
1900 // change anything that will break other reads on m (operator<<).
1901 MOSDOp *m = static_cast<MOSDOp*>(op->get_nonconst_req());
11fdf7f2 1902 ceph_assert(m->get_type() == CEPH_MSG_OSD_OP);
7c673cae
FG
1903 if (m->finish_decode()) {
1904 op->reset_desc(); // for TrackedOp
1905 m->clear_payload();
1906 }
1907
1908 dout(20) << __func__ << ": op " << *m << dendl;
1909
9f95a23c 1910 const hobject_t head = m->get_hobj().get_head();
7c673cae
FG
1911
1912 if (!info.pgid.pgid.contains(
1913 info.pgid.pgid.get_split_bits(pool.info.get_pg_num()), head)) {
1914 derr << __func__ << " " << info.pgid.pgid << " does not contain "
1915 << head << " pg_num " << pool.info.get_pg_num() << " hash "
1916 << std::hex << head.get_hash() << std::dec << dendl;
1917 osd->clog->warn() << info.pgid.pgid << " does not contain " << head
1918 << " op " << *m;
11fdf7f2 1919 ceph_assert(!cct->_conf->osd_debug_misdirected_ops);
7c673cae
FG
1920 return;
1921 }
1922
1923 bool can_backoff =
1924 m->get_connection()->has_feature(CEPH_FEATURE_RADOS_BACKOFF);
9f95a23c 1925 ceph::ref_t<Session> session;
7c673cae 1926 if (can_backoff) {
11fdf7f2 1927 session = static_cast<Session*>(m->get_connection()->get_priv().get());
7c673cae
FG
1928 if (!session.get()) {
1929 dout(10) << __func__ << " no session" << dendl;
1930 return;
1931 }
7c673cae
FG
1932
1933 if (session->check_backoff(cct, info.pgid, head, m)) {
1934 return;
1935 }
1936 }
1937
1938 if (m->has_flag(CEPH_OSD_FLAG_PARALLELEXEC)) {
1939 // not implemented.
1940 dout(20) << __func__ << ": PARALLELEXEC not implemented " << *m << dendl;
1941 osd->reply_op_error(op, -EINVAL);
1942 return;
1943 }
1944
9f95a23c
TL
1945 {
1946 int r = op->maybe_init_op_info(*get_osdmap());
7c673cae
FG
1947 if (r) {
1948 osd->reply_op_error(op, r);
1949 return;
1950 }
1951 }
1952
1953 if ((m->get_flags() & (CEPH_OSD_FLAG_BALANCE_READS |
1954 CEPH_OSD_FLAG_LOCALIZE_READS)) &&
1955 op->may_read() &&
1956 !(op->may_write() || op->may_cache())) {
1957 // balanced reads; any replica will do
9f95a23c 1958 if (!(is_primary() || is_nonprimary())) {
7c673cae
FG
1959 osd->handle_misdirected_op(this, op);
1960 return;
1961 }
1962 } else {
1963 // normal case; must be primary
1964 if (!is_primary()) {
1965 osd->handle_misdirected_op(this, op);
1966 return;
1967 }
1968 }
1969
9f95a23c
TL
1970 if (!check_laggy(op)) {
1971 return;
1972 }
1973
7c673cae
FG
1974 if (!op_has_sufficient_caps(op)) {
1975 osd->reply_op_error(op, -EPERM);
1976 return;
1977 }
1978
31f18b77
FG
1979 if (op->includes_pg_op()) {
1980 return do_pg_op(op);
1981 }
1982
7c673cae
FG
1983 // object name too long?
1984 if (m->get_oid().name.size() > cct->_conf->osd_max_object_name_len) {
1985 dout(4) << "do_op name is longer than "
1986 << cct->_conf->osd_max_object_name_len
1987 << " bytes" << dendl;
1988 osd->reply_op_error(op, -ENAMETOOLONG);
1989 return;
1990 }
1991 if (m->get_hobj().get_key().size() > cct->_conf->osd_max_object_name_len) {
1992 dout(4) << "do_op locator is longer than "
1993 << cct->_conf->osd_max_object_name_len
1994 << " bytes" << dendl;
1995 osd->reply_op_error(op, -ENAMETOOLONG);
1996 return;
1997 }
1998 if (m->get_hobj().nspace.size() > cct->_conf->osd_max_object_namespace_len) {
1999 dout(4) << "do_op namespace is longer than "
2000 << cct->_conf->osd_max_object_namespace_len
2001 << " bytes" << dendl;
2002 osd->reply_op_error(op, -ENAMETOOLONG);
2003 return;
2004 }
494da23a
TL
2005 if (m->get_hobj().oid.name.empty()) {
2006 dout(4) << "do_op empty oid name is not allowed" << dendl;
2007 osd->reply_op_error(op, -EINVAL);
2008 return;
2009 }
7c673cae
FG
2010
2011 if (int r = osd->store->validate_hobject_key(head)) {
2012 dout(4) << "do_op object " << head << " invalid for backing store: "
2013 << r << dendl;
2014 osd->reply_op_error(op, r);
2015 return;
2016 }
2017
f67539c2
TL
2018 // blocklisted?
2019 if (get_osdmap()->is_blocklisted(m->get_source_addr())) {
2020 dout(10) << "do_op " << m->get_source_addr() << " is blocklisted" << dendl;
2021 osd->reply_op_error(op, -EBLOCKLISTED);
7c673cae
FG
2022 return;
2023 }
2024
2025 // order this op as a write?
2026 bool write_ordered = op->rwordered();
2027
2028 // discard due to cluster full transition? (we discard any op that
2029 // originates before the cluster or pool is marked full; the client
2030 // will resend after the full flag is removed or if they expect the
2031 // op to succeed despite being full). The except is FULL_FORCE and
2032 // FULL_TRY ops, which there is no reason to discard because they
2033 // bypass all full checks anyway. If this op isn't write or
2034 // read-ordered, we skip.
2035 // FIXME: we exclude mds writes for now.
2036 if (write_ordered && !(m->get_source().is_mds() ||
2037 m->has_flag(CEPH_OSD_FLAG_FULL_TRY) ||
2038 m->has_flag(CEPH_OSD_FLAG_FULL_FORCE)) &&
2039 info.history.last_epoch_marked_full > m->get_map_epoch()) {
2040 dout(10) << __func__ << " discarding op sent before full " << m << " "
2041 << *m << dendl;
2042 return;
2043 }
2044 // mds should have stopped writing before this point.
2045 // We can't allow OSD to become non-startable even if mds
2046 // could be writing as part of file removals.
f67539c2 2047 if (write_ordered && osd->check_failsafe_full(get_dpp()) &&
11fdf7f2
TL
2048 !m->has_flag(CEPH_OSD_FLAG_FULL_TRY)) {
2049 dout(10) << __func__ << " fail-safe full check failed, dropping request." << dendl;
7c673cae
FG
2050 return;
2051 }
2052 int64_t poolid = get_pgid().pool();
2053 if (op->may_write()) {
2054
2055 const pg_pool_t *pi = get_osdmap()->get_pg_pool(poolid);
2056 if (!pi) {
2057 return;
2058 }
2059
2060 // invalid?
2061 if (m->get_snapid() != CEPH_NOSNAP) {
2062 dout(20) << __func__ << ": write to clone not valid " << *m << dendl;
2063 osd->reply_op_error(op, -EINVAL);
2064 return;
2065 }
2066
2067 // too big?
2068 if (cct->_conf->osd_max_write_size &&
2069 m->get_data_len() > cct->_conf->osd_max_write_size << 20) {
2070 // journal can't hold commit!
2071 derr << "do_op msg data len " << m->get_data_len()
2072 << " > osd_max_write_size " << (cct->_conf->osd_max_write_size << 20)
2073 << " on " << *m << dendl;
2074 osd->reply_op_error(op, -OSD_WRITETOOBIG);
2075 return;
2076 }
2077 }
2078
2079 dout(10) << "do_op " << *m
2080 << (op->may_write() ? " may_write" : "")
2081 << (op->may_read() ? " may_read" : "")
2082 << (op->may_cache() ? " may_cache" : "")
2083 << " -> " << (write_ordered ? "write-ordered" : "read-ordered")
2084 << " flags " << ceph_osd_flag_string(m->get_flags())
2085 << dendl;
2086
f67539c2
TL
2087#ifdef HAVE_JAEGER
2088 if (op->osd_parent_span) {
2089 auto do_op_span = jaeger_tracing::child_span(__func__, op->osd_parent_span);
2090 }
2091#endif
7c673cae
FG
2092 // missing object?
2093 if (is_unreadable_object(head)) {
224ce89b
WB
2094 if (!is_primary()) {
2095 osd->reply_op_error(op, -EAGAIN);
2096 return;
2097 }
7c673cae 2098 if (can_backoff &&
11fdf7f2 2099 (g_conf()->osd_backoff_on_degraded ||
9f95a23c
TL
2100 (g_conf()->osd_backoff_on_unfound &&
2101 recovery_state.get_missing_loc().is_unfound(head)))) {
7c673cae
FG
2102 add_backoff(session, head, head);
2103 maybe_kick_recovery(head);
2104 } else {
2105 wait_for_unreadable_object(head, op);
2106 }
2107 return;
2108 }
2109
11fdf7f2
TL
2110 if (write_ordered) {
2111 // degraded object?
2112 if (is_degraded_or_backfilling_object(head)) {
2113 if (can_backoff && g_conf()->osd_backoff_on_degraded) {
2114 add_backoff(session, head, head);
2115 maybe_kick_recovery(head);
2116 } else {
2117 wait_for_degraded_object(head, op);
2118 }
2119 return;
7c673cae 2120 }
7c673cae 2121
f67539c2 2122 if (m_scrubber->is_scrub_active() && m_scrubber->write_blocked_by_scrub(head)) {
11fdf7f2
TL
2123 dout(20) << __func__ << ": waiting for scrub" << dendl;
2124 waiting_for_scrub.push_back(op);
2125 op->mark_delayed("waiting for scrub");
2126 return;
2127 }
9f95a23c
TL
2128 if (!check_laggy_requeue(op)) {
2129 return;
2130 }
7c673cae 2131
11fdf7f2
TL
2132 // blocked on snap?
2133 if (auto blocked_iter = objects_blocked_on_degraded_snap.find(head);
2134 blocked_iter != std::end(objects_blocked_on_degraded_snap)) {
2135 hobject_t to_wait_on(head);
2136 to_wait_on.snap = blocked_iter->second;
2137 wait_for_degraded_object(to_wait_on, op);
2138 return;
2139 }
2140 if (auto blocked_snap_promote_iter = objects_blocked_on_snap_promotion.find(head);
2141 blocked_snap_promote_iter != std::end(objects_blocked_on_snap_promotion)) {
2142 wait_for_blocked_object(blocked_snap_promote_iter->second->obs.oi.soid, op);
2143 return;
2144 }
2145 if (objects_blocked_on_cache_full.count(head)) {
2146 block_write_on_full_cache(head, op);
2147 return;
2148 }
7c673cae
FG
2149 }
2150
2151 // dup/resent?
2152 if (op->may_write() || op->may_cache()) {
2153 // warning: we will get back *a* request for this reqid, but not
2154 // necessarily the most recent. this happens with flush and
2155 // promote ops, but we can't possible have both in our log where
2156 // the original request is still not stable on disk, so for our
2157 // purposes here it doesn't matter which one we get.
2158 eversion_t version;
2159 version_t user_version;
2160 int return_code = 0;
9f95a23c 2161 vector<pg_log_op_return_item_t> op_returns;
7c673cae 2162 bool got = check_in_progress_op(
9f95a23c 2163 m->get_reqid(), &version, &user_version, &return_code, &op_returns);
7c673cae
FG
2164 if (got) {
2165 dout(3) << __func__ << " dup " << m->get_reqid()
2166 << " version " << version << dendl;
2167 if (already_complete(version)) {
9f95a23c 2168 osd->reply_op_error(op, return_code, version, user_version, op_returns);
7c673cae
FG
2169 } else {
2170 dout(10) << " waiting for " << version << " to commit" << dendl;
2171 // always queue ondisk waiters, so that we can requeue if needed
9f95a23c
TL
2172 waiting_for_ondisk[version].emplace_back(op, user_version, return_code,
2173 op_returns);
7c673cae
FG
2174 op->mark_delayed("waiting for ondisk");
2175 }
2176 return;
2177 }
2178 }
2179
2180 ObjectContextRef obc;
11fdf7f2 2181 bool can_create = op->may_write();
7c673cae 2182 hobject_t missing_oid;
11fdf7f2
TL
2183
2184 // kludge around the fact that LIST_SNAPS sets CEPH_SNAPDIR for LIST_SNAPS
11fdf7f2 2185 const hobject_t& oid =
9f95a23c 2186 m->get_snapid() == CEPH_SNAPDIR ? head : m->get_hobj();
11fdf7f2
TL
2187
2188 // make sure LIST_SNAPS is on CEPH_SNAPDIR and nothing else
2189 for (vector<OSDOp>::iterator p = m->ops.begin(); p != m->ops.end(); ++p) {
2190 OSDOp& osd_op = *p;
2191
2192 if (osd_op.op.op == CEPH_OSD_OP_LIST_SNAPS) {
2193 if (m->get_snapid() != CEPH_SNAPDIR) {
2194 dout(10) << "LIST_SNAPS with incorrect context" << dendl;
2195 osd->reply_op_error(op, -EINVAL);
2196 return;
2197 }
2198 } else {
2199 if (m->get_snapid() == CEPH_SNAPDIR) {
2200 dout(10) << "non-LIST_SNAPS on snapdir" << dendl;
2201 osd->reply_op_error(op, -EINVAL);
2202 return;
2203 }
2204 }
2205 }
7c673cae
FG
2206
2207 // io blocked on obc?
2208 if (!m->has_flag(CEPH_OSD_FLAG_FLUSH) &&
11fdf7f2 2209 maybe_await_blocked_head(oid, op)) {
7c673cae
FG
2210 return;
2211 }
2212
9f95a23c
TL
2213 if (!is_primary()) {
2214 if (!recovery_state.can_serve_replica_read(oid)) {
f67539c2
TL
2215 dout(20) << __func__
2216 << ": unstable write on replica, bouncing to primary "
9f95a23c
TL
2217 << *m << dendl;
2218 osd->reply_op_error(op, -EAGAIN);
2219 return;
9f95a23c 2220 }
f67539c2
TL
2221 dout(20) << __func__ << ": serving replica read on oid " << oid
2222 << dendl;
9f95a23c
TL
2223 }
2224
7c673cae
FG
2225 int r = find_object_context(
2226 oid, &obc, can_create,
2227 m->has_flag(CEPH_OSD_FLAG_MAP_SNAP_CLONE),
2228 &missing_oid);
2229
11fdf7f2
TL
2230 // LIST_SNAPS needs the ssc too
2231 if (obc &&
2232 m->get_snapid() == CEPH_SNAPDIR &&
2233 !obc->ssc) {
2234 obc->ssc = get_snapset_context(oid, true);
2235 }
2236
7c673cae
FG
2237 if (r == -EAGAIN) {
2238 // If we're not the primary of this OSD, we just return -EAGAIN. Otherwise,
2239 // we have to wait for the object.
2240 if (is_primary()) {
2241 // missing the specific snap we need; requeue and wait.
11fdf7f2 2242 ceph_assert(!op->may_write()); // only happens on a read/cache
7c673cae
FG
2243 wait_for_unreadable_object(missing_oid, op);
2244 return;
2245 }
2246 } else if (r == 0) {
2247 if (is_unreadable_object(obc->obs.oi.soid)) {
2248 dout(10) << __func__ << ": clone " << obc->obs.oi.soid
2249 << " is unreadable, waiting" << dendl;
2250 wait_for_unreadable_object(obc->obs.oi.soid, op);
2251 return;
2252 }
2253
2254 // degraded object? (the check above was for head; this could be a clone)
2255 if (write_ordered &&
2256 obc->obs.oi.soid.snap != CEPH_NOSNAP &&
2257 is_degraded_or_backfilling_object(obc->obs.oi.soid)) {
2258 dout(10) << __func__ << ": clone " << obc->obs.oi.soid
2259 << " is degraded, waiting" << dendl;
2260 wait_for_degraded_object(obc->obs.oi.soid, op);
2261 return;
2262 }
2263 }
2264
2265 bool in_hit_set = false;
2266 if (hit_set) {
2267 if (obc.get()) {
2268 if (obc->obs.oi.soid != hobject_t() && hit_set->contains(obc->obs.oi.soid))
2269 in_hit_set = true;
2270 } else {
2271 if (missing_oid != hobject_t() && hit_set->contains(missing_oid))
2272 in_hit_set = true;
2273 }
2274 if (!op->hitset_inserted) {
2275 hit_set->insert(oid);
2276 op->hitset_inserted = true;
2277 if (hit_set->is_full() ||
2278 hit_set_start_stamp + pool.info.hit_set_period <= m->get_recv_stamp()) {
2279 hit_set_persist();
2280 }
2281 }
2282 }
2283
2284 if (agent_state) {
2285 if (agent_choose_mode(false, op))
2286 return;
2287 }
2288
31f18b77 2289 if (obc.get() && obc->obs.exists && obc->obs.oi.has_manifest()) {
f67539c2
TL
2290 if (recover_adjacent_clones(obc, op)) {
2291 return;
2292 }
31f18b77
FG
2293 if (maybe_handle_manifest(op,
2294 write_ordered,
2295 obc))
2296 return;
2297 }
2298
7c673cae
FG
2299 if (maybe_handle_cache(op,
2300 write_ordered,
2301 obc,
2302 r,
2303 missing_oid,
2304 false,
2305 in_hit_set))
2306 return;
2307
2308 if (r && (r != -ENOENT || !obc)) {
2309 // copy the reqids for copy get on ENOENT
2310 if (r == -ENOENT &&
2311 (m->ops[0].op.op == CEPH_OSD_OP_COPY_GET)) {
2312 fill_in_copy_get_noent(op, oid, m->ops[0]);
2313 return;
2314 }
224ce89b 2315 dout(20) << __func__ << ": find_object_context got error " << r << dendl;
7c673cae 2316 if (op->may_write() &&
9f95a23c 2317 get_osdmap()->require_osd_release >= ceph_release_t::kraken) {
7c673cae
FG
2318 record_write_error(op, oid, nullptr, r);
2319 } else {
2320 osd->reply_op_error(op, r);
2321 }
2322 return;
2323 }
2324
2325 // make sure locator is consistent
2326 object_locator_t oloc(obc->obs.oi.soid);
2327 if (m->get_object_locator() != oloc) {
f67539c2 2328 dout(10) << " provided locator " << m->get_object_locator()
7c673cae 2329 << " != object's " << obc->obs.oi.soid << dendl;
f67539c2 2330 osd->clog->warn() << "bad locator " << m->get_object_locator()
7c673cae
FG
2331 << " on object " << oloc
2332 << " op " << *m;
2333 }
2334
2335 // io blocked on obc?
2336 if (obc->is_blocked() &&
2337 !m->has_flag(CEPH_OSD_FLAG_FLUSH)) {
2338 wait_for_blocked_object(obc->obs.oi.soid, op);
2339 return;
2340 }
2341
2342 dout(25) << __func__ << " oi " << obc->obs.oi << dendl;
2343
c07f9fc5 2344 OpContext *ctx = new OpContext(op, m->get_reqid(), &m->ops, obc, this);
7c673cae 2345
7c673cae
FG
2346 if (m->has_flag(CEPH_OSD_FLAG_SKIPRWLOCKS)) {
2347 dout(20) << __func__ << ": skipping rw locks" << dendl;
2348 } else if (m->get_flags() & CEPH_OSD_FLAG_FLUSH) {
2349 dout(20) << __func__ << ": part of flush, will ignore write lock" << dendl;
2350
2351 // verify there is in fact a flush in progress
2352 // FIXME: we could make this a stronger test.
2353 map<hobject_t,FlushOpRef>::iterator p = flush_ops.find(obc->obs.oi.soid);
2354 if (p == flush_ops.end()) {
2355 dout(10) << __func__ << " no flush in progress, aborting" << dendl;
2356 reply_ctx(ctx, -EINVAL);
2357 return;
2358 }
2359 } else if (!get_rw_locks(write_ordered, ctx)) {
2360 dout(20) << __func__ << " waiting for rw locks " << dendl;
2361 op->mark_delayed("waiting for rw locks");
2362 close_op_ctx(ctx);
2363 return;
2364 }
2365 dout(20) << __func__ << " obc " << *obc << dendl;
2366
2367 if (r) {
2368 dout(20) << __func__ << " returned an error: " << r << dendl;
7c673cae 2369 if (op->may_write() &&
9f95a23c
TL
2370 get_osdmap()->require_osd_release >= ceph_release_t::kraken) {
2371 record_write_error(op, oid, nullptr, r,
2372 ctx->op->allows_returnvec() ? ctx : nullptr);
7c673cae
FG
2373 } else {
2374 osd->reply_op_error(op, r);
2375 }
9f95a23c 2376 close_op_ctx(ctx);
7c673cae
FG
2377 return;
2378 }
2379
2380 if (m->has_flag(CEPH_OSD_FLAG_IGNORE_CACHE)) {
2381 ctx->ignore_cache = true;
2382 }
2383
2384 if ((op->may_read()) && (obc->obs.oi.is_lost())) {
2385 // This object is lost. Reading from it returns an error.
2386 dout(20) << __func__ << ": object " << obc->obs.oi.soid
2387 << " is lost" << dendl;
2388 reply_ctx(ctx, -ENFILE);
2389 return;
2390 }
2391 if (!op->may_write() &&
2392 !op->may_cache() &&
2393 (!obc->obs.exists ||
2394 ((m->get_snapid() != CEPH_SNAPDIR) &&
2395 obc->obs.oi.is_whiteout()))) {
2396 // copy the reqids for copy get on ENOENT
2397 if (m->ops[0].op.op == CEPH_OSD_OP_COPY_GET) {
2398 fill_in_copy_get_noent(op, oid, m->ops[0]);
2399 close_op_ctx(ctx);
2400 return;
2401 }
2402 reply_ctx(ctx, -ENOENT);
2403 return;
2404 }
2405
2406 op->mark_started();
2407
2408 execute_ctx(ctx);
2409 utime_t prepare_latency = ceph_clock_now();
2410 prepare_latency -= op->get_dequeued_time();
2411 osd->logger->tinc(l_osd_op_prepare_lat, prepare_latency);
2412 if (op->may_read() && op->may_write()) {
2413 osd->logger->tinc(l_osd_op_rw_prepare_lat, prepare_latency);
2414 } else if (op->may_read()) {
2415 osd->logger->tinc(l_osd_op_r_prepare_lat, prepare_latency);
2416 } else if (op->may_write() || op->may_cache()) {
2417 osd->logger->tinc(l_osd_op_w_prepare_lat, prepare_latency);
2418 }
2419
2420 // force recovery of the oldest missing object if too many logs
2421 maybe_force_recovery();
2422}
b32b8144 2423
31f18b77
FG
2424PrimaryLogPG::cache_result_t PrimaryLogPG::maybe_handle_manifest_detail(
2425 OpRequestRef op,
2426 bool write_ordered,
2427 ObjectContextRef obc)
2428{
11fdf7f2 2429 ceph_assert(obc);
9f95a23c 2430 if (op->get_req<MOSDOp>()->get_flags() & CEPH_OSD_FLAG_IGNORE_REDIRECT) {
31f18b77
FG
2431 dout(20) << __func__ << ": ignoring redirect due to flag" << dendl;
2432 return cache_result_t::NOOP;
2433 }
2434
31f18b77 2435 // if it is write-ordered and blocked, stop now
11fdf7f2 2436 if (obc->is_blocked() && write_ordered) {
31f18b77
FG
2437 // we're already doing something with this object
2438 dout(20) << __func__ << " blocked on " << obc->obs.oi.soid << dendl;
2439 return cache_result_t::NOOP;
2440 }
2441
9f95a23c 2442 vector<OSDOp> ops = op->get_req<MOSDOp>()->ops;
31f18b77
FG
2443 for (vector<OSDOp>::iterator p = ops.begin(); p != ops.end(); ++p) {
2444 OSDOp& osd_op = *p;
2445 ceph_osd_op& op = osd_op.op;
11fdf7f2 2446 if (op.op == CEPH_OSD_OP_SET_REDIRECT ||
f67539c2 2447 op.op == CEPH_OSD_OP_SET_CHUNK ||
9f95a23c 2448 op.op == CEPH_OSD_OP_UNSET_MANIFEST ||
f67539c2
TL
2449 op.op == CEPH_OSD_OP_TIER_PROMOTE ||
2450 op.op == CEPH_OSD_OP_TIER_FLUSH ||
2451 op.op == CEPH_OSD_OP_TIER_EVICT) {
31f18b77
FG
2452 return cache_result_t::NOOP;
2453 }
2454 }
2455
2456 switch (obc->obs.oi.manifest.type) {
2457 case object_manifest_t::TYPE_REDIRECT:
2458 if (op->may_write() || write_ordered) {
11fdf7f2 2459 do_proxy_write(op, obc);
31f18b77 2460 } else {
f67539c2 2461 // promoted object
11fdf7f2
TL
2462 if (obc->obs.oi.size != 0) {
2463 return cache_result_t::NOOP;
2464 }
31f18b77
FG
2465 do_proxy_read(op, obc);
2466 }
2467 return cache_result_t::HANDLED_PROXY;
f67539c2 2468 case object_manifest_t::TYPE_CHUNKED:
11fdf7f2
TL
2469 {
2470 if (can_proxy_chunked_read(op, obc)) {
2471 map<hobject_t,FlushOpRef>::iterator p = flush_ops.find(obc->obs.oi.soid);
2472 if (p != flush_ops.end()) {
2473 do_proxy_chunked_op(op, obc->obs.oi.soid, obc, true);
2474 return cache_result_t::HANDLED_PROXY;
2475 }
2476 do_proxy_chunked_op(op, obc->obs.oi.soid, obc, write_ordered);
2477 return cache_result_t::HANDLED_PROXY;
2478 }
2479
2480 MOSDOp *m = static_cast<MOSDOp*>(op->get_nonconst_req());
2481 ceph_assert(m->get_type() == CEPH_MSG_OSD_OP);
2482 hobject_t head = m->get_hobj();
2483
2484 if (is_degraded_or_backfilling_object(head)) {
2485 dout(20) << __func__ << ": " << head << " is degraded, waiting" << dendl;
2486 wait_for_degraded_object(head, op);
2487 return cache_result_t::BLOCKED_RECOVERY;
2488 }
2489
f67539c2 2490 if (m_scrubber->write_blocked_by_scrub(head)) {
11fdf7f2
TL
2491 dout(20) << __func__ << ": waiting for scrub" << dendl;
2492 waiting_for_scrub.push_back(op);
2493 op->mark_delayed("waiting for scrub");
2494 return cache_result_t::BLOCKED_RECOVERY;
2495 }
9f95a23c
TL
2496 if (!check_laggy_requeue(op)) {
2497 return cache_result_t::BLOCKED_RECOVERY;
2498 }
f67539c2 2499
11fdf7f2
TL
2500 for (auto& p : obc->obs.oi.manifest.chunk_map) {
2501 if (p.second.is_missing()) {
9f95a23c 2502 auto m = op->get_req<MOSDOp>();
11fdf7f2
TL
2503 const object_locator_t oloc = m->get_object_locator();
2504 promote_object(obc, obc->obs.oi.soid, oloc, op, NULL);
2505 return cache_result_t::BLOCKED_PROMOTE;
2506 }
2507 }
11fdf7f2
TL
2508 return cache_result_t::NOOP;
2509 }
31f18b77 2510 default:
11fdf7f2 2511 ceph_abort_msg("unrecognized manifest type");
31f18b77
FG
2512 }
2513
2514 return cache_result_t::NOOP;
2515}
7c673cae
FG
2516
2517void PrimaryLogPG::record_write_error(OpRequestRef op, const hobject_t &soid,
9f95a23c
TL
2518 MOSDOpReply *orig_reply, int r,
2519 OpContext *ctx_for_op_returns)
7c673cae
FG
2520{
2521 dout(20) << __func__ << " r=" << r << dendl;
11fdf7f2 2522 ceph_assert(op->may_write());
9f95a23c 2523 const osd_reqid_t &reqid = op->get_req<MOSDOp>()->get_reqid();
31f18b77 2524 mempool::osd_pglog::list<pg_log_entry_t> entries;
7c673cae
FG
2525 entries.push_back(pg_log_entry_t(pg_log_entry_t::ERROR, soid,
2526 get_next_version(), eversion_t(), 0,
2527 reqid, utime_t(), r));
9f95a23c
TL
2528 if (ctx_for_op_returns) {
2529 entries.back().set_op_returns(*ctx_for_op_returns->ops);
2530 dout(20) << __func__ << " op_returns=" << entries.back().op_returns << dendl;
2531 }
7c673cae
FG
2532
2533 struct OnComplete {
2534 PrimaryLogPG *pg;
2535 OpRequestRef op;
2536 boost::intrusive_ptr<MOSDOpReply> orig_reply;
2537 int r;
2538 OnComplete(
2539 PrimaryLogPG *pg,
2540 OpRequestRef op,
2541 MOSDOpReply *orig_reply,
2542 int r)
2543 : pg(pg), op(op),
2544 orig_reply(orig_reply, false /* take over ref */), r(r)
2545 {}
2546 void operator()() {
2547 ldpp_dout(pg, 20) << "finished " << __func__ << " r=" << r << dendl;
9f95a23c 2548 auto m = op->get_req<MOSDOp>();
7c673cae 2549 MOSDOpReply *reply = orig_reply.detach();
7c673cae
FG
2550 ldpp_dout(pg, 10) << " sending commit on " << *m << " " << reply << dendl;
2551 pg->osd->send_message_osd_client(reply, m->get_connection());
2552 }
2553 };
2554
2555 ObcLockManager lock_manager;
2556 submit_log_entries(
2557 entries,
2558 std::move(lock_manager),
9f95a23c 2559 std::optional<std::function<void(void)> >(
7c673cae
FG
2560 OnComplete(this, op, orig_reply, r)),
2561 op,
2562 r);
2563}
2564
2565PrimaryLogPG::cache_result_t PrimaryLogPG::maybe_handle_cache_detail(
2566 OpRequestRef op,
2567 bool write_ordered,
2568 ObjectContextRef obc,
2569 int r, hobject_t missing_oid,
2570 bool must_promote,
2571 bool in_hit_set,
2572 ObjectContextRef *promote_obc)
2573{
b32b8144
FG
2574 // return quickly if caching is not enabled
2575 if (pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE)
2576 return cache_result_t::NOOP;
2577
7c673cae
FG
2578 if (op &&
2579 op->get_req() &&
2580 op->get_req()->get_type() == CEPH_MSG_OSD_OP &&
9f95a23c 2581 (op->get_req<MOSDOp>()->get_flags() &
7c673cae
FG
2582 CEPH_OSD_FLAG_IGNORE_CACHE)) {
2583 dout(20) << __func__ << ": ignoring cache due to flag" << dendl;
2584 return cache_result_t::NOOP;
2585 }
7c673cae
FG
2586
2587 must_promote = must_promote || op->need_promote();
2588
2589 if (obc)
2590 dout(25) << __func__ << " " << obc->obs.oi << " "
2591 << (obc->obs.exists ? "exists" : "DNE")
2592 << " missing_oid " << missing_oid
2593 << " must_promote " << (int)must_promote
2594 << " in_hit_set " << (int)in_hit_set
2595 << dendl;
2596 else
2597 dout(25) << __func__ << " (no obc)"
2598 << " missing_oid " << missing_oid
2599 << " must_promote " << (int)must_promote
2600 << " in_hit_set " << (int)in_hit_set
2601 << dendl;
2602
2603 // if it is write-ordered and blocked, stop now
2604 if (obc.get() && obc->is_blocked() && write_ordered) {
2605 // we're already doing something with this object
2606 dout(20) << __func__ << " blocked on " << obc->obs.oi.soid << dendl;
2607 return cache_result_t::NOOP;
2608 }
2609
2610 if (r == -ENOENT && missing_oid == hobject_t()) {
2611 // we know this object is logically absent (e.g., an undefined clone)
2612 return cache_result_t::NOOP;
2613 }
2614
2615 if (obc.get() && obc->obs.exists) {
2616 osd->logger->inc(l_osd_op_cache_hit);
2617 return cache_result_t::NOOP;
2618 }
b32b8144
FG
2619 if (!is_primary()) {
2620 dout(20) << __func__ << " cache miss; ask the primary" << dendl;
2621 osd->reply_op_error(op, -EAGAIN);
2622 return cache_result_t::REPLIED_WITH_EAGAIN;
2623 }
7c673cae
FG
2624
2625 if (missing_oid == hobject_t() && obc.get()) {
2626 missing_oid = obc->obs.oi.soid;
2627 }
2628
9f95a23c 2629 auto m = op->get_req<MOSDOp>();
7c673cae
FG
2630 const object_locator_t oloc = m->get_object_locator();
2631
2632 if (op->need_skip_handle_cache()) {
2633 return cache_result_t::NOOP;
2634 }
2635
7c673cae
FG
2636 OpRequestRef promote_op;
2637
2638 switch (pool.info.cache_mode) {
2639 case pg_pool_t::CACHEMODE_WRITEBACK:
2640 if (agent_state &&
2641 agent_state->evict_mode == TierAgentState::EVICT_MODE_FULL) {
2642 if (!op->may_write() && !op->may_cache() &&
2643 !write_ordered && !must_promote) {
2644 dout(20) << __func__ << " cache pool full, proxying read" << dendl;
2645 do_proxy_read(op);
2646 return cache_result_t::HANDLED_PROXY;
2647 }
2648 dout(20) << __func__ << " cache pool full, waiting" << dendl;
2649 block_write_on_full_cache(missing_oid, op);
2650 return cache_result_t::BLOCKED_FULL;
2651 }
2652
2653 if (must_promote || (!hit_set && !op->need_skip_promote())) {
2654 promote_object(obc, missing_oid, oloc, op, promote_obc);
2655 return cache_result_t::BLOCKED_PROMOTE;
2656 }
2657
2658 if (op->may_write() || op->may_cache()) {
11fdf7f2 2659 do_proxy_write(op);
7c673cae
FG
2660
2661 // Promote too?
f67539c2 2662 if (!op->need_skip_promote() &&
7c673cae
FG
2663 maybe_promote(obc, missing_oid, oloc, in_hit_set,
2664 pool.info.min_write_recency_for_promote,
2665 OpRequestRef(),
2666 promote_obc)) {
2667 return cache_result_t::BLOCKED_PROMOTE;
2668 }
2669 return cache_result_t::HANDLED_PROXY;
2670 } else {
2671 do_proxy_read(op);
2672
2673 // Avoid duplicate promotion
2674 if (obc.get() && obc->is_blocked()) {
2675 if (promote_obc)
2676 *promote_obc = obc;
2677 return cache_result_t::BLOCKED_PROMOTE;
2678 }
2679
2680 // Promote too?
2681 if (!op->need_skip_promote()) {
2682 (void)maybe_promote(obc, missing_oid, oloc, in_hit_set,
2683 pool.info.min_read_recency_for_promote,
2684 promote_op, promote_obc);
2685 }
2686
2687 return cache_result_t::HANDLED_PROXY;
2688 }
11fdf7f2 2689 ceph_abort_msg("unreachable");
7c673cae
FG
2690 return cache_result_t::NOOP;
2691
7c673cae
FG
2692 case pg_pool_t::CACHEMODE_READONLY:
2693 // TODO: clean this case up
2694 if (!obc.get() && r == -ENOENT) {
2695 // we don't have the object and op's a read
2696 promote_object(obc, missing_oid, oloc, op, promote_obc);
2697 return cache_result_t::BLOCKED_PROMOTE;
2698 }
2699 if (!r) { // it must be a write
2700 do_cache_redirect(op);
2701 return cache_result_t::HANDLED_REDIRECT;
2702 }
2703 // crap, there was a failure of some kind
2704 return cache_result_t::NOOP;
2705
9f95a23c
TL
2706 case pg_pool_t::CACHEMODE_FORWARD:
2707 // this mode is deprecated; proxy instead
7c673cae
FG
2708 case pg_pool_t::CACHEMODE_PROXY:
2709 if (!must_promote) {
2710 if (op->may_write() || op->may_cache() || write_ordered) {
11fdf7f2
TL
2711 do_proxy_write(op);
2712 return cache_result_t::HANDLED_PROXY;
7c673cae
FG
2713 } else {
2714 do_proxy_read(op);
2715 return cache_result_t::HANDLED_PROXY;
2716 }
2717 }
2718 // ugh, we're forced to promote.
2719 if (agent_state &&
2720 agent_state->evict_mode == TierAgentState::EVICT_MODE_FULL) {
2721 dout(20) << __func__ << " cache pool full, waiting" << dendl;
2722 block_write_on_full_cache(missing_oid, op);
2723 return cache_result_t::BLOCKED_FULL;
2724 }
2725 promote_object(obc, missing_oid, oloc, op, promote_obc);
2726 return cache_result_t::BLOCKED_PROMOTE;
2727
9f95a23c
TL
2728 case pg_pool_t::CACHEMODE_READFORWARD:
2729 // this mode is deprecated; proxy instead
7c673cae
FG
2730 case pg_pool_t::CACHEMODE_READPROXY:
2731 // Do writeback to the cache tier for writes
2732 if (op->may_write() || write_ordered || must_promote) {
2733 if (agent_state &&
2734 agent_state->evict_mode == TierAgentState::EVICT_MODE_FULL) {
2735 dout(20) << __func__ << " cache pool full, waiting" << dendl;
2736 block_write_on_full_cache(missing_oid, op);
2737 return cache_result_t::BLOCKED_FULL;
2738 }
2739 promote_object(obc, missing_oid, oloc, op, promote_obc);
2740 return cache_result_t::BLOCKED_PROMOTE;
2741 }
2742
2743 // If it is a read, we can read, we need to proxy it
2744 do_proxy_read(op);
2745 return cache_result_t::HANDLED_PROXY;
2746
2747 default:
11fdf7f2 2748 ceph_abort_msg("unrecognized cache_mode");
7c673cae
FG
2749 }
2750 return cache_result_t::NOOP;
2751}
2752
2753bool PrimaryLogPG::maybe_promote(ObjectContextRef obc,
2754 const hobject_t& missing_oid,
2755 const object_locator_t& oloc,
2756 bool in_hit_set,
2757 uint32_t recency,
2758 OpRequestRef promote_op,
2759 ObjectContextRef *promote_obc)
2760{
2761 dout(20) << __func__ << " missing_oid " << missing_oid
2762 << " in_hit_set " << in_hit_set << dendl;
2763
2764 switch (recency) {
2765 case 0:
2766 break;
2767 case 1:
2768 // Check if in the current hit set
2769 if (in_hit_set) {
2770 break;
2771 } else {
2772 // not promoting
2773 return false;
2774 }
2775 break;
2776 default:
2777 {
2778 unsigned count = (int)in_hit_set;
2779 if (count) {
2780 // Check if in other hit sets
2781 const hobject_t& oid = obc.get() ? obc->obs.oi.soid : missing_oid;
2782 for (map<time_t,HitSetRef>::reverse_iterator itor =
2783 agent_state->hit_set_map.rbegin();
2784 itor != agent_state->hit_set_map.rend();
2785 ++itor) {
2786 if (!itor->second->contains(oid)) {
2787 break;
2788 }
2789 ++count;
2790 if (count >= recency) {
2791 break;
2792 }
2793 }
2794 }
2795 if (count >= recency) {
2796 break;
2797 }
2798 return false; // not promoting
2799 }
2800 break;
2801 }
2802
2803 if (osd->promote_throttle()) {
2804 dout(10) << __func__ << " promote throttled" << dendl;
2805 return false;
2806 }
2807 promote_object(obc, missing_oid, oloc, promote_op, promote_obc);
2808 return true;
2809}
2810
2811void PrimaryLogPG::do_cache_redirect(OpRequestRef op)
2812{
9f95a23c 2813 auto m = op->get_req<MOSDOp>();
7c673cae 2814 int flags = m->get_flags() & (CEPH_OSD_FLAG_ACK|CEPH_OSD_FLAG_ONDISK);
11fdf7f2
TL
2815 MOSDOpReply *reply = new MOSDOpReply(m, -ENOENT, get_osdmap_epoch(),
2816 flags, false);
7c673cae
FG
2817 request_redirect_t redir(m->get_object_locator(), pool.info.tier_of);
2818 reply->set_redirect(redir);
2819 dout(10) << "sending redirect to pool " << pool.info.tier_of << " for op "
2820 << op << dendl;
2821 m->get_connection()->send_message(reply);
2822 return;
2823}
2824
2825struct C_ProxyRead : public Context {
2826 PrimaryLogPGRef pg;
2827 hobject_t oid;
2828 epoch_t last_peering_reset;
2829 ceph_tid_t tid;
2830 PrimaryLogPG::ProxyReadOpRef prdop;
2831 utime_t start;
2832 C_ProxyRead(PrimaryLogPG *p, hobject_t o, epoch_t lpr,
2833 const PrimaryLogPG::ProxyReadOpRef& prd)
2834 : pg(p), oid(o), last_peering_reset(lpr),
2835 tid(0), prdop(prd), start(ceph_clock_now())
2836 {}
2837 void finish(int r) override {
2838 if (prdop->canceled)
2839 return;
9f95a23c 2840 std::scoped_lock locker{*pg};
7c673cae 2841 if (prdop->canceled) {
7c673cae
FG
2842 return;
2843 }
2844 if (last_peering_reset == pg->get_last_peering_reset()) {
2845 pg->finish_proxy_read(oid, tid, r);
2846 pg->osd->logger->tinc(l_osd_tier_r_lat, ceph_clock_now() - start);
2847 }
7c673cae
FG
2848 }
2849};
2850
11fdf7f2
TL
2851struct C_ProxyChunkRead : public Context {
2852 PrimaryLogPGRef pg;
2853 hobject_t oid;
2854 epoch_t last_peering_reset;
2855 ceph_tid_t tid;
2856 PrimaryLogPG::ProxyReadOpRef prdop;
2857 utime_t start;
2858 ObjectOperation *obj_op;
2859 int op_index = 0;
2860 uint64_t req_offset = 0;
2861 ObjectContextRef obc;
2862 uint64_t req_total_len = 0;
2863 C_ProxyChunkRead(PrimaryLogPG *p, hobject_t o, epoch_t lpr,
2864 const PrimaryLogPG::ProxyReadOpRef& prd)
2865 : pg(p), oid(o), last_peering_reset(lpr),
2866 tid(0), prdop(prd), start(ceph_clock_now()), obj_op(NULL)
2867 {}
2868 void finish(int r) override {
2869 if (prdop->canceled)
2870 return;
9f95a23c 2871 std::scoped_lock locker{*pg};
11fdf7f2 2872 if (prdop->canceled) {
11fdf7f2
TL
2873 return;
2874 }
2875 if (last_peering_reset == pg->get_last_peering_reset()) {
2876 if (r >= 0) {
2877 if (!prdop->ops[op_index].outdata.length()) {
2878 ceph_assert(req_total_len);
2879 bufferlist list;
2880 bufferptr bptr(req_total_len);
2881 list.push_back(std::move(bptr));
2882 prdop->ops[op_index].outdata.append(list);
2883 }
2884 ceph_assert(obj_op);
2885 uint64_t copy_offset;
2886 if (req_offset >= prdop->ops[op_index].op.extent.offset) {
2887 copy_offset = req_offset - prdop->ops[op_index].op.extent.offset;
2888 } else {
2889 copy_offset = 0;
2890 }
9f95a23c
TL
2891 prdop->ops[op_index].outdata.begin(copy_offset).copy_in(
2892 obj_op->ops[0].outdata.length(),
2893 obj_op->ops[0].outdata.c_str());
f67539c2
TL
2894 }
2895
11fdf7f2
TL
2896 pg->finish_proxy_read(oid, tid, r);
2897 pg->osd->logger->tinc(l_osd_tier_r_lat, ceph_clock_now() - start);
2898 if (obj_op) {
2899 delete obj_op;
2900 }
2901 }
11fdf7f2
TL
2902 }
2903};
2904
31f18b77 2905void PrimaryLogPG::do_proxy_read(OpRequestRef op, ObjectContextRef obc)
7c673cae
FG
2906{
2907 // NOTE: non-const here because the ProxyReadOp needs mutable refs to
2908 // stash the result in the request's OSDOp vector
2909 MOSDOp *m = static_cast<MOSDOp*>(op->get_nonconst_req());
31f18b77
FG
2910 object_locator_t oloc;
2911 hobject_t soid;
2912 /* extensible tier */
2913 if (obc && obc->obs.exists && obc->obs.oi.has_manifest()) {
2914 switch (obc->obs.oi.manifest.type) {
2915 case object_manifest_t::TYPE_REDIRECT:
2916 oloc = object_locator_t(obc->obs.oi.manifest.redirect_target);
f67539c2 2917 soid = obc->obs.oi.manifest.redirect_target;
31f18b77 2918 break;
31f18b77 2919 default:
11fdf7f2 2920 ceph_abort_msg("unrecognized manifest type");
31f18b77
FG
2921 }
2922 } else {
2923 /* proxy */
2924 soid = m->get_hobj();
2925 oloc = object_locator_t(m->get_object_locator());
2926 oloc.pool = pool.info.tier_of;
2927 }
7c673cae
FG
2928 unsigned flags = CEPH_OSD_FLAG_IGNORE_CACHE | CEPH_OSD_FLAG_IGNORE_OVERLAY;
2929
2930 // pass through some original flags that make sense.
2931 // - leave out redirection and balancing flags since we are
2932 // already proxying through the primary
2933 // - leave off read/write/exec flags that are derived from the op
2934 flags |= m->get_flags() & (CEPH_OSD_FLAG_RWORDERED |
2935 CEPH_OSD_FLAG_ORDERSNAP |
2936 CEPH_OSD_FLAG_ENFORCE_SNAPC |
2937 CEPH_OSD_FLAG_MAP_SNAP_CLONE);
2938
2939 dout(10) << __func__ << " Start proxy read for " << *m << dendl;
2940
2941 ProxyReadOpRef prdop(std::make_shared<ProxyReadOp>(op, soid, m->ops));
2942
2943 ObjectOperation obj_op;
2944 obj_op.dup(prdop->ops);
2945
2946 if (pool.info.cache_mode == pg_pool_t::CACHEMODE_WRITEBACK &&
2947 (agent_state && agent_state->evict_mode != TierAgentState::EVICT_MODE_FULL)) {
2948 for (unsigned i = 0; i < obj_op.ops.size(); i++) {
2949 ceph_osd_op op = obj_op.ops[i].op;
2950 switch (op.op) {
2951 case CEPH_OSD_OP_READ:
2952 case CEPH_OSD_OP_SYNC_READ:
2953 case CEPH_OSD_OP_SPARSE_READ:
2954 case CEPH_OSD_OP_CHECKSUM:
c07f9fc5 2955 case CEPH_OSD_OP_CMPEXT:
7c673cae
FG
2956 op.flags = (op.flags | CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL) &
2957 ~(CEPH_OSD_OP_FLAG_FADVISE_DONTNEED | CEPH_OSD_OP_FLAG_FADVISE_NOCACHE);
2958 }
2959 }
2960 }
2961
2962 C_ProxyRead *fin = new C_ProxyRead(this, soid, get_last_peering_reset(),
2963 prdop);
2964 ceph_tid_t tid = osd->objecter->read(
2965 soid.oid, oloc, obj_op,
2966 m->get_snapid(), NULL,
9f95a23c 2967 flags, new C_OnFinisher(fin, osd->get_objecter_finisher(get_pg_shard())),
7c673cae
FG
2968 &prdop->user_version,
2969 &prdop->data_offset,
2970 m->get_features());
2971 fin->tid = tid;
2972 prdop->objecter_tid = tid;
2973 proxyread_ops[tid] = prdop;
2974 in_progress_proxy_ops[soid].push_back(op);
2975}
2976
2977void PrimaryLogPG::finish_proxy_read(hobject_t oid, ceph_tid_t tid, int r)
2978{
2979 dout(10) << __func__ << " " << oid << " tid " << tid
2980 << " " << cpp_strerror(r) << dendl;
2981
2982 map<ceph_tid_t, ProxyReadOpRef>::iterator p = proxyread_ops.find(tid);
2983 if (p == proxyread_ops.end()) {
2984 dout(10) << __func__ << " no proxyread_op found" << dendl;
2985 return;
2986 }
2987 ProxyReadOpRef prdop = p->second;
2988 if (tid != prdop->objecter_tid) {
2989 dout(10) << __func__ << " tid " << tid << " != prdop " << prdop
2990 << " tid " << prdop->objecter_tid << dendl;
2991 return;
2992 }
2993 if (oid != prdop->soid) {
2994 dout(10) << __func__ << " oid " << oid << " != prdop " << prdop
2995 << " soid " << prdop->soid << dendl;
2996 return;
2997 }
2998 proxyread_ops.erase(tid);
2999
3000 map<hobject_t, list<OpRequestRef>>::iterator q = in_progress_proxy_ops.find(oid);
3001 if (q == in_progress_proxy_ops.end()) {
3002 dout(10) << __func__ << " no in_progress_proxy_ops found" << dendl;
3003 return;
3004 }
11fdf7f2 3005 ceph_assert(q->second.size());
7c673cae
FG
3006 list<OpRequestRef>::iterator it = std::find(q->second.begin(),
3007 q->second.end(),
3008 prdop->op);
11fdf7f2 3009 ceph_assert(it != q->second.end());
7c673cae
FG
3010 OpRequestRef op = *it;
3011 q->second.erase(it);
3012 if (q->second.size() == 0) {
3013 in_progress_proxy_ops.erase(oid);
11fdf7f2
TL
3014 } else if (std::find(q->second.begin(),
3015 q->second.end(),
3016 prdop->op) != q->second.end()) {
3017 /* multiple read case */
3018 dout(20) << __func__ << " " << oid << " is not completed " << dendl;
3019 return;
7c673cae
FG
3020 }
3021
3022 osd->logger->inc(l_osd_tier_proxy_read);
3023
9f95a23c 3024 auto m = op->get_req<MOSDOp>();
c07f9fc5 3025 OpContext *ctx = new OpContext(op, m->get_reqid(), &prdop->ops, this);
11fdf7f2 3026 ctx->reply = new MOSDOpReply(m, 0, get_osdmap_epoch(), 0, false);
7c673cae
FG
3027 ctx->user_at_version = prdop->user_version;
3028 ctx->data_off = prdop->data_offset;
3029 ctx->ignore_log_op_stats = true;
3030 complete_read_ctx(r, ctx);
3031}
3032
3033void PrimaryLogPG::kick_proxy_ops_blocked(hobject_t& soid)
3034{
3035 map<hobject_t, list<OpRequestRef>>::iterator p = in_progress_proxy_ops.find(soid);
3036 if (p == in_progress_proxy_ops.end())
3037 return;
3038
3039 list<OpRequestRef>& ls = p->second;
3040 dout(10) << __func__ << " " << soid << " requeuing " << ls.size() << " requests" << dendl;
3041 requeue_ops(ls);
3042 in_progress_proxy_ops.erase(p);
3043}
3044
94b18763
FG
3045void PrimaryLogPG::cancel_proxy_read(ProxyReadOpRef prdop,
3046 vector<ceph_tid_t> *tids)
7c673cae
FG
3047{
3048 dout(10) << __func__ << " " << prdop->soid << dendl;
3049 prdop->canceled = true;
3050
3051 // cancel objecter op, if we can
3052 if (prdop->objecter_tid) {
94b18763 3053 tids->push_back(prdop->objecter_tid);
7c673cae
FG
3054 for (uint32_t i = 0; i < prdop->ops.size(); i++) {
3055 prdop->ops[i].outdata.clear();
3056 }
3057 proxyread_ops.erase(prdop->objecter_tid);
3058 prdop->objecter_tid = 0;
3059 }
3060}
3061
94b18763 3062void PrimaryLogPG::cancel_proxy_ops(bool requeue, vector<ceph_tid_t> *tids)
7c673cae
FG
3063{
3064 dout(10) << __func__ << dendl;
3065
3066 // cancel proxy reads
3067 map<ceph_tid_t, ProxyReadOpRef>::iterator p = proxyread_ops.begin();
3068 while (p != proxyread_ops.end()) {
94b18763 3069 cancel_proxy_read((p++)->second, tids);
7c673cae
FG
3070 }
3071
3072 // cancel proxy writes
3073 map<ceph_tid_t, ProxyWriteOpRef>::iterator q = proxywrite_ops.begin();
3074 while (q != proxywrite_ops.end()) {
94b18763 3075 cancel_proxy_write((q++)->second, tids);
7c673cae
FG
3076 }
3077
3078 if (requeue) {
3079 map<hobject_t, list<OpRequestRef>>::iterator p =
3080 in_progress_proxy_ops.begin();
3081 while (p != in_progress_proxy_ops.end()) {
3082 list<OpRequestRef>& ls = p->second;
3083 dout(10) << __func__ << " " << p->first << " requeuing " << ls.size()
3084 << " requests" << dendl;
3085 requeue_ops(ls);
3086 in_progress_proxy_ops.erase(p++);
3087 }
3088 } else {
3089 in_progress_proxy_ops.clear();
3090 }
3091}
3092
3093struct C_ProxyWrite_Commit : public Context {
3094 PrimaryLogPGRef pg;
3095 hobject_t oid;
3096 epoch_t last_peering_reset;
3097 ceph_tid_t tid;
3098 PrimaryLogPG::ProxyWriteOpRef pwop;
3099 C_ProxyWrite_Commit(PrimaryLogPG *p, hobject_t o, epoch_t lpr,
3100 const PrimaryLogPG::ProxyWriteOpRef& pw)
3101 : pg(p), oid(o), last_peering_reset(lpr),
3102 tid(0), pwop(pw)
3103 {}
3104 void finish(int r) override {
3105 if (pwop->canceled)
3106 return;
9f95a23c 3107 std::scoped_lock locker{*pg};
7c673cae 3108 if (pwop->canceled) {
7c673cae
FG
3109 return;
3110 }
3111 if (last_peering_reset == pg->get_last_peering_reset()) {
3112 pg->finish_proxy_write(oid, tid, r);
3113 }
7c673cae
FG
3114 }
3115};
3116
11fdf7f2 3117void PrimaryLogPG::do_proxy_write(OpRequestRef op, ObjectContextRef obc)
7c673cae
FG
3118{
3119 // NOTE: non-const because ProxyWriteOp takes a mutable ref
3120 MOSDOp *m = static_cast<MOSDOp*>(op->get_nonconst_req());
31f18b77 3121 object_locator_t oloc;
7c673cae 3122 SnapContext snapc(m->get_snap_seq(), m->get_snaps());
31f18b77
FG
3123 hobject_t soid;
3124 /* extensible tier */
3125 if (obc && obc->obs.exists && obc->obs.oi.has_manifest()) {
3126 switch (obc->obs.oi.manifest.type) {
3127 case object_manifest_t::TYPE_REDIRECT:
3128 oloc = object_locator_t(obc->obs.oi.manifest.redirect_target);
f67539c2 3129 soid = obc->obs.oi.manifest.redirect_target;
31f18b77 3130 break;
31f18b77 3131 default:
11fdf7f2 3132 ceph_abort_msg("unrecognized manifest type");
31f18b77
FG
3133 }
3134 } else {
3135 /* proxy */
3136 soid = m->get_hobj();
3137 oloc = object_locator_t(m->get_object_locator());
3138 oloc.pool = pool.info.tier_of;
3139 }
7c673cae 3140
7c673cae 3141 unsigned flags = CEPH_OSD_FLAG_IGNORE_CACHE | CEPH_OSD_FLAG_IGNORE_OVERLAY;
31f18b77
FG
3142 if (!(op->may_write() || op->may_cache())) {
3143 flags |= CEPH_OSD_FLAG_RWORDERED;
3144 }
9f95a23c
TL
3145 if (op->allows_returnvec()) {
3146 flags |= CEPH_OSD_FLAG_RETURNVEC;
3147 }
3148
7c673cae
FG
3149 dout(10) << __func__ << " Start proxy write for " << *m << dendl;
3150
3151 ProxyWriteOpRef pwop(std::make_shared<ProxyWriteOp>(op, soid, m->ops, m->get_reqid()));
c07f9fc5 3152 pwop->ctx = new OpContext(op, m->get_reqid(), &pwop->ops, this);
7c673cae
FG
3153 pwop->mtime = m->get_mtime();
3154
3155 ObjectOperation obj_op;
3156 obj_op.dup(pwop->ops);
3157
3158 C_ProxyWrite_Commit *fin = new C_ProxyWrite_Commit(
3159 this, soid, get_last_peering_reset(), pwop);
3160 ceph_tid_t tid = osd->objecter->mutate(
3161 soid.oid, oloc, obj_op, snapc,
3162 ceph::real_clock::from_ceph_timespec(pwop->mtime),
9f95a23c 3163 flags, new C_OnFinisher(fin, osd->get_objecter_finisher(get_pg_shard())),
7c673cae
FG
3164 &pwop->user_version, pwop->reqid);
3165 fin->tid = tid;
3166 pwop->objecter_tid = tid;
3167 proxywrite_ops[tid] = pwop;
3168 in_progress_proxy_ops[soid].push_back(op);
3169}
3170
f67539c2 3171void PrimaryLogPG::do_proxy_chunked_op(OpRequestRef op, const hobject_t& missing_oid,
11fdf7f2
TL
3172 ObjectContextRef obc, bool write_ordered)
3173{
3174 MOSDOp *m = static_cast<MOSDOp*>(op->get_nonconst_req());
3175 OSDOp *osd_op = NULL;
3176 for (unsigned int i = 0; i < m->ops.size(); i++) {
3177 osd_op = &m->ops[i];
3178 uint64_t cursor = osd_op->op.extent.offset;
3179 uint64_t op_length = osd_op->op.extent.offset + osd_op->op.extent.length;
3180 uint64_t chunk_length = 0, chunk_index = 0, req_len = 0;
3181 object_manifest_t *manifest = &obc->obs.oi.manifest;
3182 map <uint64_t, map<uint64_t, uint64_t>> chunk_read;
3183
3184 while (cursor < op_length) {
3185 chunk_index = 0;
3186 chunk_length = 0;
3187 /* find the right chunk position for cursor */
f67539c2
TL
3188 for (auto &p : manifest->chunk_map) {
3189 if (p.first <= cursor && p.first + p.second.length > cursor) {
3190 chunk_length = p.second.length;
3191 chunk_index = p.first;
11fdf7f2
TL
3192 break;
3193 }
f67539c2 3194 }
11fdf7f2
TL
3195 /* no index */
3196 if (!chunk_index && !chunk_length) {
3197 if (cursor == osd_op->op.extent.offset) {
f67539c2 3198 OpContext *ctx = new OpContext(op, m->get_reqid(), &m->ops, this);
11fdf7f2 3199 ctx->reply = new MOSDOpReply(m, 0, get_osdmap_epoch(), 0, false);
f67539c2
TL
3200 ctx->data_off = osd_op->op.extent.offset;
3201 ctx->ignore_log_op_stats = true;
3202 complete_read_ctx(0, ctx);
11fdf7f2
TL
3203 }
3204 break;
3205 }
3206 uint64_t next_length = chunk_length;
3207 /* the size to read -> | op length | */
3208 /* | a chunk | */
3209 if (cursor + next_length > op_length) {
3210 next_length = op_length - cursor;
3211 }
3212 /* the size to read -> | op length | */
3213 /* | a chunk | */
f67539c2
TL
3214 if (cursor + next_length > chunk_index + chunk_length) {
3215 next_length = chunk_index + chunk_length - cursor;
3216 }
11fdf7f2
TL
3217
3218 chunk_read[cursor] = {{chunk_index, next_length}};
3219 cursor += next_length;
3220 }
3221
3222 req_len = cursor - osd_op->op.extent.offset;
3223 for (auto &p : chunk_read) {
3224 auto chunks = p.second.begin();
f67539c2
TL
3225 dout(20) << __func__ << " chunk_index: " << chunks->first
3226 << " next_length: " << chunks->second << " cursor: "
11fdf7f2
TL
3227 << p.first << dendl;
3228 do_proxy_chunked_read(op, obc, i, chunks->first, p.first, chunks->second, req_len, write_ordered);
3229 }
f67539c2 3230 }
11fdf7f2
TL
3231}
3232
3233struct RefCountCallback : public Context {
3234public:
11fdf7f2
TL
3235 PrimaryLogPG::OpContext *ctx;
3236 OSDOp& osd_op;
9f95a23c 3237 bool requeue = false;
f67539c2 3238
9f95a23c
TL
3239 RefCountCallback(PrimaryLogPG::OpContext *ctx, OSDOp &osd_op)
3240 : ctx(ctx), osd_op(osd_op) {}
11fdf7f2 3241 void finish(int r) override {
9f95a23c
TL
3242 // NB: caller must already have pg->lock held
3243 ctx->obc->stop_block();
3244 ctx->pg->kick_object_context_blocked(ctx->obc);
3245 if (r >= 0) {
3246 osd_op.rval = 0;
3247 ctx->pg->execute_ctx(ctx);
3248 } else {
3249 // on cancel simply toss op out,
3250 // or requeue as requested
3251 if (r != -ECANCELED) {
3252 if (ctx->op)
3253 ctx->pg->osd->reply_op_error(ctx->op, r);
3254 } else if (requeue) {
3255 if (ctx->op)
3256 ctx->pg->requeue_op(ctx->op);
11fdf7f2 3257 }
9f95a23c 3258 ctx->pg->close_op_ctx(ctx);
11fdf7f2 3259 }
9f95a23c
TL
3260 }
3261 void set_requeue(bool rq) {
3262 requeue = rq;
11fdf7f2
TL
3263 }
3264};
3265
3266struct SetManifestFinisher : public PrimaryLogPG::OpFinisher {
3267 OSDOp& osd_op;
3268
3269 explicit SetManifestFinisher(OSDOp& osd_op) : osd_op(osd_op) {
3270 }
3271
3272 int execute() override {
3273 return osd_op.rval;
3274 }
3275};
3276
9f95a23c 3277struct C_SetManifestRefCountDone : public Context {
f67539c2
TL
3278 PrimaryLogPGRef pg;
3279 PrimaryLogPG::ManifestOpRef mop;
9f95a23c 3280 hobject_t soid;
f67539c2
TL
3281 C_SetManifestRefCountDone(PrimaryLogPG *p,
3282 PrimaryLogPG::ManifestOpRef mop, hobject_t soid) :
3283 pg(p), mop(mop), soid(soid) {}
9f95a23c
TL
3284 void finish(int r) override {
3285 if (r == -ECANCELED)
3286 return;
9f95a23c
TL
3287 std::scoped_lock locker{*pg};
3288 auto it = pg->manifest_ops.find(soid);
3289 if (it == pg->manifest_ops.end()) {
3290 // raced with cancel_manifest_ops
3291 return;
3292 }
f67539c2
TL
3293 if (it->second->cb) {
3294 it->second->cb->complete(r);
3295 }
9f95a23c 3296 pg->manifest_ops.erase(it);
f67539c2
TL
3297 mop.reset();
3298 }
3299};
3300
3301struct C_SetDedupChunks : public Context {
3302 PrimaryLogPGRef pg;
3303 hobject_t oid;
3304 epoch_t last_peering_reset;
3305 ceph_tid_t tid;
3306 uint64_t offset;
3307
3308 C_SetDedupChunks(PrimaryLogPG *p, hobject_t o, epoch_t lpr, uint64_t offset)
3309 : pg(p), oid(o), last_peering_reset(lpr),
3310 tid(0), offset(offset)
3311 {}
3312 void finish(int r) override {
3313 if (r == -ECANCELED)
3314 return;
3315 std::scoped_lock locker{*pg};
3316 if (last_peering_reset != pg->get_last_peering_reset()) {
3317 return;
3318 }
3319 pg->finish_set_dedup(oid, r, tid, offset);
9f95a23c
TL
3320 }
3321};
3322
3323void PrimaryLogPG::cancel_manifest_ops(bool requeue, vector<ceph_tid_t> *tids)
3324{
3325 dout(10) << __func__ << dendl;
3326 auto p = manifest_ops.begin();
3327 while (p != manifest_ops.end()) {
3328 auto mop = p->second;
3329 // cancel objecter op, if we can
3330 if (mop->objecter_tid) {
3331 tids->push_back(mop->objecter_tid);
3332 mop->objecter_tid = 0;
3333 }
f67539c2
TL
3334 if (mop->cb) {
3335 mop->cb->set_requeue(requeue);
3336 mop->cb->complete(-ECANCELED);
3337 }
9f95a23c
TL
3338 manifest_ops.erase(p++);
3339 }
3340}
3341
f67539c2
TL
3342int PrimaryLogPG::get_manifest_ref_count(ObjectContextRef obc, std::string& fp_oid)
3343{
3344 int cnt = 0;
3345 // head
3346 for (auto &p : obc->obs.oi.manifest.chunk_map) {
3347 if (p.second.oid.oid.name == fp_oid) {
3348 cnt++;
3349 }
3350 }
3351 // snap
3352 SnapSet& ss = obc->ssc->snapset;
3353 const OSDMapRef& osdmap = get_osdmap();
3354 for (vector<snapid_t>::const_reverse_iterator p = ss.clones.rbegin();
3355 p != ss.clones.rend();
3356 ++p) {
3357 object_ref_delta_t refs;
3358 ObjectContextRef obc_l = nullptr;
3359 ObjectContextRef obc_g = nullptr;
3360 hobject_t clone_oid = obc->obs.oi.soid;
3361 clone_oid.snap = *p;
3362 if (osdmap->in_removed_snaps_queue(info.pgid.pgid.pool(), *p)) {
3363 return -EBUSY;
3364 }
3365 ObjectContextRef clone_obc = get_object_context(clone_oid, false);
3366 if (!clone_obc) {
3367 break;
3368 }
3369 get_adjacent_clones(clone_obc, obc_l, obc_g);
3370 clone_obc->obs.oi.manifest.calc_refs_to_inc_on_set(
3371 obc_g ? &(obc_g->obs.oi.manifest) : nullptr ,
3372 nullptr,
3373 refs);
3374 for (auto p = refs.begin(); p != refs.end(); ++p) {
3375 if (p->first.oid.name == fp_oid && p->second > 0) {
3376 cnt += p->second;
3377 }
3378 }
3379 }
3380
3381 return cnt;
3382}
3383
3384bool PrimaryLogPG::recover_adjacent_clones(ObjectContextRef obc, OpRequestRef op)
3385{
3386 if (!obc->obs.oi.manifest.is_chunked() || !obc->ssc || !obc->ssc->snapset.clones.size()) {
3387 return false;
3388 }
3389
3390 const SnapSet& snapset = obc->ssc->snapset;
3391 auto s = std::find(snapset.clones.begin(), snapset.clones.end(), obc->obs.oi.soid.snap);
3392 auto is_unreadable_snap = [this, obc, &snapset, op](auto iter) -> bool {
3393 hobject_t cid = obc->obs.oi.soid;
3394 cid.snap = (iter == snapset.clones.end()) ? snapid_t(CEPH_NOSNAP) : *iter;
3395 if (is_unreadable_object(cid)) {
3396 dout(10) << __func__ << ": clone " << cid
3397 << " is unreadable, waiting" << dendl;
3398 wait_for_unreadable_object(cid, op);
3399 return true;
3400 }
3401 return false;
3402 };
3403 if (s != snapset.clones.begin()) {
3404 if (is_unreadable_snap(s - 1)) {
3405 return true;
3406 }
3407 }
3408 if (s != snapset.clones.end()) {
3409 if (is_unreadable_snap(s + 1)) {
3410 return true;
3411 }
3412 }
3413 return false;
3414}
3415
3416ObjectContextRef PrimaryLogPG::get_prev_clone_obc(ObjectContextRef obc)
3417{
3418 auto s = std::find(obc->ssc->snapset.clones.begin(), obc->ssc->snapset.clones.end(),
3419 obc->obs.oi.soid.snap);
3420 if (s != obc->ssc->snapset.clones.begin()) {
3421 auto s_iter = s - 1;
3422 hobject_t cid = obc->obs.oi.soid;
3423 object_ref_delta_t refs;
3424 cid.snap = *s_iter;
3425 ObjectContextRef cobc = get_object_context(cid, false, NULL);
3426 ceph_assert(cobc);
3427 return cobc;
3428 }
3429 return nullptr;
3430}
3431
3432void PrimaryLogPG::dec_refcount(const hobject_t& soid, const object_ref_delta_t& refs)
3433{
3434 for (auto p = refs.begin(); p != refs.end(); ++p) {
3435 int dec_ref_count = p->second;
3436 ceph_assert(dec_ref_count < 0);
3437 while (dec_ref_count < 0) {
3438 dout(10) << __func__ << ": decrement reference on offset oid: " << p->first << dendl;
3439 refcount_manifest(soid, p->first,
3440 refcount_t::DECREMENT_REF, NULL, std::nullopt);
3441 dec_ref_count++;
3442 }
3443 }
3444}
3445
3446
3447void PrimaryLogPG::get_adjacent_clones(ObjectContextRef src_obc,
3448 ObjectContextRef& _l, ObjectContextRef& _g)
3449{
3450 const SnapSet& snapset = src_obc->ssc->snapset;
3451 const object_info_t& oi = src_obc->obs.oi;
3452
3453 auto get_context = [this, &oi, &snapset](auto iter)
3454 -> ObjectContextRef {
3455 hobject_t cid = oi.soid;
3456 cid.snap = (iter == snapset.clones.end()) ? snapid_t(CEPH_NOSNAP) : *iter;
3457 ObjectContextRef obc = get_object_context(cid, false, NULL);
3458 ceph_assert(obc);
3459 return obc;
3460 };
3461
3462 // check adjacent clones
3463 auto s = std::find(snapset.clones.begin(), snapset.clones.end(), oi.soid.snap);
3464
3465 // We *must* find the clone iff it's not head,
3466 // let s == snapset.clones.end() mean head
3467 ceph_assert((s == snapset.clones.end()) == oi.soid.is_head());
3468
3469 if (s != snapset.clones.begin()) {
3470 _l = get_context(s - 1);
3471 }
3472
3473 if (s != snapset.clones.end()) {
3474 _g = get_context(s + 1);
3475 }
3476}
3477
3478bool PrimaryLogPG::inc_refcount_by_set(OpContext* ctx, object_manifest_t& set_chunk,
3479 OSDOp& osd_op)
3480{
3481 object_ref_delta_t refs;
3482 ObjectContextRef obc_l, obc_g;
3483 get_adjacent_clones(ctx->obc, obc_l, obc_g);
3484 set_chunk.calc_refs_to_inc_on_set(
3485 obc_l ? &(obc_l->obs.oi.manifest) : nullptr,
3486 obc_g ? &(obc_g->obs.oi.manifest) : nullptr,
3487 refs);
3488 if (!refs.is_empty()) {
3489 /* This is called by set-chunk, so we only consider a single chunk for the time being */
3490 ceph_assert(refs.size() == 1);
3491 auto p = refs.begin();
3492 int inc_ref_count = p->second;
3493 if (inc_ref_count > 0) {
3494 /*
3495 * In set-chunk case, the first thing we should do is to increment
3496 * the reference the targe object has prior to update object_manifest in object_info_t.
3497 * So, call directly refcount_manifest.
3498 */
3499 ManifestOpRef mop = std::make_shared<ManifestOp>(new RefCountCallback(ctx, osd_op));
3500 C_SetManifestRefCountDone* fin = new C_SetManifestRefCountDone(this, mop, ctx->obs->oi.soid);
3501 ceph_tid_t tid = refcount_manifest(ctx->obs->oi.soid, p->first,
3502 refcount_t::INCREMENT_REF, fin, std::nullopt);
3503 mop->objecter_tid = tid;
3504 manifest_ops[ctx->obs->oi.soid] = mop;
3505 ctx->obc->start_block();
3506 return true;
3507 } else if (inc_ref_count < 0) {
3508 hobject_t src = ctx->obs->oi.soid;
3509 hobject_t tgt = p->first;
3510 ctx->register_on_commit(
3511 [src, tgt, this](){
3512 refcount_manifest(src, tgt, refcount_t::DECREMENT_REF, NULL, std::nullopt);
3513 });
3514 return false;
3515 }
3516 }
3517
3518 return false;
3519}
3520
3521void PrimaryLogPG::dec_refcount_by_dirty(OpContext* ctx)
3522{
3523 object_ref_delta_t refs;
3524 ObjectContextRef cobc = nullptr;
3525 ObjectContextRef obc = ctx->obc;
3526 for (auto &p : ctx->obs->oi.manifest.chunk_map) {
3527 if (!ctx->clean_regions.is_clean_region(p.first, p.second.length)) {
3528 ctx->new_obs.oi.manifest.chunk_map.erase(p.first);
3529 if (ctx->new_obs.oi.manifest.chunk_map.empty()) {
3530 ctx->new_obs.oi.manifest.type = object_manifest_t::TYPE_NONE;
3531 ctx->new_obs.oi.clear_flag(object_info_t::FLAG_MANIFEST);
3532 ctx->delta_stats.num_objects_manifest--;
3533 }
3534 }
3535 }
3536 // Look over previous snapshot, then figure out whether updated chunk needs to be deleted
3537 cobc = get_prev_clone_obc(obc);
3538 obc->obs.oi.manifest.calc_refs_to_drop_on_modify(
3539 cobc ? &cobc->obs.oi.manifest : nullptr,
3540 ctx->clean_regions,
3541 refs);
3542 if (!refs.is_empty()) {
3543 hobject_t soid = obc->obs.oi.soid;
3544 ctx->register_on_commit(
3545 [soid, this, refs](){
3546 dec_refcount(soid, refs);
3547 });
3548 }
3549}
3550
3551void PrimaryLogPG::dec_all_refcount_manifest(const object_info_t& oi, OpContext* ctx)
3552{
3553 ceph_assert(oi.has_manifest());
3554 ceph_assert(ctx->obc->ssc);
3555
3556 if (oi.manifest.is_chunked()) {
3557 object_ref_delta_t refs;
3558 ObjectContextRef obc_l, obc_g;
3559 get_adjacent_clones(ctx->obc, obc_l, obc_g);
3560 oi.manifest.calc_refs_to_drop_on_removal(
3561 obc_l ? &(obc_l->obs.oi.manifest) : nullptr,
3562 obc_g ? &(obc_g->obs.oi.manifest) : nullptr,
3563 refs);
3564
3565 if (!refs.is_empty()) {
3566 hobject_t soid = ctx->obc->obs.oi.soid;
3567 ctx->register_on_commit(
3568 [soid, this, refs](){
3569 dec_refcount(soid, refs);
3570 });
3571 }
3572 } else if (oi.manifest.is_redirect() &&
3573 oi.test_flag(object_info_t::FLAG_REDIRECT_HAS_REFERENCE)) {
3574 ctx->register_on_commit(
3575 [oi, this](){
3576 refcount_manifest(oi.soid, oi.manifest.redirect_target,
3577 refcount_t::DECREMENT_REF, NULL, std::nullopt);
3578 });
3579 }
3580}
3581
3582ceph_tid_t PrimaryLogPG::refcount_manifest(hobject_t src_soid, hobject_t tgt_soid, refcount_t type,
3583 Context *cb, std::optional<bufferlist> chunk)
11fdf7f2
TL
3584{
3585 unsigned flags = CEPH_OSD_FLAG_IGNORE_CACHE | CEPH_OSD_FLAG_IGNORE_OVERLAY |
f67539c2
TL
3586 CEPH_OSD_FLAG_RWORDERED;
3587
3588 dout(10) << __func__ << " Start refcount from " << src_soid
3589 << " to " << tgt_soid << dendl;
11fdf7f2 3590
11fdf7f2
TL
3591 ObjectOperation obj_op;
3592 bufferlist in;
f67539c2
TL
3593 if (type == refcount_t::INCREMENT_REF) {
3594 cls_cas_chunk_get_ref_op call;
3595 call.source = src_soid.get_head();
3596 ::encode(call, in);
3597 obj_op.call("cas", "chunk_get_ref", in);
3598 } else if (type == refcount_t::DECREMENT_REF) {
3599 cls_cas_chunk_put_ref_op call;
3600 call.source = src_soid.get_head();
3601 ::encode(call, in);
3602 obj_op.call("cas", "chunk_put_ref", in);
3603 } else if (type == refcount_t::CREATE_OR_GET_REF) {
3604 cls_cas_chunk_create_or_get_ref_op get_call;
3605 get_call.source = src_soid.get_head();
3606 ceph_assert(chunk);
3607 get_call.data = move(*chunk);
3608 ::encode(get_call, in);
3609 obj_op.call("cas", "chunk_create_or_get_ref", in);
3610 } else {
3611 ceph_assert(0 == "unrecognized type");
3612 }
3613
9f95a23c 3614 Context *c = nullptr;
11fdf7f2 3615 if (cb) {
f67539c2 3616 c = new C_OnFinisher(cb, osd->get_objecter_finisher(get_pg_shard()));
11fdf7f2
TL
3617 }
3618
f67539c2
TL
3619 object_locator_t oloc(tgt_soid);
3620 ObjectContextRef src_obc = get_object_context(src_soid, false, NULL);
3621 ceph_assert(src_obc);
9f95a23c 3622 auto tid = osd->objecter->mutate(
f67539c2
TL
3623 tgt_soid.oid, oloc, obj_op, SnapContext(),
3624 ceph::real_clock::from_ceph_timespec(src_obc->obs.oi.mtime),
11fdf7f2 3625 flags, c);
f67539c2
TL
3626 return tid;
3627}
11fdf7f2
TL
3628
3629void PrimaryLogPG::do_proxy_chunked_read(OpRequestRef op, ObjectContextRef obc, int op_index,
3630 uint64_t chunk_index, uint64_t req_offset, uint64_t req_length,
3631 uint64_t req_total_len, bool write_ordered)
3632{
3633 MOSDOp *m = static_cast<MOSDOp*>(op->get_nonconst_req());
3634 object_manifest_t *manifest = &obc->obs.oi.manifest;
3635 if (!manifest->chunk_map.count(chunk_index)) {
3636 return;
f67539c2 3637 }
11fdf7f2
TL
3638 uint64_t chunk_length = manifest->chunk_map[chunk_index].length;
3639 hobject_t soid = manifest->chunk_map[chunk_index].oid;
3640 hobject_t ori_soid = m->get_hobj();
3641 object_locator_t oloc(soid);
3642 unsigned flags = CEPH_OSD_FLAG_IGNORE_CACHE | CEPH_OSD_FLAG_IGNORE_OVERLAY;
3643 if (write_ordered) {
3644 flags |= CEPH_OSD_FLAG_RWORDERED;
3645 }
f67539c2 3646
11fdf7f2
TL
3647 if (!chunk_length || soid == hobject_t()) {
3648 return;
3649 }
3650
3651 /* same as do_proxy_read() */
3652 flags |= m->get_flags() & (CEPH_OSD_FLAG_RWORDERED |
3653 CEPH_OSD_FLAG_ORDERSNAP |
3654 CEPH_OSD_FLAG_ENFORCE_SNAPC |
3655 CEPH_OSD_FLAG_MAP_SNAP_CLONE);
3656
f67539c2
TL
3657 dout(10) << __func__ << " Start do chunk proxy read for " << *m
3658 << " index: " << op_index << " oid: " << soid.oid.name << " req_offset: " << req_offset
11fdf7f2
TL
3659 << " req_length: " << req_length << dendl;
3660
3661 ProxyReadOpRef prdop(std::make_shared<ProxyReadOp>(op, ori_soid, m->ops));
3662
3663 ObjectOperation *pobj_op = new ObjectOperation;
3664 OSDOp &osd_op = pobj_op->add_op(m->ops[op_index].op.op);
3665
3666 if (chunk_index <= req_offset) {
3667 osd_op.op.extent.offset = manifest->chunk_map[chunk_index].offset + req_offset - chunk_index;
3668 } else {
3669 ceph_abort_msg("chunk_index > req_offset");
f67539c2
TL
3670 }
3671 osd_op.op.extent.length = req_length;
11fdf7f2
TL
3672
3673 ObjectOperation obj_op;
3674 obj_op.dup(pobj_op->ops);
3675
3676 C_ProxyChunkRead *fin = new C_ProxyChunkRead(this, ori_soid, get_last_peering_reset(),
3677 prdop);
3678 fin->obj_op = pobj_op;
3679 fin->op_index = op_index;
3680 fin->req_offset = req_offset;
3681 fin->obc = obc;
3682 fin->req_total_len = req_total_len;
3683
11fdf7f2
TL
3684 ceph_tid_t tid = osd->objecter->read(
3685 soid.oid, oloc, obj_op,
3686 m->get_snapid(), NULL,
9f95a23c 3687 flags, new C_OnFinisher(fin, osd->get_objecter_finisher(get_pg_shard())),
11fdf7f2
TL
3688 &prdop->user_version,
3689 &prdop->data_offset,
3690 m->get_features());
3691 fin->tid = tid;
3692 prdop->objecter_tid = tid;
3693 proxyread_ops[tid] = prdop;
3694 in_progress_proxy_ops[ori_soid].push_back(op);
3695}
3696
3697bool PrimaryLogPG::can_proxy_chunked_read(OpRequestRef op, ObjectContextRef obc)
3698{
3699 MOSDOp *m = static_cast<MOSDOp*>(op->get_nonconst_req());
3700 OSDOp *osd_op = NULL;
3701 bool ret = true;
3702 for (unsigned int i = 0; i < m->ops.size(); i++) {
3703 osd_op = &m->ops[i];
3704 ceph_osd_op op = osd_op->op;
3705 switch (op.op) {
f67539c2 3706 case CEPH_OSD_OP_READ:
11fdf7f2
TL
3707 case CEPH_OSD_OP_SYNC_READ: {
3708 uint64_t cursor = osd_op->op.extent.offset;
3709 uint64_t remain = osd_op->op.extent.length;
3710
3711 /* requested chunks exist in chunk_map ? */
3712 for (auto &p : obc->obs.oi.manifest.chunk_map) {
3713 if (p.first <= cursor && p.first + p.second.length > cursor) {
3714 if (!p.second.is_missing()) {
3715 return false;
3716 }
3717 if (p.second.length >= remain) {
3718 remain = 0;
f67539c2 3719 break;
11fdf7f2
TL
3720 } else {
3721 remain = remain - p.second.length;
3722 }
3723 cursor += p.second.length;
3724 }
3725 }
f67539c2 3726
11fdf7f2
TL
3727 if (remain) {
3728 dout(20) << __func__ << " requested chunks don't exist in chunk_map " << dendl;
3729 return false;
3730 }
3731 continue;
3732 }
3733 default:
3734 return false;
3735 }
3736 }
3737 return ret;
3738}
3739
7c673cae
FG
3740void PrimaryLogPG::finish_proxy_write(hobject_t oid, ceph_tid_t tid, int r)
3741{
3742 dout(10) << __func__ << " " << oid << " tid " << tid
3743 << " " << cpp_strerror(r) << dendl;
3744
3745 map<ceph_tid_t, ProxyWriteOpRef>::iterator p = proxywrite_ops.find(tid);
3746 if (p == proxywrite_ops.end()) {
3747 dout(10) << __func__ << " no proxywrite_op found" << dendl;
3748 return;
3749 }
3750 ProxyWriteOpRef pwop = p->second;
11fdf7f2
TL
3751 ceph_assert(tid == pwop->objecter_tid);
3752 ceph_assert(oid == pwop->soid);
7c673cae
FG
3753
3754 proxywrite_ops.erase(tid);
3755
3756 map<hobject_t, list<OpRequestRef> >::iterator q = in_progress_proxy_ops.find(oid);
3757 if (q == in_progress_proxy_ops.end()) {
3758 dout(10) << __func__ << " no in_progress_proxy_ops found" << dendl;
3759 delete pwop->ctx;
3760 pwop->ctx = NULL;
3761 return;
3762 }
3763 list<OpRequestRef>& in_progress_op = q->second;
11fdf7f2 3764 ceph_assert(in_progress_op.size());
7c673cae
FG
3765 list<OpRequestRef>::iterator it = std::find(in_progress_op.begin(),
3766 in_progress_op.end(),
3767 pwop->op);
11fdf7f2 3768 ceph_assert(it != in_progress_op.end());
7c673cae
FG
3769 in_progress_op.erase(it);
3770 if (in_progress_op.size() == 0) {
3771 in_progress_proxy_ops.erase(oid);
11fdf7f2
TL
3772 } else if (std::find(in_progress_op.begin(),
3773 in_progress_op.end(),
3774 pwop->op) != in_progress_op.end()) {
3775 if (pwop->ctx)
3776 delete pwop->ctx;
3777 pwop->ctx = NULL;
3778 dout(20) << __func__ << " " << oid << " tid " << tid
3779 << " in_progress_op size: "
3780 << in_progress_op.size() << dendl;
3781 return;
7c673cae
FG
3782 }
3783
3784 osd->logger->inc(l_osd_tier_proxy_write);
3785
9f95a23c 3786 auto m = pwop->op->get_req<MOSDOp>();
11fdf7f2 3787 ceph_assert(m != NULL);
7c673cae
FG
3788
3789 if (!pwop->sent_reply) {
3790 // send commit.
9f95a23c
TL
3791 assert(pwop->ctx->reply == nullptr);
3792 MOSDOpReply *reply = new MOSDOpReply(m, r, get_osdmap_epoch(), 0,
3793 true /* we claim it below */);
3794 reply->set_reply_versions(eversion_t(), pwop->user_version);
7c673cae 3795 reply->add_flags(CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK);
9f95a23c 3796 reply->claim_op_out_data(pwop->ops);
7c673cae
FG
3797 dout(10) << " sending commit on " << pwop << " " << reply << dendl;
3798 osd->send_message_osd_client(reply, m->get_connection());
3799 pwop->sent_reply = true;
3800 pwop->ctx->op->mark_commit_sent();
3801 }
3802
3803 delete pwop->ctx;
3804 pwop->ctx = NULL;
3805}
3806
94b18763
FG
3807void PrimaryLogPG::cancel_proxy_write(ProxyWriteOpRef pwop,
3808 vector<ceph_tid_t> *tids)
7c673cae
FG
3809{
3810 dout(10) << __func__ << " " << pwop->soid << dendl;
3811 pwop->canceled = true;
3812
3813 // cancel objecter op, if we can
3814 if (pwop->objecter_tid) {
94b18763 3815 tids->push_back(pwop->objecter_tid);
7c673cae
FG
3816 delete pwop->ctx;
3817 pwop->ctx = NULL;
3818 proxywrite_ops.erase(pwop->objecter_tid);
3819 pwop->objecter_tid = 0;
3820 }
3821}
3822
3823class PromoteCallback: public PrimaryLogPG::CopyCallback {
3824 ObjectContextRef obc;
3825 PrimaryLogPG *pg;
3826 utime_t start;
3827public:
3828 PromoteCallback(ObjectContextRef obc_, PrimaryLogPG *pg_)
3829 : obc(obc_),
3830 pg(pg_),
3831 start(ceph_clock_now()) {}
3832
3833 void finish(PrimaryLogPG::CopyCallbackResults results) override {
3834 PrimaryLogPG::CopyResults *results_data = results.get<1>();
3835 int r = results.get<0>();
3836 pg->finish_promote(r, results_data, obc);
3837 pg->osd->logger->tinc(l_osd_tier_promote_lat, ceph_clock_now() - start);
3838 }
3839};
3840
11fdf7f2
TL
3841class PromoteManifestCallback: public PrimaryLogPG::CopyCallback {
3842 ObjectContextRef obc;
3843 PrimaryLogPG *pg;
3844 utime_t start;
3845 PrimaryLogPG::OpContext *ctx;
3846 PrimaryLogPG::CopyCallbackResults promote_results;
3847public:
3848 PromoteManifestCallback(ObjectContextRef obc_, PrimaryLogPG *pg_, PrimaryLogPG::OpContext *ctx = NULL)
3849 : obc(obc_),
3850 pg(pg_),
3851 start(ceph_clock_now()), ctx(ctx) {}
3852
3853 void finish(PrimaryLogPG::CopyCallbackResults results) override {
3854 PrimaryLogPG::CopyResults *results_data = results.get<1>();
3855 int r = results.get<0>();
3856 if (ctx) {
3857 promote_results = results;
3858 pg->execute_ctx(ctx);
3859 } else {
3860 pg->finish_promote_manifest(r, results_data, obc);
3861 }
3862 pg->osd->logger->tinc(l_osd_tier_promote_lat, ceph_clock_now() - start);
3863 }
3864 friend struct PromoteFinisher;
3865};
3866
3867struct PromoteFinisher : public PrimaryLogPG::OpFinisher {
3868 PromoteManifestCallback *promote_callback;
3869
3870 explicit PromoteFinisher(PromoteManifestCallback *promote_callback)
3871 : promote_callback(promote_callback) {
3872 }
3873
3874 int execute() override {
3875 if (promote_callback->ctx->obc->obs.oi.manifest.is_redirect()) {
3876 promote_callback->ctx->pg->finish_promote(promote_callback->promote_results.get<0>(),
3877 promote_callback->promote_results.get<1>(),
3878 promote_callback->obc);
3879 } else if (promote_callback->ctx->obc->obs.oi.manifest.is_chunked()) {
3880 promote_callback->ctx->pg->finish_promote_manifest(promote_callback->promote_results.get<0>(),
3881 promote_callback->promote_results.get<1>(),
3882 promote_callback->obc);
3883 } else {
3884 ceph_abort_msg("unrecognized manifest type");
3885 }
3886 return 0;
3887 }
3888};
3889
7c673cae
FG
3890void PrimaryLogPG::promote_object(ObjectContextRef obc,
3891 const hobject_t& missing_oid,
3892 const object_locator_t& oloc,
3893 OpRequestRef op,
3894 ObjectContextRef *promote_obc)
3895{
3896 hobject_t hoid = obc ? obc->obs.oi.soid : missing_oid;
11fdf7f2 3897 ceph_assert(hoid != hobject_t());
f67539c2 3898 if (m_scrubber->write_blocked_by_scrub(hoid)) {
7c673cae
FG
3899 dout(10) << __func__ << " " << hoid
3900 << " blocked by scrub" << dendl;
3901 if (op) {
3902 waiting_for_scrub.push_back(op);
3903 op->mark_delayed("waiting for scrub");
3904 dout(10) << __func__ << " " << hoid
3905 << " placing op in waiting_for_scrub" << dendl;
3906 } else {
3907 dout(10) << __func__ << " " << hoid
3908 << " no op, dropping on the floor" << dendl;
3909 }
3910 return;
3911 }
9f95a23c
TL
3912 if (op && !check_laggy_requeue(op)) {
3913 return;
3914 }
7c673cae 3915 if (!obc) { // we need to create an ObjectContext
11fdf7f2 3916 ceph_assert(missing_oid != hobject_t());
7c673cae
FG
3917 obc = get_object_context(missing_oid, true);
3918 }
3919 if (promote_obc)
3920 *promote_obc = obc;
3921
3922 /*
3923 * Before promote complete, if there are proxy-reads for the object,
3924 * for this case we don't use DONTNEED.
3925 */
3926 unsigned src_fadvise_flags = LIBRADOS_OP_FLAG_FADVISE_SEQUENTIAL;
3927 map<hobject_t, list<OpRequestRef>>::iterator q = in_progress_proxy_ops.find(obc->obs.oi.soid);
3928 if (q == in_progress_proxy_ops.end()) {
3929 src_fadvise_flags |= LIBRADOS_OP_FLAG_FADVISE_DONTNEED;
3930 }
3931
11fdf7f2
TL
3932 CopyCallback *cb;
3933 object_locator_t my_oloc;
3934 hobject_t src_hoid;
3935 if (!obc->obs.oi.has_manifest()) {
3936 my_oloc = oloc;
3937 my_oloc.pool = pool.info.tier_of;
3938 src_hoid = obc->obs.oi.soid;
3939 cb = new PromoteCallback(obc, this);
3940 } else {
3941 if (obc->obs.oi.manifest.is_chunked()) {
3942 src_hoid = obc->obs.oi.soid;
3943 cb = new PromoteManifestCallback(obc, this);
3944 } else if (obc->obs.oi.manifest.is_redirect()) {
3945 object_locator_t src_oloc(obc->obs.oi.manifest.redirect_target);
3946 my_oloc = src_oloc;
3947 src_hoid = obc->obs.oi.manifest.redirect_target;
3948 cb = new PromoteCallback(obc, this);
3949 } else {
3950 ceph_abort_msg("unrecognized manifest type");
3951 }
3952 }
7c673cae
FG
3953
3954 unsigned flags = CEPH_OSD_COPY_FROM_FLAG_IGNORE_OVERLAY |
3955 CEPH_OSD_COPY_FROM_FLAG_IGNORE_CACHE |
3956 CEPH_OSD_COPY_FROM_FLAG_MAP_SNAP_CLONE |
3957 CEPH_OSD_COPY_FROM_FLAG_RWORDERED;
11fdf7f2 3958 start_copy(cb, obc, src_hoid, my_oloc, 0, flags,
7c673cae
FG
3959 obc->obs.oi.soid.snap == CEPH_NOSNAP,
3960 src_fadvise_flags, 0);
3961
11fdf7f2 3962 ceph_assert(obc->is_blocked());
7c673cae
FG
3963
3964 if (op)
3965 wait_for_blocked_object(obc->obs.oi.soid, op);
9f95a23c
TL
3966
3967 recovery_state.update_stats(
3968 [](auto &history, auto &stats) {
3969 stats.stats.sum.num_promote++;
3970 return false;
3971 });
7c673cae
FG
3972}
3973
3974void PrimaryLogPG::execute_ctx(OpContext *ctx)
3975{
11fdf7f2 3976 FUNCTRACE(cct);
7c673cae
FG
3977 dout(10) << __func__ << " " << ctx << dendl;
3978 ctx->reset_obs(ctx->obc);
3979 ctx->update_log_only = false; // reset in case finish_copyfrom() is re-running execute_ctx
3980 OpRequestRef op = ctx->op;
9f95a23c 3981 auto m = op->get_req<MOSDOp>();
7c673cae
FG
3982 ObjectContextRef obc = ctx->obc;
3983 const hobject_t& soid = obc->obs.oi.soid;
3984
3985 // this method must be idempotent since we may call it several times
3986 // before we finally apply the resulting transaction.
3987 ctx->op_t.reset(new PGTransaction);
3988
3989 if (op->may_write() || op->may_cache()) {
3990 // snap
3991 if (!(m->has_flag(CEPH_OSD_FLAG_ENFORCE_SNAPC)) &&
3992 pool.info.is_pool_snaps_mode()) {
3993 // use pool's snapc
3994 ctx->snapc = pool.snapc;
3995 } else {
3996 // client specified snapc
3997 ctx->snapc.seq = m->get_snap_seq();
3998 ctx->snapc.snaps = m->get_snaps();
3999 filter_snapc(ctx->snapc.snaps);
4000 }
4001 if ((m->has_flag(CEPH_OSD_FLAG_ORDERSNAP)) &&
4002 ctx->snapc.seq < obc->ssc->snapset.seq) {
4003 dout(10) << " ORDERSNAP flag set and snapc seq " << ctx->snapc.seq
4004 << " < snapset seq " << obc->ssc->snapset.seq
4005 << " on " << obc->obs.oi.soid << dendl;
4006 reply_ctx(ctx, -EOLDSNAPC);
4007 return;
4008 }
4009
4010 // version
4011 ctx->at_version = get_next_version();
4012 ctx->mtime = m->get_mtime();
4013
c07f9fc5 4014 dout(10) << __func__ << " " << soid << " " << *ctx->ops
f67539c2 4015 << " ov " << obc->obs.oi.version << " av " << ctx->at_version
7c673cae
FG
4016 << " snapc " << ctx->snapc
4017 << " snapset " << obc->ssc->snapset
f67539c2 4018 << dendl;
7c673cae 4019 } else {
c07f9fc5 4020 dout(10) << __func__ << " " << soid << " " << *ctx->ops
7c673cae 4021 << " ov " << obc->obs.oi.version
f67539c2 4022 << dendl;
7c673cae
FG
4023 }
4024
4025 if (!ctx->user_at_version)
4026 ctx->user_at_version = obc->obs.oi.user_version;
4027 dout(30) << __func__ << " user_at_version " << ctx->user_at_version << dendl;
4028
7c673cae
FG
4029 {
4030#ifdef WITH_LTTNG
4031 osd_reqid_t reqid = ctx->op->get_reqid();
4032#endif
4033 tracepoint(osd, prepare_tx_enter, reqid.name._type,
4034 reqid.name._num, reqid.tid, reqid.inc);
4035 }
f67539c2
TL
4036#ifdef HAVE_JAEGER
4037 if (ctx->op->osd_parent_span) {
4038 auto execute_span = jaeger_tracing::child_span(__func__, ctx->op->osd_parent_span);
4039 }
4040#endif
7c673cae
FG
4041
4042 int result = prepare_transaction(ctx);
4043
4044 {
4045#ifdef WITH_LTTNG
4046 osd_reqid_t reqid = ctx->op->get_reqid();
4047#endif
4048 tracepoint(osd, prepare_tx_exit, reqid.name._type,
4049 reqid.name._num, reqid.tid, reqid.inc);
4050 }
4051
c07f9fc5
FG
4052 bool pending_async_reads = !ctx->pending_async_reads.empty();
4053 if (result == -EINPROGRESS || pending_async_reads) {
7c673cae 4054 // come back later.
c07f9fc5 4055 if (pending_async_reads) {
11fdf7f2 4056 ceph_assert(pool.info.is_erasure());
c07f9fc5
FG
4057 in_progress_async_reads.push_back(make_pair(op, ctx));
4058 ctx->start_async_reads(this);
4059 }
7c673cae
FG
4060 return;
4061 }
4062
4063 if (result == -EAGAIN) {
4064 // clean up after the ctx
4065 close_op_ctx(ctx);
4066 return;
4067 }
4068
9f95a23c
TL
4069 bool ignore_out_data = false;
4070 if (!ctx->op_t->empty() &&
4071 op->may_write() &&
4072 result >= 0) {
4073 // successful update
4074 if (ctx->op->allows_returnvec()) {
4075 // enforce reasonable bound on the return buffer sizes
4076 for (auto& i : *ctx->ops) {
4077 if (i.outdata.length() > cct->_conf->osd_max_write_op_reply_len) {
4078 dout(10) << __func__ << " op " << i << " outdata overflow" << dendl;
4079 result = -EOVERFLOW; // overall result is overflow
4080 i.rval = -EOVERFLOW;
4081 i.outdata.clear();
4082 }
4083 }
4084 } else {
4085 // legacy behavior -- zero result and return data etc.
4086 ignore_out_data = true;
4087 result = 0;
4088 }
7c673cae 4089 }
9f95a23c
TL
4090
4091 // prepare the reply
4092 ctx->reply = new MOSDOpReply(m, result, get_osdmap_epoch(), 0,
4093 ignore_out_data);
4094 dout(20) << __func__ << " alloc reply " << ctx->reply
4095 << " result " << result << dendl;
7c673cae
FG
4096
4097 // read or error?
4098 if ((ctx->op_t->empty() || result < 0) && !ctx->update_log_only) {
4099 // finish side-effects
4100 if (result >= 0)
4101 do_osd_op_effects(ctx, m->get_connection());
4102
c07f9fc5 4103 complete_read_ctx(result, ctx);
7c673cae
FG
4104 return;
4105 }
4106
4107 ctx->reply->set_reply_versions(ctx->at_version, ctx->user_at_version);
4108
11fdf7f2 4109 ceph_assert(op->may_write() || op->may_cache());
7c673cae
FG
4110
4111 // trim log?
9f95a23c 4112 recovery_state.update_trim_to();
7c673cae
FG
4113
4114 // verify that we are doing this in order?
4115 if (cct->_conf->osd_debug_op_order && m->get_source().is_client() &&
4116 !pool.info.is_tier() && !pool.info.has_tiers()) {
4117 map<client_t,ceph_tid_t>& cm = debug_op_order[obc->obs.oi.soid];
4118 ceph_tid_t t = m->get_tid();
4119 client_t n = m->get_source().num();
4120 map<client_t,ceph_tid_t>::iterator p = cm.find(n);
4121 if (p == cm.end()) {
4122 dout(20) << " op order client." << n << " tid " << t << " (first)" << dendl;
4123 cm[n] = t;
4124 } else {
4125 dout(20) << " op order client." << n << " tid " << t << " last was " << p->second << dendl;
4126 if (p->second > t) {
4127 derr << "bad op order, already applied " << p->second << " > this " << t << dendl;
11fdf7f2 4128 ceph_abort_msg("out of order op");
7c673cae
FG
4129 }
4130 p->second = t;
4131 }
4132 }
4133
4134 if (ctx->update_log_only) {
4135 if (result >= 0)
4136 do_osd_op_effects(ctx, m->get_connection());
4137
4138 dout(20) << __func__ << " update_log_only -- result=" << result << dendl;
4139 // save just what we need from ctx
4140 MOSDOpReply *reply = ctx->reply;
4141 ctx->reply = nullptr;
c07f9fc5 4142 reply->get_header().data_off = (ctx->data_off ? *ctx->data_off : 0);
7c673cae
FG
4143
4144 if (result == -ENOENT) {
4145 reply->set_enoent_reply_versions(info.last_update,
4146 info.last_user_version);
4147 }
4148 reply->add_flags(CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK);
4149 // append to pg log for dup detection - don't save buffers for now
9f95a23c
TL
4150 record_write_error(op, soid, reply, result,
4151 ctx->op->allows_returnvec() ? ctx : nullptr);
4152 close_op_ctx(ctx);
7c673cae
FG
4153 return;
4154 }
4155
4156 // no need to capture PG ref, repop cancel will handle that
4157 // Can capture the ctx by pointer, it's owned by the repop
4158 ctx->register_on_commit(
4159 [m, ctx, this](){
4160 if (ctx->op)
11fdf7f2 4161 log_op_stats(*ctx->op, ctx->bytes_written, ctx->bytes_read);
7c673cae
FG
4162
4163 if (m && !ctx->sent_reply) {
4164 MOSDOpReply *reply = ctx->reply;
9f95a23c 4165 ctx->reply = nullptr;
7c673cae
FG
4166 reply->add_flags(CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK);
4167 dout(10) << " sending reply on " << *m << " " << reply << dendl;
4168 osd->send_message_osd_client(reply, m->get_connection());
4169 ctx->sent_reply = true;
4170 ctx->op->mark_commit_sent();
4171 }
4172 });
4173 ctx->register_on_success(
4174 [ctx, this]() {
4175 do_osd_op_effects(
4176 ctx,
4177 ctx->op ? ctx->op->get_req()->get_connection() :
4178 ConnectionRef());
4179 });
4180 ctx->register_on_finish(
11fdf7f2 4181 [ctx]() {
7c673cae
FG
4182 delete ctx;
4183 });
4184
4185 // issue replica writes
4186 ceph_tid_t rep_tid = osd->get_tid();
4187
4188 RepGather *repop = new_repop(ctx, obc, rep_tid);
4189
4190 issue_repop(repop, ctx);
4191 eval_repop(repop);
4192 repop->put();
4193}
4194
c07f9fc5
FG
4195void PrimaryLogPG::close_op_ctx(OpContext *ctx) {
4196 release_object_locks(ctx->lock_manager);
4197
4198 ctx->op_t.reset();
4199
4200 for (auto p = ctx->on_finish.begin(); p != ctx->on_finish.end();
4201 ctx->on_finish.erase(p++)) {
4202 (*p)();
4203 }
4204 delete ctx;
4205}
4206
7c673cae
FG
4207void PrimaryLogPG::reply_ctx(OpContext *ctx, int r)
4208{
4209 if (ctx->op)
4210 osd->reply_op_error(ctx->op, r);
4211 close_op_ctx(ctx);
4212}
4213
11fdf7f2
TL
4214void PrimaryLogPG::log_op_stats(const OpRequest& op,
4215 const uint64_t inb,
4216 const uint64_t outb)
7c673cae 4217{
9f95a23c 4218 auto m = op.get_req<MOSDOp>();
11fdf7f2 4219 const utime_t now = ceph_clock_now();
7c673cae 4220
11fdf7f2
TL
4221 const utime_t latency = now - m->get_recv_stamp();
4222 const utime_t process_latency = now - op.get_dequeued_time();
7c673cae
FG
4223
4224 osd->logger->inc(l_osd_op);
4225
4226 osd->logger->inc(l_osd_op_outb, outb);
4227 osd->logger->inc(l_osd_op_inb, inb);
4228 osd->logger->tinc(l_osd_op_lat, latency);
4229 osd->logger->tinc(l_osd_op_process_lat, process_latency);
4230
11fdf7f2 4231 if (op.may_read() && op.may_write()) {
7c673cae
FG
4232 osd->logger->inc(l_osd_op_rw);
4233 osd->logger->inc(l_osd_op_rw_inb, inb);
4234 osd->logger->inc(l_osd_op_rw_outb, outb);
4235 osd->logger->tinc(l_osd_op_rw_lat, latency);
4236 osd->logger->hinc(l_osd_op_rw_lat_inb_hist, latency.to_nsec(), inb);
4237 osd->logger->hinc(l_osd_op_rw_lat_outb_hist, latency.to_nsec(), outb);
4238 osd->logger->tinc(l_osd_op_rw_process_lat, process_latency);
11fdf7f2 4239 } else if (op.may_read()) {
7c673cae
FG
4240 osd->logger->inc(l_osd_op_r);
4241 osd->logger->inc(l_osd_op_r_outb, outb);
4242 osd->logger->tinc(l_osd_op_r_lat, latency);
4243 osd->logger->hinc(l_osd_op_r_lat_outb_hist, latency.to_nsec(), outb);
4244 osd->logger->tinc(l_osd_op_r_process_lat, process_latency);
11fdf7f2 4245 } else if (op.may_write() || op.may_cache()) {
7c673cae
FG
4246 osd->logger->inc(l_osd_op_w);
4247 osd->logger->inc(l_osd_op_w_inb, inb);
4248 osd->logger->tinc(l_osd_op_w_lat, latency);
4249 osd->logger->hinc(l_osd_op_w_lat_inb_hist, latency.to_nsec(), inb);
4250 osd->logger->tinc(l_osd_op_w_process_lat, process_latency);
11fdf7f2 4251 } else {
7c673cae 4252 ceph_abort();
11fdf7f2 4253 }
7c673cae
FG
4254
4255 dout(15) << "log_op_stats " << *m
4256 << " inb " << inb
4257 << " outb " << outb
4258 << " lat " << latency << dendl;
7c673cae 4259
11fdf7f2
TL
4260 if (m_dynamic_perf_stats.is_enabled()) {
4261 m_dynamic_perf_stats.add(osd, info, op, inb, outb, latency);
7c673cae 4262 }
11fdf7f2 4263}
7c673cae 4264
11fdf7f2
TL
4265void PrimaryLogPG::set_dynamic_perf_stats_queries(
4266 const std::list<OSDPerfMetricQuery> &queries)
4267{
4268 m_dynamic_perf_stats.set_queries(queries);
7c673cae
FG
4269}
4270
11fdf7f2 4271void PrimaryLogPG::get_dynamic_perf_stats(DynamicPerfStats *stats)
7c673cae 4272{
11fdf7f2 4273 std::swap(m_dynamic_perf_stats, *stats);
7c673cae
FG
4274}
4275
4276void PrimaryLogPG::do_scan(
4277 OpRequestRef op,
4278 ThreadPool::TPHandle &handle)
4279{
9f95a23c 4280 auto m = op->get_req<MOSDPGScan>();
11fdf7f2 4281 ceph_assert(m->get_type() == MSG_OSD_PG_SCAN);
7c673cae
FG
4282 dout(10) << "do_scan " << *m << dendl;
4283
4284 op->mark_started();
4285
4286 switch (m->op) {
4287 case MOSDPGScan::OP_SCAN_GET_DIGEST:
4288 {
11fdf7f2
TL
4289 auto dpp = get_dpp();
4290 if (osd->check_backfill_full(dpp)) {
4291 dout(1) << __func__ << ": Canceling backfill: Full." << dendl;
7c673cae 4292 queue_peering_event(
11fdf7f2
TL
4293 PGPeeringEventRef(
4294 std::make_shared<PGPeeringEvent>(
4295 get_osdmap_epoch(),
4296 get_osdmap_epoch(),
9f95a23c 4297 PeeringState::BackfillTooFull())));
7c673cae
FG
4298 return;
4299 }
4300
4301 BackfillInterval bi;
4302 bi.begin = m->begin;
4303 // No need to flush, there won't be any in progress writes occuring
4304 // past m->begin
4305 scan_range(
4306 cct->_conf->osd_backfill_scan_min,
4307 cct->_conf->osd_backfill_scan_max,
4308 &bi,
4309 handle);
4310 MOSDPGScan *reply = new MOSDPGScan(
4311 MOSDPGScan::OP_SCAN_DIGEST,
4312 pg_whoami,
11fdf7f2 4313 get_osdmap_epoch(), m->query_epoch,
7c673cae 4314 spg_t(info.pgid.pgid, get_primary().shard), bi.begin, bi.end);
11fdf7f2 4315 encode(bi.objects, reply->get_data());
7c673cae
FG
4316 osd->send_message_osd_cluster(reply, m->get_connection());
4317 }
4318 break;
4319
4320 case MOSDPGScan::OP_SCAN_DIGEST:
4321 {
4322 pg_shard_t from = m->from;
4323
4324 // Check that from is in backfill_targets vector
9f95a23c 4325 ceph_assert(is_backfill_target(from));
7c673cae
FG
4326
4327 BackfillInterval& bi = peer_backfill_info[from];
4328 bi.begin = m->begin;
4329 bi.end = m->end;
11fdf7f2 4330 auto p = m->get_data().cbegin();
7c673cae
FG
4331
4332 // take care to preserve ordering!
4333 bi.clear_objects();
f67539c2
TL
4334 decode_noclear(bi.objects, p);
4335 dout(10) << __func__ << " bi.begin=" << bi.begin << " bi.end=" << bi.end
4336 << " bi.objects.size()=" << bi.objects.size() << dendl;
7c673cae
FG
4337
4338 if (waiting_on_backfill.erase(from)) {
4339 if (waiting_on_backfill.empty()) {
9f95a23c
TL
4340 ceph_assert(
4341 peer_backfill_info.size() ==
4342 get_backfill_targets().size());
7c673cae
FG
4343 finish_recovery_op(hobject_t::get_max());
4344 }
4345 } else {
4346 // we canceled backfill for a while due to a too full, and this
4347 // is an extra response from a non-too-full peer
11fdf7f2 4348 dout(20) << __func__ << " canceled backfill (too full?)" << dendl;
7c673cae
FG
4349 }
4350 }
4351 break;
4352 }
4353}
4354
4355void PrimaryLogPG::do_backfill(OpRequestRef op)
4356{
9f95a23c 4357 auto m = op->get_req<MOSDPGBackfill>();
11fdf7f2 4358 ceph_assert(m->get_type() == MSG_OSD_PG_BACKFILL);
7c673cae
FG
4359 dout(10) << "do_backfill " << *m << dendl;
4360
4361 op->mark_started();
4362
4363 switch (m->op) {
4364 case MOSDPGBackfill::OP_BACKFILL_FINISH:
4365 {
11fdf7f2 4366 ceph_assert(cct->_conf->osd_kill_backfill_at != 1);
7c673cae
FG
4367
4368 MOSDPGBackfill *reply = new MOSDPGBackfill(
4369 MOSDPGBackfill::OP_BACKFILL_FINISH_ACK,
11fdf7f2 4370 get_osdmap_epoch(),
7c673cae
FG
4371 m->query_epoch,
4372 spg_t(info.pgid.pgid, get_primary().shard));
4373 reply->set_priority(get_recovery_op_priority());
4374 osd->send_message_osd_cluster(reply, m->get_connection());
4375 queue_peering_event(
11fdf7f2
TL
4376 PGPeeringEventRef(
4377 std::make_shared<PGPeeringEvent>(
4378 get_osdmap_epoch(),
4379 get_osdmap_epoch(),
7c673cae
FG
4380 RecoveryDone())));
4381 }
4382 // fall-thru
4383
4384 case MOSDPGBackfill::OP_BACKFILL_PROGRESS:
4385 {
11fdf7f2 4386 ceph_assert(cct->_conf->osd_kill_backfill_at != 2);
7c673cae 4387
7c673cae 4388 ObjectStore::Transaction t;
9f95a23c
TL
4389 recovery_state.update_backfill_progress(
4390 m->last_backfill,
4391 m->stats,
4392 m->op == MOSDPGBackfill::OP_BACKFILL_PROGRESS,
4393 t);
4394
11fdf7f2
TL
4395 int tr = osd->store->queue_transaction(ch, std::move(t), NULL);
4396 ceph_assert(tr == 0);
7c673cae
FG
4397 }
4398 break;
4399
4400 case MOSDPGBackfill::OP_BACKFILL_FINISH_ACK:
4401 {
11fdf7f2
TL
4402 ceph_assert(is_primary());
4403 ceph_assert(cct->_conf->osd_kill_backfill_at != 3);
7c673cae
FG
4404 finish_recovery_op(hobject_t::get_max());
4405 }
4406 break;
4407 }
4408}
4409
4410void PrimaryLogPG::do_backfill_remove(OpRequestRef op)
4411{
4412 const MOSDPGBackfillRemove *m = static_cast<const MOSDPGBackfillRemove*>(
4413 op->get_req());
11fdf7f2 4414 ceph_assert(m->get_type() == MSG_OSD_PG_BACKFILL_REMOVE);
7c673cae
FG
4415 dout(7) << __func__ << " " << m->ls << dendl;
4416
4417 op->mark_started();
4418
4419 ObjectStore::Transaction t;
4420 for (auto& p : m->ls) {
11fdf7f2
TL
4421 if (is_remote_backfilling()) {
4422 struct stat st;
4423 int r = osd->store->stat(ch, ghobject_t(p.first, ghobject_t::NO_GEN,
4424 pg_whoami.shard) , &st);
4425 if (r == 0) {
4426 sub_local_num_bytes(st.st_size);
4427 int64_t usersize;
4428 if (pool.info.is_erasure()) {
4429 bufferlist bv;
4430 int r = osd->store->getattr(
4431 ch,
4432 ghobject_t(p.first, ghobject_t::NO_GEN, pg_whoami.shard),
4433 OI_ATTR,
4434 bv);
4435 if (r >= 0) {
4436 object_info_t oi(bv);
4437 usersize = oi.size * pgbackend->get_ec_data_chunk_count();
4438 } else {
4439 dout(0) << __func__ << " " << ghobject_t(p.first, ghobject_t::NO_GEN, pg_whoami.shard)
4440 << " can't get object info" << dendl;
4441 usersize = 0;
4442 }
4443 } else {
4444 usersize = st.st_size;
4445 }
4446 sub_num_bytes(usersize);
4447 dout(10) << __func__ << " " << ghobject_t(p.first, ghobject_t::NO_GEN, pg_whoami.shard)
4448 << " sub actual data by " << st.st_size
4449 << " sub num_bytes by " << usersize
4450 << dendl;
4451 }
4452 }
7c673cae
FG
4453 remove_snap_mapped_object(t, p.first);
4454 }
11fdf7f2
TL
4455 int r = osd->store->queue_transaction(ch, std::move(t), NULL);
4456 ceph_assert(r == 0);
7c673cae
FG
4457}
4458
224ce89b 4459int PrimaryLogPG::trim_object(
9f95a23c
TL
4460 bool first, const hobject_t &coid, snapid_t snap_to_trim,
4461 PrimaryLogPG::OpContextUPtr *ctxp)
7c673cae 4462{
224ce89b 4463 *ctxp = NULL;
11fdf7f2 4464
7c673cae
FG
4465 // load clone info
4466 bufferlist bl;
4467 ObjectContextRef obc = get_object_context(coid, false, NULL);
224ce89b
WB
4468 if (!obc || !obc->ssc || !obc->ssc->exists) {
4469 osd->clog->error() << __func__ << ": Can not trim " << coid
4470 << " repair needed " << (obc ? "(no obc->ssc or !exists)" : "(no obc)");
4471 return -ENOENT;
7c673cae 4472 }
7c673cae 4473
11fdf7f2
TL
4474 hobject_t head_oid = coid.get_head();
4475 ObjectContextRef head_obc = get_object_context(head_oid, false);
4476 if (!head_obc) {
224ce89b 4477 osd->clog->error() << __func__ << ": Can not trim " << coid
11fdf7f2 4478 << " repair needed, no snapset obc for " << head_oid;
224ce89b
WB
4479 return -ENOENT;
4480 }
7c673cae
FG
4481
4482 SnapSet& snapset = obc->ssc->snapset;
4483
7c673cae 4484 object_info_t &coi = obc->obs.oi;
11fdf7f2
TL
4485 auto citer = snapset.clone_snaps.find(coid.snap);
4486 if (citer == snapset.clone_snaps.end()) {
4487 osd->clog->error() << "No clone_snaps in snapset " << snapset
4488 << " for object " << coid << "\n";
4489 return -ENOENT;
7c673cae 4490 }
11fdf7f2 4491 set<snapid_t> old_snaps(citer->second.begin(), citer->second.end());
7c673cae 4492 if (old_snaps.empty()) {
c07f9fc5 4493 osd->clog->error() << "No object info snaps for object " << coid;
224ce89b 4494 return -ENOENT;
7c673cae
FG
4495 }
4496
4497 dout(10) << coid << " old_snaps " << old_snaps
4498 << " old snapset " << snapset << dendl;
4499 if (snapset.seq == 0) {
c07f9fc5 4500 osd->clog->error() << "No snapset.seq for object " << coid;
224ce89b 4501 return -ENOENT;
7c673cae
FG
4502 }
4503
4504 set<snapid_t> new_snaps;
9f95a23c 4505 const OSDMapRef& osdmap = get_osdmap();
7c673cae
FG
4506 for (set<snapid_t>::iterator i = old_snaps.begin();
4507 i != old_snaps.end();
4508 ++i) {
9f95a23c
TL
4509 if (!osdmap->in_removed_snaps_queue(info.pgid.pgid.pool(), *i) &&
4510 *i != snap_to_trim) {
7c673cae 4511 new_snaps.insert(*i);
9f95a23c 4512 }
7c673cae
FG
4513 }
4514
4515 vector<snapid_t>::iterator p = snapset.clones.end();
4516
4517 if (new_snaps.empty()) {
4518 p = std::find(snapset.clones.begin(), snapset.clones.end(), coid.snap);
4519 if (p == snapset.clones.end()) {
c07f9fc5 4520 osd->clog->error() << "Snap " << coid.snap << " not in clones";
224ce89b 4521 return -ENOENT;
7c673cae
FG
4522 }
4523 }
4524
4525 OpContextUPtr ctx = simple_opc_create(obc);
11fdf7f2 4526 ctx->head_obc = head_obc;
7c673cae
FG
4527
4528 if (!ctx->lock_manager.get_snaptrimmer_write(
4529 coid,
4530 obc,
4531 first)) {
4532 close_op_ctx(ctx.release());
4533 dout(10) << __func__ << ": Unable to get a wlock on " << coid << dendl;
224ce89b 4534 return -ENOLCK;
7c673cae
FG
4535 }
4536
4537 if (!ctx->lock_manager.get_snaptrimmer_write(
11fdf7f2
TL
4538 head_oid,
4539 head_obc,
7c673cae
FG
4540 first)) {
4541 close_op_ctx(ctx.release());
11fdf7f2 4542 dout(10) << __func__ << ": Unable to get a wlock on " << head_oid << dendl;
224ce89b 4543 return -ENOLCK;
7c673cae
FG
4544 }
4545
4546 ctx->at_version = get_next_version();
4547
4548 PGTransaction *t = ctx->op_t.get();
f67539c2 4549
7c673cae
FG
4550 if (new_snaps.empty()) {
4551 // remove clone
4552 dout(10) << coid << " snaps " << old_snaps << " -> "
4553 << new_snaps << " ... deleting" << dendl;
4554
4555 // ...from snapset
11fdf7f2 4556 ceph_assert(p != snapset.clones.end());
f67539c2 4557
7c673cae
FG
4558 snapid_t last = coid.snap;
4559 ctx->delta_stats.num_bytes -= snapset.get_clone_bytes(last);
4560
4561 if (p != snapset.clones.begin()) {
4562 // not the oldest... merge overlap into next older clone
4563 vector<snapid_t>::iterator n = p - 1;
4564 hobject_t prev_coid = coid;
4565 prev_coid.snap = *n;
4566 bool adjust_prev_bytes = is_present_clone(prev_coid);
4567
4568 if (adjust_prev_bytes)
4569 ctx->delta_stats.num_bytes -= snapset.get_clone_bytes(*n);
4570
4571 snapset.clone_overlap[*n].intersection_of(
4572 snapset.clone_overlap[*p]);
4573
4574 if (adjust_prev_bytes)
4575 ctx->delta_stats.num_bytes += snapset.get_clone_bytes(*n);
4576 }
4577 ctx->delta_stats.num_objects--;
4578 if (coi.is_dirty())
4579 ctx->delta_stats.num_objects_dirty--;
4580 if (coi.is_omap())
4581 ctx->delta_stats.num_objects_omap--;
4582 if (coi.is_whiteout()) {
4583 dout(20) << __func__ << " trimming whiteout on " << coid << dendl;
4584 ctx->delta_stats.num_whiteouts--;
4585 }
4586 ctx->delta_stats.num_object_clones--;
4587 if (coi.is_cache_pinned())
4588 ctx->delta_stats.num_objects_pinned--;
f67539c2
TL
4589 if (coi.has_manifest()) {
4590 dec_all_refcount_manifest(coi, ctx.get());
11fdf7f2 4591 ctx->delta_stats.num_objects_manifest--;
f67539c2 4592 }
7c673cae
FG
4593 obc->obs.exists = false;
4594
4595 snapset.clones.erase(p);
4596 snapset.clone_overlap.erase(last);
4597 snapset.clone_size.erase(last);
4598 snapset.clone_snaps.erase(last);
f67539c2 4599
7c673cae
FG
4600 ctx->log.push_back(
4601 pg_log_entry_t(
4602 pg_log_entry_t::DELETE,
4603 coid,
4604 ctx->at_version,
4605 ctx->obs->oi.version,
4606 0,
4607 osd_reqid_t(),
4608 ctx->mtime,
4609 0)
4610 );
4611 t->remove(coid);
4612 t->update_snaps(
4613 coid,
4614 old_snaps,
4615 new_snaps);
31f18b77
FG
4616
4617 coi = object_info_t(coid);
4618
7c673cae
FG
4619 ctx->at_version.version++;
4620 } else {
4621 // save adjusted snaps for this object
4622 dout(10) << coid << " snaps " << old_snaps << " -> " << new_snaps << dendl;
11fdf7f2
TL
4623 snapset.clone_snaps[coid.snap] =
4624 vector<snapid_t>(new_snaps.rbegin(), new_snaps.rend());
4625 // we still do a 'modify' event on this object just to trigger a
4626 // snapmapper.update ... :(
7c673cae
FG
4627
4628 coi.prior_version = coi.version;
4629 coi.version = ctx->at_version;
4630 bl.clear();
11fdf7f2 4631 encode(coi, bl, get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
7c673cae
FG
4632 t->setattr(coid, OI_ATTR, bl);
4633
4634 ctx->log.push_back(
4635 pg_log_entry_t(
4636 pg_log_entry_t::MODIFY,
4637 coid,
4638 coi.version,
4639 coi.prior_version,
4640 0,
4641 osd_reqid_t(),
4642 ctx->mtime,
4643 0)
4644 );
4645 ctx->at_version.version++;
4646
4647 t->update_snaps(
4648 coid,
4649 old_snaps,
4650 new_snaps);
4651 }
4652
4653 // save head snapset
4654 dout(10) << coid << " new snapset " << snapset << " on "
11fdf7f2 4655 << head_obc->obs.oi << dendl;
7c673cae 4656 if (snapset.clones.empty() &&
11fdf7f2
TL
4657 (head_obc->obs.oi.is_whiteout() &&
4658 !(head_obc->obs.oi.is_dirty() && pool.info.is_tier()) &&
4659 !head_obc->obs.oi.is_cache_pinned())) {
7c673cae
FG
4660 // NOTE: this arguably constitutes minor interference with the
4661 // tiering agent if this is a cache tier since a snap trim event
4662 // is effectively evicting a whiteout we might otherwise want to
4663 // keep around.
11fdf7f2 4664 dout(10) << coid << " removing " << head_oid << dendl;
7c673cae
FG
4665 ctx->log.push_back(
4666 pg_log_entry_t(
4667 pg_log_entry_t::DELETE,
11fdf7f2 4668 head_oid,
7c673cae 4669 ctx->at_version,
11fdf7f2 4670 head_obc->obs.oi.version,
7c673cae
FG
4671 0,
4672 osd_reqid_t(),
4673 ctx->mtime,
4674 0)
4675 );
11fdf7f2
TL
4676 derr << "removing snap head" << dendl;
4677 object_info_t& oi = head_obc->obs.oi;
4678 ctx->delta_stats.num_objects--;
4679 if (oi.is_dirty()) {
4680 ctx->delta_stats.num_objects_dirty--;
4681 }
4682 if (oi.is_omap())
4683 ctx->delta_stats.num_objects_omap--;
4684 if (oi.is_whiteout()) {
4685 dout(20) << __func__ << " trimming whiteout on " << oi.soid << dendl;
4686 ctx->delta_stats.num_whiteouts--;
4687 }
4688 if (oi.is_cache_pinned()) {
4689 ctx->delta_stats.num_objects_pinned--;
4690 }
f67539c2 4691 if (oi.has_manifest()) {
11fdf7f2 4692 ctx->delta_stats.num_objects_manifest--;
f67539c2
TL
4693 dec_all_refcount_manifest(oi, ctx.get());
4694 }
11fdf7f2
TL
4695 head_obc->obs.exists = false;
4696 head_obc->obs.oi = object_info_t(head_oid);
4697 t->remove(head_oid);
7c673cae 4698 } else {
9f95a23c
TL
4699 if (get_osdmap()->require_osd_release < ceph_release_t::octopus) {
4700 // filter SnapSet::snaps for the benefit of pre-octopus
4701 // peers. This is perhaps overly conservative in that I'm not
4702 // certain they need this, but let's be conservative here.
4703 dout(10) << coid << " filtering snapset on " << head_oid << dendl;
4704 snapset.filter(pool.info);
4705 } else {
4706 snapset.snaps.clear();
4707 }
11fdf7f2 4708 dout(10) << coid << " writing updated snapset on " << head_oid
7c673cae
FG
4709 << ", snapset is " << snapset << dendl;
4710 ctx->log.push_back(
4711 pg_log_entry_t(
4712 pg_log_entry_t::MODIFY,
11fdf7f2 4713 head_oid,
7c673cae 4714 ctx->at_version,
11fdf7f2 4715 head_obc->obs.oi.version,
7c673cae
FG
4716 0,
4717 osd_reqid_t(),
4718 ctx->mtime,
4719 0)
4720 );
4721
11fdf7f2
TL
4722 head_obc->obs.oi.prior_version = head_obc->obs.oi.version;
4723 head_obc->obs.oi.version = ctx->at_version;
7c673cae
FG
4724
4725 map <string, bufferlist> attrs;
4726 bl.clear();
11fdf7f2 4727 encode(snapset, bl);
f67539c2 4728 attrs[SS_ATTR] = std::move(bl);
7c673cae
FG
4729
4730 bl.clear();
11fdf7f2 4731 encode(head_obc->obs.oi, bl,
7c673cae 4732 get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
f67539c2 4733 attrs[OI_ATTR] = std::move(bl);
11fdf7f2 4734 t->setattrs(head_oid, attrs);
7c673cae
FG
4735 }
4736
224ce89b
WB
4737 *ctxp = std::move(ctx);
4738 return 0;
7c673cae
FG
4739}
4740
4741void PrimaryLogPG::kick_snap_trim()
4742{
11fdf7f2
TL
4743 ceph_assert(is_active());
4744 ceph_assert(is_primary());
4745 if (is_clean() &&
4746 !state_test(PG_STATE_PREMERGE) &&
4747 !snap_trimq.empty()) {
4748 if (get_osdmap()->test_flag(CEPH_OSDMAP_NOSNAPTRIM)) {
4749 dout(10) << __func__ << ": nosnaptrim set, not kicking" << dendl;
4750 } else {
4751 dout(10) << __func__ << ": clean and snaps to trim, kicking" << dendl;
4752 snap_trimmer_machine.process_event(KickTrim());
4753 }
7c673cae
FG
4754 }
4755}
4756
4757void PrimaryLogPG::snap_trimmer_scrub_complete()
4758{
4759 if (is_primary() && is_active() && is_clean()) {
11fdf7f2 4760 ceph_assert(!snap_trimq.empty());
7c673cae
FG
4761 snap_trimmer_machine.process_event(ScrubComplete());
4762 }
4763}
4764
4765void PrimaryLogPG::snap_trimmer(epoch_t queued)
4766{
9f95a23c 4767 if (recovery_state.is_deleting() || pg_has_reset_since(queued)) {
7c673cae
FG
4768 return;
4769 }
4770
11fdf7f2 4771 ceph_assert(is_primary());
7c673cae
FG
4772
4773 dout(10) << "snap_trimmer posting" << dendl;
4774 snap_trimmer_machine.process_event(DoSnapWork());
4775 dout(10) << "snap_trimmer complete" << dendl;
4776 return;
4777}
4778
4779int PrimaryLogPG::do_xattr_cmp_u64(int op, __u64 v1, bufferlist& xattr)
4780{
4781 __u64 v2;
4782
4783 string v2s(xattr.c_str(), xattr.length());
4784 if (v2s.length())
4785 v2 = strtoull(v2s.c_str(), NULL, 10);
4786 else
4787 v2 = 0;
4788
4789 dout(20) << "do_xattr_cmp_u64 '" << v1 << "' vs '" << v2 << "' op " << op << dendl;
4790
4791 switch (op) {
4792 case CEPH_OSD_CMPXATTR_OP_EQ:
4793 return (v1 == v2);
4794 case CEPH_OSD_CMPXATTR_OP_NE:
4795 return (v1 != v2);
4796 case CEPH_OSD_CMPXATTR_OP_GT:
4797 return (v1 > v2);
4798 case CEPH_OSD_CMPXATTR_OP_GTE:
4799 return (v1 >= v2);
4800 case CEPH_OSD_CMPXATTR_OP_LT:
4801 return (v1 < v2);
4802 case CEPH_OSD_CMPXATTR_OP_LTE:
4803 return (v1 <= v2);
4804 default:
4805 return -EINVAL;
4806 }
4807}
4808
4809int PrimaryLogPG::do_xattr_cmp_str(int op, string& v1s, bufferlist& xattr)
4810{
4811 string v2s(xattr.c_str(), xattr.length());
4812
4813 dout(20) << "do_xattr_cmp_str '" << v1s << "' vs '" << v2s << "' op " << op << dendl;
4814
4815 switch (op) {
4816 case CEPH_OSD_CMPXATTR_OP_EQ:
4817 return (v1s.compare(v2s) == 0);
4818 case CEPH_OSD_CMPXATTR_OP_NE:
4819 return (v1s.compare(v2s) != 0);
4820 case CEPH_OSD_CMPXATTR_OP_GT:
4821 return (v1s.compare(v2s) > 0);
4822 case CEPH_OSD_CMPXATTR_OP_GTE:
4823 return (v1s.compare(v2s) >= 0);
4824 case CEPH_OSD_CMPXATTR_OP_LT:
4825 return (v1s.compare(v2s) < 0);
4826 case CEPH_OSD_CMPXATTR_OP_LTE:
4827 return (v1s.compare(v2s) <= 0);
4828 default:
4829 return -EINVAL;
4830 }
4831}
4832
7c673cae
FG
4833int PrimaryLogPG::do_writesame(OpContext *ctx, OSDOp& osd_op)
4834{
4835 ceph_osd_op& op = osd_op.op;
4836 vector<OSDOp> write_ops(1);
4837 OSDOp& write_op = write_ops[0];
4838 uint64_t write_length = op.writesame.length;
4839 int result = 0;
4840
4841 if (!write_length)
4842 return 0;
4843
4844 if (!op.writesame.data_length || write_length % op.writesame.data_length)
4845 return -EINVAL;
4846
4847 if (op.writesame.data_length != osd_op.indata.length()) {
4848 derr << "invalid length ws data length " << op.writesame.data_length << " actual len " << osd_op.indata.length() << dendl;
4849 return -EINVAL;
4850 }
4851
4852 while (write_length) {
4853 write_op.indata.append(osd_op.indata);
4854 write_length -= op.writesame.data_length;
4855 }
4856
4857 write_op.op.op = CEPH_OSD_OP_WRITE;
4858 write_op.op.extent.offset = op.writesame.offset;
4859 write_op.op.extent.length = op.writesame.length;
4860 result = do_osd_ops(ctx, write_ops);
4861 if (result < 0)
4862 derr << "do_writesame do_osd_ops failed " << result << dendl;
4863
4864 return result;
4865}
4866
4867// ========================================================================
4868// low level osd ops
4869
4870int PrimaryLogPG::do_tmap2omap(OpContext *ctx, unsigned flags)
4871{
4872 dout(20) << " convert tmap to omap for " << ctx->new_obs.oi.soid << dendl;
4873 bufferlist header, vals;
4874 int r = _get_tmap(ctx, &header, &vals);
4875 if (r < 0) {
4876 if (r == -ENODATA && (flags & CEPH_OSD_TMAP2OMAP_NULLOK))
4877 r = 0;
4878 return r;
4879 }
4880
4881 vector<OSDOp> ops(3);
4882
4883 ops[0].op.op = CEPH_OSD_OP_TRUNCATE;
4884 ops[0].op.extent.offset = 0;
4885 ops[0].op.extent.length = 0;
4886
4887 ops[1].op.op = CEPH_OSD_OP_OMAPSETHEADER;
f67539c2 4888 ops[1].indata = std::move(header);
7c673cae
FG
4889
4890 ops[2].op.op = CEPH_OSD_OP_OMAPSETVALS;
f67539c2 4891 ops[2].indata = std::move(vals);
7c673cae
FG
4892
4893 return do_osd_ops(ctx, ops);
4894}
4895
11fdf7f2
TL
4896int PrimaryLogPG::do_tmapup_slow(OpContext *ctx, bufferlist::const_iterator& bp,
4897 OSDOp& osd_op, bufferlist& bl)
7c673cae
FG
4898{
4899 // decode
4900 bufferlist header;
4901 map<string, bufferlist> m;
4902 if (bl.length()) {
11fdf7f2
TL
4903 auto p = bl.cbegin();
4904 decode(header, p);
4905 decode(m, p);
4906 ceph_assert(p.end());
7c673cae
FG
4907 }
4908
4909 // do the update(s)
4910 while (!bp.end()) {
4911 __u8 op;
4912 string key;
11fdf7f2 4913 decode(op, bp);
7c673cae
FG
4914
4915 switch (op) {
4916 case CEPH_OSD_TMAP_SET: // insert key
4917 {
11fdf7f2 4918 decode(key, bp);
7c673cae 4919 bufferlist data;
11fdf7f2 4920 decode(data, bp);
7c673cae
FG
4921 m[key] = data;
4922 }
4923 break;
4924 case CEPH_OSD_TMAP_RM: // remove key
11fdf7f2 4925 decode(key, bp);
7c673cae
FG
4926 if (!m.count(key)) {
4927 return -ENOENT;
4928 }
4929 m.erase(key);
4930 break;
4931 case CEPH_OSD_TMAP_RMSLOPPY: // remove key
11fdf7f2 4932 decode(key, bp);
7c673cae
FG
4933 m.erase(key);
4934 break;
4935 case CEPH_OSD_TMAP_HDR: // update header
4936 {
11fdf7f2 4937 decode(header, bp);
7c673cae
FG
4938 }
4939 break;
4940 default:
4941 return -EINVAL;
4942 }
4943 }
4944
4945 // reencode
4946 bufferlist obl;
11fdf7f2
TL
4947 encode(header, obl);
4948 encode(m, obl);
7c673cae
FG
4949
4950 // write it out
4951 vector<OSDOp> nops(1);
4952 OSDOp& newop = nops[0];
4953 newop.op.op = CEPH_OSD_OP_WRITEFULL;
4954 newop.op.extent.offset = 0;
4955 newop.op.extent.length = obl.length();
4956 newop.indata = obl;
4957 do_osd_ops(ctx, nops);
7c673cae
FG
4958 return 0;
4959}
4960
11fdf7f2 4961int PrimaryLogPG::do_tmapup(OpContext *ctx, bufferlist::const_iterator& bp, OSDOp& osd_op)
7c673cae 4962{
11fdf7f2 4963 bufferlist::const_iterator orig_bp = bp;
7c673cae
FG
4964 int result = 0;
4965 if (bp.end()) {
4966 dout(10) << "tmapup is a no-op" << dendl;
4967 } else {
4968 // read the whole object
4969 vector<OSDOp> nops(1);
4970 OSDOp& newop = nops[0];
4971 newop.op.op = CEPH_OSD_OP_READ;
4972 newop.op.extent.offset = 0;
4973 newop.op.extent.length = 0;
4974 result = do_osd_ops(ctx, nops);
4975
4976 dout(10) << "tmapup read " << newop.outdata.length() << dendl;
4977
4978 dout(30) << " starting is \n";
4979 newop.outdata.hexdump(*_dout);
4980 *_dout << dendl;
4981
11fdf7f2 4982 auto ip = newop.outdata.cbegin();
7c673cae
FG
4983 bufferlist obl;
4984
4985 dout(30) << "the update command is: \n";
4986 osd_op.indata.hexdump(*_dout);
4987 *_dout << dendl;
4988
4989 // header
4990 bufferlist header;
4991 __u32 nkeys = 0;
4992 if (newop.outdata.length()) {
11fdf7f2
TL
4993 decode(header, ip);
4994 decode(nkeys, ip);
7c673cae
FG
4995 }
4996 dout(10) << "tmapup header " << header.length() << dendl;
4997
4998 if (!bp.end() && *bp == CEPH_OSD_TMAP_HDR) {
4999 ++bp;
11fdf7f2 5000 decode(header, bp);
7c673cae
FG
5001 dout(10) << "tmapup new header " << header.length() << dendl;
5002 }
5003
11fdf7f2 5004 encode(header, obl);
7c673cae
FG
5005
5006 dout(20) << "tmapup initial nkeys " << nkeys << dendl;
5007
5008 // update keys
5009 bufferlist newkeydata;
5010 string nextkey, last_in_key;
5011 bufferlist nextval;
5012 bool have_next = false;
5013 if (!ip.end()) {
5014 have_next = true;
11fdf7f2
TL
5015 decode(nextkey, ip);
5016 decode(nextval, ip);
7c673cae
FG
5017 }
5018 while (!bp.end() && !result) {
5019 __u8 op;
5020 string key;
5021 try {
11fdf7f2
TL
5022 decode(op, bp);
5023 decode(key, bp);
7c673cae 5024 }
f67539c2 5025 catch (ceph::buffer::error& e) {
7c673cae
FG
5026 return -EINVAL;
5027 }
5028 if (key < last_in_key) {
5029 dout(5) << "tmapup warning: key '" << key << "' < previous key '" << last_in_key
5030 << "', falling back to an inefficient (unsorted) update" << dendl;
5031 bp = orig_bp;
5032 return do_tmapup_slow(ctx, bp, osd_op, newop.outdata);
5033 }
5034 last_in_key = key;
5035
5036 dout(10) << "tmapup op " << (int)op << " key " << key << dendl;
9f95a23c 5037
7c673cae
FG
5038 // skip existing intervening keys
5039 bool key_exists = false;
5040 while (have_next && !key_exists) {
5041 dout(20) << " (have_next=" << have_next << " nextkey=" << nextkey << ")" << dendl;
5042 if (nextkey > key)
5043 break;
5044 if (nextkey < key) {
5045 // copy untouched.
11fdf7f2
TL
5046 encode(nextkey, newkeydata);
5047 encode(nextval, newkeydata);
7c673cae
FG
5048 dout(20) << " keep " << nextkey << " " << nextval.length() << dendl;
5049 } else {
5050 // don't copy; discard old value. and stop.
5051 dout(20) << " drop " << nextkey << " " << nextval.length() << dendl;
5052 key_exists = true;
5053 nkeys--;
5054 }
5055 if (!ip.end()) {
11fdf7f2
TL
5056 decode(nextkey, ip);
5057 decode(nextval, ip);
7c673cae
FG
5058 } else {
5059 have_next = false;
5060 }
5061 }
5062
5063 if (op == CEPH_OSD_TMAP_SET) {
5064 bufferlist val;
5065 try {
11fdf7f2 5066 decode(val, bp);
7c673cae 5067 }
f67539c2 5068 catch (ceph::buffer::error& e) {
7c673cae
FG
5069 return -EINVAL;
5070 }
11fdf7f2
TL
5071 encode(key, newkeydata);
5072 encode(val, newkeydata);
7c673cae
FG
5073 dout(20) << " set " << key << " " << val.length() << dendl;
5074 nkeys++;
5075 } else if (op == CEPH_OSD_TMAP_CREATE) {
5076 if (key_exists) {
5077 return -EEXIST;
5078 }
5079 bufferlist val;
5080 try {
11fdf7f2 5081 decode(val, bp);
7c673cae 5082 }
f67539c2 5083 catch (ceph::buffer::error& e) {
7c673cae
FG
5084 return -EINVAL;
5085 }
11fdf7f2
TL
5086 encode(key, newkeydata);
5087 encode(val, newkeydata);
7c673cae
FG
5088 dout(20) << " create " << key << " " << val.length() << dendl;
5089 nkeys++;
5090 } else if (op == CEPH_OSD_TMAP_RM) {
5091 // do nothing.
5092 if (!key_exists) {
5093 return -ENOENT;
5094 }
5095 } else if (op == CEPH_OSD_TMAP_RMSLOPPY) {
5096 // do nothing
5097 } else {
5098 dout(10) << " invalid tmap op " << (int)op << dendl;
5099 return -EINVAL;
5100 }
5101 }
5102
5103 // copy remaining
5104 if (have_next) {
11fdf7f2
TL
5105 encode(nextkey, newkeydata);
5106 encode(nextval, newkeydata);
7c673cae
FG
5107 dout(20) << " keep " << nextkey << " " << nextval.length() << dendl;
5108 }
5109 if (!ip.end()) {
5110 bufferlist rest;
5111 rest.substr_of(newop.outdata, ip.get_off(), newop.outdata.length() - ip.get_off());
5112 dout(20) << " keep trailing " << rest.length()
5113 << " at " << newkeydata.length() << dendl;
5114 newkeydata.claim_append(rest);
5115 }
5116
5117 // encode final key count + key data
5118 dout(20) << "tmapup final nkeys " << nkeys << dendl;
11fdf7f2 5119 encode(nkeys, obl);
7c673cae
FG
5120 obl.claim_append(newkeydata);
5121
5122 if (0) {
5123 dout(30) << " final is \n";
5124 obl.hexdump(*_dout);
5125 *_dout << dendl;
5126
5127 // sanity check
11fdf7f2 5128 auto tp = obl.cbegin();
7c673cae 5129 bufferlist h;
11fdf7f2 5130 decode(h, tp);
7c673cae 5131 map<string,bufferlist> d;
11fdf7f2
TL
5132 decode(d, tp);
5133 ceph_assert(tp.end());
7c673cae
FG
5134 dout(0) << " **** debug sanity check, looks ok ****" << dendl;
5135 }
5136
5137 // write it out
5138 if (!result) {
5139 dout(20) << "tmapput write " << obl.length() << dendl;
5140 newop.op.op = CEPH_OSD_OP_WRITEFULL;
5141 newop.op.extent.offset = 0;
5142 newop.op.extent.length = obl.length();
5143 newop.indata = obl;
5144 do_osd_ops(ctx, nops);
7c673cae
FG
5145 }
5146 }
5147 return result;
5148}
5149
11fdf7f2
TL
5150static int check_offset_and_length(uint64_t offset, uint64_t length,
5151 uint64_t max, DoutPrefixProvider *dpp)
7c673cae
FG
5152{
5153 if (offset >= max ||
5154 length > max ||
11fdf7f2
TL
5155 offset + length > max) {
5156 ldpp_dout(dpp, 10) << __func__ << " "
5157 << "osd_max_object_size: " << max
5158 << "; Hard limit of object size is 4GB." << dendl;
7c673cae 5159 return -EFBIG;
11fdf7f2 5160 }
7c673cae
FG
5161
5162 return 0;
5163}
5164
5165struct FillInVerifyExtent : public Context {
5166 ceph_le64 *r;
5167 int32_t *rval;
5168 bufferlist *outdatap;
9f95a23c 5169 std::optional<uint32_t> maybe_crc;
7c673cae
FG
5170 uint64_t size;
5171 OSDService *osd;
5172 hobject_t soid;
9f95a23c 5173 uint32_t flags;
7c673cae 5174 FillInVerifyExtent(ceph_le64 *r, int32_t *rv, bufferlist *blp,
9f95a23c
TL
5175 std::optional<uint32_t> mc, uint64_t size,
5176 OSDService *osd, hobject_t soid, uint32_t flags) :
7c673cae
FG
5177 r(r), rval(rv), outdatap(blp), maybe_crc(mc),
5178 size(size), osd(osd), soid(soid), flags(flags) {}
5179 void finish(int len) override {
7c673cae 5180 *r = len;
c07f9fc5
FG
5181 if (len < 0) {
5182 *rval = len;
7c673cae 5183 return;
c07f9fc5
FG
5184 }
5185 *rval = 0;
5186
7c673cae
FG
5187 // whole object? can we verify the checksum?
5188 if (maybe_crc && *r == size) {
5189 uint32_t crc = outdatap->crc32c(-1);
5190 if (maybe_crc != crc) {
5191 osd->clog->error() << std::hex << " full-object read crc 0x" << crc
5192 << " != expected 0x" << *maybe_crc
5193 << std::dec << " on " << soid;
5194 if (!(flags & CEPH_OSD_OP_FLAG_FAILOK)) {
5195 *rval = -EIO;
5196 *r = 0;
5197 }
5198 }
5199 }
5200 }
5201};
5202
5203struct ToSparseReadResult : public Context {
c07f9fc5
FG
5204 int* result;
5205 bufferlist* data_bl;
7c673cae 5206 uint64_t data_offset;
c07f9fc5
FG
5207 ceph_le64* len;
5208 ToSparseReadResult(int* result, bufferlist* bl, uint64_t offset,
5209 ceph_le64* len)
5210 : result(result), data_bl(bl), data_offset(offset),len(len) {}
7c673cae 5211 void finish(int r) override {
c07f9fc5
FG
5212 if (r < 0) {
5213 *result = r;
5214 return;
5215 }
5216 *result = 0;
5217 *len = r;
7c673cae
FG
5218 bufferlist outdata;
5219 map<uint64_t, uint64_t> extents = {{data_offset, r}};
11fdf7f2 5220 encode(extents, outdata);
f67539c2 5221 encode_destructively(*data_bl, outdata);
c07f9fc5 5222 data_bl->swap(outdata);
7c673cae
FG
5223 }
5224};
5225
5226template<typename V>
5227static string list_keys(const map<string, V>& m) {
5228 string s;
5229 for (typename map<string, V>::const_iterator itr = m.begin(); itr != m.end(); ++itr) {
5230 if (!s.empty()) {
5231 s.push_back(',');
5232 }
5233 s.append(itr->first);
5234 }
5235 return s;
5236}
5237
5238template<typename T>
5239static string list_entries(const T& m) {
5240 string s;
5241 for (typename T::const_iterator itr = m.begin(); itr != m.end(); ++itr) {
5242 if (!s.empty()) {
5243 s.push_back(',');
5244 }
5245 s.append(*itr);
5246 }
5247 return s;
5248}
5249
5250void PrimaryLogPG::maybe_create_new_object(
5251 OpContext *ctx,
5252 bool ignore_transaction)
5253{
5254 ObjectState& obs = ctx->new_obs;
5255 if (!obs.exists) {
5256 ctx->delta_stats.num_objects++;
5257 obs.exists = true;
11fdf7f2 5258 ceph_assert(!obs.oi.is_whiteout());
7c673cae
FG
5259 obs.oi.new_object();
5260 if (!ignore_transaction)
5261 ctx->op_t->create(obs.oi.soid);
5262 } else if (obs.oi.is_whiteout()) {
5263 dout(10) << __func__ << " clearing whiteout on " << obs.oi.soid << dendl;
5264 ctx->new_obs.oi.clear_flag(object_info_t::FLAG_WHITEOUT);
5265 --ctx->delta_stats.num_whiteouts;
5266 }
5267}
5268
c07f9fc5
FG
5269struct ReadFinisher : public PrimaryLogPG::OpFinisher {
5270 OSDOp& osd_op;
5271
11fdf7f2 5272 explicit ReadFinisher(OSDOp& osd_op) : osd_op(osd_op) {
c07f9fc5
FG
5273 }
5274
5275 int execute() override {
5276 return osd_op.rval;
5277 }
5278};
5279
7c673cae
FG
5280struct C_ChecksumRead : public Context {
5281 PrimaryLogPG *primary_log_pg;
5282 OSDOp &osd_op;
5283 Checksummer::CSumType csum_type;
5284 bufferlist init_value_bl;
5285 ceph_le64 read_length;
5286 bufferlist read_bl;
5287 Context *fill_extent_ctx;
5288
5289 C_ChecksumRead(PrimaryLogPG *primary_log_pg, OSDOp &osd_op,
5290 Checksummer::CSumType csum_type, bufferlist &&init_value_bl,
9f95a23c
TL
5291 std::optional<uint32_t> maybe_crc, uint64_t size,
5292 OSDService *osd, hobject_t soid, uint32_t flags)
7c673cae
FG
5293 : primary_log_pg(primary_log_pg), osd_op(osd_op),
5294 csum_type(csum_type), init_value_bl(std::move(init_value_bl)),
5295 fill_extent_ctx(new FillInVerifyExtent(&read_length, &osd_op.rval,
5296 &read_bl, maybe_crc, size,
5297 osd, soid, flags)) {
5298 }
c07f9fc5
FG
5299 ~C_ChecksumRead() override {
5300 delete fill_extent_ctx;
5301 }
7c673cae
FG
5302
5303 void finish(int r) override {
5304 fill_extent_ctx->complete(r);
c07f9fc5 5305 fill_extent_ctx = nullptr;
7c673cae
FG
5306
5307 if (osd_op.rval >= 0) {
11fdf7f2 5308 bufferlist::const_iterator init_value_bl_it = init_value_bl.begin();
7c673cae 5309 osd_op.rval = primary_log_pg->finish_checksum(osd_op, csum_type,
c07f9fc5 5310 &init_value_bl_it, read_bl);
7c673cae
FG
5311 }
5312 }
5313};
5314
5315int PrimaryLogPG::do_checksum(OpContext *ctx, OSDOp& osd_op,
11fdf7f2 5316 bufferlist::const_iterator *bl_it)
7c673cae
FG
5317{
5318 dout(20) << __func__ << dendl;
5319
5320 auto& op = osd_op.op;
5321 if (op.checksum.chunk_size > 0) {
5322 if (op.checksum.length == 0) {
5323 dout(10) << __func__ << ": length required when chunk size provided"
5324 << dendl;
5325 return -EINVAL;
5326 }
5327 if (op.checksum.length % op.checksum.chunk_size != 0) {
5328 dout(10) << __func__ << ": length not aligned to chunk size" << dendl;
5329 return -EINVAL;
5330 }
5331 }
5332
5333 auto& oi = ctx->new_obs.oi;
5334 if (op.checksum.offset == 0 && op.checksum.length == 0) {
5335 // zeroed offset+length implies checksum whole object
5336 op.checksum.length = oi.size;
11fdf7f2
TL
5337 } else if (op.checksum.offset >= oi.size) {
5338 // read size was trimmed to zero, do nothing
5339 // see PrimaryLogPG::do_read
5340 return 0;
5341 } else if (op.extent.offset + op.extent.length > oi.size) {
5342 op.extent.length = oi.size - op.extent.offset;
5343 if (op.checksum.chunk_size > 0 &&
5344 op.checksum.length % op.checksum.chunk_size != 0) {
5345 dout(10) << __func__ << ": length (trimmed to 0x"
5346 << std::hex << op.checksum.length
5347 << ") not aligned to chunk size 0x"
5348 << op.checksum.chunk_size << std::dec
5349 << dendl;
5350 return -EINVAL;
5351 }
7c673cae
FG
5352 }
5353
5354 Checksummer::CSumType csum_type;
5355 switch (op.checksum.type) {
5356 case CEPH_OSD_CHECKSUM_OP_TYPE_XXHASH32:
5357 csum_type = Checksummer::CSUM_XXHASH32;
5358 break;
5359 case CEPH_OSD_CHECKSUM_OP_TYPE_XXHASH64:
5360 csum_type = Checksummer::CSUM_XXHASH64;
5361 break;
5362 case CEPH_OSD_CHECKSUM_OP_TYPE_CRC32C:
5363 csum_type = Checksummer::CSUM_CRC32C;
5364 break;
5365 default:
5366 dout(10) << __func__ << ": unknown crc type ("
5367 << static_cast<uint32_t>(op.checksum.type) << ")" << dendl;
5368 return -EINVAL;
5369 }
5370
5371 size_t csum_init_value_size = Checksummer::get_csum_init_value_size(csum_type);
5372 if (bl_it->get_remaining() < csum_init_value_size) {
5373 dout(10) << __func__ << ": init value not provided" << dendl;
5374 return -EINVAL;
5375 }
5376
5377 bufferlist init_value_bl;
5378 init_value_bl.substr_of(bl_it->get_bl(), bl_it->get_off(),
5379 csum_init_value_size);
9f95a23c 5380 *bl_it += csum_init_value_size;
7c673cae 5381
11fdf7f2 5382 if (pool.info.is_erasure() && op.checksum.length > 0) {
7c673cae
FG
5383 // If there is a data digest and it is possible we are reading
5384 // entire object, pass the digest.
9f95a23c 5385 std::optional<uint32_t> maybe_crc;
11fdf7f2 5386 if (oi.is_data_digest() && op.checksum.offset == 0 &&
7c673cae
FG
5387 op.checksum.length >= oi.size) {
5388 maybe_crc = oi.data_digest;
5389 }
5390
5391 // async read
5392 auto& soid = oi.soid;
5393 auto checksum_ctx = new C_ChecksumRead(this, osd_op, csum_type,
5394 std::move(init_value_bl), maybe_crc,
5395 oi.size, osd, soid, op.flags);
c07f9fc5 5396
7c673cae
FG
5397 ctx->pending_async_reads.push_back({
5398 {op.checksum.offset, op.checksum.length, op.flags},
5399 {&checksum_ctx->read_bl, checksum_ctx}});
5400
5401 dout(10) << __func__ << ": async_read noted for " << soid << dendl;
c07f9fc5
FG
5402 ctx->op_finishers[ctx->current_osd_subop_num].reset(
5403 new ReadFinisher(osd_op));
5404 return -EINPROGRESS;
7c673cae
FG
5405 }
5406
5407 // sync read
7c673cae
FG
5408 std::vector<OSDOp> read_ops(1);
5409 auto& read_op = read_ops[0];
5410 if (op.checksum.length > 0) {
5411 read_op.op.op = CEPH_OSD_OP_READ;
5412 read_op.op.flags = op.flags;
5413 read_op.op.extent.offset = op.checksum.offset;
5414 read_op.op.extent.length = op.checksum.length;
5415 read_op.op.extent.truncate_size = 0;
5416 read_op.op.extent.truncate_seq = 0;
5417
5418 int r = do_osd_ops(ctx, read_ops);
5419 if (r < 0) {
5420 derr << __func__ << ": do_osd_ops failed: " << cpp_strerror(r) << dendl;
5421 return r;
5422 }
5423 }
5424
11fdf7f2 5425 bufferlist::const_iterator init_value_bl_it = init_value_bl.begin();
7c673cae
FG
5426 return finish_checksum(osd_op, csum_type, &init_value_bl_it,
5427 read_op.outdata);
5428}
5429
5430int PrimaryLogPG::finish_checksum(OSDOp& osd_op,
5431 Checksummer::CSumType csum_type,
11fdf7f2 5432 bufferlist::const_iterator *init_value_bl_it,
7c673cae
FG
5433 const bufferlist &read_bl) {
5434 dout(20) << __func__ << dendl;
5435
5436 auto& op = osd_op.op;
5437
5438 if (op.checksum.length > 0 && read_bl.length() != op.checksum.length) {
5439 derr << __func__ << ": bytes read " << read_bl.length() << " != "
5440 << op.checksum.length << dendl;
5441 return -EINVAL;
5442 }
5443
5444 size_t csum_chunk_size = (op.checksum.chunk_size != 0 ?
5445 op.checksum.chunk_size : read_bl.length());
5446 uint32_t csum_count = (csum_chunk_size > 0 ?
5447 read_bl.length() / csum_chunk_size : 0);
5448
5449 bufferlist csum;
5450 bufferptr csum_data;
5451 if (csum_count > 0) {
5452 size_t csum_value_size = Checksummer::get_csum_value_size(csum_type);
f67539c2 5453 csum_data = ceph::buffer::create(csum_value_size * csum_count);
7c673cae
FG
5454 csum_data.zero();
5455 csum.append(csum_data);
5456
5457 switch (csum_type) {
5458 case Checksummer::CSUM_XXHASH32:
5459 {
5460 Checksummer::xxhash32::init_value_t init_value;
11fdf7f2 5461 decode(init_value, *init_value_bl_it);
7c673cae
FG
5462 Checksummer::calculate<Checksummer::xxhash32>(
5463 init_value, csum_chunk_size, 0, read_bl.length(), read_bl,
5464 &csum_data);
5465 }
5466 break;
5467 case Checksummer::CSUM_XXHASH64:
5468 {
5469 Checksummer::xxhash64::init_value_t init_value;
11fdf7f2 5470 decode(init_value, *init_value_bl_it);
7c673cae
FG
5471 Checksummer::calculate<Checksummer::xxhash64>(
5472 init_value, csum_chunk_size, 0, read_bl.length(), read_bl,
5473 &csum_data);
5474 }
5475 break;
5476 case Checksummer::CSUM_CRC32C:
5477 {
5478 Checksummer::crc32c::init_value_t init_value;
11fdf7f2 5479 decode(init_value, *init_value_bl_it);
7c673cae
FG
5480 Checksummer::calculate<Checksummer::crc32c>(
5481 init_value, csum_chunk_size, 0, read_bl.length(), read_bl,
5482 &csum_data);
5483 }
5484 break;
5485 default:
5486 break;
5487 }
5488 }
5489
11fdf7f2 5490 encode(csum_count, osd_op.outdata);
7c673cae
FG
5491 osd_op.outdata.claim_append(csum);
5492 return 0;
5493}
5494
c07f9fc5
FG
5495struct C_ExtentCmpRead : public Context {
5496 PrimaryLogPG *primary_log_pg;
5497 OSDOp &osd_op;
11fdf7f2 5498 ceph_le64 read_length{};
c07f9fc5
FG
5499 bufferlist read_bl;
5500 Context *fill_extent_ctx;
5501
5502 C_ExtentCmpRead(PrimaryLogPG *primary_log_pg, OSDOp &osd_op,
9f95a23c
TL
5503 std::optional<uint32_t> maybe_crc, uint64_t size,
5504 OSDService *osd, hobject_t soid, uint32_t flags)
c07f9fc5
FG
5505 : primary_log_pg(primary_log_pg), osd_op(osd_op),
5506 fill_extent_ctx(new FillInVerifyExtent(&read_length, &osd_op.rval,
5507 &read_bl, maybe_crc, size,
5508 osd, soid, flags)) {
5509 }
5510 ~C_ExtentCmpRead() override {
5511 delete fill_extent_ctx;
5512 }
5513
5514 void finish(int r) override {
5515 if (r == -ENOENT) {
5516 osd_op.rval = 0;
5517 read_bl.clear();
5518 delete fill_extent_ctx;
5519 } else {
5520 fill_extent_ctx->complete(r);
5521 }
5522 fill_extent_ctx = nullptr;
5523
5524 if (osd_op.rval >= 0) {
5525 osd_op.rval = primary_log_pg->finish_extent_cmp(osd_op, read_bl);
5526 }
5527 }
5528};
5529
5530int PrimaryLogPG::do_extent_cmp(OpContext *ctx, OSDOp& osd_op)
5531{
5532 dout(20) << __func__ << dendl;
5533 ceph_osd_op& op = osd_op.op;
5534
3efd9988
FG
5535 auto& oi = ctx->new_obs.oi;
5536 uint64_t size = oi.size;
5537 if ((oi.truncate_seq < op.extent.truncate_seq) &&
5538 (op.extent.offset + op.extent.length > op.extent.truncate_size)) {
5539 size = op.extent.truncate_size;
5540 }
5541
5542 if (op.extent.offset >= size) {
5543 op.extent.length = 0;
5544 } else if (op.extent.offset + op.extent.length > size) {
5545 op.extent.length = size - op.extent.offset;
5546 }
5547
5548 if (op.extent.length == 0) {
5549 dout(20) << __func__ << " zero length extent" << dendl;
5550 return finish_extent_cmp(osd_op, bufferlist{});
5551 } else if (!ctx->obs->exists || ctx->obs->oi.is_whiteout()) {
c07f9fc5
FG
5552 dout(20) << __func__ << " object DNE" << dendl;
5553 return finish_extent_cmp(osd_op, {});
11fdf7f2 5554 } else if (pool.info.is_erasure()) {
c07f9fc5
FG
5555 // If there is a data digest and it is possible we are reading
5556 // entire object, pass the digest.
9f95a23c 5557 std::optional<uint32_t> maybe_crc;
11fdf7f2 5558 if (oi.is_data_digest() && op.checksum.offset == 0 &&
c07f9fc5
FG
5559 op.checksum.length >= oi.size) {
5560 maybe_crc = oi.data_digest;
5561 }
5562
5563 // async read
5564 auto& soid = oi.soid;
5565 auto extent_cmp_ctx = new C_ExtentCmpRead(this, osd_op, maybe_crc, oi.size,
5566 osd, soid, op.flags);
5567 ctx->pending_async_reads.push_back({
5568 {op.extent.offset, op.extent.length, op.flags},
5569 {&extent_cmp_ctx->read_bl, extent_cmp_ctx}});
5570
5571 dout(10) << __func__ << ": async_read noted for " << soid << dendl;
5572
5573 ctx->op_finishers[ctx->current_osd_subop_num].reset(
5574 new ReadFinisher(osd_op));
5575 return -EINPROGRESS;
5576 }
5577
5578 // sync read
5579 vector<OSDOp> read_ops(1);
5580 OSDOp& read_op = read_ops[0];
5581
5582 read_op.op.op = CEPH_OSD_OP_SYNC_READ;
5583 read_op.op.extent.offset = op.extent.offset;
5584 read_op.op.extent.length = op.extent.length;
5585 read_op.op.extent.truncate_seq = op.extent.truncate_seq;
5586 read_op.op.extent.truncate_size = op.extent.truncate_size;
5587
5588 int result = do_osd_ops(ctx, read_ops);
5589 if (result < 0) {
5590 derr << __func__ << " failed " << result << dendl;
5591 return result;
5592 }
5593 return finish_extent_cmp(osd_op, read_op.outdata);
5594}
5595
5596int PrimaryLogPG::finish_extent_cmp(OSDOp& osd_op, const bufferlist &read_bl)
5597{
5598 for (uint64_t idx = 0; idx < osd_op.indata.length(); ++idx) {
5599 char read_byte = (idx < read_bl.length() ? read_bl[idx] : 0);
5600 if (osd_op.indata[idx] != read_byte) {
5601 return (-MAX_ERRNO - idx);
5602 }
5603 }
5604
5605 return 0;
5606}
5607
5608int PrimaryLogPG::do_read(OpContext *ctx, OSDOp& osd_op) {
5609 dout(20) << __func__ << dendl;
5610 auto& op = osd_op.op;
5611 auto& oi = ctx->new_obs.oi;
5612 auto& soid = oi.soid;
5613 __u32 seq = oi.truncate_seq;
5614 uint64_t size = oi.size;
5615 bool trimmed_read = false;
5616
91327a77
AA
5617 dout(30) << __func__ << " oi.size: " << oi.size << dendl;
5618 dout(30) << __func__ << " oi.truncate_seq: " << oi.truncate_seq << dendl;
5619 dout(30) << __func__ << " op.extent.truncate_seq: " << op.extent.truncate_seq << dendl;
5620 dout(30) << __func__ << " op.extent.truncate_size: " << op.extent.truncate_size << dendl;
5621
c07f9fc5
FG
5622 // are we beyond truncate_size?
5623 if ( (seq < op.extent.truncate_seq) &&
91327a77
AA
5624 (op.extent.offset + op.extent.length > op.extent.truncate_size) &&
5625 (size > op.extent.truncate_size) )
c07f9fc5
FG
5626 size = op.extent.truncate_size;
5627
5628 if (op.extent.length == 0) //length is zero mean read the whole object
5629 op.extent.length = size;
5630
5631 if (op.extent.offset >= size) {
5632 op.extent.length = 0;
5633 trimmed_read = true;
5634 } else if (op.extent.offset + op.extent.length > size) {
5635 op.extent.length = size - op.extent.offset;
5636 trimmed_read = true;
5637 }
5638
91327a77
AA
5639 dout(30) << __func__ << "op.extent.length is now " << op.extent.length << dendl;
5640
c07f9fc5
FG
5641 // read into a buffer
5642 int result = 0;
5643 if (trimmed_read && op.extent.length == 0) {
5644 // read size was trimmed to zero and it is expected to do nothing
5645 // a read operation of 0 bytes does *not* do nothing, this is why
5646 // the trimmed_read boolean is needed
11fdf7f2
TL
5647 } else if (pool.info.is_erasure()) {
5648 // The initialisation below is required to silence a false positive
5649 // -Wmaybe-uninitialized warning
9f95a23c 5650 std::optional<uint32_t> maybe_crc;
c07f9fc5
FG
5651 // If there is a data digest and it is possible we are reading
5652 // entire object, pass the digest. FillInVerifyExtent will
5653 // will check the oi.size again.
11fdf7f2 5654 if (oi.is_data_digest() && op.extent.offset == 0 &&
c07f9fc5
FG
5655 op.extent.length >= oi.size)
5656 maybe_crc = oi.data_digest;
5657 ctx->pending_async_reads.push_back(
5658 make_pair(
5659 boost::make_tuple(op.extent.offset, op.extent.length, op.flags),
5660 make_pair(&osd_op.outdata,
5661 new FillInVerifyExtent(&op.extent.length, &osd_op.rval,
5662 &osd_op.outdata, maybe_crc, oi.size,
5663 osd, soid, op.flags))));
5664 dout(10) << " async_read noted for " << soid << dendl;
5665
5666 ctx->op_finishers[ctx->current_osd_subop_num].reset(
5667 new ReadFinisher(osd_op));
5668 } else {
5669 int r = pgbackend->objects_read_sync(
5670 soid, op.extent.offset, op.extent.length, op.flags, &osd_op.outdata);
1adf2230 5671 // whole object? can we verify the checksum?
11fdf7f2 5672 if (r >= 0 && op.extent.offset == 0 &&
1adf2230
AA
5673 (uint64_t)r == oi.size && oi.is_data_digest()) {
5674 uint32_t crc = osd_op.outdata.crc32c(-1);
5675 if (oi.data_digest != crc) {
5676 osd->clog->error() << info.pgid << std::hex
5677 << " full-object read crc 0x" << crc
5678 << " != expected 0x" << oi.data_digest
5679 << std::dec << " on " << soid;
5680 r = -EIO; // try repair later
5681 }
5682 }
c07f9fc5 5683 if (r == -EIO) {
11fdf7f2 5684 r = rep_repair_primary_object(soid, ctx);
c07f9fc5
FG
5685 }
5686 if (r >= 0)
5687 op.extent.length = r;
a8e16298 5688 else if (r == -EAGAIN) {
11fdf7f2 5689 result = -EAGAIN;
a8e16298 5690 } else {
c07f9fc5
FG
5691 result = r;
5692 op.extent.length = 0;
5693 }
5694 dout(10) << " read got " << r << " / " << op.extent.length
5695 << " bytes from obj " << soid << dendl;
c07f9fc5 5696 }
11fdf7f2
TL
5697 if (result >= 0) {
5698 ctx->delta_stats.num_rd_kb += shift_round_up(op.extent.length, 10);
5699 ctx->delta_stats.num_rd++;
5700 }
c07f9fc5
FG
5701 return result;
5702}
5703
5704int PrimaryLogPG::do_sparse_read(OpContext *ctx, OSDOp& osd_op) {
5705 dout(20) << __func__ << dendl;
5706 auto& op = osd_op.op;
5707 auto& oi = ctx->new_obs.oi;
5708 auto& soid = oi.soid;
5709
5710 if (op.extent.truncate_seq) {
5711 dout(0) << "sparse_read does not support truncation sequence " << dendl;
5712 return -EINVAL;
5713 }
5714
5715 ++ctx->num_read;
11fdf7f2 5716 if (pool.info.is_erasure()) {
c07f9fc5
FG
5717 // translate sparse read to a normal one if not supported
5718 uint64_t offset = op.extent.offset;
5719 uint64_t length = op.extent.length;
5720 if (offset > oi.size) {
5721 length = 0;
5722 } else if (offset + length > oi.size) {
5723 length = oi.size - offset;
5724 }
5725
5726 if (length > 0) {
5727 ctx->pending_async_reads.push_back(
5728 make_pair(
5729 boost::make_tuple(offset, length, op.flags),
5730 make_pair(
5731 &osd_op.outdata,
5732 new ToSparseReadResult(&osd_op.rval, &osd_op.outdata, offset,
5733 &op.extent.length))));
5734 dout(10) << " async_read (was sparse_read) noted for " << soid << dendl;
5735
5736 ctx->op_finishers[ctx->current_osd_subop_num].reset(
5737 new ReadFinisher(osd_op));
5738 } else {
5739 dout(10) << " sparse read ended up empty for " << soid << dendl;
5740 map<uint64_t, uint64_t> extents;
11fdf7f2 5741 encode(extents, osd_op.outdata);
c07f9fc5
FG
5742 }
5743 } else {
5744 // read into a buffer
5745 map<uint64_t, uint64_t> m;
c07f9fc5
FG
5746 int r = osd->store->fiemap(ch, ghobject_t(soid, ghobject_t::NO_GEN,
5747 info.pgid.shard),
5748 op.extent.offset, op.extent.length, m);
5749 if (r < 0) {
5750 return r;
5751 }
5752
c07f9fc5 5753 bufferlist data_bl;
9f95a23c
TL
5754 r = pgbackend->objects_readv_sync(soid, std::move(m), op.flags, &data_bl);
5755 if (r == -EIO) {
5756 r = rep_repair_primary_object(soid, ctx);
5757 }
5758 if (r < 0) {
5759 return r;
c07f9fc5
FG
5760 }
5761
5762 // Why SPARSE_READ need checksum? In fact, librbd always use sparse-read.
5763 // Maybe at first, there is no much whole objects. With continued use, more
5764 // and more whole object exist. So from this point, for spare-read add
5765 // checksum make sense.
9f95a23c 5766 if ((uint64_t)r == oi.size && oi.is_data_digest()) {
c07f9fc5
FG
5767 uint32_t crc = data_bl.crc32c(-1);
5768 if (oi.data_digest != crc) {
5769 osd->clog->error() << info.pgid << std::hex
5770 << " full-object read crc 0x" << crc
5771 << " != expected 0x" << oi.data_digest
5772 << std::dec << " on " << soid;
11fdf7f2 5773 r = rep_repair_primary_object(soid, ctx);
1adf2230
AA
5774 if (r < 0) {
5775 return r;
5776 }
c07f9fc5
FG
5777 }
5778 }
5779
1911f103 5780 op.extent.length = r;
c07f9fc5 5781
11fdf7f2 5782 encode(m, osd_op.outdata); // re-encode since it might be modified
c07f9fc5
FG
5783 ::encode_destructively(data_bl, osd_op.outdata);
5784
9f95a23c 5785 dout(10) << " sparse_read got " << r << " bytes from object "
c07f9fc5
FG
5786 << soid << dendl;
5787 }
5788
11fdf7f2 5789 ctx->delta_stats.num_rd_kb += shift_round_up(op.extent.length, 10);
c07f9fc5
FG
5790 ctx->delta_stats.num_rd++;
5791 return 0;
5792}
5793
7c673cae
FG
5794int PrimaryLogPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops)
5795{
5796 int result = 0;
5797 SnapSetContext *ssc = ctx->obc->ssc;
5798 ObjectState& obs = ctx->new_obs;
5799 object_info_t& oi = obs.oi;
5800 const hobject_t& soid = oi.soid;
11fdf7f2
TL
5801 const bool skip_data_digest = osd->store->has_builtin_csum() &&
5802 osd->osd_skip_data_digest;
7c673cae 5803
7c673cae
FG
5804 PGTransaction* t = ctx->op_t.get();
5805
5806 dout(10) << "do_osd_op " << soid << " " << ops << dendl;
f67539c2
TL
5807#ifdef HAVE_JAEGER
5808 if (ctx->op->osd_parent_span) {
5809 auto do_osd_op_span = jaeger_tracing::child_span(__func__, ctx->op->osd_parent_span);
5810 }
5811#endif
7c673cae 5812
c07f9fc5 5813 ctx->current_osd_subop_num = 0;
b32b8144 5814 for (auto p = ops.begin(); p != ops.end(); ++p, ctx->current_osd_subop_num++, ctx->processed_subop_count++) {
7c673cae
FG
5815 OSDOp& osd_op = *p;
5816 ceph_osd_op& op = osd_op.op;
5817
c07f9fc5
FG
5818 OpFinisher* op_finisher = nullptr;
5819 {
5820 auto op_finisher_it = ctx->op_finishers.find(ctx->current_osd_subop_num);
5821 if (op_finisher_it != ctx->op_finishers.end()) {
5822 op_finisher = op_finisher_it->second.get();
5823 }
5824 }
5825
9f95a23c 5826 // TODO: check endianness (ceph_le32 vs uint32_t, etc.)
7c673cae
FG
5827 // The fields in ceph_osd_op are little-endian (according to the definition in rados.h),
5828 // but the code in this function seems to treat them as native-endian. What should the
5829 // tracepoints do?
5830 tracepoint(osd, do_osd_op_pre, soid.oid.name.c_str(), soid.snap.val, op.op, ceph_osd_op_name(op.op), op.flags);
5831
5832 dout(10) << "do_osd_op " << osd_op << dendl;
5833
11fdf7f2 5834 auto bp = osd_op.indata.cbegin();
7c673cae
FG
5835
5836 // user-visible modifcation?
5837 switch (op.op) {
5838 // non user-visible modifications
5839 case CEPH_OSD_OP_WATCH:
5840 case CEPH_OSD_OP_CACHE_EVICT:
5841 case CEPH_OSD_OP_CACHE_FLUSH:
5842 case CEPH_OSD_OP_CACHE_TRY_FLUSH:
5843 case CEPH_OSD_OP_UNDIRTY:
5844 case CEPH_OSD_OP_COPY_FROM: // we handle user_version update explicitly
9f95a23c 5845 case CEPH_OSD_OP_COPY_FROM2:
7c673cae
FG
5846 case CEPH_OSD_OP_CACHE_PIN:
5847 case CEPH_OSD_OP_CACHE_UNPIN:
31f18b77 5848 case CEPH_OSD_OP_SET_REDIRECT:
f67539c2 5849 case CEPH_OSD_OP_SET_CHUNK:
11fdf7f2 5850 case CEPH_OSD_OP_TIER_PROMOTE:
9f95a23c 5851 case CEPH_OSD_OP_TIER_FLUSH:
f67539c2 5852 case CEPH_OSD_OP_TIER_EVICT:
7c673cae
FG
5853 break;
5854 default:
5855 if (op.op & CEPH_OSD_OP_MODE_WR)
5856 ctx->user_modify = true;
5857 }
5858
5859 // munge -1 truncate to 0 truncate
5860 if (ceph_osd_op_uses_extent(op.op) &&
5861 op.extent.truncate_seq == 1 &&
5862 op.extent.truncate_size == (-1ULL)) {
5863 op.extent.truncate_size = 0;
5864 op.extent.truncate_seq = 0;
5865 }
5866
5867 // munge ZERO -> TRUNCATE? (don't munge to DELETE or we risk hosing attributes)
5868 if (op.op == CEPH_OSD_OP_ZERO &&
11fdf7f2
TL
5869 obs.exists &&
5870 op.extent.offset < static_cast<Option::size_t>(osd->osd_max_object_size) &&
5871 op.extent.length >= 1 &&
5872 op.extent.length <= static_cast<Option::size_t>(osd->osd_max_object_size) &&
7c673cae
FG
5873 op.extent.offset + op.extent.length >= oi.size) {
5874 if (op.extent.offset >= oi.size) {
5875 // no-op
5876 goto fail;
5877 }
5878 dout(10) << " munging ZERO " << op.extent.offset << "~" << op.extent.length
5879 << " -> TRUNCATE " << op.extent.offset << " (old size is " << oi.size << ")" << dendl;
5880 op.op = CEPH_OSD_OP_TRUNCATE;
5881 }
5882
5883 switch (op.op) {
f67539c2 5884
7c673cae
FG
5885 // --- READS ---
5886
5887 case CEPH_OSD_OP_CMPEXT:
5888 ++ctx->num_read;
c07f9fc5
FG
5889 tracepoint(osd, do_osd_op_pre_extent_cmp, soid.oid.name.c_str(),
5890 soid.snap.val, oi.size, oi.truncate_seq, op.extent.offset,
5891 op.extent.length, op.extent.truncate_size,
5892 op.extent.truncate_seq);
5893
5894 if (op_finisher == nullptr) {
5895 result = do_extent_cmp(ctx, osd_op);
5896 } else {
5897 result = op_finisher->execute();
5898 }
7c673cae
FG
5899 break;
5900
5901 case CEPH_OSD_OP_SYNC_READ:
11fdf7f2 5902 if (pool.info.is_erasure()) {
7c673cae
FG
5903 result = -EOPNOTSUPP;
5904 break;
5905 }
5906 // fall through
5907 case CEPH_OSD_OP_READ:
5908 ++ctx->num_read;
c07f9fc5
FG
5909 tracepoint(osd, do_osd_op_pre_read, soid.oid.name.c_str(),
5910 soid.snap.val, oi.size, oi.truncate_seq, op.extent.offset,
5911 op.extent.length, op.extent.truncate_size,
5912 op.extent.truncate_seq);
5913 if (op_finisher == nullptr) {
5914 if (!ctx->data_off) {
7c673cae
FG
5915 ctx->data_off = op.extent.offset;
5916 }
c07f9fc5
FG
5917 result = do_read(ctx, osd_op);
5918 } else {
5919 result = op_finisher->execute();
7c673cae
FG
5920 }
5921 break;
5922
5923 case CEPH_OSD_OP_CHECKSUM:
5924 ++ctx->num_read;
5925 {
5926 tracepoint(osd, do_osd_op_pre_checksum, soid.oid.name.c_str(),
5927 soid.snap.val, oi.size, oi.truncate_seq, op.checksum.type,
5928 op.checksum.offset, op.checksum.length,
5929 op.checksum.chunk_size);
5930
c07f9fc5
FG
5931 if (op_finisher == nullptr) {
5932 result = do_checksum(ctx, osd_op, &bp);
5933 } else {
5934 result = op_finisher->execute();
7c673cae
FG
5935 }
5936 }
5937 break;
5938
5939 /* map extents */
5940 case CEPH_OSD_OP_MAPEXT:
5941 tracepoint(osd, do_osd_op_pre_mapext, soid.oid.name.c_str(), soid.snap.val, op.extent.offset, op.extent.length);
11fdf7f2 5942 if (pool.info.is_erasure()) {
7c673cae
FG
5943 result = -EOPNOTSUPP;
5944 break;
5945 }
5946 ++ctx->num_read;
5947 {
5948 // read into a buffer
5949 bufferlist bl;
5950 int r = osd->store->fiemap(ch, ghobject_t(soid, ghobject_t::NO_GEN,
5951 info.pgid.shard),
5952 op.extent.offset, op.extent.length, bl);
f67539c2 5953 osd_op.outdata = std::move(bl);
7c673cae
FG
5954 if (r < 0)
5955 result = r;
5956 else
11fdf7f2 5957 ctx->delta_stats.num_rd_kb += shift_round_up(bl.length(), 10);
7c673cae
FG
5958 ctx->delta_stats.num_rd++;
5959 dout(10) << " map_extents done on object " << soid << dendl;
5960 }
5961 break;
5962
5963 /* map extents */
5964 case CEPH_OSD_OP_SPARSE_READ:
c07f9fc5
FG
5965 tracepoint(osd, do_osd_op_pre_sparse_read, soid.oid.name.c_str(),
5966 soid.snap.val, oi.size, oi.truncate_seq, op.extent.offset,
5967 op.extent.length, op.extent.truncate_size,
5968 op.extent.truncate_seq);
5969 if (op_finisher == nullptr) {
5970 result = do_sparse_read(ctx, osd_op);
7c673cae 5971 } else {
c07f9fc5 5972 result = op_finisher->execute();
7c673cae 5973 }
7c673cae
FG
5974 break;
5975
5976 case CEPH_OSD_OP_CALL:
5977 {
5978 string cname, mname;
5979 bufferlist indata;
5980 try {
5981 bp.copy(op.cls.class_len, cname);
5982 bp.copy(op.cls.method_len, mname);
5983 bp.copy(op.cls.indata_len, indata);
f67539c2 5984 } catch (ceph::buffer::error& e) {
7c673cae
FG
5985 dout(10) << "call unable to decode class + method + indata" << dendl;
5986 dout(30) << "in dump: ";
5987 osd_op.indata.hexdump(*_dout);
5988 *_dout << dendl;
5989 result = -EINVAL;
5990 tracepoint(osd, do_osd_op_pre_call, soid.oid.name.c_str(), soid.snap.val, "???", "???");
5991 break;
5992 }
5993 tracepoint(osd, do_osd_op_pre_call, soid.oid.name.c_str(), soid.snap.val, cname.c_str(), mname.c_str());
5994
5995 ClassHandler::ClassData *cls;
9f95a23c 5996 result = ClassHandler::get_instance().open_class(cname, &cls);
11fdf7f2 5997 ceph_assert(result == 0); // init_op_flags() already verified this works.
7c673cae 5998
9f95a23c 5999 ClassHandler::ClassMethod *method = cls->get_method(mname);
7c673cae
FG
6000 if (!method) {
6001 dout(10) << "call method " << cname << "." << mname << " does not exist" << dendl;
6002 result = -EOPNOTSUPP;
6003 break;
6004 }
6005
6006 int flags = method->get_flags();
6007 if (flags & CLS_METHOD_WR)
6008 ctx->user_modify = true;
6009
6010 bufferlist outdata;
6011 dout(10) << "call method " << cname << "." << mname << dendl;
6012 int prev_rd = ctx->num_read;
6013 int prev_wr = ctx->num_write;
6014 result = method->exec((cls_method_context_t)&ctx, indata, outdata);
6015
6016 if (ctx->num_read > prev_rd && !(flags & CLS_METHOD_RD)) {
6017 derr << "method " << cname << "." << mname << " tried to read object but is not marked RD" << dendl;
6018 result = -EIO;
6019 break;
6020 }
6021 if (ctx->num_write > prev_wr && !(flags & CLS_METHOD_WR)) {
6022 derr << "method " << cname << "." << mname << " tried to update object but is not marked WR" << dendl;
6023 result = -EIO;
6024 break;
6025 }
6026
6027 dout(10) << "method called response length=" << outdata.length() << dendl;
6028 op.extent.length = outdata.length();
6029 osd_op.outdata.claim_append(outdata);
6030 dout(30) << "out dump: ";
6031 osd_op.outdata.hexdump(*_dout);
6032 *_dout << dendl;
6033 }
6034 break;
6035
6036 case CEPH_OSD_OP_STAT:
6037 // note: stat does not require RD
6038 {
6039 tracepoint(osd, do_osd_op_pre_stat, soid.oid.name.c_str(), soid.snap.val);
6040
6041 if (obs.exists && !oi.is_whiteout()) {
11fdf7f2
TL
6042 encode(oi.size, osd_op.outdata);
6043 encode(oi.mtime, osd_op.outdata);
7c673cae
FG
6044 dout(10) << "stat oi has " << oi.size << " " << oi.mtime << dendl;
6045 } else {
6046 result = -ENOENT;
6047 dout(10) << "stat oi object does not exist" << dendl;
6048 }
6049
6050 ctx->delta_stats.num_rd++;
6051 }
6052 break;
6053
6054 case CEPH_OSD_OP_ISDIRTY:
6055 ++ctx->num_read;
6056 {
6057 tracepoint(osd, do_osd_op_pre_isdirty, soid.oid.name.c_str(), soid.snap.val);
6058 bool is_dirty = obs.oi.is_dirty();
11fdf7f2 6059 encode(is_dirty, osd_op.outdata);
7c673cae
FG
6060 ctx->delta_stats.num_rd++;
6061 result = 0;
6062 }
6063 break;
6064
6065 case CEPH_OSD_OP_UNDIRTY:
6066 ++ctx->num_write;
9f95a23c 6067 result = 0;
7c673cae
FG
6068 {
6069 tracepoint(osd, do_osd_op_pre_undirty, soid.oid.name.c_str(), soid.snap.val);
6070 if (oi.is_dirty()) {
6071 ctx->undirty = true; // see make_writeable()
6072 ctx->modify = true;
6073 ctx->delta_stats.num_wr++;
6074 }
7c673cae
FG
6075 }
6076 break;
6077
6078 case CEPH_OSD_OP_CACHE_TRY_FLUSH:
6079 ++ctx->num_write;
9f95a23c 6080 result = 0;
7c673cae
FG
6081 {
6082 tracepoint(osd, do_osd_op_pre_try_flush, soid.oid.name.c_str(), soid.snap.val);
9f95a23c 6083 if (ctx->lock_type != RWState::RWNONE) {
7c673cae
FG
6084 dout(10) << "cache-try-flush without SKIPRWLOCKS flag set" << dendl;
6085 result = -EINVAL;
6086 break;
6087 }
f67539c2 6088 if (pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE || obs.oi.has_manifest()) {
7c673cae
FG
6089 result = -EINVAL;
6090 break;
6091 }
6092 if (!obs.exists) {
6093 result = 0;
6094 break;
6095 }
6096 if (oi.is_cache_pinned()) {
6097 dout(10) << "cache-try-flush on a pinned object, consider unpin this object first" << dendl;
6098 result = -EPERM;
6099 break;
6100 }
6101 if (oi.is_dirty()) {
9f95a23c 6102 result = start_flush(ctx->op, ctx->obc, false, NULL, std::nullopt);
7c673cae
FG
6103 if (result == -EINPROGRESS)
6104 result = -EAGAIN;
6105 } else {
6106 result = 0;
6107 }
6108 }
6109 break;
6110
6111 case CEPH_OSD_OP_CACHE_FLUSH:
6112 ++ctx->num_write;
9f95a23c 6113 result = 0;
7c673cae
FG
6114 {
6115 tracepoint(osd, do_osd_op_pre_cache_flush, soid.oid.name.c_str(), soid.snap.val);
9f95a23c 6116 if (ctx->lock_type == RWState::RWNONE) {
7c673cae
FG
6117 dout(10) << "cache-flush with SKIPRWLOCKS flag set" << dendl;
6118 result = -EINVAL;
6119 break;
6120 }
f67539c2 6121 if (pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE || obs.oi.has_manifest()) {
7c673cae
FG
6122 result = -EINVAL;
6123 break;
6124 }
6125 if (!obs.exists) {
6126 result = 0;
6127 break;
6128 }
6129 if (oi.is_cache_pinned()) {
6130 dout(10) << "cache-flush on a pinned object, consider unpin this object first" << dendl;
6131 result = -EPERM;
6132 break;
6133 }
6134 hobject_t missing;
6135 if (oi.is_dirty()) {
9f95a23c 6136 result = start_flush(ctx->op, ctx->obc, true, &missing, std::nullopt);
7c673cae
FG
6137 if (result == -EINPROGRESS)
6138 result = -EAGAIN;
6139 } else {
6140 result = 0;
6141 }
6142 // Check special return value which has set missing_return
6143 if (result == -ENOENT) {
6144 dout(10) << __func__ << " CEPH_OSD_OP_CACHE_FLUSH got ENOENT" << dendl;
11fdf7f2 6145 ceph_assert(!missing.is_min());
7c673cae
FG
6146 wait_for_unreadable_object(missing, ctx->op);
6147 // Error code which is used elsewhere when wait_for_unreadable_object() is used
6148 result = -EAGAIN;
6149 }
6150 }
6151 break;
6152
6153 case CEPH_OSD_OP_CACHE_EVICT:
6154 ++ctx->num_write;
9f95a23c 6155 result = 0;
7c673cae
FG
6156 {
6157 tracepoint(osd, do_osd_op_pre_cache_evict, soid.oid.name.c_str(), soid.snap.val);
f67539c2 6158 if (pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE || obs.oi.has_manifest()) {
7c673cae
FG
6159 result = -EINVAL;
6160 break;
6161 }
6162 if (!obs.exists) {
6163 result = 0;
6164 break;
6165 }
6166 if (oi.is_cache_pinned()) {
6167 dout(10) << "cache-evict on a pinned object, consider unpin this object first" << dendl;
6168 result = -EPERM;
6169 break;
6170 }
6171 if (oi.is_dirty()) {
6172 result = -EBUSY;
6173 break;
6174 }
6175 if (!oi.watchers.empty()) {
6176 result = -EBUSY;
6177 break;
6178 }
6179 if (soid.snap == CEPH_NOSNAP) {
6180 result = _verify_no_head_clones(soid, ssc->snapset);
6181 if (result < 0)
6182 break;
6183 }
6184 result = _delete_oid(ctx, true, false);
6185 if (result >= 0) {
6186 // mark that this is a cache eviction to avoid triggering normal
11fdf7f2 6187 // make_writeable() clone creation in finish_ctx()
f67539c2 6188 ctx->cache_operation = true;
7c673cae
FG
6189 }
6190 osd->logger->inc(l_osd_tier_evict);
6191 }
6192 break;
6193
6194 case CEPH_OSD_OP_GETXATTR:
6195 ++ctx->num_read;
6196 {
6197 string aname;
6198 bp.copy(op.xattr.name_len, aname);
6199 tracepoint(osd, do_osd_op_pre_getxattr, soid.oid.name.c_str(), soid.snap.val, aname.c_str());
6200 string name = "_" + aname;
6201 int r = getattr_maybe_cache(
6202 ctx->obc,
6203 name,
6204 &(osd_op.outdata));
6205 if (r >= 0) {
6206 op.xattr.value_len = osd_op.outdata.length();
6207 result = 0;
11fdf7f2 6208 ctx->delta_stats.num_rd_kb += shift_round_up(osd_op.outdata.length(), 10);
7c673cae
FG
6209 } else
6210 result = r;
6211
6212 ctx->delta_stats.num_rd++;
6213 }
6214 break;
6215
6216 case CEPH_OSD_OP_GETXATTRS:
6217 ++ctx->num_read;
6218 {
6219 tracepoint(osd, do_osd_op_pre_getxattrs, soid.oid.name.c_str(), soid.snap.val);
6220 map<string, bufferlist> out;
6221 result = getattrs_maybe_cache(
6222 ctx->obc,
b32b8144 6223 &out);
f67539c2 6224
7c673cae 6225 bufferlist bl;
11fdf7f2
TL
6226 encode(out, bl);
6227 ctx->delta_stats.num_rd_kb += shift_round_up(bl.length(), 10);
7c673cae
FG
6228 ctx->delta_stats.num_rd++;
6229 osd_op.outdata.claim_append(bl);
6230 }
6231 break;
f67539c2 6232
7c673cae
FG
6233 case CEPH_OSD_OP_CMPXATTR:
6234 ++ctx->num_read;
6235 {
6236 string aname;
6237 bp.copy(op.xattr.name_len, aname);
6238 tracepoint(osd, do_osd_op_pre_cmpxattr, soid.oid.name.c_str(), soid.snap.val, aname.c_str());
6239 string name = "_" + aname;
6240 name[op.xattr.name_len + 1] = 0;
f67539c2 6241
7c673cae
FG
6242 bufferlist xattr;
6243 result = getattr_maybe_cache(
6244 ctx->obc,
6245 name,
6246 &xattr);
6247 if (result < 0 && result != -EEXIST && result != -ENODATA)
6248 break;
f67539c2 6249
7c673cae 6250 ctx->delta_stats.num_rd++;
11fdf7f2 6251 ctx->delta_stats.num_rd_kb += shift_round_up(xattr.length(), 10);
7c673cae
FG
6252
6253 switch (op.xattr.cmp_mode) {
6254 case CEPH_OSD_CMPXATTR_MODE_STRING:
6255 {
6256 string val;
6257 bp.copy(op.xattr.value_len, val);
6258 val[op.xattr.value_len] = 0;
6259 dout(10) << "CEPH_OSD_OP_CMPXATTR name=" << name << " val=" << val
6260 << " op=" << (int)op.xattr.cmp_op << " mode=" << (int)op.xattr.cmp_mode << dendl;
6261 result = do_xattr_cmp_str(op.xattr.cmp_op, val, xattr);
6262 }
6263 break;
6264
6265 case CEPH_OSD_CMPXATTR_MODE_U64:
6266 {
6267 uint64_t u64val;
6268 try {
11fdf7f2 6269 decode(u64val, bp);
7c673cae 6270 }
f67539c2 6271 catch (ceph::buffer::error& e) {
7c673cae
FG
6272 result = -EINVAL;
6273 goto fail;
6274 }
6275 dout(10) << "CEPH_OSD_OP_CMPXATTR name=" << name << " val=" << u64val
6276 << " op=" << (int)op.xattr.cmp_op << " mode=" << (int)op.xattr.cmp_mode << dendl;
6277 result = do_xattr_cmp_u64(op.xattr.cmp_op, u64val, xattr);
6278 }
6279 break;
6280
6281 default:
6282 dout(10) << "bad cmp mode " << (int)op.xattr.cmp_mode << dendl;
6283 result = -EINVAL;
6284 }
6285
6286 if (!result) {
6287 dout(10) << "comparison returned false" << dendl;
6288 result = -ECANCELED;
6289 break;
6290 }
6291 if (result < 0) {
6292 dout(10) << "comparison returned " << result << " " << cpp_strerror(-result) << dendl;
6293 break;
6294 }
6295
6296 dout(10) << "comparison returned true" << dendl;
6297 }
6298 break;
6299
6300 case CEPH_OSD_OP_ASSERT_VER:
6301 ++ctx->num_read;
6302 {
6303 uint64_t ver = op.assert_ver.ver;
6304 tracepoint(osd, do_osd_op_pre_assert_ver, soid.oid.name.c_str(), soid.snap.val, ver);
6305 if (!ver)
6306 result = -EINVAL;
6307 else if (ver < oi.user_version)
6308 result = -ERANGE;
6309 else if (ver > oi.user_version)
6310 result = -EOVERFLOW;
6311 }
6312 break;
6313
6314 case CEPH_OSD_OP_LIST_WATCHERS:
6315 ++ctx->num_read;
6316 {
6317 tracepoint(osd, do_osd_op_pre_list_watchers, soid.oid.name.c_str(), soid.snap.val);
6318 obj_list_watch_response_t resp;
6319
6320 map<pair<uint64_t, entity_name_t>, watch_info_t>::const_iterator oi_iter;
6321 for (oi_iter = oi.watchers.begin(); oi_iter != oi.watchers.end();
6322 ++oi_iter) {
6323 dout(20) << "key cookie=" << oi_iter->first.first
6324 << " entity=" << oi_iter->first.second << " "
6325 << oi_iter->second << dendl;
11fdf7f2
TL
6326 ceph_assert(oi_iter->first.first == oi_iter->second.cookie);
6327 ceph_assert(oi_iter->first.second.is_client());
7c673cae
FG
6328
6329 watch_item_t wi(oi_iter->first.second, oi_iter->second.cookie,
6330 oi_iter->second.timeout_seconds, oi_iter->second.addr);
6331 resp.entries.push_back(wi);
6332 }
6333
6334 resp.encode(osd_op.outdata, ctx->get_features());
6335 result = 0;
6336
6337 ctx->delta_stats.num_rd++;
6338 break;
6339 }
6340
6341 case CEPH_OSD_OP_LIST_SNAPS:
6342 ++ctx->num_read;
6343 {
6344 tracepoint(osd, do_osd_op_pre_list_snaps, soid.oid.name.c_str(), soid.snap.val);
6345 obj_list_snap_response_t resp;
6346
6347 if (!ssc) {
6348 ssc = ctx->obc->ssc = get_snapset_context(soid, false);
6349 }
11fdf7f2
TL
6350 ceph_assert(ssc);
6351 dout(20) << " snapset " << ssc->snapset << dendl;
7c673cae
FG
6352
6353 int clonecount = ssc->snapset.clones.size();
11fdf7f2 6354 clonecount++; // for head
7c673cae
FG
6355 resp.clones.reserve(clonecount);
6356 for (auto clone_iter = ssc->snapset.clones.begin();
6357 clone_iter != ssc->snapset.clones.end(); ++clone_iter) {
6358 clone_info ci;
6359 ci.cloneid = *clone_iter;
6360
6361 hobject_t clone_oid = soid;
6362 clone_oid.snap = *clone_iter;
6363
11fdf7f2
TL
6364 auto p = ssc->snapset.clone_snaps.find(*clone_iter);
6365 if (p == ssc->snapset.clone_snaps.end()) {
6366 osd->clog->error() << "osd." << osd->whoami
6367 << ": inconsistent clone_snaps found for oid "
6368 << soid << " clone " << *clone_iter
6369 << " snapset " << ssc->snapset;
6370 result = -EINVAL;
6371 break;
6372 }
6373 for (auto q = p->second.rbegin(); q != p->second.rend(); ++q) {
6374 ci.snaps.push_back(*q);
7c673cae
FG
6375 }
6376
6377 dout(20) << " clone " << *clone_iter << " snaps " << ci.snaps << dendl;
6378
6379 map<snapid_t, interval_set<uint64_t> >::const_iterator coi;
6380 coi = ssc->snapset.clone_overlap.find(ci.cloneid);
6381 if (coi == ssc->snapset.clone_overlap.end()) {
6382 osd->clog->error() << "osd." << osd->whoami
6383 << ": inconsistent clone_overlap found for oid "
6384 << soid << " clone " << *clone_iter;
6385 result = -EINVAL;
6386 break;
6387 }
6388 const interval_set<uint64_t> &o = coi->second;
6389 ci.overlap.reserve(o.num_intervals());
6390 for (interval_set<uint64_t>::const_iterator r = o.begin();
6391 r != o.end(); ++r) {
6392 ci.overlap.push_back(pair<uint64_t,uint64_t>(r.get_start(),
6393 r.get_len()));
6394 }
6395
6396 map<snapid_t, uint64_t>::const_iterator si;
6397 si = ssc->snapset.clone_size.find(ci.cloneid);
6398 if (si == ssc->snapset.clone_size.end()) {
6399 osd->clog->error() << "osd." << osd->whoami
6400 << ": inconsistent clone_size found for oid "
6401 << soid << " clone " << *clone_iter;
6402 result = -EINVAL;
6403 break;
6404 }
6405 ci.size = si->second;
6406
6407 resp.clones.push_back(ci);
6408 }
6409 if (result < 0) {
6410 break;
f67539c2 6411 }
11fdf7f2
TL
6412 if (!ctx->obc->obs.oi.is_whiteout()) {
6413 ceph_assert(obs.exists);
7c673cae
FG
6414 clone_info ci;
6415 ci.cloneid = CEPH_NOSNAP;
6416
6417 //Size for HEAD is oi.size
6418 ci.size = oi.size;
6419
6420 resp.clones.push_back(ci);
6421 }
6422 resp.seq = ssc->snapset.seq;
6423
6424 resp.encode(osd_op.outdata);
6425 result = 0;
6426
6427 ctx->delta_stats.num_rd++;
6428 break;
6429 }
6430
6431 case CEPH_OSD_OP_NOTIFY:
6432 ++ctx->num_read;
6433 {
6434 uint32_t timeout;
6435 bufferlist bl;
6436
6437 try {
6438 uint32_t ver; // obsolete
11fdf7f2
TL
6439 decode(ver, bp);
6440 decode(timeout, bp);
6441 decode(bl, bp);
f67539c2 6442 } catch (const ceph::buffer::error &e) {
7c673cae
FG
6443 timeout = 0;
6444 }
6445 tracepoint(osd, do_osd_op_pre_notify, soid.oid.name.c_str(), soid.snap.val, timeout);
6446 if (!timeout)
6447 timeout = cct->_conf->osd_default_notify_timeout;
6448
6449 notify_info_t n;
6450 n.timeout = timeout;
11fdf7f2 6451 n.notify_id = osd->get_next_id(get_osdmap_epoch());
9f95a23c 6452 n.cookie = op.notify.cookie;
7c673cae
FG
6453 n.bl = bl;
6454 ctx->notifies.push_back(n);
6455
6456 // return our unique notify id to the client
11fdf7f2 6457 encode(n.notify_id, osd_op.outdata);
7c673cae
FG
6458 }
6459 break;
6460
6461 case CEPH_OSD_OP_NOTIFY_ACK:
6462 ++ctx->num_read;
6463 {
6464 try {
6465 uint64_t notify_id = 0;
6466 uint64_t watch_cookie = 0;
11fdf7f2
TL
6467 decode(notify_id, bp);
6468 decode(watch_cookie, bp);
7c673cae
FG
6469 bufferlist reply_bl;
6470 if (!bp.end()) {
11fdf7f2 6471 decode(reply_bl, bp);
7c673cae
FG
6472 }
6473 tracepoint(osd, do_osd_op_pre_notify_ack, soid.oid.name.c_str(), soid.snap.val, notify_id, watch_cookie, "Y");
6474 OpContext::NotifyAck ack(notify_id, watch_cookie, reply_bl);
6475 ctx->notify_acks.push_back(ack);
f67539c2 6476 } catch (const ceph::buffer::error &e) {
7c673cae
FG
6477 tracepoint(osd, do_osd_op_pre_notify_ack, soid.oid.name.c_str(), soid.snap.val, op.watch.cookie, 0, "N");
6478 OpContext::NotifyAck ack(
6479 // op.watch.cookie is actually the notify_id for historical reasons
6480 op.watch.cookie
6481 );
6482 ctx->notify_acks.push_back(ack);
6483 }
6484 }
6485 break;
6486
6487 case CEPH_OSD_OP_SETALLOCHINT:
6488 ++ctx->num_write;
9f95a23c 6489 result = 0;
7c673cae
FG
6490 {
6491 tracepoint(osd, do_osd_op_pre_setallochint, soid.oid.name.c_str(), soid.snap.val, op.alloc_hint.expected_object_size, op.alloc_hint.expected_write_size);
6492 maybe_create_new_object(ctx);
6493 oi.expected_object_size = op.alloc_hint.expected_object_size;
6494 oi.expected_write_size = op.alloc_hint.expected_write_size;
6495 oi.alloc_hint_flags = op.alloc_hint.flags;
6496 t->set_alloc_hint(soid, op.alloc_hint.expected_object_size,
6497 op.alloc_hint.expected_write_size,
6498 op.alloc_hint.flags);
7c673cae
FG
6499 }
6500 break;
6501
6502
6503 // --- WRITES ---
6504
6505 // -- object data --
6506
6507 case CEPH_OSD_OP_WRITE:
6508 ++ctx->num_write;
9f95a23c 6509 result = 0;
7c673cae
FG
6510 { // write
6511 __u32 seq = oi.truncate_seq;
6512 tracepoint(osd, do_osd_op_pre_write, soid.oid.name.c_str(), soid.snap.val, oi.size, seq, op.extent.offset, op.extent.length, op.extent.truncate_size, op.extent.truncate_seq);
6513 if (op.extent.length != osd_op.indata.length()) {
6514 result = -EINVAL;
6515 break;
6516 }
6517
6518 if (pool.info.has_flag(pg_pool_t::FLAG_WRITE_FADVISE_DONTNEED))
6519 op.flags = op.flags | CEPH_OSD_OP_FLAG_FADVISE_DONTNEED;
6520
6521 if (pool.info.requires_aligned_append() &&
6522 (op.extent.offset % pool.info.required_alignment() != 0)) {
6523 result = -EOPNOTSUPP;
6524 break;
6525 }
6526
6527 if (!obs.exists) {
6528 if (pool.info.requires_aligned_append() && op.extent.offset) {
6529 result = -EOPNOTSUPP;
6530 break;
6531 }
6532 } else if (op.extent.offset != oi.size &&
6533 pool.info.requires_aligned_append()) {
6534 result = -EOPNOTSUPP;
6535 break;
6536 }
6537
6538 if (seq && (seq > op.extent.truncate_seq) &&
6539 (op.extent.offset + op.extent.length > oi.size)) {
6540 // old write, arrived after trimtrunc
6541 op.extent.length = (op.extent.offset > oi.size ? 0 : oi.size - op.extent.offset);
6542 dout(10) << " old truncate_seq " << op.extent.truncate_seq << " < current " << seq
6543 << ", adjusting write length to " << op.extent.length << dendl;
6544 bufferlist t;
6545 t.substr_of(osd_op.indata, 0, op.extent.length);
6546 osd_op.indata.swap(t);
6547 }
6548 if (op.extent.truncate_seq > seq) {
6549 // write arrives before trimtrunc
6550 if (obs.exists && !oi.is_whiteout()) {
6551 dout(10) << " truncate_seq " << op.extent.truncate_seq << " > current " << seq
6552 << ", truncating to " << op.extent.truncate_size << dendl;
6553 t->truncate(soid, op.extent.truncate_size);
6554 oi.truncate_seq = op.extent.truncate_seq;
6555 oi.truncate_size = op.extent.truncate_size;
11fdf7f2
TL
6556 if (oi.size > op.extent.truncate_size) {
6557 interval_set<uint64_t> trim;
6558 trim.insert(op.extent.truncate_size,
6559 oi.size - op.extent.truncate_size);
6560 ctx->modified_ranges.union_of(trim);
9f95a23c 6561 ctx->clean_regions.mark_data_region_dirty(op.extent.truncate_size, oi.size - op.extent.truncate_size);
b3b6e05e 6562 oi.clear_data_digest();
11fdf7f2 6563 }
7c673cae 6564 if (op.extent.truncate_size != oi.size) {
11fdf7f2
TL
6565 truncate_update_size_and_usage(ctx->delta_stats,
6566 oi,
6567 op.extent.truncate_size);
7c673cae
FG
6568 }
6569 } else {
6570 dout(10) << " truncate_seq " << op.extent.truncate_seq << " > current " << seq
6571 << ", but object is new" << dendl;
6572 oi.truncate_seq = op.extent.truncate_seq;
6573 oi.truncate_size = op.extent.truncate_size;
6574 }
6575 }
11fdf7f2
TL
6576 result = check_offset_and_length(
6577 op.extent.offset, op.extent.length,
6578 static_cast<Option::size_t>(osd->osd_max_object_size), get_dpp());
7c673cae
FG
6579 if (result < 0)
6580 break;
6581
6582 maybe_create_new_object(ctx);
6583
6584 if (op.extent.length == 0) {
6585 if (op.extent.offset > oi.size) {
6586 t->truncate(
6587 soid, op.extent.offset);
eafe8130
TL
6588 truncate_update_size_and_usage(ctx->delta_stats, oi,
6589 op.extent.offset);
7c673cae
FG
6590 } else {
6591 t->nop(soid);
6592 }
6593 } else {
6594 t->write(
6595 soid, op.extent.offset, op.extent.length, osd_op.indata, op.flags);
6596 }
6597
28e407b8
AA
6598 if (op.extent.offset == 0 && op.extent.length >= oi.size
6599 && !skip_data_digest) {
7c673cae 6600 obs.oi.set_data_digest(osd_op.indata.crc32c(-1));
28e407b8
AA
6601 } else if (op.extent.offset == oi.size && obs.oi.is_data_digest()) {
6602 if (skip_data_digest) {
6603 obs.oi.clear_data_digest();
6604 } else {
6605 obs.oi.set_data_digest(osd_op.indata.crc32c(obs.oi.data_digest));
6606 }
6607 } else {
7c673cae 6608 obs.oi.clear_data_digest();
28e407b8 6609 }
7c673cae
FG
6610 write_update_size_and_usage(ctx->delta_stats, oi, ctx->modified_ranges,
6611 op.extent.offset, op.extent.length);
9f95a23c
TL
6612 ctx->clean_regions.mark_data_region_dirty(op.extent.offset, op.extent.length);
6613 dout(10) << "clean_regions modified" << ctx->clean_regions << dendl;
7c673cae
FG
6614 }
6615 break;
f67539c2 6616
7c673cae
FG
6617 case CEPH_OSD_OP_WRITEFULL:
6618 ++ctx->num_write;
9f95a23c 6619 result = 0;
7c673cae
FG
6620 { // write full object
6621 tracepoint(osd, do_osd_op_pre_writefull, soid.oid.name.c_str(), soid.snap.val, oi.size, 0, op.extent.length);
6622
6623 if (op.extent.length != osd_op.indata.length()) {
6624 result = -EINVAL;
6625 break;
6626 }
11fdf7f2
TL
6627 result = check_offset_and_length(
6628 0, op.extent.length,
6629 static_cast<Option::size_t>(osd->osd_max_object_size), get_dpp());
7c673cae
FG
6630 if (result < 0)
6631 break;
6632
6633 if (pool.info.has_flag(pg_pool_t::FLAG_WRITE_FADVISE_DONTNEED))
6634 op.flags = op.flags | CEPH_OSD_OP_FLAG_FADVISE_DONTNEED;
6635
6636 maybe_create_new_object(ctx);
11fdf7f2 6637 if (pool.info.is_erasure()) {
7c673cae
FG
6638 t->truncate(soid, 0);
6639 } else if (obs.exists && op.extent.length < oi.size) {
6640 t->truncate(soid, op.extent.length);
6641 }
6642 if (op.extent.length) {
6643 t->write(soid, 0, op.extent.length, osd_op.indata, op.flags);
6644 }
28e407b8
AA
6645 if (!skip_data_digest) {
6646 obs.oi.set_data_digest(osd_op.indata.crc32c(-1));
6647 } else {
6648 obs.oi.clear_data_digest();
6649 }
9f95a23c
TL
6650 ctx->clean_regions.mark_data_region_dirty(0,
6651 std::max((uint64_t)op.extent.length, oi.size));
7c673cae
FG
6652 write_update_size_and_usage(ctx->delta_stats, oi, ctx->modified_ranges,
6653 0, op.extent.length, true);
6654 }
6655 break;
6656
6657 case CEPH_OSD_OP_WRITESAME:
6658 ++ctx->num_write;
6659 tracepoint(osd, do_osd_op_pre_writesame, soid.oid.name.c_str(), soid.snap.val, oi.size, op.writesame.offset, op.writesame.length, op.writesame.data_length);
6660 result = do_writesame(ctx, osd_op);
6661 break;
6662
6663 case CEPH_OSD_OP_ROLLBACK :
6664 ++ctx->num_write;
6665 tracepoint(osd, do_osd_op_pre_rollback, soid.oid.name.c_str(), soid.snap.val);
6666 result = _rollback_to(ctx, op);
6667 break;
6668
6669 case CEPH_OSD_OP_ZERO:
6670 tracepoint(osd, do_osd_op_pre_zero, soid.oid.name.c_str(), soid.snap.val, op.extent.offset, op.extent.length);
6671 if (pool.info.requires_aligned_append()) {
6672 result = -EOPNOTSUPP;
6673 break;
6674 }
6675 ++ctx->num_write;
6676 { // zero
11fdf7f2
TL
6677 result = check_offset_and_length(
6678 op.extent.offset, op.extent.length,
6679 static_cast<Option::size_t>(osd->osd_max_object_size), get_dpp());
7c673cae
FG
6680 if (result < 0)
6681 break;
f67539c2 6682
11fdf7f2 6683 ceph_assert(op.extent.length);
7c673cae
FG
6684 if (obs.exists && !oi.is_whiteout()) {
6685 t->zero(soid, op.extent.offset, op.extent.length);
6686 interval_set<uint64_t> ch;
6687 ch.insert(op.extent.offset, op.extent.length);
6688 ctx->modified_ranges.union_of(ch);
9f95a23c 6689 ctx->clean_regions.mark_data_region_dirty(op.extent.offset, op.extent.length);
7c673cae
FG
6690 ctx->delta_stats.num_wr++;
6691 oi.clear_data_digest();
6692 } else {
6693 // no-op
6694 }
6695 }
6696 break;
6697 case CEPH_OSD_OP_CREATE:
6698 ++ctx->num_write;
9f95a23c 6699 result = 0;
7c673cae
FG
6700 {
6701 tracepoint(osd, do_osd_op_pre_create, soid.oid.name.c_str(), soid.snap.val);
7c673cae 6702 if (obs.exists && !oi.is_whiteout() &&
9f95a23c 6703 (op.flags & CEPH_OSD_OP_FLAG_EXCL)) {
7c673cae
FG
6704 result = -EEXIST; /* this is an exclusive create */
6705 } else {
6706 if (osd_op.indata.length()) {
11fdf7f2 6707 auto p = osd_op.indata.cbegin();
7c673cae
FG
6708 string category;
6709 try {
11fdf7f2 6710 decode(category, p);
7c673cae 6711 }
f67539c2 6712 catch (ceph::buffer::error& e) {
7c673cae
FG
6713 result = -EINVAL;
6714 goto fail;
6715 }
6716 // category is no longer implemented.
6717 }
9f95a23c
TL
6718 maybe_create_new_object(ctx);
6719 t->nop(soid);
7c673cae
FG
6720 }
6721 }
6722 break;
6723
6724 case CEPH_OSD_OP_TRIMTRUNC:
6725 op.extent.offset = op.extent.truncate_size;
6726 // falling through
6727
6728 case CEPH_OSD_OP_TRUNCATE:
6729 tracepoint(osd, do_osd_op_pre_truncate, soid.oid.name.c_str(), soid.snap.val, oi.size, oi.truncate_seq, op.extent.offset, op.extent.length, op.extent.truncate_size, op.extent.truncate_seq);
6730 if (pool.info.requires_aligned_append()) {
6731 result = -EOPNOTSUPP;
6732 break;
6733 }
6734 ++ctx->num_write;
9f95a23c 6735 result = 0;
7c673cae
FG
6736 {
6737 // truncate
6738 if (!obs.exists || oi.is_whiteout()) {
6739 dout(10) << " object dne, truncate is a no-op" << dendl;
6740 break;
6741 }
6742
11fdf7f2
TL
6743 result = check_offset_and_length(
6744 op.extent.offset, op.extent.length,
6745 static_cast<Option::size_t>(osd->osd_max_object_size), get_dpp());
6746 if (result < 0)
7c673cae 6747 break;
7c673cae
FG
6748
6749 if (op.extent.truncate_seq) {
11fdf7f2 6750 ceph_assert(op.extent.offset == op.extent.truncate_size);
7c673cae
FG
6751 if (op.extent.truncate_seq <= oi.truncate_seq) {
6752 dout(10) << " truncate seq " << op.extent.truncate_seq << " <= current " << oi.truncate_seq
6753 << ", no-op" << dendl;
6754 break; // old
6755 }
6756 dout(10) << " truncate seq " << op.extent.truncate_seq << " > current " << oi.truncate_seq
6757 << ", truncating" << dendl;
6758 oi.truncate_seq = op.extent.truncate_seq;
6759 oi.truncate_size = op.extent.truncate_size;
6760 }
6761
6762 maybe_create_new_object(ctx);
6763 t->truncate(soid, op.extent.offset);
6764 if (oi.size > op.extent.offset) {
6765 interval_set<uint64_t> trim;
6766 trim.insert(op.extent.offset, oi.size-op.extent.offset);
6767 ctx->modified_ranges.union_of(trim);
9f95a23c
TL
6768 ctx->clean_regions.mark_data_region_dirty(op.extent.offset, oi.size - op.extent.offset);
6769 } else if (oi.size < op.extent.offset) {
6770 ctx->clean_regions.mark_data_region_dirty(oi.size, op.extent.offset - oi.size);
6771 }
7c673cae 6772 if (op.extent.offset != oi.size) {
11fdf7f2
TL
6773 truncate_update_size_and_usage(ctx->delta_stats,
6774 oi,
6775 op.extent.offset);
7c673cae
FG
6776 }
6777 ctx->delta_stats.num_wr++;
6778 // do no set exists, or we will break above DELETE -> TRUNCATE munging.
6779
6780 oi.clear_data_digest();
6781 }
6782 break;
f67539c2 6783
7c673cae
FG
6784 case CEPH_OSD_OP_DELETE:
6785 ++ctx->num_write;
9f95a23c 6786 result = 0;
7c673cae
FG
6787 tracepoint(osd, do_osd_op_pre_delete, soid.oid.name.c_str(), soid.snap.val);
6788 {
6789 result = _delete_oid(ctx, false, ctx->ignore_cache);
6790 }
6791 break;
6792
6793 case CEPH_OSD_OP_WATCH:
6794 ++ctx->num_write;
9f95a23c 6795 result = 0;
7c673cae
FG
6796 {
6797 tracepoint(osd, do_osd_op_pre_watch, soid.oid.name.c_str(), soid.snap.val,
6798 op.watch.cookie, op.watch.op);
6799 if (!obs.exists) {
6800 result = -ENOENT;
6801 break;
6802 }
9f95a23c 6803 result = 0;
7c673cae
FG
6804 uint64_t cookie = op.watch.cookie;
6805 entity_name_t entity = ctx->reqid.name;
6806 ObjectContextRef obc = ctx->obc;
6807
6808 dout(10) << "watch " << ceph_osd_watch_op_name(op.watch.op)
6809 << ": ctx->obc=" << (void *)obc.get() << " cookie=" << cookie
6810 << " oi.version=" << oi.version.version << " ctx->at_version=" << ctx->at_version << dendl;
6811 dout(10) << "watch: oi.user_version=" << oi.user_version<< dendl;
6812 dout(10) << "watch: peer_addr="
6813 << ctx->op->get_req()->get_connection()->get_peer_addr() << dendl;
6814
6815 uint32_t timeout = cct->_conf->osd_client_watch_timeout;
6816 if (op.watch.timeout != 0) {
6817 timeout = op.watch.timeout;
6818 }
6819
6820 watch_info_t w(cookie, timeout,
6821 ctx->op->get_req()->get_connection()->get_peer_addr());
6822 if (op.watch.op == CEPH_OSD_WATCH_OP_WATCH ||
6823 op.watch.op == CEPH_OSD_WATCH_OP_LEGACY_WATCH) {
6824 if (oi.watchers.count(make_pair(cookie, entity))) {
6825 dout(10) << " found existing watch " << w << " by " << entity << dendl;
6826 } else {
6827 dout(10) << " registered new watch " << w << " by " << entity << dendl;
6828 oi.watchers[make_pair(cookie, entity)] = w;
6829 t->nop(soid); // make sure update the object_info on disk!
6830 }
6831 bool will_ping = (op.watch.op == CEPH_OSD_WATCH_OP_WATCH);
6832 ctx->watch_connects.push_back(make_pair(w, will_ping));
6833 } else if (op.watch.op == CEPH_OSD_WATCH_OP_RECONNECT) {
6834 if (!oi.watchers.count(make_pair(cookie, entity))) {
6835 result = -ENOTCONN;
6836 break;
6837 }
6838 dout(10) << " found existing watch " << w << " by " << entity << dendl;
6839 ctx->watch_connects.push_back(make_pair(w, true));
6840 } else if (op.watch.op == CEPH_OSD_WATCH_OP_PING) {
6841 /* Note: WATCH with PING doesn't cause may_write() to return true,
6842 * so if there is nothing else in the transaction, this is going
6843 * to run do_osd_op_effects, but not write out a log entry */
6844 if (!oi.watchers.count(make_pair(cookie, entity))) {
6845 result = -ENOTCONN;
6846 break;
6847 }
6848 map<pair<uint64_t,entity_name_t>,WatchRef>::iterator p =
6849 obc->watchers.find(make_pair(cookie, entity));
6850 if (p == obc->watchers.end() ||
6851 !p->second->is_connected()) {
6852 // client needs to reconnect
6853 result = -ETIMEDOUT;
6854 break;
6855 }
6856 dout(10) << " found existing watch " << w << " by " << entity << dendl;
6857 p->second->got_ping(ceph_clock_now());
6858 result = 0;
6859 } else if (op.watch.op == CEPH_OSD_WATCH_OP_UNWATCH) {
6860 map<pair<uint64_t, entity_name_t>, watch_info_t>::iterator oi_iter =
6861 oi.watchers.find(make_pair(cookie, entity));
6862 if (oi_iter != oi.watchers.end()) {
6863 dout(10) << " removed watch " << oi_iter->second << " by "
6864 << entity << dendl;
6865 oi.watchers.erase(oi_iter);
6866 t->nop(soid); // update oi on disk
6867 ctx->watch_disconnects.push_back(
6868 watch_disconnect_t(cookie, entity, false));
6869 } else {
6870 dout(10) << " can't remove: no watch by " << entity << dendl;
6871 }
6872 }
6873 }
6874 break;
6875
6876 case CEPH_OSD_OP_CACHE_PIN:
6877 tracepoint(osd, do_osd_op_pre_cache_pin, soid.oid.name.c_str(), soid.snap.val);
6878 if ((!pool.info.is_tier() ||
6879 pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE)) {
6880 result = -EINVAL;
6881 dout(10) << " pin object is only allowed on the cache tier " << dendl;
6882 break;
6883 }
6884 ++ctx->num_write;
9f95a23c 6885 result = 0;
7c673cae
FG
6886 {
6887 if (!obs.exists || oi.is_whiteout()) {
6888 result = -ENOENT;
6889 break;
6890 }
6891
6892 if (!oi.is_cache_pinned()) {
6893 oi.set_flag(object_info_t::FLAG_CACHE_PIN);
6894 ctx->modify = true;
6895 ctx->delta_stats.num_objects_pinned++;
6896 ctx->delta_stats.num_wr++;
6897 }
7c673cae
FG
6898 }
6899 break;
6900
6901 case CEPH_OSD_OP_CACHE_UNPIN:
6902 tracepoint(osd, do_osd_op_pre_cache_unpin, soid.oid.name.c_str(), soid.snap.val);
6903 if ((!pool.info.is_tier() ||
6904 pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE)) {
6905 result = -EINVAL;
6906 dout(10) << " pin object is only allowed on the cache tier " << dendl;
6907 break;
6908 }
6909 ++ctx->num_write;
9f95a23c 6910 result = 0;
7c673cae
FG
6911 {
6912 if (!obs.exists || oi.is_whiteout()) {
6913 result = -ENOENT;
6914 break;
6915 }
6916
6917 if (oi.is_cache_pinned()) {
6918 oi.clear_flag(object_info_t::FLAG_CACHE_PIN);
6919 ctx->modify = true;
6920 ctx->delta_stats.num_objects_pinned--;
6921 ctx->delta_stats.num_wr++;
6922 }
7c673cae
FG
6923 }
6924 break;
6925
31f18b77
FG
6926 case CEPH_OSD_OP_SET_REDIRECT:
6927 ++ctx->num_write;
9f95a23c 6928 result = 0;
31f18b77
FG
6929 {
6930 if (pool.info.is_tier()) {
6931 result = -EINVAL;
6932 break;
6933 }
6934 if (!obs.exists) {
6935 result = -ENOENT;
6936 break;
6937 }
9f95a23c 6938 if (get_osdmap()->require_osd_release < ceph_release_t::luminous) {
31f18b77
FG
6939 result = -EOPNOTSUPP;
6940 break;
6941 }
6942
6943 object_t target_name;
6944 object_locator_t target_oloc;
6945 snapid_t target_snapid = (uint64_t)op.copy_from.snapid;
6946 version_t target_version = op.copy_from.src_version;
6947 try {
11fdf7f2
TL
6948 decode(target_name, bp);
6949 decode(target_oloc, bp);
6950 }
f67539c2 6951 catch (ceph::buffer::error& e) {
11fdf7f2
TL
6952 result = -EINVAL;
6953 goto fail;
6954 }
6955 pg_t raw_pg;
6956 get_osdmap()->object_locator_to_pg(target_name, target_oloc, raw_pg);
6957 hobject_t target(target_name, target_oloc.key, target_snapid,
6958 raw_pg.ps(), raw_pg.pool(),
6959 target_oloc.nspace);
6960 if (target == soid) {
6961 dout(20) << " set-redirect self is invalid" << dendl;
6962 result = -EINVAL;
6963 break;
6964 }
6965
6966 bool need_reference = (osd_op.op.flags & CEPH_OSD_OP_FLAG_WITH_REFERENCE);
6967 bool has_reference = (oi.flags & object_info_t::FLAG_REDIRECT_HAS_REFERENCE);
6968 if (has_reference) {
6969 result = -EINVAL;
6970 dout(5) << " the object is already a manifest " << dendl;
6971 break;
6972 }
6973 if (op_finisher == nullptr && need_reference) {
6974 // start
6975 ctx->op_finishers[ctx->current_osd_subop_num].reset(
6976 new SetManifestFinisher(osd_op));
f67539c2
TL
6977 ManifestOpRef mop = std::make_shared<ManifestOp>(new RefCountCallback(ctx, osd_op));
6978 C_SetManifestRefCountDone* fin = new C_SetManifestRefCountDone(this, mop, soid);
6979 ceph_tid_t tid = refcount_manifest(soid, target,
6980 refcount_t::INCREMENT_REF, fin, std::nullopt);
6981 mop->objecter_tid = tid;
6982 manifest_ops[soid] = mop;
6983 ctx->obc->start_block();
11fdf7f2
TL
6984 result = -EINPROGRESS;
6985 } else {
6986 // finish
6987 if (op_finisher) {
6988 result = op_finisher->execute();
6989 ceph_assert(result == 0);
6990 }
6991
6992 if (!oi.has_manifest() && !oi.manifest.is_redirect())
6993 ctx->delta_stats.num_objects_manifest++;
6994
6995 oi.set_flag(object_info_t::FLAG_MANIFEST);
6996 oi.manifest.redirect_target = target;
6997 oi.manifest.type = object_manifest_t::TYPE_REDIRECT;
6998 t->truncate(soid, 0);
9f95a23c 6999 ctx->clean_regions.mark_data_region_dirty(0, oi.size);
11fdf7f2
TL
7000 if (oi.is_omap() && pool.info.supports_omap()) {
7001 t->omap_clear(soid);
7002 obs.oi.clear_omap_digest();
7003 obs.oi.clear_flag(object_info_t::FLAG_OMAP);
9f95a23c 7004 ctx->clean_regions.mark_omap_dirty();
11fdf7f2 7005 }
9f95a23c
TL
7006 write_update_size_and_usage(ctx->delta_stats, oi, ctx->modified_ranges,
7007 0, oi.size, false);
11fdf7f2
TL
7008 ctx->delta_stats.num_bytes -= oi.size;
7009 oi.size = 0;
7010 oi.new_object();
7011 oi.user_version = target_version;
7012 ctx->user_at_version = target_version;
7013 /* rm_attrs */
7014 map<string,bufferlist> rmattrs;
7015 result = getattrs_maybe_cache(ctx->obc, &rmattrs);
7016 if (result < 0) {
eafe8130 7017 dout(10) << __func__ << " error: " << cpp_strerror(result) << dendl;
11fdf7f2
TL
7018 return result;
7019 }
7020 map<string, bufferlist>::iterator iter;
7021 for (iter = rmattrs.begin(); iter != rmattrs.end(); ++iter) {
7022 const string& name = iter->first;
7023 t->rmattr(soid, name);
7024 }
7025 if (!has_reference && need_reference) {
7026 oi.set_flag(object_info_t::FLAG_REDIRECT_HAS_REFERENCE);
7027 }
7028 dout(10) << "set-redirect oid:" << oi.soid << " user_version: " << oi.user_version << dendl;
7029 if (op_finisher) {
7030 ctx->op_finishers.erase(ctx->current_osd_subop_num);
7031 }
7032 }
7033 }
7034
7035 break;
7036
7037 case CEPH_OSD_OP_SET_CHUNK:
7038 ++ctx->num_write;
9f95a23c 7039 result = 0;
11fdf7f2
TL
7040 {
7041 if (pool.info.is_tier()) {
7042 result = -EINVAL;
7043 break;
7044 }
7045 if (!obs.exists) {
7046 result = -ENOENT;
7047 break;
7048 }
9f95a23c 7049 if (get_osdmap()->require_osd_release < ceph_release_t::luminous) {
11fdf7f2
TL
7050 result = -EOPNOTSUPP;
7051 break;
7052 }
f67539c2
TL
7053 if (oi.manifest.is_redirect()) {
7054 result = -EINVAL;
7055 goto fail;
7056 }
11fdf7f2
TL
7057
7058 object_locator_t tgt_oloc;
7059 uint64_t src_offset, src_length, tgt_offset;
7060 object_t tgt_name;
7061 try {
7062 decode(src_offset, bp);
7063 decode(src_length, bp);
7064 decode(tgt_oloc, bp);
7065 decode(tgt_name, bp);
7066 decode(tgt_offset, bp);
31f18b77 7067 }
f67539c2 7068 catch (ceph::buffer::error& e) {
31f18b77 7069 result = -EINVAL;
11fdf7f2
TL
7070 goto fail;
7071 }
f67539c2 7072
11fdf7f2
TL
7073 if (!src_length) {
7074 result = -EINVAL;
7075 goto fail;
7076 }
f67539c2
TL
7077 if (src_offset + src_length > oi.size) {
7078 result = -ERANGE;
7079 goto fail;
7080 }
7081 if (!(osd_op.op.flags & CEPH_OSD_OP_FLAG_WITH_REFERENCE)) {
7082 result = -EOPNOTSUPP;
7083 break;
7084 }
7085 if (pool.info.is_erasure()) {
7086 result = -EOPNOTSUPP;
7087 break;
7088 }
11fdf7f2
TL
7089
7090 for (auto &p : oi.manifest.chunk_map) {
f67539c2
TL
7091 interval_set<uint64_t> chunk;
7092 chunk.insert(p.first, p.second.length);
7093 if (chunk.intersects(src_offset, src_length)) {
11fdf7f2
TL
7094 dout(20) << __func__ << " overlapped !! offset: " << src_offset << " length: " << src_length
7095 << " chunk_info: " << p << dendl;
7096 result = -EOPNOTSUPP;
7097 goto fail;
7098 }
7099 }
7100
11fdf7f2
TL
7101 pg_t raw_pg;
7102 chunk_info_t chunk_info;
7103 get_osdmap()->object_locator_to_pg(tgt_name, tgt_oloc, raw_pg);
7104 hobject_t target(tgt_name, tgt_oloc.key, snapid_t(),
7105 raw_pg.ps(), raw_pg.pool(),
7106 tgt_oloc.nspace);
11fdf7f2 7107 bool has_reference = (oi.manifest.chunk_map.find(src_offset) != oi.manifest.chunk_map.end()) &&
f67539c2 7108 (oi.manifest.chunk_map[src_offset].test_flag(chunk_info_t::FLAG_HAS_REFERENCE));
11fdf7f2
TL
7109 if (has_reference) {
7110 result = -EINVAL;
7111 dout(5) << " the object is already a manifest " << dendl;
7112 break;
7113 }
f67539c2
TL
7114 chunk_info.oid = target;
7115 chunk_info.offset = tgt_offset;
7116 chunk_info.length = src_length;
7117 if (op_finisher == nullptr) {
11fdf7f2
TL
7118 // start
7119 ctx->op_finishers[ctx->current_osd_subop_num].reset(
7120 new SetManifestFinisher(osd_op));
f67539c2
TL
7121 object_manifest_t set_chunk;
7122 bool need_inc_ref = false;
7123 set_chunk.chunk_map[src_offset] = chunk_info;
7124 need_inc_ref = inc_refcount_by_set(ctx, set_chunk, osd_op);
7125 if (need_inc_ref) {
7126 result = -EINPROGRESS;
7127 break;
11fdf7f2 7128 }
f67539c2
TL
7129 }
7130 if (op_finisher) {
7131 result = op_finisher->execute();
7132 ceph_assert(result == 0);
7133 }
11fdf7f2 7134
f67539c2
TL
7135 oi.manifest.chunk_map[src_offset] = chunk_info;
7136 if (!oi.has_manifest() && !oi.manifest.is_chunked())
7137 ctx->delta_stats.num_objects_manifest++;
7138 oi.set_flag(object_info_t::FLAG_MANIFEST);
7139 oi.manifest.type = object_manifest_t::TYPE_CHUNKED;
7140 if (!has_reference) {
7141 oi.manifest.chunk_map[src_offset].set_flag(chunk_info_t::FLAG_HAS_REFERENCE);
7142 }
7143 ctx->modify = true;
7144 ctx->cache_operation = true;
11fdf7f2 7145
f67539c2
TL
7146 dout(10) << "set-chunked oid:" << oi.soid << " user_version: " << oi.user_version
7147 << " chunk_info: " << chunk_info << dendl;
7148 if (op_finisher) {
7149 ctx->op_finishers.erase(ctx->current_osd_subop_num);
11fdf7f2
TL
7150 }
7151 }
7152
7153 break;
7154
7155 case CEPH_OSD_OP_TIER_PROMOTE:
7156 ++ctx->num_write;
9f95a23c 7157 result = 0;
11fdf7f2
TL
7158 {
7159 if (pool.info.is_tier()) {
7160 result = -EINVAL;
7161 break;
7162 }
7163 if (!obs.exists) {
7164 result = -ENOENT;
7165 break;
7166 }
9f95a23c 7167 if (get_osdmap()->require_osd_release < ceph_release_t::luminous) {
11fdf7f2
TL
7168 result = -EOPNOTSUPP;
7169 break;
7170 }
7171 if (!obs.oi.has_manifest()) {
7172 result = 0;
7173 break;
7174 }
7175
7176 if (op_finisher == nullptr) {
7177 PromoteManifestCallback *cb;
7178 object_locator_t my_oloc;
7179 hobject_t src_hoid;
7180
7181 if (obs.oi.manifest.is_chunked()) {
7182 src_hoid = obs.oi.soid;
11fdf7f2
TL
7183 } else if (obs.oi.manifest.is_redirect()) {
7184 object_locator_t src_oloc(obs.oi.manifest.redirect_target);
7185 my_oloc = src_oloc;
7186 src_hoid = obs.oi.manifest.redirect_target;
11fdf7f2
TL
7187 } else {
7188 ceph_abort_msg("unrecognized manifest type");
7189 }
f67539c2 7190 cb = new PromoteManifestCallback(ctx->obc, this, ctx);
11fdf7f2
TL
7191 ctx->op_finishers[ctx->current_osd_subop_num].reset(
7192 new PromoteFinisher(cb));
7193 unsigned flags = CEPH_OSD_COPY_FROM_FLAG_IGNORE_OVERLAY |
7194 CEPH_OSD_COPY_FROM_FLAG_IGNORE_CACHE |
7195 CEPH_OSD_COPY_FROM_FLAG_MAP_SNAP_CLONE |
7196 CEPH_OSD_COPY_FROM_FLAG_RWORDERED;
7197 unsigned src_fadvise_flags = LIBRADOS_OP_FLAG_FADVISE_SEQUENTIAL;
7198 start_copy(cb, ctx->obc, src_hoid, my_oloc, 0, flags,
7199 obs.oi.soid.snap == CEPH_NOSNAP,
7200 src_fadvise_flags, 0);
7201
7202 dout(10) << "tier-promote oid:" << oi.soid << " manifest: " << obs.oi.manifest << dendl;
7203 result = -EINPROGRESS;
7204 } else {
7205 result = op_finisher->execute();
7206 ceph_assert(result == 0);
7207 ctx->op_finishers.erase(ctx->current_osd_subop_num);
7208 }
7209 }
7210
7211 break;
7212
9f95a23c
TL
7213 case CEPH_OSD_OP_TIER_FLUSH:
7214 ++ctx->num_write;
7215 result = 0;
7216 {
7217 if (pool.info.is_tier()) {
7218 result = -EINVAL;
7219 break;
7220 }
7221 if (!obs.exists) {
7222 result = -ENOENT;
7223 break;
7224 }
7225 if (get_osdmap()->require_osd_release < ceph_release_t::octopus) {
7226 result = -EOPNOTSUPP;
7227 break;
7228 }
7229 if (!obs.oi.has_manifest()) {
7230 result = 0;
7231 break;
7232 }
7233
f67539c2 7234 if (oi.is_dirty()) {
9f95a23c
TL
7235 result = start_flush(ctx->op, ctx->obc, true, NULL, std::nullopt);
7236 if (result == -EINPROGRESS)
7237 result = -EAGAIN;
7238 } else {
7239 result = 0;
7240 }
7241 }
7242
7243 break;
7244
f67539c2
TL
7245 case CEPH_OSD_OP_TIER_EVICT:
7246 ++ctx->num_write;
7247 result = 0;
7248 {
7249 if (pool.info.is_tier()) {
7250 result = -EINVAL;
7251 break;
7252 }
7253 if (!obs.exists) {
7254 result = -ENOENT;
7255 break;
7256 }
7257 if (get_osdmap()->require_osd_release < ceph_release_t::octopus) {
7258 result = -EOPNOTSUPP;
7259 break;
7260 }
7261 if (!obs.oi.has_manifest()) {
7262 result = -EINVAL;
7263 break;
7264 }
7265
7266 // The chunks already has a reference, so it is just enough to invoke truncate if necessary
7267 uint64_t chunk_length = 0;
7268 for (auto p : obs.oi.manifest.chunk_map) {
7269 chunk_length += p.second.length;
7270 }
7271 if (chunk_length == obs.oi.size) {
7272 for (auto &p : obs.oi.manifest.chunk_map) {
7273 p.second.set_flag(chunk_info_t::FLAG_MISSING);
7274 }
7275 // punch hole
7276 t->zero(soid, 0, oi.size);
7277 oi.clear_data_digest();
7278 ctx->delta_stats.num_wr++;
7279 ctx->cache_operation = true;
7280 }
7281 osd->logger->inc(l_osd_tier_evict);
7282 }
7283
7284 break;
7285
11fdf7f2
TL
7286 case CEPH_OSD_OP_UNSET_MANIFEST:
7287 ++ctx->num_write;
9f95a23c 7288 result = 0;
11fdf7f2
TL
7289 {
7290 if (pool.info.is_tier()) {
7291 result = -EINVAL;
7292 break;
31f18b77 7293 }
11fdf7f2
TL
7294 if (!obs.exists) {
7295 result = -ENOENT;
31f18b77
FG
7296 break;
7297 }
11fdf7f2
TL
7298 if (!oi.has_manifest()) {
7299 result = -EOPNOTSUPP;
7300 break;
31f18b77 7301 }
9f95a23c 7302 if (get_osdmap()->require_osd_release < ceph_release_t::luminous) {
11fdf7f2
TL
7303 result = -EOPNOTSUPP;
7304 break;
31f18b77 7305 }
11fdf7f2 7306
f67539c2 7307 dec_all_refcount_manifest(oi, ctx);
11fdf7f2
TL
7308
7309 oi.clear_flag(object_info_t::FLAG_MANIFEST);
7310 oi.manifest = object_manifest_t();
7311 ctx->delta_stats.num_objects_manifest--;
7312 ctx->delta_stats.num_wr++;
7313 ctx->modify = true;
31f18b77
FG
7314 }
7315
7316 break;
7c673cae
FG
7317
7318 // -- object attrs --
f67539c2 7319
7c673cae
FG
7320 case CEPH_OSD_OP_SETXATTR:
7321 ++ctx->num_write;
9f95a23c 7322 result = 0;
7c673cae
FG
7323 {
7324 if (cct->_conf->osd_max_attr_size > 0 &&
7325 op.xattr.value_len > cct->_conf->osd_max_attr_size) {
7326 tracepoint(osd, do_osd_op_pre_setxattr, soid.oid.name.c_str(), soid.snap.val, "???");
7327 result = -EFBIG;
7328 break;
7329 }
11fdf7f2
TL
7330 unsigned max_name_len =
7331 std::min<uint64_t>(osd->store->get_max_attr_name_length(),
7332 cct->_conf->osd_max_attr_name_len);
7c673cae
FG
7333 if (op.xattr.name_len > max_name_len) {
7334 result = -ENAMETOOLONG;
7335 break;
7336 }
7337 maybe_create_new_object(ctx);
7338 string aname;
7339 bp.copy(op.xattr.name_len, aname);
7340 tracepoint(osd, do_osd_op_pre_setxattr, soid.oid.name.c_str(), soid.snap.val, aname.c_str());
7341 string name = "_" + aname;
7342 bufferlist bl;
7343 bp.copy(op.xattr.value_len, bl);
7344 t->setattr(soid, name, bl);
7345 ctx->delta_stats.num_wr++;
7346 }
7347 break;
7348
7349 case CEPH_OSD_OP_RMXATTR:
7350 ++ctx->num_write;
9f95a23c 7351 result = 0;
7c673cae
FG
7352 {
7353 string aname;
7354 bp.copy(op.xattr.name_len, aname);
7355 tracepoint(osd, do_osd_op_pre_rmxattr, soid.oid.name.c_str(), soid.snap.val, aname.c_str());
7356 if (!obs.exists || oi.is_whiteout()) {
7357 result = -ENOENT;
7358 break;
7359 }
7360 string name = "_" + aname;
7361 t->rmattr(soid, name);
7362 ctx->delta_stats.num_wr++;
7363 }
7364 break;
f67539c2 7365
7c673cae
FG
7366
7367 // -- fancy writers --
7368 case CEPH_OSD_OP_APPEND:
7369 {
7370 tracepoint(osd, do_osd_op_pre_append, soid.oid.name.c_str(), soid.snap.val, oi.size, oi.truncate_seq, op.extent.offset, op.extent.length, op.extent.truncate_size, op.extent.truncate_seq);
7371 // just do it inline; this works because we are happy to execute
7372 // fancy op on replicas as well.
7373 vector<OSDOp> nops(1);
7374 OSDOp& newop = nops[0];
7375 newop.op.op = CEPH_OSD_OP_WRITE;
7376 newop.op.extent.offset = oi.size;
7377 newop.op.extent.length = op.extent.length;
7378 newop.op.extent.truncate_seq = oi.truncate_seq;
7379 newop.indata = osd_op.indata;
7380 result = do_osd_ops(ctx, nops);
f67539c2 7381 osd_op.outdata = std::move(newop.outdata);
7c673cae
FG
7382 }
7383 break;
7384
7385 case CEPH_OSD_OP_STARTSYNC:
9f95a23c 7386 result = 0;
7c673cae
FG
7387 t->nop(soid);
7388 break;
7389
7c673cae
FG
7390 // -- trivial map --
7391 case CEPH_OSD_OP_TMAPGET:
7392 tracepoint(osd, do_osd_op_pre_tmapget, soid.oid.name.c_str(), soid.snap.val);
11fdf7f2 7393 if (pool.info.is_erasure()) {
7c673cae
FG
7394 result = -EOPNOTSUPP;
7395 break;
7396 }
7397 {
7398 vector<OSDOp> nops(1);
7399 OSDOp& newop = nops[0];
7400 newop.op.op = CEPH_OSD_OP_SYNC_READ;
7401 newop.op.extent.offset = 0;
7402 newop.op.extent.length = 0;
9f95a23c 7403 result = do_osd_ops(ctx, nops);
f67539c2 7404 osd_op.outdata = std::move(newop.outdata);
7c673cae
FG
7405 }
7406 break;
7407
7408 case CEPH_OSD_OP_TMAPPUT:
7409 tracepoint(osd, do_osd_op_pre_tmapput, soid.oid.name.c_str(), soid.snap.val);
11fdf7f2 7410 if (pool.info.is_erasure()) {
7c673cae
FG
7411 result = -EOPNOTSUPP;
7412 break;
7413 }
7414 {
7415 //_dout_lock.Lock();
7416 //osd_op.data.hexdump(*_dout);
7417 //_dout_lock.Unlock();
7418
7419 // verify sort order
7420 bool unsorted = false;
7421 if (true) {
7422 bufferlist header;
11fdf7f2 7423 decode(header, bp);
7c673cae 7424 uint32_t n;
11fdf7f2 7425 decode(n, bp);
7c673cae
FG
7426 string last_key;
7427 while (n--) {
7428 string key;
11fdf7f2 7429 decode(key, bp);
7c673cae
FG
7430 dout(10) << "tmapput key " << key << dendl;
7431 bufferlist val;
11fdf7f2 7432 decode(val, bp);
7c673cae
FG
7433 if (key < last_key) {
7434 dout(10) << "TMAPPUT is unordered; resorting" << dendl;
7435 unsorted = true;
7436 break;
7437 }
7438 last_key = key;
7439 }
7440 }
7441
7442 // write it
7443 vector<OSDOp> nops(1);
7444 OSDOp& newop = nops[0];
7445 newop.op.op = CEPH_OSD_OP_WRITEFULL;
7446 newop.op.extent.offset = 0;
7447 newop.op.extent.length = osd_op.indata.length();
7448 newop.indata = osd_op.indata;
7449
7450 if (unsorted) {
7451 bp = osd_op.indata.begin();
7452 bufferlist header;
7453 map<string, bufferlist> m;
11fdf7f2
TL
7454 decode(header, bp);
7455 decode(m, bp);
7456 ceph_assert(bp.end());
7c673cae 7457 bufferlist newbl;
11fdf7f2
TL
7458 encode(header, newbl);
7459 encode(m, newbl);
7c673cae
FG
7460 newop.indata = newbl;
7461 }
7462 result = do_osd_ops(ctx, nops);
11fdf7f2 7463 ceph_assert(result == 0);
7c673cae
FG
7464 }
7465 break;
7466
7467 case CEPH_OSD_OP_TMAPUP:
7468 tracepoint(osd, do_osd_op_pre_tmapup, soid.oid.name.c_str(), soid.snap.val);
11fdf7f2 7469 if (pool.info.is_erasure()) {
7c673cae
FG
7470 result = -EOPNOTSUPP;
7471 break;
7472 }
7473 ++ctx->num_write;
7474 result = do_tmapup(ctx, bp, osd_op);
7475 break;
7476
7477 case CEPH_OSD_OP_TMAP2OMAP:
7478 ++ctx->num_write;
7479 tracepoint(osd, do_osd_op_pre_tmap2omap, soid.oid.name.c_str(), soid.snap.val);
7480 result = do_tmap2omap(ctx, op.tmap2omap.flags);
7481 break;
7482
7483 // OMAP Read ops
7484 case CEPH_OSD_OP_OMAPGETKEYS:
7485 ++ctx->num_read;
7486 {
7487 string start_after;
7488 uint64_t max_return;
7489 try {
11fdf7f2
TL
7490 decode(start_after, bp);
7491 decode(max_return, bp);
7c673cae 7492 }
f67539c2 7493 catch (ceph::buffer::error& e) {
7c673cae
FG
7494 result = -EINVAL;
7495 tracepoint(osd, do_osd_op_pre_omapgetkeys, soid.oid.name.c_str(), soid.snap.val, "???", 0);
7496 goto fail;
7497 }
7498 if (max_return > cct->_conf->osd_max_omap_entries_per_request) {
7499 max_return = cct->_conf->osd_max_omap_entries_per_request;
7500 }
7501 tracepoint(osd, do_osd_op_pre_omapgetkeys, soid.oid.name.c_str(), soid.snap.val, start_after.c_str(), max_return);
7502
7503 bufferlist bl;
7504 uint32_t num = 0;
7505 bool truncated = false;
7506 if (oi.is_omap()) {
7507 ObjectMap::ObjectMapIterator iter = osd->store->get_omap_iterator(
11fdf7f2 7508 ch, ghobject_t(soid)
7c673cae 7509 );
11fdf7f2 7510 ceph_assert(iter);
7c673cae 7511 iter->upper_bound(start_after);
11fdf7f2 7512 for (num = 0; iter->valid(); ++num, iter->next()) {
7c673cae
FG
7513 if (num >= max_return ||
7514 bl.length() >= cct->_conf->osd_max_omap_bytes_per_request) {
7515 truncated = true;
7516 break;
7517 }
11fdf7f2 7518 encode(iter->key(), bl);
7c673cae
FG
7519 }
7520 } // else return empty out_set
11fdf7f2 7521 encode(num, osd_op.outdata);
7c673cae 7522 osd_op.outdata.claim_append(bl);
11fdf7f2
TL
7523 encode(truncated, osd_op.outdata);
7524 ctx->delta_stats.num_rd_kb += shift_round_up(osd_op.outdata.length(), 10);
7c673cae
FG
7525 ctx->delta_stats.num_rd++;
7526 }
7527 break;
7528
7529 case CEPH_OSD_OP_OMAPGETVALS:
7530 ++ctx->num_read;
7531 {
7532 string start_after;
7533 uint64_t max_return;
7534 string filter_prefix;
7535 try {
11fdf7f2
TL
7536 decode(start_after, bp);
7537 decode(max_return, bp);
7538 decode(filter_prefix, bp);
7c673cae 7539 }
f67539c2 7540 catch (ceph::buffer::error& e) {
7c673cae
FG
7541 result = -EINVAL;
7542 tracepoint(osd, do_osd_op_pre_omapgetvals, soid.oid.name.c_str(), soid.snap.val, "???", 0, "???");
7543 goto fail;
7544 }
7545 if (max_return > cct->_conf->osd_max_omap_entries_per_request) {
7546 max_return = cct->_conf->osd_max_omap_entries_per_request;
7547 }
7548 tracepoint(osd, do_osd_op_pre_omapgetvals, soid.oid.name.c_str(), soid.snap.val, start_after.c_str(), max_return, filter_prefix.c_str());
7549
7550 uint32_t num = 0;
7551 bool truncated = false;
7552 bufferlist bl;
7553 if (oi.is_omap()) {
7554 ObjectMap::ObjectMapIterator iter = osd->store->get_omap_iterator(
11fdf7f2 7555 ch, ghobject_t(soid)
7c673cae
FG
7556 );
7557 if (!iter) {
7558 result = -ENOENT;
7559 goto fail;
7560 }
7561 iter->upper_bound(start_after);
7562 if (filter_prefix > start_after) iter->lower_bound(filter_prefix);
7563 for (num = 0;
7564 iter->valid() &&
7565 iter->key().substr(0, filter_prefix.size()) == filter_prefix;
11fdf7f2 7566 ++num, iter->next()) {
7c673cae
FG
7567 dout(20) << "Found key " << iter->key() << dendl;
7568 if (num >= max_return ||
7569 bl.length() >= cct->_conf->osd_max_omap_bytes_per_request) {
7570 truncated = true;
7571 break;
7572 }
11fdf7f2
TL
7573 encode(iter->key(), bl);
7574 encode(iter->value(), bl);
7c673cae
FG
7575 }
7576 } // else return empty out_set
11fdf7f2 7577 encode(num, osd_op.outdata);
7c673cae 7578 osd_op.outdata.claim_append(bl);
11fdf7f2
TL
7579 encode(truncated, osd_op.outdata);
7580 ctx->delta_stats.num_rd_kb += shift_round_up(osd_op.outdata.length(), 10);
7c673cae
FG
7581 ctx->delta_stats.num_rd++;
7582 }
7583 break;
7584
7585 case CEPH_OSD_OP_OMAPGETHEADER:
7586 tracepoint(osd, do_osd_op_pre_omapgetheader, soid.oid.name.c_str(), soid.snap.val);
7587 if (!oi.is_omap()) {
7588 // return empty header
7589 break;
7590 }
7591 ++ctx->num_read;
7592 {
7593 osd->store->omap_get_header(ch, ghobject_t(soid), &osd_op.outdata);
11fdf7f2 7594 ctx->delta_stats.num_rd_kb += shift_round_up(osd_op.outdata.length(), 10);
7c673cae
FG
7595 ctx->delta_stats.num_rd++;
7596 }
7597 break;
7598
7599 case CEPH_OSD_OP_OMAPGETVALSBYKEYS:
7600 ++ctx->num_read;
7601 {
7602 set<string> keys_to_get;
7603 try {
11fdf7f2 7604 decode(keys_to_get, bp);
7c673cae 7605 }
f67539c2 7606 catch (ceph::buffer::error& e) {
7c673cae
FG
7607 result = -EINVAL;
7608 tracepoint(osd, do_osd_op_pre_omapgetvalsbykeys, soid.oid.name.c_str(), soid.snap.val, "???");
7609 goto fail;
7610 }
7611 tracepoint(osd, do_osd_op_pre_omapgetvalsbykeys, soid.oid.name.c_str(), soid.snap.val, list_entries(keys_to_get).c_str());
7612 map<string, bufferlist> out;
7613 if (oi.is_omap()) {
7614 osd->store->omap_get_values(ch, ghobject_t(soid), keys_to_get, &out);
7615 } // else return empty omap entries
11fdf7f2
TL
7616 encode(out, osd_op.outdata);
7617 ctx->delta_stats.num_rd_kb += shift_round_up(osd_op.outdata.length(), 10);
7c673cae
FG
7618 ctx->delta_stats.num_rd++;
7619 }
7620 break;
7621
7622 case CEPH_OSD_OP_OMAP_CMP:
7623 ++ctx->num_read;
7624 {
7625 if (!obs.exists || oi.is_whiteout()) {
7626 result = -ENOENT;
7627 tracepoint(osd, do_osd_op_pre_omap_cmp, soid.oid.name.c_str(), soid.snap.val, "???");
7628 break;
7629 }
7630 map<string, pair<bufferlist, int> > assertions;
7631 try {
11fdf7f2 7632 decode(assertions, bp);
7c673cae 7633 }
f67539c2 7634 catch (ceph::buffer::error& e) {
7c673cae
FG
7635 result = -EINVAL;
7636 tracepoint(osd, do_osd_op_pre_omap_cmp, soid.oid.name.c_str(), soid.snap.val, "???");
7637 goto fail;
7638 }
7639 tracepoint(osd, do_osd_op_pre_omap_cmp, soid.oid.name.c_str(), soid.snap.val, list_keys(assertions).c_str());
f67539c2 7640
7c673cae
FG
7641 map<string, bufferlist> out;
7642
7643 if (oi.is_omap()) {
7644 set<string> to_get;
7645 for (map<string, pair<bufferlist, int> >::iterator i = assertions.begin();
7646 i != assertions.end();
7647 ++i)
7648 to_get.insert(i->first);
7649 int r = osd->store->omap_get_values(ch, ghobject_t(soid),
7650 to_get, &out);
7651 if (r < 0) {
7652 result = r;
7653 break;
7654 }
7655 } // else leave out empty
7656
7657 //Should set num_rd_kb based on encode length of map
7658 ctx->delta_stats.num_rd++;
7659
7660 int r = 0;
7661 bufferlist empty;
7662 for (map<string, pair<bufferlist, int> >::iterator i = assertions.begin();
7663 i != assertions.end();
7664 ++i) {
7665 auto out_entry = out.find(i->first);
7666 bufferlist &bl = (out_entry != out.end()) ?
7667 out_entry->second : empty;
7668 switch (i->second.second) {
7669 case CEPH_OSD_CMPXATTR_OP_EQ:
7670 if (!(bl == i->second.first)) {
7671 r = -ECANCELED;
7672 }
7673 break;
7674 case CEPH_OSD_CMPXATTR_OP_LT:
7675 if (!(bl < i->second.first)) {
7676 r = -ECANCELED;
7677 }
7678 break;
7679 case CEPH_OSD_CMPXATTR_OP_GT:
7680 if (!(bl > i->second.first)) {
7681 r = -ECANCELED;
7682 }
7683 break;
7684 default:
7685 r = -EINVAL;
7686 break;
7687 }
7688 if (r < 0)
7689 break;
7690 }
7691 if (r < 0) {
7692 result = r;
7693 }
7694 }
7695 break;
7696
7697 // OMAP Write ops
7698 case CEPH_OSD_OP_OMAPSETVALS:
7699 if (!pool.info.supports_omap()) {
7700 result = -EOPNOTSUPP;
7701 tracepoint(osd, do_osd_op_pre_omapsetvals, soid.oid.name.c_str(), soid.snap.val);
7702 break;
7703 }
7704 ++ctx->num_write;
9f95a23c 7705 result = 0;
7c673cae
FG
7706 {
7707 maybe_create_new_object(ctx);
7708 bufferlist to_set_bl;
7709 try {
7710 decode_str_str_map_to_bl(bp, &to_set_bl);
7711 }
f67539c2 7712 catch (ceph::buffer::error& e) {
7c673cae
FG
7713 result = -EINVAL;
7714 tracepoint(osd, do_osd_op_pre_omapsetvals, soid.oid.name.c_str(), soid.snap.val);
7715 goto fail;
7716 }
7717 tracepoint(osd, do_osd_op_pre_omapsetvals, soid.oid.name.c_str(), soid.snap.val);
11fdf7f2 7718 if (cct->_conf->subsys.should_gather<dout_subsys, 20>()) {
7c673cae
FG
7719 dout(20) << "setting vals: " << dendl;
7720 map<string,bufferlist> to_set;
11fdf7f2
TL
7721 bufferlist::const_iterator pt = to_set_bl.begin();
7722 decode(to_set, pt);
7c673cae
FG
7723 for (map<string, bufferlist>::iterator i = to_set.begin();
7724 i != to_set.end();
7725 ++i) {
7726 dout(20) << "\t" << i->first << dendl;
7727 }
7728 }
7729 t->omap_setkeys(soid, to_set_bl);
9f95a23c 7730 ctx->clean_regions.mark_omap_dirty();
7c673cae 7731 ctx->delta_stats.num_wr++;
11fdf7f2 7732 ctx->delta_stats.num_wr_kb += shift_round_up(to_set_bl.length(), 10);
7c673cae
FG
7733 }
7734 obs.oi.set_flag(object_info_t::FLAG_OMAP);
7735 obs.oi.clear_omap_digest();
7736 break;
7737
7738 case CEPH_OSD_OP_OMAPSETHEADER:
7739 tracepoint(osd, do_osd_op_pre_omapsetheader, soid.oid.name.c_str(), soid.snap.val);
7740 if (!pool.info.supports_omap()) {
7741 result = -EOPNOTSUPP;
7742 break;
7743 }
7744 ++ctx->num_write;
9f95a23c 7745 result = 0;
7c673cae
FG
7746 {
7747 maybe_create_new_object(ctx);
7748 t->omap_setheader(soid, osd_op.indata);
9f95a23c 7749 ctx->clean_regions.mark_omap_dirty();
7c673cae
FG
7750 ctx->delta_stats.num_wr++;
7751 }
7752 obs.oi.set_flag(object_info_t::FLAG_OMAP);
7753 obs.oi.clear_omap_digest();
7754 break;
7755
7756 case CEPH_OSD_OP_OMAPCLEAR:
7757 tracepoint(osd, do_osd_op_pre_omapclear, soid.oid.name.c_str(), soid.snap.val);
7758 if (!pool.info.supports_omap()) {
7759 result = -EOPNOTSUPP;
7760 break;
7761 }
7762 ++ctx->num_write;
9f95a23c 7763 result = 0;
7c673cae
FG
7764 {
7765 if (!obs.exists || oi.is_whiteout()) {
7766 result = -ENOENT;
7767 break;
7768 }
7769 if (oi.is_omap()) {
7770 t->omap_clear(soid);
9f95a23c 7771 ctx->clean_regions.mark_omap_dirty();
7c673cae
FG
7772 ctx->delta_stats.num_wr++;
7773 obs.oi.clear_omap_digest();
7774 obs.oi.clear_flag(object_info_t::FLAG_OMAP);
7775 }
7776 }
7777 break;
7778
7779 case CEPH_OSD_OP_OMAPRMKEYS:
7780 if (!pool.info.supports_omap()) {
7781 result = -EOPNOTSUPP;
7782 tracepoint(osd, do_osd_op_pre_omaprmkeys, soid.oid.name.c_str(), soid.snap.val);
7783 break;
7784 }
7785 ++ctx->num_write;
9f95a23c 7786 result = 0;
7c673cae
FG
7787 {
7788 if (!obs.exists || oi.is_whiteout()) {
7789 result = -ENOENT;
7790 tracepoint(osd, do_osd_op_pre_omaprmkeys, soid.oid.name.c_str(), soid.snap.val);
7791 break;
7792 }
7793 bufferlist to_rm_bl;
7794 try {
7795 decode_str_set_to_bl(bp, &to_rm_bl);
7796 }
f67539c2 7797 catch (ceph::buffer::error& e) {
7c673cae
FG
7798 result = -EINVAL;
7799 tracepoint(osd, do_osd_op_pre_omaprmkeys, soid.oid.name.c_str(), soid.snap.val);
7800 goto fail;
7801 }
7802 tracepoint(osd, do_osd_op_pre_omaprmkeys, soid.oid.name.c_str(), soid.snap.val);
7803 t->omap_rmkeys(soid, to_rm_bl);
9f95a23c
TL
7804 ctx->clean_regions.mark_omap_dirty();
7805 ctx->delta_stats.num_wr++;
7806 }
7807 obs.oi.clear_omap_digest();
7808 break;
7809
7810 case CEPH_OSD_OP_OMAPRMKEYRANGE:
7811 tracepoint(osd, do_osd_op_pre_omaprmkeyrange, soid.oid.name.c_str(), soid.snap.val);
7812 if (!pool.info.supports_omap()) {
7813 result = -EOPNOTSUPP;
7814 break;
7815 }
7816 ++ctx->num_write;
7817 result = 0;
7818 {
7819 if (!obs.exists || oi.is_whiteout()) {
7820 result = -ENOENT;
7821 break;
7822 }
7823 std::string key_begin, key_end;
7824 try {
7825 decode(key_begin, bp);
7826 decode(key_end, bp);
f67539c2 7827 } catch (ceph::buffer::error& e) {
9f95a23c
TL
7828 result = -EINVAL;
7829 goto fail;
7830 }
7831 t->omap_rmkeyrange(soid, key_begin, key_end);
7c673cae
FG
7832 ctx->delta_stats.num_wr++;
7833 }
7834 obs.oi.clear_omap_digest();
7835 break;
7836
7837 case CEPH_OSD_OP_COPY_GET:
7838 ++ctx->num_read;
c07f9fc5
FG
7839 tracepoint(osd, do_osd_op_pre_copy_get, soid.oid.name.c_str(),
7840 soid.snap.val);
7841 if (op_finisher == nullptr) {
7842 result = do_copy_get(ctx, bp, osd_op, ctx->obc);
7843 } else {
7844 result = op_finisher->execute();
7845 }
7c673cae
FG
7846 break;
7847
7848 case CEPH_OSD_OP_COPY_FROM:
9f95a23c 7849 case CEPH_OSD_OP_COPY_FROM2:
7c673cae 7850 ++ctx->num_write;
9f95a23c 7851 result = 0;
7c673cae
FG
7852 {
7853 object_t src_name;
7854 object_locator_t src_oloc;
9f95a23c
TL
7855 uint32_t truncate_seq = 0;
7856 uint64_t truncate_size = 0;
7857 bool have_truncate = false;
7c673cae
FG
7858 snapid_t src_snapid = (uint64_t)op.copy_from.snapid;
7859 version_t src_version = op.copy_from.src_version;
9f95a23c
TL
7860
7861 if ((op.op == CEPH_OSD_OP_COPY_FROM2) &&
7862 (op.copy_from.flags & ~CEPH_OSD_COPY_FROM_FLAGS)) {
7863 dout(20) << "invalid copy-from2 flags 0x"
7864 << std::hex << (int)op.copy_from.flags << std::dec << dendl;
7865 result = -EINVAL;
7866 break;
7867 }
7c673cae 7868 try {
11fdf7f2
TL
7869 decode(src_name, bp);
7870 decode(src_oloc, bp);
9f95a23c
TL
7871 // check if client sent us truncate_seq and truncate_size
7872 if ((op.op == CEPH_OSD_OP_COPY_FROM2) &&
7873 (op.copy_from.flags & CEPH_OSD_COPY_FROM_FLAG_TRUNCATE_SEQ)) {
7874 decode(truncate_seq, bp);
7875 decode(truncate_size, bp);
7876 have_truncate = true;
7877 }
7c673cae 7878 }
f67539c2 7879 catch (ceph::buffer::error& e) {
7c673cae
FG
7880 result = -EINVAL;
7881 tracepoint(osd,
7882 do_osd_op_pre_copy_from,
7883 soid.oid.name.c_str(),
7884 soid.snap.val,
7885 "???",
7886 0,
7887 "???",
7888 "???",
7889 0,
7890 src_snapid,
7891 src_version);
7892 goto fail;
7893 }
7894 tracepoint(osd,
7895 do_osd_op_pre_copy_from,
7896 soid.oid.name.c_str(),
7897 soid.snap.val,
7898 src_name.name.c_str(),
7899 src_oloc.pool,
7900 src_oloc.key.c_str(),
7901 src_oloc.nspace.c_str(),
7902 src_oloc.hash,
7903 src_snapid,
7904 src_version);
c07f9fc5 7905 if (op_finisher == nullptr) {
7c673cae
FG
7906 // start
7907 pg_t raw_pg;
7908 get_osdmap()->object_locator_to_pg(src_name, src_oloc, raw_pg);
7909 hobject_t src(src_name, src_oloc.key, src_snapid,
7910 raw_pg.ps(), raw_pg.pool(),
7911 src_oloc.nspace);
7912 if (src == soid) {
7913 dout(20) << " copy from self is invalid" << dendl;
7914 result = -EINVAL;
7915 break;
7916 }
c07f9fc5 7917 CopyFromCallback *cb = new CopyFromCallback(ctx, osd_op);
9f95a23c
TL
7918 if (have_truncate)
7919 cb->set_truncate(truncate_seq, truncate_size);
c07f9fc5
FG
7920 ctx->op_finishers[ctx->current_osd_subop_num].reset(
7921 new CopyFromFinisher(cb));
7c673cae
FG
7922 start_copy(cb, ctx->obc, src, src_oloc, src_version,
7923 op.copy_from.flags,
7924 false,
7925 op.copy_from.src_fadvise_flags,
7926 op.flags);
7927 result = -EINPROGRESS;
7928 } else {
7929 // finish
c07f9fc5 7930 result = op_finisher->execute();
11fdf7f2 7931 ceph_assert(result == 0);
c07f9fc5
FG
7932
7933 // COPY_FROM cannot be executed multiple times -- it must restart
7934 ctx->op_finishers.erase(ctx->current_osd_subop_num);
7c673cae
FG
7935 }
7936 }
7937 break;
7938
7939 default:
7940 tracepoint(osd, do_osd_op_pre_unknown, soid.oid.name.c_str(), soid.snap.val, op.op, ceph_osd_op_name(op.op));
7941 dout(1) << "unrecognized osd op " << op.op
7942 << " " << ceph_osd_op_name(op.op)
7943 << dendl;
7944 result = -EOPNOTSUPP;
7945 }
7946
7947 fail:
7948 osd_op.rval = result;
7949 tracepoint(osd, do_osd_op_post, soid.oid.name.c_str(), soid.snap.val, op.op, ceph_osd_op_name(op.op), op.flags, result);
11fdf7f2
TL
7950 if (result < 0 && (op.flags & CEPH_OSD_OP_FLAG_FAILOK) &&
7951 result != -EAGAIN && result != -EINPROGRESS)
7c673cae
FG
7952 result = 0;
7953
7954 if (result < 0)
7955 break;
7956 }
eafe8130
TL
7957 if (result < 0) {
7958 dout(10) << __func__ << " error: " << cpp_strerror(result) << dendl;
7959 }
7c673cae
FG
7960 return result;
7961}
7962
7963int PrimaryLogPG::_get_tmap(OpContext *ctx, bufferlist *header, bufferlist *vals)
7964{
7965 if (ctx->new_obs.oi.size == 0) {
7966 dout(20) << "unable to get tmap for zero sized " << ctx->new_obs.oi.soid << dendl;
7967 return -ENODATA;
7968 }
7969 vector<OSDOp> nops(1);
7970 OSDOp &newop = nops[0];
7971 newop.op.op = CEPH_OSD_OP_TMAPGET;
7972 do_osd_ops(ctx, nops);
7973 try {
11fdf7f2
TL
7974 bufferlist::const_iterator i = newop.outdata.begin();
7975 decode(*header, i);
7c673cae
FG
7976 (*vals).substr_of(newop.outdata, i.get_off(), i.get_remaining());
7977 } catch (...) {
7978 dout(20) << "unsuccessful at decoding tmap for " << ctx->new_obs.oi.soid
7979 << dendl;
7980 return -EINVAL;
7981 }
7982 dout(20) << "successful at decoding tmap for " << ctx->new_obs.oi.soid
7983 << dendl;
7984 return 0;
7985}
7986
7987int PrimaryLogPG::_verify_no_head_clones(const hobject_t& soid,
7988 const SnapSet& ss)
7989{
7990 // verify that all clones have been evicted
7991 dout(20) << __func__ << " verifying clones are absent "
7992 << ss << dendl;
7993 for (vector<snapid_t>::const_iterator p = ss.clones.begin();
7994 p != ss.clones.end();
7995 ++p) {
7996 hobject_t clone_oid = soid;
7997 clone_oid.snap = *p;
7998 if (is_missing_object(clone_oid))
7999 return -EBUSY;
8000 ObjectContextRef clone_obc = get_object_context(clone_oid, false);
8001 if (clone_obc && clone_obc->obs.exists) {
8002 dout(10) << __func__ << " cannot evict head before clone "
8003 << clone_oid << dendl;
8004 return -EBUSY;
8005 }
8006 if (copy_ops.count(clone_oid)) {
8007 dout(10) << __func__ << " cannot evict head, pending promote on clone "
8008 << clone_oid << dendl;
8009 return -EBUSY;
8010 }
8011 }
8012 return 0;
8013}
8014
8015inline int PrimaryLogPG::_delete_oid(
8016 OpContext *ctx,
8017 bool no_whiteout, // no whiteouts, no matter what.
8018 bool try_no_whiteout) // try not to whiteout
8019{
8020 SnapSet& snapset = ctx->new_snapset;
8021 ObjectState& obs = ctx->new_obs;
8022 object_info_t& oi = obs.oi;
8023 const hobject_t& soid = oi.soid;
8024 PGTransaction* t = ctx->op_t.get();
8025
8026 // cache: cache: set whiteout on delete?
8027 bool whiteout = false;
8028 if (pool.info.cache_mode != pg_pool_t::CACHEMODE_NONE
8029 && !no_whiteout
8030 && !try_no_whiteout) {
8031 whiteout = true;
8032 }
11fdf7f2
TL
8033
8034 // in luminous or later, we can't delete the head if there are
8035 // clones. we trust the caller passing no_whiteout has already
8036 // verified they don't exist.
8037 if (!snapset.clones.empty() ||
8038 (!ctx->snapc.snaps.empty() && ctx->snapc.snaps[0] > snapset.seq)) {
8039 if (no_whiteout) {
8040 dout(20) << __func__ << " has or will have clones but no_whiteout=1"
8041 << dendl;
8042 } else {
8043 dout(20) << __func__ << " has or will have clones; will whiteout"
8044 << dendl;
8045 whiteout = true;
7c673cae 8046 }
7c673cae
FG
8047 }
8048 dout(20) << __func__ << " " << soid << " whiteout=" << (int)whiteout
8049 << " no_whiteout=" << (int)no_whiteout
8050 << " try_no_whiteout=" << (int)try_no_whiteout
8051 << dendl;
8052 if (!obs.exists || (obs.oi.is_whiteout() && whiteout))
8053 return -ENOENT;
8054
8055 t->remove(soid);
8056
8057 if (oi.size > 0) {
8058 interval_set<uint64_t> ch;
8059 ch.insert(0, oi.size);
8060 ctx->modified_ranges.union_of(ch);
9f95a23c 8061 ctx->clean_regions.mark_data_region_dirty(0, oi.size);
7c673cae
FG
8062 }
8063
9f95a23c 8064 ctx->clean_regions.mark_omap_dirty();
7c673cae
FG
8065 ctx->delta_stats.num_wr++;
8066 if (soid.is_snap()) {
11fdf7f2 8067 ceph_assert(ctx->obc->ssc->snapset.clone_overlap.count(soid.snap));
7c673cae
FG
8068 ctx->delta_stats.num_bytes -= ctx->obc->ssc->snapset.get_clone_bytes(soid.snap);
8069 } else {
8070 ctx->delta_stats.num_bytes -= oi.size;
8071 }
8072 oi.size = 0;
8073 oi.new_object();
8074
8075 // disconnect all watchers
8076 for (map<pair<uint64_t, entity_name_t>, watch_info_t>::iterator p =
8077 oi.watchers.begin();
8078 p != oi.watchers.end();
8079 ++p) {
8080 dout(20) << __func__ << " will disconnect watcher " << p->first << dendl;
8081 ctx->watch_disconnects.push_back(
8082 watch_disconnect_t(p->first.first, p->first.second, true));
8083 }
8084 oi.watchers.clear();
8085
f67539c2
TL
8086 if (oi.has_manifest()) {
8087 ctx->delta_stats.num_objects_manifest--;
8088 dec_all_refcount_manifest(oi, ctx);
8089 }
8090
7c673cae
FG
8091 if (whiteout) {
8092 dout(20) << __func__ << " setting whiteout on " << soid << dendl;
8093 oi.set_flag(object_info_t::FLAG_WHITEOUT);
8094 ctx->delta_stats.num_whiteouts++;
8095 t->create(soid);
8096 osd->logger->inc(l_osd_tier_whiteout);
8097 return 0;
8098 }
8099
8100 // delete the head
8101 ctx->delta_stats.num_objects--;
8102 if (soid.is_snap())
8103 ctx->delta_stats.num_object_clones--;
8104 if (oi.is_whiteout()) {
8105 dout(20) << __func__ << " deleting whiteout on " << soid << dendl;
8106 ctx->delta_stats.num_whiteouts--;
8107 oi.clear_flag(object_info_t::FLAG_WHITEOUT);
8108 }
8109 if (oi.is_cache_pinned()) {
8110 ctx->delta_stats.num_objects_pinned--;
8111 }
7c673cae
FG
8112 obs.exists = false;
8113 return 0;
8114}
8115
8116int PrimaryLogPG::_rollback_to(OpContext *ctx, ceph_osd_op& op)
8117{
8118 SnapSet& snapset = ctx->new_snapset;
8119 ObjectState& obs = ctx->new_obs;
8120 object_info_t& oi = obs.oi;
8121 const hobject_t& soid = oi.soid;
8122 PGTransaction* t = ctx->op_t.get();
8123 snapid_t snapid = (uint64_t)op.snap.snapid;
8124 hobject_t missing_oid;
8125
8126 dout(10) << "_rollback_to " << soid << " snapid " << snapid << dendl;
8127
8128 ObjectContextRef rollback_to;
11fdf7f2 8129
7c673cae
FG
8130 int ret = find_object_context(
8131 hobject_t(soid.oid, soid.get_key(), snapid, soid.get_hash(), info.pgid.pool(),
8132 soid.get_namespace()),
8133 &rollback_to, false, false, &missing_oid);
8134 if (ret == -EAGAIN) {
8135 /* clone must be missing */
11fdf7f2 8136 ceph_assert(is_degraded_or_backfilling_object(missing_oid) || is_degraded_on_async_recovery_target(missing_oid));
c07f9fc5 8137 dout(20) << "_rollback_to attempted to roll back to a missing or backfilling clone "
7c673cae
FG
8138 << missing_oid << " (requested snapid: ) " << snapid << dendl;
8139 block_write_on_degraded_snap(missing_oid, ctx->op);
8140 return ret;
8141 }
8142 {
8143 ObjectContextRef promote_obc;
31f18b77 8144 cache_result_t tier_mode_result;
f67539c2
TL
8145 if (obs.exists && obs.oi.has_manifest()) {
8146 tier_mode_result =
31f18b77
FG
8147 maybe_handle_manifest_detail(
8148 ctx->op,
8149 true,
8150 rollback_to);
8151 } else {
f67539c2 8152 tier_mode_result =
31f18b77
FG
8153 maybe_handle_cache_detail(
8154 ctx->op,
8155 true,
8156 rollback_to,
8157 ret,
8158 missing_oid,
8159 true,
8160 false,
8161 &promote_obc);
8162 }
8163 switch (tier_mode_result) {
7c673cae
FG
8164 case cache_result_t::NOOP:
8165 break;
8166 case cache_result_t::BLOCKED_PROMOTE:
11fdf7f2 8167 ceph_assert(promote_obc);
7c673cae
FG
8168 block_write_on_snap_rollback(soid, promote_obc, ctx->op);
8169 return -EAGAIN;
8170 case cache_result_t::BLOCKED_FULL:
8171 block_write_on_full_cache(soid, ctx->op);
8172 return -EAGAIN;
b32b8144 8173 case cache_result_t::REPLIED_WITH_EAGAIN:
11fdf7f2 8174 ceph_abort_msg("this can't happen, no rollback on replica");
7c673cae 8175 default:
11fdf7f2 8176 ceph_abort_msg("must promote was set, other values are not valid");
7c673cae
FG
8177 return -EAGAIN;
8178 }
8179 }
8180
8181 if (ret == -ENOENT || (rollback_to && rollback_to->obs.oi.is_whiteout())) {
8182 // there's no snapshot here, or there's no object.
8183 // if there's no snapshot, we delete the object; otherwise, do nothing.
8184 dout(20) << "_rollback_to deleting head on " << soid.oid
8185 << " because got ENOENT|whiteout on find_object_context" << dendl;
8186 if (ctx->obc->obs.oi.watchers.size()) {
8187 // Cannot delete an object with watchers
8188 ret = -EBUSY;
8189 } else {
8190 _delete_oid(ctx, false, false);
8191 ret = 0;
8192 }
8193 } else if (ret) {
8194 // ummm....huh? It *can't* return anything else at time of writing.
11fdf7f2 8195 ceph_abort_msg("unexpected error code in _rollback_to");
7c673cae
FG
8196 } else { //we got our context, let's use it to do the rollback!
8197 hobject_t& rollback_to_sobject = rollback_to->obs.oi.soid;
11fdf7f2
TL
8198 if (is_degraded_or_backfilling_object(rollback_to_sobject) ||
8199 is_degraded_on_async_recovery_target(rollback_to_sobject)) {
7c673cae
FG
8200 dout(20) << "_rollback_to attempted to roll back to a degraded object "
8201 << rollback_to_sobject << " (requested snapid: ) " << snapid << dendl;
8202 block_write_on_degraded_snap(rollback_to_sobject, ctx->op);
8203 ret = -EAGAIN;
8204 } else if (rollback_to->obs.oi.soid.snap == CEPH_NOSNAP) {
8205 // rolling back to the head; we just need to clone it.
8206 ctx->modify = true;
8207 } else {
8208 /* 1) Delete current head
8209 * 2) Clone correct snapshot into head
8210 * 3) Calculate clone_overlaps by following overlaps
8211 * forward from rollback snapshot */
8212 dout(10) << "_rollback_to deleting " << soid.oid
8213 << " and rolling back to old snap" << dendl;
8214
8215 if (obs.exists) {
8216 t->remove(soid);
8217 }
8218 t->clone(soid, rollback_to_sobject);
7c673cae
FG
8219 t->add_obc(rollback_to);
8220
8221 map<snapid_t, interval_set<uint64_t> >::iterator iter =
8222 snapset.clone_overlap.lower_bound(snapid);
11fdf7f2 8223 ceph_assert(iter != snapset.clone_overlap.end());
7c673cae 8224 interval_set<uint64_t> overlaps = iter->second;
7c673cae
FG
8225 for ( ;
8226 iter != snapset.clone_overlap.end();
8227 ++iter)
8228 overlaps.intersection_of(iter->second);
8229
8230 if (obs.oi.size > 0) {
8231 interval_set<uint64_t> modified;
8232 modified.insert(0, obs.oi.size);
8233 overlaps.intersection_of(modified);
8234 modified.subtract(overlaps);
8235 ctx->modified_ranges.union_of(modified);
8236 }
8237
8238 // Adjust the cached objectcontext
8239 maybe_create_new_object(ctx, true);
8240 ctx->delta_stats.num_bytes -= obs.oi.size;
8241 ctx->delta_stats.num_bytes += rollback_to->obs.oi.size;
9f95a23c
TL
8242 ctx->clean_regions.mark_data_region_dirty(0, std::max(obs.oi.size, rollback_to->obs.oi.size));
8243 ctx->clean_regions.mark_omap_dirty();
7c673cae
FG
8244 obs.oi.size = rollback_to->obs.oi.size;
8245 if (rollback_to->obs.oi.is_data_digest())
8246 obs.oi.set_data_digest(rollback_to->obs.oi.data_digest);
8247 else
8248 obs.oi.clear_data_digest();
8249 if (rollback_to->obs.oi.is_omap_digest())
8250 obs.oi.set_omap_digest(rollback_to->obs.oi.omap_digest);
8251 else
8252 obs.oi.clear_omap_digest();
8253
8254 if (rollback_to->obs.oi.is_omap()) {
8255 dout(10) << __func__ << " setting omap flag on " << obs.oi.soid << dendl;
8256 obs.oi.set_flag(object_info_t::FLAG_OMAP);
8257 } else {
8258 dout(10) << __func__ << " clearing omap flag on " << obs.oi.soid << dendl;
8259 obs.oi.clear_flag(object_info_t::FLAG_OMAP);
8260 }
7c673cae
FG
8261 }
8262 }
8263 return ret;
8264}
8265
8266void PrimaryLogPG::_make_clone(
8267 OpContext *ctx,
8268 PGTransaction* t,
8269 ObjectContextRef obc,
8270 const hobject_t& head, const hobject_t& coid,
8271 object_info_t *poi)
8272{
8273 bufferlist bv;
11fdf7f2 8274 encode(*poi, bv, get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
7c673cae
FG
8275
8276 t->clone(coid, head);
11fdf7f2
TL
8277 setattr_maybe_cache(obc, t, OI_ATTR, bv);
8278 rmattr_maybe_cache(obc, t, SS_ATTR);
7c673cae
FG
8279}
8280
8281void PrimaryLogPG::make_writeable(OpContext *ctx)
8282{
8283 const hobject_t& soid = ctx->obs->oi.soid;
8284 SnapContext& snapc = ctx->snapc;
8285
8286 // clone?
11fdf7f2 8287 ceph_assert(soid.snap == CEPH_NOSNAP);
7c673cae
FG
8288 dout(20) << "make_writeable " << soid << " snapset=" << ctx->new_snapset
8289 << " snapc=" << snapc << dendl;
f67539c2 8290
7c673cae
FG
8291 bool was_dirty = ctx->obc->obs.oi.is_dirty();
8292 if (ctx->new_obs.exists) {
8293 // we will mark the object dirty
8294 if (ctx->undirty && was_dirty) {
8295 dout(20) << " clearing DIRTY flag" << dendl;
11fdf7f2 8296 ceph_assert(ctx->new_obs.oi.is_dirty());
7c673cae
FG
8297 ctx->new_obs.oi.clear_flag(object_info_t::FLAG_DIRTY);
8298 --ctx->delta_stats.num_objects_dirty;
8299 osd->logger->inc(l_osd_tier_clean);
8300 } else if (!was_dirty && !ctx->undirty) {
8301 dout(20) << " setting DIRTY flag" << dendl;
8302 ctx->new_obs.oi.set_flag(object_info_t::FLAG_DIRTY);
8303 ++ctx->delta_stats.num_objects_dirty;
8304 osd->logger->inc(l_osd_tier_dirty);
8305 }
8306 } else {
8307 if (was_dirty) {
8308 dout(20) << " deletion, decrementing num_dirty and clearing flag" << dendl;
8309 ctx->new_obs.oi.clear_flag(object_info_t::FLAG_DIRTY);
8310 --ctx->delta_stats.num_objects_dirty;
8311 }
8312 }
8313
8314 if ((ctx->new_obs.exists &&
8315 ctx->new_obs.oi.is_omap()) &&
8316 (!ctx->obc->obs.exists ||
8317 !ctx->obc->obs.oi.is_omap())) {
8318 ++ctx->delta_stats.num_objects_omap;
8319 }
8320 if ((!ctx->new_obs.exists ||
8321 !ctx->new_obs.oi.is_omap()) &&
8322 (ctx->obc->obs.exists &&
8323 ctx->obc->obs.oi.is_omap())) {
8324 --ctx->delta_stats.num_objects_omap;
8325 }
8326
7c673cae 8327 if (ctx->new_snapset.seq > snapc.seq) {
11fdf7f2 8328 dout(10) << " op snapset is old" << dendl;
7c673cae
FG
8329 }
8330
8331 if ((ctx->obs->exists && !ctx->obs->oi.is_whiteout()) && // head exist(ed)
8332 snapc.snaps.size() && // there are snaps
f67539c2 8333 !ctx->cache_operation &&
7c673cae
FG
8334 snapc.snaps[0] > ctx->new_snapset.seq) { // existing object is old
8335 // clone
8336 hobject_t coid = soid;
8337 coid.snap = snapc.seq;
f67539c2 8338
7c673cae 8339 unsigned l;
11fdf7f2
TL
8340 for (l = 1;
8341 l < snapc.snaps.size() && snapc.snaps[l] > ctx->new_snapset.seq;
8342 l++) ;
8343
7c673cae
FG
8344 vector<snapid_t> snaps(l);
8345 for (unsigned i=0; i<l; i++)
8346 snaps[i] = snapc.snaps[i];
f67539c2 8347
7c673cae
FG
8348 // prepare clone
8349 object_info_t static_snap_oi(coid);
8350 object_info_t *snap_oi;
8351 if (is_primary()) {
8352 ctx->clone_obc = object_contexts.lookup_or_create(static_snap_oi.soid);
11fdf7f2
TL
8353 ctx->clone_obc->destructor_callback =
8354 new C_PG_ObjectContext(this, ctx->clone_obc.get());
7c673cae
FG
8355 ctx->clone_obc->obs.oi = static_snap_oi;
8356 ctx->clone_obc->obs.exists = true;
8357 ctx->clone_obc->ssc = ctx->obc->ssc;
8358 ctx->clone_obc->ssc->ref++;
11fdf7f2 8359 if (pool.info.is_erasure())
7c673cae
FG
8360 ctx->clone_obc->attr_cache = ctx->obc->attr_cache;
8361 snap_oi = &ctx->clone_obc->obs.oi;
f67539c2
TL
8362 if (ctx->obc->obs.oi.has_manifest()) {
8363 if ((ctx->obc->obs.oi.flags & object_info_t::FLAG_REDIRECT_HAS_REFERENCE) &&
8364 ctx->obc->obs.oi.manifest.is_redirect()) {
8365 snap_oi->set_flag(object_info_t::FLAG_MANIFEST);
8366 snap_oi->manifest.type = object_manifest_t::TYPE_REDIRECT;
8367 snap_oi->manifest.redirect_target = ctx->obc->obs.oi.manifest.redirect_target;
8368 } else if (ctx->obc->obs.oi.manifest.is_chunked()) {
8369 snap_oi->set_flag(object_info_t::FLAG_MANIFEST);
8370 snap_oi->manifest.type = object_manifest_t::TYPE_CHUNKED;
8371 snap_oi->manifest.chunk_map = ctx->obc->obs.oi.manifest.chunk_map;
8372 } else {
8373 ceph_abort_msg("unrecognized manifest type");
8374 }
8375 }
7c673cae
FG
8376 bool got = ctx->lock_manager.get_write_greedy(
8377 coid,
8378 ctx->clone_obc,
8379 ctx->op);
11fdf7f2 8380 ceph_assert(got);
7c673cae
FG
8381 dout(20) << " got greedy write on clone_obc " << *ctx->clone_obc << dendl;
8382 } else {
8383 snap_oi = &static_snap_oi;
8384 }
8385 snap_oi->version = ctx->at_version;
8386 snap_oi->prior_version = ctx->obs->oi.version;
8387 snap_oi->copy_user_bits(ctx->obs->oi);
8388
7c673cae 8389 _make_clone(ctx, ctx->op_t.get(), ctx->clone_obc, soid, coid, snap_oi);
f67539c2 8390
7c673cae
FG
8391 ctx->delta_stats.num_objects++;
8392 if (snap_oi->is_dirty()) {
8393 ctx->delta_stats.num_objects_dirty++;
8394 osd->logger->inc(l_osd_tier_dirty);
8395 }
8396 if (snap_oi->is_omap())
8397 ctx->delta_stats.num_objects_omap++;
8398 if (snap_oi->is_cache_pinned())
8399 ctx->delta_stats.num_objects_pinned++;
11fdf7f2
TL
8400 if (snap_oi->has_manifest())
8401 ctx->delta_stats.num_objects_manifest++;
7c673cae
FG
8402 ctx->delta_stats.num_object_clones++;
8403 ctx->new_snapset.clones.push_back(coid.snap);
8404 ctx->new_snapset.clone_size[coid.snap] = ctx->obs->oi.size;
11fdf7f2 8405 ctx->new_snapset.clone_snaps[coid.snap] = snaps;
7c673cae 8406
f67539c2 8407 // clone_overlap should contain an entry for each clone
7c673cae
FG
8408 // (an empty interval_set if there is no overlap)
8409 ctx->new_snapset.clone_overlap[coid.snap];
8410 if (ctx->obs->oi.size)
8411 ctx->new_snapset.clone_overlap[coid.snap].insert(0, ctx->obs->oi.size);
f67539c2 8412
7c673cae
FG
8413 // log clone
8414 dout(10) << " cloning v " << ctx->obs->oi.version
8415 << " to " << coid << " v " << ctx->at_version
8416 << " snaps=" << snaps
8417 << " snapset=" << ctx->new_snapset << dendl;
11fdf7f2
TL
8418 ctx->log.push_back(pg_log_entry_t(
8419 pg_log_entry_t::CLONE, coid, ctx->at_version,
8420 ctx->obs->oi.version,
8421 ctx->obs->oi.user_version,
8422 osd_reqid_t(), ctx->new_obs.oi.mtime, 0));
8423 encode(snaps, ctx->log.back().snaps);
7c673cae
FG
8424
8425 ctx->at_version.version++;
8426 }
8427
8428 // update most recent clone_overlap and usage stats
8429 if (ctx->new_snapset.clones.size() > 0) {
11fdf7f2
TL
8430 // the clone_overlap is difference of range between head and clones.
8431 // we need to check whether the most recent clone exists, if it's
8432 // been evicted, it's not included in the stats, but the clone_overlap
8433 // is still exist in the snapset, so we should update the
8434 // clone_overlap to make it sense.
7c673cae
FG
8435 hobject_t last_clone_oid = soid;
8436 last_clone_oid.snap = ctx->new_snapset.clone_overlap.rbegin()->first;
11fdf7f2
TL
8437 interval_set<uint64_t> &newest_overlap =
8438 ctx->new_snapset.clone_overlap.rbegin()->second;
8439 ctx->modified_ranges.intersection_of(newest_overlap);
7c673cae 8440 if (is_present_clone(last_clone_oid)) {
7c673cae 8441 // modified_ranges is still in use by the clone
11fdf7f2 8442 ctx->delta_stats.num_bytes += ctx->modified_ranges.size();
7c673cae 8443 }
11fdf7f2 8444 newest_overlap.subtract(ctx->modified_ranges);
7c673cae 8445 }
f67539c2 8446
11fdf7f2
TL
8447 if (snapc.seq > ctx->new_snapset.seq) {
8448 // update snapset with latest snap context
8449 ctx->new_snapset.seq = snapc.seq;
9f95a23c
TL
8450 if (get_osdmap()->require_osd_release < ceph_release_t::octopus) {
8451 ctx->new_snapset.snaps = snapc.snaps;
8452 } else {
8453 ctx->new_snapset.snaps.clear();
8454 }
7c673cae
FG
8455 }
8456 dout(20) << "make_writeable " << soid
8457 << " done, snapset=" << ctx->new_snapset << dendl;
8458}
8459
8460
8461void PrimaryLogPG::write_update_size_and_usage(object_stat_sum_t& delta_stats, object_info_t& oi,
8462 interval_set<uint64_t>& modified, uint64_t offset,
8463 uint64_t length, bool write_full)
8464{
8465 interval_set<uint64_t> ch;
8466 if (write_full) {
8467 if (oi.size)
8468 ch.insert(0, oi.size);
8469 } else if (length)
8470 ch.insert(offset, length);
8471 modified.union_of(ch);
11fdf7f2
TL
8472 if (write_full ||
8473 (offset + length > oi.size && length)) {
7c673cae
FG
8474 uint64_t new_size = offset + length;
8475 delta_stats.num_bytes -= oi.size;
8476 delta_stats.num_bytes += new_size;
8477 oi.size = new_size;
8478 }
f67539c2 8479
7c673cae 8480 delta_stats.num_wr++;
11fdf7f2 8481 delta_stats.num_wr_kb += shift_round_up(length, 10);
7c673cae
FG
8482}
8483
11fdf7f2
TL
8484void PrimaryLogPG::truncate_update_size_and_usage(
8485 object_stat_sum_t& delta_stats,
8486 object_info_t& oi,
8487 uint64_t truncate_size)
7c673cae 8488{
11fdf7f2
TL
8489 if (oi.size != truncate_size) {
8490 delta_stats.num_bytes -= oi.size;
8491 delta_stats.num_bytes += truncate_size;
8492 oi.size = truncate_size;
7c673cae
FG
8493 }
8494}
8495
8496void PrimaryLogPG::complete_disconnect_watches(
8497 ObjectContextRef obc,
8498 const list<watch_disconnect_t> &to_disconnect)
8499{
8500 for (list<watch_disconnect_t>::const_iterator i =
8501 to_disconnect.begin();
8502 i != to_disconnect.end();
8503 ++i) {
8504 pair<uint64_t, entity_name_t> watcher(i->cookie, i->name);
8505 auto watchers_entry = obc->watchers.find(watcher);
8506 if (watchers_entry != obc->watchers.end()) {
8507 WatchRef watch = watchers_entry->second;
8508 dout(10) << "do_osd_op_effects disconnect watcher " << watcher << dendl;
8509 obc->watchers.erase(watcher);
8510 watch->remove(i->send_disconnect);
8511 } else {
8512 dout(10) << "do_osd_op_effects disconnect failed to find watcher "
8513 << watcher << dendl;
8514 }
8515 }
8516}
8517
8518void PrimaryLogPG::do_osd_op_effects(OpContext *ctx, const ConnectionRef& conn)
8519{
8520 entity_name_t entity = ctx->reqid.name;
8521 dout(15) << "do_osd_op_effects " << entity << " con " << conn.get() << dendl;
8522
8523 // disconnects first
8524 complete_disconnect_watches(ctx->obc, ctx->watch_disconnects);
8525
11fdf7f2 8526 ceph_assert(conn);
7c673cae 8527
11fdf7f2
TL
8528 auto session = conn->get_priv();
8529 if (!session)
7c673cae 8530 return;
7c673cae
FG
8531
8532 for (list<pair<watch_info_t,bool> >::iterator i = ctx->watch_connects.begin();
8533 i != ctx->watch_connects.end();
8534 ++i) {
8535 pair<uint64_t, entity_name_t> watcher(i->first.cookie, entity);
8536 dout(15) << "do_osd_op_effects applying watch connect on session "
8537 << session.get() << " watcher " << watcher << dendl;
8538 WatchRef watch;
8539 if (ctx->obc->watchers.count(watcher)) {
8540 dout(15) << "do_osd_op_effects found existing watch watcher " << watcher
8541 << dendl;
8542 watch = ctx->obc->watchers[watcher];
8543 } else {
8544 dout(15) << "do_osd_op_effects new watcher " << watcher
8545 << dendl;
8546 watch = Watch::makeWatchRef(
8547 this, osd, ctx->obc, i->first.timeout_seconds,
8548 i->first.cookie, entity, conn->get_peer_addr());
8549 ctx->obc->watchers.insert(
8550 make_pair(
8551 watcher,
8552 watch));
8553 }
8554 watch->connect(conn, i->second);
8555 }
8556
8557 for (list<notify_info_t>::iterator p = ctx->notifies.begin();
8558 p != ctx->notifies.end();
8559 ++p) {
8560 dout(10) << "do_osd_op_effects, notify " << *p << dendl;
8561 ConnectionRef conn(ctx->op->get_req()->get_connection());
8562 NotifyRef notif(
8563 Notify::makeNotifyRef(
8564 conn,
8565 ctx->reqid.name.num(),
8566 p->bl,
8567 p->timeout,
8568 p->cookie,
8569 p->notify_id,
8570 ctx->obc->obs.oi.user_version,
8571 osd));
8572 for (map<pair<uint64_t, entity_name_t>, WatchRef>::iterator i =
8573 ctx->obc->watchers.begin();
8574 i != ctx->obc->watchers.end();
8575 ++i) {
8576 dout(10) << "starting notify on watch " << i->first << dendl;
8577 i->second->start_notify(notif);
8578 }
8579 notif->init();
8580 }
8581
8582 for (list<OpContext::NotifyAck>::iterator p = ctx->notify_acks.begin();
8583 p != ctx->notify_acks.end();
8584 ++p) {
8585 if (p->watch_cookie)
9f95a23c 8586 dout(10) << "notify_ack " << make_pair(*(p->watch_cookie), p->notify_id) << dendl;
7c673cae
FG
8587 else
8588 dout(10) << "notify_ack " << make_pair("NULL", p->notify_id) << dendl;
8589 for (map<pair<uint64_t, entity_name_t>, WatchRef>::iterator i =
8590 ctx->obc->watchers.begin();
8591 i != ctx->obc->watchers.end();
8592 ++i) {
8593 if (i->first.second != entity) continue;
8594 if (p->watch_cookie &&
9f95a23c 8595 *(p->watch_cookie) != i->first.first) continue;
7c673cae
FG
8596 dout(10) << "acking notify on watch " << i->first << dendl;
8597 i->second->notify_ack(p->notify_id, p->reply_bl);
8598 }
8599 }
8600}
8601
8602hobject_t PrimaryLogPG::generate_temp_object(const hobject_t& target)
8603{
8604 ostringstream ss;
8605 ss << "temp_" << info.pgid << "_" << get_role()
8606 << "_" << osd->monc->get_global_id() << "_" << (++temp_seq);
8607 hobject_t hoid = target.make_temp_hobject(ss.str());
8608 dout(20) << __func__ << " " << hoid << dendl;
8609 return hoid;
8610}
8611
8612hobject_t PrimaryLogPG::get_temp_recovery_object(
8613 const hobject_t& target,
8614 eversion_t version)
8615{
8616 ostringstream ss;
8617 ss << "temp_recovering_" << info.pgid // (note this includes the shardid)
8618 << "_" << version
8619 << "_" << info.history.same_interval_since
8620 << "_" << target.snap;
8621 // pgid + version + interval + snapid is unique, and short
8622 hobject_t hoid = target.make_temp_hobject(ss.str());
8623 dout(20) << __func__ << " " << hoid << dendl;
8624 return hoid;
8625}
8626
8627int PrimaryLogPG::prepare_transaction(OpContext *ctx)
8628{
11fdf7f2 8629 ceph_assert(!ctx->ops->empty());
7c673cae
FG
8630
8631 // valid snap context?
8632 if (!ctx->snapc.is_valid()) {
8633 dout(10) << " invalid snapc " << ctx->snapc << dendl;
8634 return -EINVAL;
8635 }
8636
8637 // prepare the actual mutation
c07f9fc5 8638 int result = do_osd_ops(ctx, *ctx->ops);
7c673cae
FG
8639 if (result < 0) {
8640 if (ctx->op->may_write() &&
9f95a23c 8641 get_osdmap()->require_osd_release >= ceph_release_t::kraken) {
7c673cae
FG
8642 // need to save the error code in the pg log, to detect dup ops,
8643 // but do nothing else
8644 ctx->update_log_only = true;
8645 }
8646 return result;
8647 }
8648
8649 // read-op? write-op noop? done?
8650 if (ctx->op_t->empty() && !ctx->modify) {
11fdf7f2
TL
8651 if (ctx->pending_async_reads.empty())
8652 unstable_stats.add(ctx->delta_stats);
7c673cae 8653 if (ctx->op->may_write() &&
9f95a23c 8654 get_osdmap()->require_osd_release >= ceph_release_t::kraken) {
7c673cae
FG
8655 ctx->update_log_only = true;
8656 }
8657 return result;
8658 }
8659
8660 // check for full
8661 if ((ctx->delta_stats.num_bytes > 0 ||
8662 ctx->delta_stats.num_objects > 0) && // FIXME: keys?
9f95a23c
TL
8663 pool.info.has_flag(pg_pool_t::FLAG_FULL)) {
8664 auto m = ctx->op->get_req<MOSDOp>();
7c673cae
FG
8665 if (ctx->reqid.name.is_mds() || // FIXME: ignore MDS for now
8666 m->has_flag(CEPH_OSD_FLAG_FULL_FORCE)) {
8667 dout(20) << __func__ << " full, but proceeding due to FULL_FORCE or MDS"
8668 << dendl;
8669 } else if (m->has_flag(CEPH_OSD_FLAG_FULL_TRY)) {
8670 // they tried, they failed.
8671 dout(20) << __func__ << " full, replying to FULL_TRY op" << dendl;
11fdf7f2 8672 return pool.info.has_flag(pg_pool_t::FLAG_FULL_QUOTA) ? -EDQUOT : -ENOSPC;
7c673cae
FG
8673 } else {
8674 // drop request
8675 dout(20) << __func__ << " full, dropping request (bad client)" << dendl;
8676 return -EAGAIN;
8677 }
8678 }
8679
11fdf7f2 8680 const hobject_t& soid = ctx->obs->oi.soid;
7c673cae
FG
8681 // clone, if necessary
8682 if (soid.snap == CEPH_NOSNAP)
8683 make_writeable(ctx);
8684
8685 finish_ctx(ctx,
8686 ctx->new_obs.exists ? pg_log_entry_t::MODIFY :
9f95a23c
TL
8687 pg_log_entry_t::DELETE,
8688 result);
7c673cae
FG
8689
8690 return result;
8691}
8692
9f95a23c 8693void PrimaryLogPG::finish_ctx(OpContext *ctx, int log_op_type, int result)
7c673cae
FG
8694{
8695 const hobject_t& soid = ctx->obs->oi.soid;
8696 dout(20) << __func__ << " " << soid << " " << ctx
8697 << " op " << pg_log_entry_t::get_op_name(log_op_type)
8698 << dendl;
8699 utime_t now = ceph_clock_now();
8700
f67539c2
TL
8701#ifdef HAVE_JAEGER
8702 if (ctx->op->osd_parent_span) {
8703 auto finish_ctx_span = jaeger_tracing::child_span(__func__, ctx->op->osd_parent_span);
8704 }
8705#endif
8706 // Drop the reference if deduped chunk is modified
8707 if (ctx->new_obs.oi.is_dirty() &&
8708 (ctx->obs->oi.has_manifest() && ctx->obs->oi.manifest.is_chunked()) &&
8709 // If a clone is creating, ignore dropping the reference for manifest object
8710 !ctx->delta_stats.num_object_clones &&
8711 ctx->new_obs.oi.size != 0 && // missing, redirect and delete
8712 !ctx->cache_operation &&
8713 log_op_type != pg_log_entry_t::PROMOTE) {
8714 dec_refcount_by_dirty(ctx);
8715 }
8716
7c673cae
FG
8717 // finish and log the op.
8718 if (ctx->user_modify) {
8719 // update the user_version for any modify ops, except for the watch op
11fdf7f2 8720 ctx->user_at_version = std::max(info.last_user_version, ctx->new_obs.oi.user_version) + 1;
7c673cae
FG
8721 /* In order for new clients and old clients to interoperate properly
8722 * when exchanging versions, we need to lower bound the user_version
8723 * (which our new clients pay proper attention to)
8724 * by the at_version (which is all the old clients can ever see). */
8725 if (ctx->at_version.version > ctx->user_at_version)
8726 ctx->user_at_version = ctx->at_version.version;
8727 ctx->new_obs.oi.user_version = ctx->user_at_version;
8728 }
8729 ctx->bytes_written = ctx->op_t->get_bytes_written();
f67539c2 8730
7c673cae 8731 if (ctx->new_obs.exists) {
7c673cae
FG
8732 ctx->new_obs.oi.version = ctx->at_version;
8733 ctx->new_obs.oi.prior_version = ctx->obs->oi.version;
8734 ctx->new_obs.oi.last_reqid = ctx->reqid;
8735 if (ctx->mtime != utime_t()) {
8736 ctx->new_obs.oi.mtime = ctx->mtime;
8737 dout(10) << " set mtime to " << ctx->new_obs.oi.mtime << dendl;
8738 ctx->new_obs.oi.local_mtime = now;
8739 } else {
8740 dout(10) << " mtime unchanged at " << ctx->new_obs.oi.mtime << dendl;
8741 }
8742
11fdf7f2 8743 // object_info_t
7c673cae
FG
8744 map <string, bufferlist> attrs;
8745 bufferlist bv(sizeof(ctx->new_obs.oi));
11fdf7f2 8746 encode(ctx->new_obs.oi, bv,
7c673cae 8747 get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
f67539c2 8748 attrs[OI_ATTR] = std::move(bv);
7c673cae 8749
11fdf7f2 8750 // snapset
7c673cae
FG
8751 if (soid.snap == CEPH_NOSNAP) {
8752 dout(10) << " final snapset " << ctx->new_snapset
8753 << " in " << soid << dendl;
11fdf7f2
TL
8754 bufferlist bss;
8755 encode(ctx->new_snapset, bss);
f67539c2 8756 attrs[SS_ATTR] = std::move(bss);
7c673cae
FG
8757 } else {
8758 dout(10) << " no snapset (this is a clone)" << dendl;
8759 }
8760 ctx->op_t->setattrs(soid, attrs);
8761 } else {
11fdf7f2 8762 // reset cached oi
7c673cae
FG
8763 ctx->new_obs.oi = object_info_t(ctx->obc->obs.oi.soid);
8764 }
8765
7c673cae 8766 // append to log
9f95a23c
TL
8767 ctx->log.push_back(
8768 pg_log_entry_t(log_op_type, soid, ctx->at_version,
8769 ctx->obs->oi.version,
8770 ctx->user_at_version, ctx->reqid,
8771 ctx->mtime,
8772 (ctx->op && ctx->op->allows_returnvec()) ? result : 0));
8773 if (ctx->op && ctx->op->allows_returnvec()) {
8774 // also the per-op values
8775 ctx->log.back().set_op_returns(*ctx->ops);
8776 dout(20) << __func__ << " op_returns " << ctx->log.back().op_returns
8777 << dendl;
8778 }
8779
8780 ctx->log.back().clean_regions = ctx->clean_regions;
8781 dout(20) << __func__ << " object " << soid << " marks clean_regions " << ctx->log.back().clean_regions << dendl;
8782
7c673cae
FG
8783 if (soid.snap < CEPH_NOSNAP) {
8784 switch (log_op_type) {
8785 case pg_log_entry_t::MODIFY:
8786 case pg_log_entry_t::PROMOTE:
8787 case pg_log_entry_t::CLEAN:
11fdf7f2
TL
8788 dout(20) << __func__ << " encoding snaps from " << ctx->new_snapset
8789 << dendl;
8790 encode(ctx->new_snapset.clone_snaps[soid.snap], ctx->log.back().snaps);
7c673cae
FG
8791 break;
8792 default:
8793 break;
8794 }
8795 }
8796
8797 if (!ctx->extra_reqids.empty()) {
11fdf7f2
TL
8798 dout(20) << __func__ << " extra_reqids " << ctx->extra_reqids << " "
8799 << ctx->extra_reqid_return_codes << dendl;
7c673cae 8800 ctx->log.back().extra_reqids.swap(ctx->extra_reqids);
11fdf7f2 8801 ctx->log.back().extra_reqid_return_codes.swap(ctx->extra_reqid_return_codes);
7c673cae
FG
8802 }
8803
8804 // apply new object state.
8805 ctx->obc->obs = ctx->new_obs;
8806
11fdf7f2 8807 if (soid.is_head() && !ctx->obc->obs.exists) {
7c673cae
FG
8808 ctx->obc->ssc->exists = false;
8809 ctx->obc->ssc->snapset = SnapSet();
8810 } else {
8811 ctx->obc->ssc->exists = true;
8812 ctx->obc->ssc->snapset = ctx->new_snapset;
8813 }
8814}
8815
8816void PrimaryLogPG::apply_stats(
8817 const hobject_t &soid,
8818 const object_stat_sum_t &delta_stats) {
8819
9f95a23c
TL
8820 recovery_state.apply_op_stats(soid, delta_stats);
8821 for (set<pg_shard_t>::const_iterator i = get_backfill_targets().begin();
8822 i != get_backfill_targets().end();
7c673cae
FG
8823 ++i) {
8824 pg_shard_t bt = *i;
9f95a23c
TL
8825 const pg_info_t& pinfo = recovery_state.get_peer_info(bt);
8826 if (soid > pinfo.last_backfill && soid <= last_backfill_started) {
7c673cae 8827 pending_backfill_updates[soid].stats.add(delta_stats);
9f95a23c 8828 }
7c673cae
FG
8829 }
8830
f67539c2 8831 m_scrubber->stats_of_handled_objects(delta_stats, soid);
7c673cae
FG
8832}
8833
8834void PrimaryLogPG::complete_read_ctx(int result, OpContext *ctx)
8835{
9f95a23c 8836 auto m = ctx->op->get_req<MOSDOp>();
11fdf7f2 8837 ceph_assert(ctx->async_reads_complete());
7c673cae 8838
f67539c2 8839 for (auto p = ctx->ops->begin();
c07f9fc5 8840 p != ctx->ops->end() && result >= 0; ++p) {
7c673cae
FG
8841 if (p->rval < 0 && !(p->op.flags & CEPH_OSD_OP_FLAG_FAILOK)) {
8842 result = p->rval;
8843 break;
8844 }
8845 ctx->bytes_read += p->outdata.length();
8846 }
c07f9fc5 8847 ctx->reply->get_header().data_off = (ctx->data_off ? *ctx->data_off : 0);
7c673cae
FG
8848
8849 MOSDOpReply *reply = ctx->reply;
8850 ctx->reply = nullptr;
8851
8852 if (result >= 0) {
8853 if (!ctx->ignore_log_op_stats) {
11fdf7f2
TL
8854 log_op_stats(*ctx->op, ctx->bytes_written, ctx->bytes_read);
8855
7c673cae
FG
8856 publish_stats_to_osd();
8857 }
8858
8859 // on read, return the current object version
8860 if (ctx->obs) {
8861 reply->set_reply_versions(eversion_t(), ctx->obs->oi.user_version);
8862 } else {
8863 reply->set_reply_versions(eversion_t(), ctx->user_at_version);
8864 }
8865 } else if (result == -ENOENT) {
8866 // on ENOENT, set a floor for what the next user version will be.
8867 reply->set_enoent_reply_versions(info.last_update, info.last_user_version);
8868 }
8869
8870 reply->set_result(result);
8871 reply->add_flags(CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK);
8872 osd->send_message_osd_client(reply, m->get_connection());
8873 close_op_ctx(ctx);
8874}
8875
8876// ========================================================================
8877// copyfrom
8878
8879struct C_Copyfrom : public Context {
8880 PrimaryLogPGRef pg;
8881 hobject_t oid;
8882 epoch_t last_peering_reset;
8883 ceph_tid_t tid;
11fdf7f2 8884 PrimaryLogPG::CopyOpRef cop; // used for keeping the cop alive
7c673cae
FG
8885 C_Copyfrom(PrimaryLogPG *p, hobject_t o, epoch_t lpr,
8886 const PrimaryLogPG::CopyOpRef& c)
8887 : pg(p), oid(o), last_peering_reset(lpr),
8888 tid(0), cop(c)
8889 {}
8890 void finish(int r) override {
8891 if (r == -ECANCELED)
8892 return;
9f95a23c 8893 std::scoped_lock l{*pg};
7c673cae
FG
8894 if (last_peering_reset == pg->get_last_peering_reset()) {
8895 pg->process_copy_chunk(oid, tid, r);
11fdf7f2 8896 cop.reset();
7c673cae 8897 }
7c673cae
FG
8898 }
8899};
8900
8901struct C_CopyFrom_AsyncReadCb : public Context {
8902 OSDOp *osd_op;
8903 object_copy_data_t reply_obj;
8904 uint64_t features;
8905 size_t len;
8906 C_CopyFrom_AsyncReadCb(OSDOp *osd_op, uint64_t features) :
8907 osd_op(osd_op), features(features), len(0) {}
8908 void finish(int r) override {
c07f9fc5
FG
8909 osd_op->rval = r;
8910 if (r < 0) {
8911 return;
8912 }
8913
11fdf7f2
TL
8914 ceph_assert(len > 0);
8915 ceph_assert(len <= reply_obj.data.length());
7c673cae
FG
8916 bufferlist bl;
8917 bl.substr_of(reply_obj.data, 0, len);
8918 reply_obj.data.swap(bl);
11fdf7f2
TL
8919 encode(reply_obj, osd_op->outdata, features);
8920 }
8921};
8922
8923struct C_CopyChunk : public Context {
8924 PrimaryLogPGRef pg;
8925 hobject_t oid;
8926 epoch_t last_peering_reset;
8927 ceph_tid_t tid;
8928 PrimaryLogPG::CopyOpRef cop; // used for keeping the cop alive
8929 uint64_t offset = 0;
8930 C_CopyChunk(PrimaryLogPG *p, hobject_t o, epoch_t lpr,
8931 const PrimaryLogPG::CopyOpRef& c)
8932 : pg(p), oid(o), last_peering_reset(lpr),
f67539c2 8933 tid(0), cop(c)
11fdf7f2
TL
8934 {}
8935 void finish(int r) override {
8936 if (r == -ECANCELED)
8937 return;
9f95a23c 8938 std::scoped_lock l{*pg};
11fdf7f2
TL
8939 if (last_peering_reset == pg->get_last_peering_reset()) {
8940 pg->process_copy_chunk_manifest(oid, tid, r, offset);
8941 cop.reset();
8942 }
7c673cae
FG
8943 }
8944};
8945
11fdf7f2 8946int PrimaryLogPG::do_copy_get(OpContext *ctx, bufferlist::const_iterator& bp,
c07f9fc5 8947 OSDOp& osd_op, ObjectContextRef &obc)
7c673cae
FG
8948{
8949 object_info_t& oi = obc->obs.oi;
8950 hobject_t& soid = oi.soid;
8951 int result = 0;
8952 object_copy_cursor_t cursor;
8953 uint64_t out_max;
8954 try {
11fdf7f2
TL
8955 decode(cursor, bp);
8956 decode(out_max, bp);
7c673cae 8957 }
f67539c2 8958 catch (ceph::buffer::error& e) {
7c673cae
FG
8959 result = -EINVAL;
8960 return result;
8961 }
8962
8963 const MOSDOp *op = reinterpret_cast<const MOSDOp*>(ctx->op->get_req());
8964 uint64_t features = op->get_features();
8965
8966 bool async_read_started = false;
8967 object_copy_data_t _reply_obj;
11fdf7f2
TL
8968 C_CopyFrom_AsyncReadCb *cb = nullptr;
8969 if (pool.info.is_erasure()) {
7c673cae
FG
8970 cb = new C_CopyFrom_AsyncReadCb(&osd_op, features);
8971 }
8972 object_copy_data_t &reply_obj = cb ? cb->reply_obj : _reply_obj;
8973 // size, mtime
8974 reply_obj.size = oi.size;
8975 reply_obj.mtime = oi.mtime;
11fdf7f2 8976 ceph_assert(obc->ssc);
7c673cae 8977 if (soid.snap < CEPH_NOSNAP) {
11fdf7f2
TL
8978 auto p = obc->ssc->snapset.clone_snaps.find(soid.snap);
8979 ceph_assert(p != obc->ssc->snapset.clone_snaps.end()); // warn?
8980 reply_obj.snaps = p->second;
7c673cae
FG
8981 } else {
8982 reply_obj.snap_seq = obc->ssc->snapset.seq;
8983 }
11fdf7f2 8984 if (oi.is_data_digest()) {
7c673cae
FG
8985 reply_obj.flags |= object_copy_data_t::FLAG_DATA_DIGEST;
8986 reply_obj.data_digest = oi.data_digest;
8987 }
8988 if (oi.is_omap_digest()) {
8989 reply_obj.flags |= object_copy_data_t::FLAG_OMAP_DIGEST;
8990 reply_obj.omap_digest = oi.omap_digest;
8991 }
8992 reply_obj.truncate_seq = oi.truncate_seq;
8993 reply_obj.truncate_size = oi.truncate_size;
8994
8995 // attrs
8996 map<string,bufferlist>& out_attrs = reply_obj.attrs;
8997 if (!cursor.attr_complete) {
8998 result = getattrs_maybe_cache(
8999 ctx->obc,
b32b8144 9000 &out_attrs);
7c673cae
FG
9001 if (result < 0) {
9002 if (cb) {
9003 delete cb;
9004 }
9005 return result;
9006 }
9007 cursor.attr_complete = true;
9008 dout(20) << " got attrs" << dendl;
9009 }
9010
9011 int64_t left = out_max - osd_op.outdata.length();
9012
9013 // data
9014 bufferlist& bl = reply_obj.data;
9015 if (left > 0 && !cursor.data_complete) {
9016 if (cursor.data_offset < oi.size) {
11fdf7f2 9017 uint64_t max_read = std::min(oi.size - cursor.data_offset, (uint64_t)left);
7c673cae
FG
9018 if (cb) {
9019 async_read_started = true;
9020 ctx->pending_async_reads.push_back(
9021 make_pair(
9022 boost::make_tuple(cursor.data_offset, max_read, osd_op.op.flags),
9023 make_pair(&bl, cb)));
c07f9fc5
FG
9024 cb->len = max_read;
9025
9026 ctx->op_finishers[ctx->current_osd_subop_num].reset(
9027 new ReadFinisher(osd_op));
9028 result = -EINPROGRESS;
9029
9030 dout(10) << __func__ << ": async_read noted for " << soid << dendl;
7c673cae
FG
9031 } else {
9032 result = pgbackend->objects_read_sync(
c07f9fc5 9033 oi.soid, cursor.data_offset, max_read, osd_op.op.flags, &bl);
7c673cae
FG
9034 if (result < 0)
9035 return result;
9036 }
c07f9fc5
FG
9037 left -= max_read;
9038 cursor.data_offset += max_read;
7c673cae
FG
9039 }
9040 if (cursor.data_offset == oi.size) {
9041 cursor.data_complete = true;
9042 dout(20) << " got data" << dendl;
9043 }
11fdf7f2 9044 ceph_assert(cursor.data_offset <= oi.size);
7c673cae
FG
9045 }
9046
9047 // omap
9048 uint32_t omap_keys = 0;
9049 if (!pool.info.supports_omap() || !oi.is_omap()) {
9050 cursor.omap_complete = true;
9051 } else {
9052 if (left > 0 && !cursor.omap_complete) {
11fdf7f2 9053 ceph_assert(cursor.data_complete);
7c673cae
FG
9054 if (cursor.omap_offset.empty()) {
9055 osd->store->omap_get_header(ch, ghobject_t(oi.soid),
9056 &reply_obj.omap_header);
9057 }
9058 bufferlist omap_data;
9059 ObjectMap::ObjectMapIterator iter =
11fdf7f2
TL
9060 osd->store->get_omap_iterator(ch, ghobject_t(oi.soid));
9061 ceph_assert(iter);
7c673cae 9062 iter->upper_bound(cursor.omap_offset);
11fdf7f2 9063 for (; iter->valid(); iter->next()) {
7c673cae 9064 ++omap_keys;
11fdf7f2
TL
9065 encode(iter->key(), omap_data);
9066 encode(iter->value(), omap_data);
7c673cae
FG
9067 left -= iter->key().length() + 4 + iter->value().length() + 4;
9068 if (left <= 0)
9069 break;
9070 }
9071 if (omap_keys) {
11fdf7f2 9072 encode(omap_keys, reply_obj.omap_data);
7c673cae
FG
9073 reply_obj.omap_data.claim_append(omap_data);
9074 }
9075 if (iter->valid()) {
9076 cursor.omap_offset = iter->key();
9077 } else {
9078 cursor.omap_complete = true;
9079 dout(20) << " got omap" << dendl;
9080 }
9081 }
9082 }
9083
9084 if (cursor.is_complete()) {
9085 // include reqids only in the final step. this is a bit fragile
9086 // but it works...
9f95a23c 9087 recovery_state.get_pg_log().get_log().get_object_reqids(ctx->obc->obs.oi.soid, 10,
11fdf7f2
TL
9088 &reply_obj.reqids,
9089 &reply_obj.reqid_return_codes);
7c673cae
FG
9090 dout(20) << " got reqids" << dendl;
9091 }
9092
9093 dout(20) << " cursor.is_complete=" << cursor.is_complete()
9094 << " " << out_attrs.size() << " attrs"
9095 << " " << bl.length() << " bytes"
9096 << " " << reply_obj.omap_header.length() << " omap header bytes"
9097 << " " << reply_obj.omap_data.length() << " omap data bytes in "
9098 << omap_keys << " keys"
9099 << " " << reply_obj.reqids.size() << " reqids"
9100 << dendl;
9101 reply_obj.cursor = cursor;
9102 if (!async_read_started) {
11fdf7f2 9103 encode(reply_obj, osd_op.outdata, features);
7c673cae
FG
9104 }
9105 if (cb && !async_read_started) {
9106 delete cb;
9107 }
c07f9fc5
FG
9108
9109 if (result > 0) {
9110 result = 0;
9111 }
7c673cae
FG
9112 return result;
9113}
9114
9115void PrimaryLogPG::fill_in_copy_get_noent(OpRequestRef& op, hobject_t oid,
9116 OSDOp& osd_op)
9117{
9f95a23c 9118 const MOSDOp *m = static_cast<const MOSDOp*>(op->get_req());
7c673cae
FG
9119 uint64_t features = m->get_features();
9120 object_copy_data_t reply_obj;
9121
9f95a23c 9122 recovery_state.get_pg_log().get_log().get_object_reqids(oid, 10, &reply_obj.reqids,
11fdf7f2 9123 &reply_obj.reqid_return_codes);
7c673cae 9124 dout(20) << __func__ << " got reqids " << reply_obj.reqids << dendl;
11fdf7f2 9125 encode(reply_obj, osd_op.outdata, features);
7c673cae 9126 osd_op.rval = -ENOENT;
11fdf7f2 9127 MOSDOpReply *reply = new MOSDOpReply(m, 0, get_osdmap_epoch(), 0, false);
7c673cae
FG
9128 reply->set_result(-ENOENT);
9129 reply->add_flags(CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK);
9130 osd->send_message_osd_client(reply, m->get_connection());
9131}
9132
9133void PrimaryLogPG::start_copy(CopyCallback *cb, ObjectContextRef obc,
9134 hobject_t src, object_locator_t oloc,
9135 version_t version, unsigned flags,
9136 bool mirror_snapset,
9137 unsigned src_obj_fadvise_flags,
9138 unsigned dest_obj_fadvise_flags)
9139{
9140 const hobject_t& dest = obc->obs.oi.soid;
9141 dout(10) << __func__ << " " << dest
9142 << " from " << src << " " << oloc << " v" << version
9143 << " flags " << flags
9144 << (mirror_snapset ? " mirror_snapset" : "")
9145 << dendl;
9146
11fdf7f2 9147 ceph_assert(!mirror_snapset || src.snap == CEPH_NOSNAP);
7c673cae
FG
9148
9149 // cancel a previous in-progress copy?
9150 if (copy_ops.count(dest)) {
9151 // FIXME: if the src etc match, we could avoid restarting from the
9152 // beginning.
9153 CopyOpRef cop = copy_ops[dest];
94b18763
FG
9154 vector<ceph_tid_t> tids;
9155 cancel_copy(cop, false, &tids);
9156 osd->objecter->op_cancel(tids, -ECANCELED);
7c673cae
FG
9157 }
9158
9159 CopyOpRef cop(std::make_shared<CopyOp>(cb, obc, src, oloc, version, flags,
9160 mirror_snapset, src_obj_fadvise_flags,
9161 dest_obj_fadvise_flags));
9162 copy_ops[dest] = cop;
9163 obc->start_block();
9164
11fdf7f2
TL
9165 if (!obc->obs.oi.has_manifest()) {
9166 _copy_some(obc, cop);
9167 } else {
9168 if (obc->obs.oi.manifest.is_redirect()) {
9169 _copy_some(obc, cop);
9170 } else if (obc->obs.oi.manifest.is_chunked()) {
9171 auto p = obc->obs.oi.manifest.chunk_map.begin();
9172 _copy_some_manifest(obc, cop, p->first);
9173 } else {
9174 ceph_abort_msg("unrecognized manifest type");
9175 }
9176 }
7c673cae
FG
9177}
9178
9179void PrimaryLogPG::_copy_some(ObjectContextRef obc, CopyOpRef cop)
9180{
91327a77 9181 dout(10) << __func__ << " " << *obc << " " << cop << dendl;
7c673cae
FG
9182
9183 unsigned flags = 0;
9184 if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_FLUSH)
9185 flags |= CEPH_OSD_FLAG_FLUSH;
9186 if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_IGNORE_CACHE)
9187 flags |= CEPH_OSD_FLAG_IGNORE_CACHE;
9188 if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_IGNORE_OVERLAY)
9189 flags |= CEPH_OSD_FLAG_IGNORE_OVERLAY;
9190 if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_MAP_SNAP_CLONE)
9191 flags |= CEPH_OSD_FLAG_MAP_SNAP_CLONE;
9192 if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_RWORDERED)
9193 flags |= CEPH_OSD_FLAG_RWORDERED;
9194
9195 C_GatherBuilder gather(cct);
9196
9197 if (cop->cursor.is_initial() && cop->mirror_snapset) {
9198 // list snaps too.
11fdf7f2 9199 ceph_assert(cop->src.snap == CEPH_NOSNAP);
7c673cae
FG
9200 ObjectOperation op;
9201 op.list_snaps(&cop->results.snapset, NULL);
9202 ceph_tid_t tid = osd->objecter->read(cop->src.oid, cop->oloc, op,
9203 CEPH_SNAPDIR, NULL,
9204 flags, gather.new_sub(), NULL);
9205 cop->objecter_tid2 = tid;
9206 }
9207
9208 ObjectOperation op;
9209 if (cop->results.user_version) {
9210 op.assert_version(cop->results.user_version);
9211 } else {
9212 // we should learn the version after the first chunk, if we didn't know
9213 // it already!
11fdf7f2 9214 ceph_assert(cop->cursor.is_initial());
7c673cae
FG
9215 }
9216 op.copy_get(&cop->cursor, get_copy_chunk_size(),
9217 &cop->results.object_size, &cop->results.mtime,
9218 &cop->attrs, &cop->data, &cop->omap_header, &cop->omap_data,
9219 &cop->results.snaps, &cop->results.snap_seq,
9220 &cop->results.flags,
9221 &cop->results.source_data_digest,
9222 &cop->results.source_omap_digest,
9223 &cop->results.reqids,
11fdf7f2 9224 &cop->results.reqid_return_codes,
7c673cae
FG
9225 &cop->results.truncate_seq,
9226 &cop->results.truncate_size,
9227 &cop->rval);
9228 op.set_last_op_flags(cop->src_obj_fadvise_flags);
9229
9230 C_Copyfrom *fin = new C_Copyfrom(this, obc->obs.oi.soid,
9231 get_last_peering_reset(), cop);
9232 gather.set_finisher(new C_OnFinisher(fin,
9f95a23c 9233 osd->get_objecter_finisher(get_pg_shard())));
7c673cae
FG
9234
9235 ceph_tid_t tid = osd->objecter->read(cop->src.oid, cop->oloc, op,
9236 cop->src.snap, NULL,
9237 flags,
9238 gather.new_sub(),
9239 // discover the object version if we don't know it yet
9240 cop->results.user_version ? NULL : &cop->results.user_version);
9241 fin->tid = tid;
9242 cop->objecter_tid = tid;
9243 gather.activate();
9244}
9245
11fdf7f2
TL
9246void PrimaryLogPG::_copy_some_manifest(ObjectContextRef obc, CopyOpRef cop, uint64_t start_offset)
9247{
9248 dout(10) << __func__ << " " << *obc << " " << cop << dendl;
9249
9250 unsigned flags = 0;
9251 if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_FLUSH)
9252 flags |= CEPH_OSD_FLAG_FLUSH;
9253 if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_IGNORE_CACHE)
9254 flags |= CEPH_OSD_FLAG_IGNORE_CACHE;
9255 if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_IGNORE_OVERLAY)
9256 flags |= CEPH_OSD_FLAG_IGNORE_OVERLAY;
9257 if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_MAP_SNAP_CLONE)
9258 flags |= CEPH_OSD_FLAG_MAP_SNAP_CLONE;
9259 if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_RWORDERED)
9260 flags |= CEPH_OSD_FLAG_RWORDERED;
9261
9262 int num_chunks = 0;
9263 uint64_t last_offset = 0, chunks_size = 0;
9264 object_manifest_t *manifest = &obc->obs.oi.manifest;
f67539c2 9265 map<uint64_t, chunk_info_t>::iterator iter = manifest->chunk_map.find(start_offset);
11fdf7f2
TL
9266 for (;iter != manifest->chunk_map.end(); ++iter) {
9267 num_chunks++;
9268 chunks_size += iter->second.length;
9269 last_offset = iter->first;
9270 if (get_copy_chunk_size() < chunks_size) {
9271 break;
9272 }
9273 }
9274
9275 cop->num_chunk = num_chunks;
9276 cop->start_offset = start_offset;
9277 cop->last_offset = last_offset;
9278 dout(20) << __func__ << " oid " << obc->obs.oi.soid << " num_chunks: " << num_chunks
f67539c2 9279 << " start_offset: " << start_offset << " chunks_size: " << chunks_size
11fdf7f2
TL
9280 << " last_offset: " << last_offset << dendl;
9281
9282 iter = manifest->chunk_map.find(start_offset);
9283 for (;iter != manifest->chunk_map.end(); ++iter) {
9284 uint64_t obj_offset = iter->first;
9285 uint64_t length = manifest->chunk_map[iter->first].length;
9286 hobject_t soid = manifest->chunk_map[iter->first].oid;
9287 object_locator_t oloc(soid);
9288 CopyCallback * cb = NULL;
9289 CopyOpRef sub_cop(std::make_shared<CopyOp>(cb, ObjectContextRef(), cop->src, oloc,
9290 cop->results.user_version, cop->flags, cop->mirror_snapset,
9291 cop->src_obj_fadvise_flags, cop->dest_obj_fadvise_flags));
9292 sub_cop->cursor.data_offset = obj_offset;
9293 cop->chunk_cops[obj_offset] = sub_cop;
9294
9295 int s = sub_cop->chunk_ops.size();
9296 sub_cop->chunk_ops.resize(s+1);
9297 sub_cop->chunk_ops[s].op.op = CEPH_OSD_OP_READ;
9298 sub_cop->chunk_ops[s].op.extent.offset = manifest->chunk_map[iter->first].offset;
9299 sub_cop->chunk_ops[s].op.extent.length = length;
9300
9301 ObjectOperation op;
9302 op.dup(sub_cop->chunk_ops);
9303
11fdf7f2
TL
9304 if (cop->results.user_version) {
9305 op.assert_version(cop->results.user_version);
9306 } else {
9307 // we should learn the version after the first chunk, if we didn't know
9308 // it already!
9309 ceph_assert(cop->cursor.is_initial());
9310 }
9311 op.set_last_op_flags(cop->src_obj_fadvise_flags);
9312
9313 C_CopyChunk *fin = new C_CopyChunk(this, obc->obs.oi.soid,
9314 get_last_peering_reset(), cop);
9315 fin->offset = obj_offset;
9f95a23c
TL
9316
9317 ceph_tid_t tid = osd->objecter->read(
9318 soid.oid, oloc, op,
9319 sub_cop->src.snap, NULL,
9320 flags,
9321 new C_OnFinisher(fin, osd->get_objecter_finisher(get_pg_shard())),
9322 // discover the object version if we don't know it yet
9323 sub_cop->results.user_version ? NULL : &sub_cop->results.user_version);
11fdf7f2
TL
9324 fin->tid = tid;
9325 sub_cop->objecter_tid = tid;
f67539c2
TL
9326
9327 dout(20) << __func__ << " tgt_oid: " << soid.oid << " tgt_offset: "
9328 << manifest->chunk_map[iter->first].offset
9329 << " length: " << length << " pool id: " << oloc.pool
9330 << " tid: " << tid << dendl;
9331
11fdf7f2
TL
9332 if (last_offset < iter->first) {
9333 break;
9334 }
9335 }
9336}
9337
7c673cae
FG
9338void PrimaryLogPG::process_copy_chunk(hobject_t oid, ceph_tid_t tid, int r)
9339{
9340 dout(10) << __func__ << " " << oid << " tid " << tid
9341 << " " << cpp_strerror(r) << dendl;
9342 map<hobject_t,CopyOpRef>::iterator p = copy_ops.find(oid);
9343 if (p == copy_ops.end()) {
9344 dout(10) << __func__ << " no copy_op found" << dendl;
9345 return;
9346 }
9347 CopyOpRef cop = p->second;
9348 if (tid != cop->objecter_tid) {
9349 dout(10) << __func__ << " tid " << tid << " != cop " << cop
9350 << " tid " << cop->objecter_tid << dendl;
9351 return;
9352 }
9353
9354 if (cop->omap_data.length() || cop->omap_header.length())
9355 cop->results.has_omap = true;
9356
9357 if (r >= 0 && !pool.info.supports_omap() &&
9358 (cop->omap_data.length() || cop->omap_header.length())) {
9359 r = -EOPNOTSUPP;
9360 }
9361 cop->objecter_tid = 0;
9362 cop->objecter_tid2 = 0; // assume this ordered before us (if it happened)
9363 ObjectContextRef& cobc = cop->obc;
9364
9365 if (r < 0)
9366 goto out;
9367
11fdf7f2 9368 ceph_assert(cop->rval >= 0);
7c673cae
FG
9369
9370 if (oid.snap < CEPH_NOSNAP && !cop->results.snaps.empty()) {
9371 // verify snap hasn't been deleted
9372 vector<snapid_t>::iterator p = cop->results.snaps.begin();
9373 while (p != cop->results.snaps.end()) {
9f95a23c
TL
9374 // make best effort to sanitize snaps/clones.
9375 if (get_osdmap()->in_removed_snaps_queue(info.pgid.pgid.pool(), *p)) {
7c673cae
FG
9376 dout(10) << __func__ << " clone snap " << *p << " has been deleted"
9377 << dendl;
9378 for (vector<snapid_t>::iterator q = p + 1;
9379 q != cop->results.snaps.end();
9380 ++q)
9381 *(q - 1) = *q;
9382 cop->results.snaps.resize(cop->results.snaps.size() - 1);
9383 } else {
9384 ++p;
9385 }
9386 }
9387 if (cop->results.snaps.empty()) {
9388 dout(10) << __func__ << " no more snaps for " << oid << dendl;
9389 r = -ENOENT;
9390 goto out;
9391 }
9392 }
9393
11fdf7f2 9394 ceph_assert(cop->rval >= 0);
7c673cae
FG
9395
9396 if (!cop->temp_cursor.data_complete) {
9397 cop->results.data_digest = cop->data.crc32c(cop->results.data_digest);
9398 }
9399 if (pool.info.supports_omap() && !cop->temp_cursor.omap_complete) {
9400 if (cop->omap_header.length()) {
9401 cop->results.omap_digest =
9402 cop->omap_header.crc32c(cop->results.omap_digest);
9403 }
9404 if (cop->omap_data.length()) {
9405 bufferlist keys;
9406 keys.substr_of(cop->omap_data, 4, cop->omap_data.length() - 4);
9407 cop->results.omap_digest = keys.crc32c(cop->results.omap_digest);
9408 }
9409 }
9410
9411 if (!cop->temp_cursor.attr_complete) {
9412 for (map<string,bufferlist>::iterator p = cop->attrs.begin();
9413 p != cop->attrs.end();
9414 ++p) {
9415 cop->results.attrs[string("_") + p->first] = p->second;
9416 }
9417 cop->attrs.clear();
9418 }
9419
9420 if (!cop->cursor.is_complete()) {
9421 // write out what we have so far
9422 if (cop->temp_cursor.is_initial()) {
11fdf7f2 9423 ceph_assert(!cop->results.started_temp_obj);
7c673cae
FG
9424 cop->results.started_temp_obj = true;
9425 cop->results.temp_oid = generate_temp_object(oid);
9426 dout(20) << __func__ << " using temp " << cop->results.temp_oid << dendl;
9427 }
9428 ObjectContextRef tempobc = get_object_context(cop->results.temp_oid, true);
9429 OpContextUPtr ctx = simple_opc_create(tempobc);
9430 if (cop->temp_cursor.is_initial()) {
9431 ctx->new_temp_oid = cop->results.temp_oid;
9432 }
9433 _write_copy_chunk(cop, ctx->op_t.get());
9434 simple_opc_submit(std::move(ctx));
9435 dout(10) << __func__ << " fetching more" << dendl;
9436 _copy_some(cobc, cop);
9437 return;
9438 }
9439
9440 // verify digests?
9441 if (cop->results.is_data_digest() || cop->results.is_omap_digest()) {
9442 dout(20) << __func__ << std::hex
9443 << " got digest: rx data 0x" << cop->results.data_digest
9444 << " omap 0x" << cop->results.omap_digest
9445 << ", source: data 0x" << cop->results.source_data_digest
9446 << " omap 0x" << cop->results.source_omap_digest
9447 << std::dec
9448 << " flags " << cop->results.flags
9449 << dendl;
9450 }
9451 if (cop->results.is_data_digest() &&
9452 cop->results.data_digest != cop->results.source_data_digest) {
9453 derr << __func__ << std::hex << " data digest 0x" << cop->results.data_digest
9454 << " != source 0x" << cop->results.source_data_digest << std::dec
9455 << dendl;
9456 osd->clog->error() << info.pgid << " copy from " << cop->src
9457 << " to " << cop->obc->obs.oi.soid << std::hex
9458 << " data digest 0x" << cop->results.data_digest
9459 << " != source 0x" << cop->results.source_data_digest
9460 << std::dec;
9461 r = -EIO;
9462 goto out;
9463 }
9464 if (cop->results.is_omap_digest() &&
9465 cop->results.omap_digest != cop->results.source_omap_digest) {
9466 derr << __func__ << std::hex
9467 << " omap digest 0x" << cop->results.omap_digest
9468 << " != source 0x" << cop->results.source_omap_digest
9469 << std::dec << dendl;
9470 osd->clog->error() << info.pgid << " copy from " << cop->src
9471 << " to " << cop->obc->obs.oi.soid << std::hex
9472 << " omap digest 0x" << cop->results.omap_digest
9473 << " != source 0x" << cop->results.source_omap_digest
9474 << std::dec;
9475 r = -EIO;
9476 goto out;
9477 }
9478 if (cct->_conf->osd_debug_inject_copyfrom_error) {
9479 derr << __func__ << " injecting copyfrom failure" << dendl;
9480 r = -EIO;
9481 goto out;
9482 }
9483
9484 cop->results.fill_in_final_tx = std::function<void(PGTransaction*)>(
9485 [this, &cop /* avoid ref cycle */](PGTransaction *t) {
9486 ObjectState& obs = cop->obc->obs;
9487 if (cop->temp_cursor.is_initial()) {
9488 dout(20) << "fill_in_final_tx: writing "
9489 << "directly to final object" << dendl;
9490 // write directly to final object
9491 cop->results.temp_oid = obs.oi.soid;
9492 _write_copy_chunk(cop, t);
9493 } else {
9494 // finish writing to temp object, then move into place
9495 dout(20) << "fill_in_final_tx: writing to temp object" << dendl;
f67539c2
TL
9496 if (obs.oi.has_manifest() && obs.oi.manifest.is_redirect() && obs.exists) {
9497 /* In redirect manifest case, the object exists in the upper tier.
9498 * So, to avoid a conflict when rename() is called, remove existing
9499 * object first
9500 */
9501 t->remove(obs.oi.soid);
9502 }
7c673cae
FG
9503 _write_copy_chunk(cop, t);
9504 t->rename(obs.oi.soid, cop->results.temp_oid);
9505 }
9506 t->setattrs(obs.oi.soid, cop->results.attrs);
9507 });
9508
9509 dout(20) << __func__ << " success; committing" << dendl;
9510
9511 out:
9512 dout(20) << __func__ << " complete r = " << cpp_strerror(r) << dendl;
9513 CopyCallbackResults results(r, &cop->results);
9514 cop->cb->complete(results);
9515
9516 copy_ops.erase(cobc->obs.oi.soid);
9517 cobc->stop_block();
9518
9519 if (r < 0 && cop->results.started_temp_obj) {
9520 dout(10) << __func__ << " deleting partial temp object "
9521 << cop->results.temp_oid << dendl;
9522 ObjectContextRef tempobc = get_object_context(cop->results.temp_oid, true);
9523 OpContextUPtr ctx = simple_opc_create(tempobc);
9524 ctx->op_t->remove(cop->results.temp_oid);
9525 ctx->discard_temp_oid = cop->results.temp_oid;
9526 simple_opc_submit(std::move(ctx));
9527 }
9528
9529 // cancel and requeue proxy ops on this object
9530 if (!r) {
11fdf7f2
TL
9531 cancel_and_requeue_proxy_ops(cobc->obs.oi.soid);
9532 }
9533
9534 kick_object_context_blocked(cobc);
9535}
9536
9537void PrimaryLogPG::process_copy_chunk_manifest(hobject_t oid, ceph_tid_t tid, int r, uint64_t offset)
9538{
9539 dout(10) << __func__ << " " << oid << " tid " << tid
9540 << " " << cpp_strerror(r) << dendl;
9541 map<hobject_t,CopyOpRef>::iterator p = copy_ops.find(oid);
9542 if (p == copy_ops.end()) {
9543 dout(10) << __func__ << " no copy_op found" << dendl;
9544 return;
9545 }
9546 CopyOpRef obj_cop = p->second;
9547 CopyOpRef chunk_cop = obj_cop->chunk_cops[offset];
9548
9549 if (tid != chunk_cop->objecter_tid) {
9550 dout(10) << __func__ << " tid " << tid << " != cop " << chunk_cop
9551 << " tid " << chunk_cop->objecter_tid << dendl;
9552 return;
9553 }
9554
9555 if (chunk_cop->omap_data.length() || chunk_cop->omap_header.length()) {
9556 r = -EOPNOTSUPP;
9557 }
9558
9559 chunk_cop->objecter_tid = 0;
9560 chunk_cop->objecter_tid2 = 0; // assume this ordered before us (if it happened)
9561 ObjectContextRef& cobc = obj_cop->obc;
9562 OSDOp &chunk_data = chunk_cop->chunk_ops[0];
9563
9564 if (r < 0) {
9565 obj_cop->failed = true;
9566 goto out;
f67539c2 9567 }
11fdf7f2
TL
9568
9569 if (obj_cop->failed) {
9570 return;
f67539c2 9571 }
11fdf7f2
TL
9572 if (!chunk_data.outdata.length()) {
9573 r = -EIO;
9574 obj_cop->failed = true;
9575 goto out;
9576 }
9577
9578 obj_cop->num_chunk--;
9579
9580 /* check all of the copyop are completed */
9581 if (obj_cop->num_chunk) {
9582 dout(20) << __func__ << " num_chunk: " << obj_cop->num_chunk << dendl;
9583 return;
9584 }
9585
9586 {
9587 OpContextUPtr ctx = simple_opc_create(obj_cop->obc);
9588 if (!ctx->lock_manager.take_write_lock(
9589 obj_cop->obc->obs.oi.soid,
9590 obj_cop->obc)) {
f67539c2
TL
9591 // recovery op can take read lock.
9592 // so need to wait for recovery completion
11fdf7f2
TL
9593 r = -EAGAIN;
9594 obj_cop->failed = true;
9595 close_op_ctx(ctx.release());
9596 goto out;
7c673cae 9597 }
11fdf7f2
TL
9598 dout(20) << __func__ << " took lock on obc, " << obj_cop->obc->rwstate << dendl;
9599
9600 PGTransaction *t = ctx->op_t.get();
9601 ObjectState& obs = ctx->new_obs;
9602 for (auto p : obj_cop->chunk_cops) {
9603 OSDOp &sub_chunk = p.second->chunk_ops[0];
9604 t->write(cobc->obs.oi.soid,
9605 p.second->cursor.data_offset,
9606 sub_chunk.outdata.length(),
9607 sub_chunk.outdata,
9608 p.second->dest_obj_fadvise_flags);
f67539c2 9609 dout(20) << __func__ << " offset: " << p.second->cursor.data_offset
11fdf7f2
TL
9610 << " length: " << sub_chunk.outdata.length() << dendl;
9611 write_update_size_and_usage(ctx->delta_stats, obs.oi, ctx->modified_ranges,
9612 p.second->cursor.data_offset, sub_chunk.outdata.length());
f67539c2 9613 obs.oi.manifest.chunk_map[p.second->cursor.data_offset].clear_flag(chunk_info_t::FLAG_MISSING);
9f95a23c 9614 ctx->clean_regions.mark_data_region_dirty(p.second->cursor.data_offset, sub_chunk.outdata.length());
11fdf7f2
TL
9615 sub_chunk.outdata.clear();
9616 }
9617 obs.oi.clear_data_digest();
f67539c2 9618 ctx->at_version = get_next_version();
11fdf7f2
TL
9619 finish_ctx(ctx.get(), pg_log_entry_t::PROMOTE);
9620 simple_opc_submit(std::move(ctx));
9621
9622 auto p = cobc->obs.oi.manifest.chunk_map.rbegin();
9623 /* check remaining work */
9624 if (p != cobc->obs.oi.manifest.chunk_map.rend()) {
9625 if (obj_cop->last_offset >= p->first + p->second.length) {
9626 for (auto &en : cobc->obs.oi.manifest.chunk_map) {
9627 if (obj_cop->last_offset < en.first) {
9628 _copy_some_manifest(cobc, obj_cop, en.first);
9629 return;
9630 }
9631 }
7c673cae
FG
9632 }
9633 }
11fdf7f2
TL
9634 }
9635
9636 out:
9637 dout(20) << __func__ << " complete r = " << cpp_strerror(r) << dendl;
9638 CopyCallbackResults results(r, &obj_cop->results);
9639 obj_cop->cb->complete(results);
9640
9641 copy_ops.erase(cobc->obs.oi.soid);
9642 cobc->stop_block();
9643
9644 // cancel and requeue proxy ops on this object
9645 if (!r) {
9646 cancel_and_requeue_proxy_ops(cobc->obs.oi.soid);
7c673cae
FG
9647 }
9648
9649 kick_object_context_blocked(cobc);
9650}
9651
94b18763
FG
9652void PrimaryLogPG::cancel_and_requeue_proxy_ops(hobject_t oid) {
9653 vector<ceph_tid_t> tids;
9654 for (map<ceph_tid_t, ProxyReadOpRef>::iterator it = proxyread_ops.begin();
9655 it != proxyread_ops.end();) {
9656 if (it->second->soid == oid) {
9657 cancel_proxy_read((it++)->second, &tids);
9658 } else {
9659 ++it;
9660 }
9661 }
9662 for (map<ceph_tid_t, ProxyWriteOpRef>::iterator it = proxywrite_ops.begin();
9663 it != proxywrite_ops.end();) {
9664 if (it->second->soid == oid) {
9665 cancel_proxy_write((it++)->second, &tids);
9666 } else {
9667 ++it;
9668 }
9669 }
9670 osd->objecter->op_cancel(tids, -ECANCELED);
9671 kick_proxy_ops_blocked(oid);
9672}
9673
7c673cae
FG
9674void PrimaryLogPG::_write_copy_chunk(CopyOpRef cop, PGTransaction *t)
9675{
9676 dout(20) << __func__ << " " << cop
9677 << " " << cop->attrs.size() << " attrs"
9678 << " " << cop->data.length() << " bytes"
9679 << " " << cop->omap_header.length() << " omap header bytes"
9680 << " " << cop->omap_data.length() << " omap data bytes"
9681 << dendl;
9682 if (!cop->temp_cursor.attr_complete) {
9683 t->create(cop->results.temp_oid);
9684 }
9685 if (!cop->temp_cursor.data_complete) {
11fdf7f2 9686 ceph_assert(cop->data.length() + cop->temp_cursor.data_offset ==
7c673cae 9687 cop->cursor.data_offset);
11fdf7f2 9688 if (pool.info.required_alignment() &&
7c673cae
FG
9689 !cop->cursor.data_complete) {
9690 /**
9691 * Trim off the unaligned bit at the end, we'll adjust cursor.data_offset
9692 * to pick it up on the next pass.
9693 */
11fdf7f2 9694 ceph_assert(cop->temp_cursor.data_offset %
7c673cae
FG
9695 pool.info.required_alignment() == 0);
9696 if (cop->data.length() % pool.info.required_alignment() != 0) {
9697 uint64_t to_trim =
9698 cop->data.length() % pool.info.required_alignment();
9699 bufferlist bl;
9700 bl.substr_of(cop->data, 0, cop->data.length() - to_trim);
9701 cop->data.swap(bl);
9702 cop->cursor.data_offset -= to_trim;
11fdf7f2 9703 ceph_assert(cop->data.length() + cop->temp_cursor.data_offset ==
7c673cae
FG
9704 cop->cursor.data_offset);
9705 }
9706 }
9707 if (cop->data.length()) {
9708 t->write(
9709 cop->results.temp_oid,
9710 cop->temp_cursor.data_offset,
9711 cop->data.length(),
9712 cop->data,
9713 cop->dest_obj_fadvise_flags);
9714 }
9715 cop->data.clear();
9716 }
9717 if (pool.info.supports_omap()) {
9718 if (!cop->temp_cursor.omap_complete) {
9719 if (cop->omap_header.length()) {
9720 t->omap_setheader(
9721 cop->results.temp_oid,
9722 cop->omap_header);
9723 cop->omap_header.clear();
9724 }
9725 if (cop->omap_data.length()) {
9726 map<string,bufferlist> omap;
11fdf7f2
TL
9727 bufferlist::const_iterator p = cop->omap_data.begin();
9728 decode(omap, p);
7c673cae
FG
9729 t->omap_setkeys(cop->results.temp_oid, omap);
9730 cop->omap_data.clear();
9731 }
9732 }
9733 } else {
11fdf7f2
TL
9734 ceph_assert(cop->omap_header.length() == 0);
9735 ceph_assert(cop->omap_data.length() == 0);
7c673cae
FG
9736 }
9737 cop->temp_cursor = cop->cursor;
9738}
9739
c07f9fc5 9740void PrimaryLogPG::finish_copyfrom(CopyFromCallback *cb)
7c673cae 9741{
c07f9fc5 9742 OpContext *ctx = cb->ctx;
7c673cae 9743 dout(20) << "finish_copyfrom on " << ctx->obs->oi.soid << dendl;
7c673cae 9744
c07f9fc5 9745 ObjectState& obs = ctx->new_obs;
7c673cae
FG
9746 if (obs.exists) {
9747 dout(20) << __func__ << ": exists, removing" << dendl;
9748 ctx->op_t->remove(obs.oi.soid);
9749 } else {
9750 ctx->delta_stats.num_objects++;
9751 obs.exists = true;
9752 }
9753 if (cb->is_temp_obj_used()) {
9754 ctx->discard_temp_oid = cb->results->temp_oid;
9755 }
9756 cb->results->fill_in_final_tx(ctx->op_t.get());
9757
9758 // CopyFromCallback fills this in for us
9759 obs.oi.user_version = ctx->user_at_version;
9760
28e407b8
AA
9761 if (cb->results->is_data_digest()) {
9762 obs.oi.set_data_digest(cb->results->data_digest);
9763 } else {
9764 obs.oi.clear_data_digest();
9765 }
9766 if (cb->results->is_omap_digest()) {
9767 obs.oi.set_omap_digest(cb->results->omap_digest);
9768 } else {
9769 obs.oi.clear_omap_digest();
9770 }
7c673cae 9771
9f95a23c
TL
9772 obs.oi.truncate_seq = cb->truncate_seq;
9773 obs.oi.truncate_size = cb->truncate_size;
9774
9775 obs.oi.mtime = ceph::real_clock::to_timespec(cb->results->mtime);
9776 ctx->mtime = utime_t();
7c673cae
FG
9777
9778 ctx->extra_reqids = cb->results->reqids;
11fdf7f2 9779 ctx->extra_reqid_return_codes = cb->results->reqid_return_codes;
7c673cae
FG
9780
9781 // cache: clear whiteout?
9782 if (obs.oi.is_whiteout()) {
9783 dout(10) << __func__ << " clearing whiteout on " << obs.oi.soid << dendl;
9784 obs.oi.clear_flag(object_info_t::FLAG_WHITEOUT);
9785 --ctx->delta_stats.num_whiteouts;
9786 }
9787
9788 if (cb->results->has_omap) {
9789 dout(10) << __func__ << " setting omap flag on " << obs.oi.soid << dendl;
9790 obs.oi.set_flag(object_info_t::FLAG_OMAP);
9f95a23c 9791 ctx->clean_regions.mark_omap_dirty();
7c673cae
FG
9792 } else {
9793 dout(10) << __func__ << " clearing omap flag on " << obs.oi.soid << dendl;
9794 obs.oi.clear_flag(object_info_t::FLAG_OMAP);
9795 }
9796
9797 interval_set<uint64_t> ch;
9798 if (obs.oi.size > 0)
9799 ch.insert(0, obs.oi.size);
9800 ctx->modified_ranges.union_of(ch);
9f95a23c 9801 ctx->clean_regions.mark_data_region_dirty(0, std::max(obs.oi.size, cb->get_data_size()));
7c673cae
FG
9802
9803 if (cb->get_data_size() != obs.oi.size) {
9804 ctx->delta_stats.num_bytes -= obs.oi.size;
9805 obs.oi.size = cb->get_data_size();
9806 ctx->delta_stats.num_bytes += obs.oi.size;
9807 }
9808 ctx->delta_stats.num_wr++;
11fdf7f2 9809 ctx->delta_stats.num_wr_kb += shift_round_up(obs.oi.size, 10);
7c673cae
FG
9810
9811 osd->logger->inc(l_osd_copyfrom);
9812}
9813
9814void PrimaryLogPG::finish_promote(int r, CopyResults *results,
9815 ObjectContextRef obc)
9816{
9817 const hobject_t& soid = obc->obs.oi.soid;
9818 dout(10) << __func__ << " " << soid << " r=" << r
9819 << " uv" << results->user_version << dendl;
9820
9821 if (r == -ECANCELED) {
9822 return;
9823 }
9824
9825 if (r != -ENOENT && soid.is_snap()) {
9826 if (results->snaps.empty()) {
9f95a23c
TL
9827 // we must have read "snap" content from the head object in the
9828 // base pool. use snap_seq to construct what snaps should be
9829 // for this clone (what is was before we evicted the clean clone
9830 // from this pool, and what it will be when we flush and the
9831 // clone eventually happens in the base pool). we want to use
9832 // snaps in (results->snap_seq,soid.snap]
7c673cae 9833 SnapSet& snapset = obc->ssc->snapset;
9f95a23c
TL
9834 for (auto p = snapset.clone_snaps.rbegin();
9835 p != snapset.clone_snaps.rend();
9836 ++p) {
9837 for (auto snap : p->second) {
9838 if (snap > soid.snap) {
9839 continue;
9840 }
9841 if (snap <= results->snap_seq) {
9842 break;
9843 }
9844 results->snaps.push_back(snap);
9845 }
7c673cae
FG
9846 }
9847 }
9848
9849 dout(20) << __func__ << " snaps " << results->snaps << dendl;
9850 filter_snapc(results->snaps);
9851
9852 dout(20) << __func__ << " filtered snaps " << results->snaps << dendl;
9853 if (results->snaps.empty()) {
9854 dout(20) << __func__
9855 << " snaps are empty, clone is invalid,"
9856 << " setting r to ENOENT" << dendl;
9857 r = -ENOENT;
9858 }
9859 }
9860
9861 if (r < 0 && results->started_temp_obj) {
9862 dout(10) << __func__ << " abort; will clean up partial work" << dendl;
9863 ObjectContextRef tempobc = get_object_context(results->temp_oid, false);
11fdf7f2 9864 ceph_assert(tempobc);
7c673cae
FG
9865 OpContextUPtr ctx = simple_opc_create(tempobc);
9866 ctx->op_t->remove(results->temp_oid);
9867 simple_opc_submit(std::move(ctx));
9868 results->started_temp_obj = false;
9869 }
9870
9871 if (r == -ENOENT && soid.is_snap()) {
9872 dout(10) << __func__
9873 << ": enoent while trying to promote clone, " << soid
9874 << " must have been trimmed, removing from snapset"
9875 << dendl;
9876 hobject_t head(soid.get_head());
9877 ObjectContextRef obc = get_object_context(head, false);
11fdf7f2 9878 ceph_assert(obc);
7c673cae
FG
9879
9880 OpContextUPtr tctx = simple_opc_create(obc);
9881 tctx->at_version = get_next_version();
9f95a23c
TL
9882 if (get_osdmap()->require_osd_release < ceph_release_t::octopus) {
9883 filter_snapc(tctx->new_snapset.snaps);
9884 } else {
9885 tctx->new_snapset.snaps.clear();
9886 }
7c673cae
FG
9887 vector<snapid_t> new_clones;
9888 map<snapid_t, vector<snapid_t>> new_clone_snaps;
9889 for (vector<snapid_t>::iterator i = tctx->new_snapset.clones.begin();
9890 i != tctx->new_snapset.clones.end();
9891 ++i) {
9892 if (*i != soid.snap) {
9893 new_clones.push_back(*i);
9894 auto p = tctx->new_snapset.clone_snaps.find(*i);
9895 if (p != tctx->new_snapset.clone_snaps.end()) {
9896 new_clone_snaps[*i] = p->second;
9897 }
9898 }
9899 }
9900 tctx->new_snapset.clones.swap(new_clones);
9901 tctx->new_snapset.clone_overlap.erase(soid.snap);
9902 tctx->new_snapset.clone_size.erase(soid.snap);
9903 tctx->new_snapset.clone_snaps.swap(new_clone_snaps);
9904
9905 // take RWWRITE lock for duration of our local write. ignore starvation.
9906 if (!tctx->lock_manager.take_write_lock(
9907 head,
9908 obc)) {
11fdf7f2 9909 ceph_abort_msg("problem!");
7c673cae
FG
9910 }
9911 dout(20) << __func__ << " took lock on obc, " << obc->rwstate << dendl;
9912
9913 finish_ctx(tctx.get(), pg_log_entry_t::PROMOTE);
9914
9915 simple_opc_submit(std::move(tctx));
9916 return;
9917 }
9918
9919 bool whiteout = false;
9920 if (r == -ENOENT) {
11fdf7f2 9921 ceph_assert(soid.snap == CEPH_NOSNAP); // snap case is above
7c673cae
FG
9922 dout(10) << __func__ << " whiteout " << soid << dendl;
9923 whiteout = true;
9924 }
9925
9926 if (r < 0 && !whiteout) {
9927 derr << __func__ << " unexpected promote error " << cpp_strerror(r) << dendl;
9928 // pass error to everyone blocked on this object
9929 // FIXME: this is pretty sloppy, but at this point we got
9930 // something unexpected and don't have many other options.
9931 map<hobject_t,list<OpRequestRef>>::iterator blocked_iter =
9932 waiting_for_blocked_object.find(soid);
9933 if (blocked_iter != waiting_for_blocked_object.end()) {
9934 while (!blocked_iter->second.empty()) {
9935 osd->reply_op_error(blocked_iter->second.front(), r);
9936 blocked_iter->second.pop_front();
9937 }
9938 waiting_for_blocked_object.erase(blocked_iter);
9939 }
9940 return;
9941 }
9942
9943 osd->promote_finish(results->object_size);
9944
9945 OpContextUPtr tctx = simple_opc_create(obc);
9946 tctx->at_version = get_next_version();
9947
11fdf7f2
TL
9948 if (!obc->obs.oi.has_manifest()) {
9949 ++tctx->delta_stats.num_objects;
9950 }
7c673cae
FG
9951 if (soid.snap < CEPH_NOSNAP)
9952 ++tctx->delta_stats.num_object_clones;
9953 tctx->new_obs.exists = true;
9954
9955 tctx->extra_reqids = results->reqids;
11fdf7f2 9956 tctx->extra_reqid_return_codes = results->reqid_return_codes;
7c673cae 9957
f67539c2
TL
9958 if (obc->obs.oi.has_manifest() && obc->obs.oi.manifest.is_redirect()) {
9959 tctx->new_obs.oi.manifest.type = object_manifest_t::TYPE_NONE;
9960 tctx->new_obs.oi.clear_flag(object_info_t::FLAG_REDIRECT_HAS_REFERENCE);
9961 tctx->new_obs.oi.clear_flag(object_info_t::FLAG_MANIFEST);
9962 tctx->new_obs.oi.manifest.redirect_target = hobject_t();
9963 tctx->delta_stats.num_objects_manifest--;
9964 if (obc->obs.oi.test_flag(object_info_t::FLAG_REDIRECT_HAS_REFERENCE)) {
9965 dec_all_refcount_manifest(obc->obs.oi, tctx.get());
9966 }
9967 }
9968
7c673cae
FG
9969 if (whiteout) {
9970 // create a whiteout
9971 tctx->op_t->create(soid);
9972 tctx->new_obs.oi.set_flag(object_info_t::FLAG_WHITEOUT);
9973 ++tctx->delta_stats.num_whiteouts;
9974 dout(20) << __func__ << " creating whiteout on " << soid << dendl;
9975 osd->logger->inc(l_osd_tier_whiteout);
9976 } else {
9977 if (results->has_omap) {
9978 dout(10) << __func__ << " setting omap flag on " << soid << dendl;
9979 tctx->new_obs.oi.set_flag(object_info_t::FLAG_OMAP);
9980 ++tctx->delta_stats.num_objects_omap;
9981 }
9982
9983 results->fill_in_final_tx(tctx->op_t.get());
9984 if (results->started_temp_obj) {
9985 tctx->discard_temp_oid = results->temp_oid;
9986 }
9987 tctx->new_obs.oi.size = results->object_size;
9988 tctx->new_obs.oi.user_version = results->user_version;
9f95a23c
TL
9989 tctx->new_obs.oi.mtime = ceph::real_clock::to_timespec(results->mtime);
9990 tctx->mtime = utime_t();
28e407b8 9991 if (results->is_data_digest()) {
7c673cae 9992 tctx->new_obs.oi.set_data_digest(results->data_digest);
28e407b8
AA
9993 } else {
9994 tctx->new_obs.oi.clear_data_digest();
9995 }
9f95a23c
TL
9996 if (results->object_size)
9997 tctx->clean_regions.mark_data_region_dirty(0, results->object_size);
28e407b8 9998 if (results->is_omap_digest()) {
7c673cae 9999 tctx->new_obs.oi.set_omap_digest(results->omap_digest);
28e407b8
AA
10000 } else {
10001 tctx->new_obs.oi.clear_omap_digest();
10002 }
9f95a23c
TL
10003 if (results->has_omap)
10004 tctx->clean_regions.mark_omap_dirty();
7c673cae
FG
10005 tctx->new_obs.oi.truncate_seq = results->truncate_seq;
10006 tctx->new_obs.oi.truncate_size = results->truncate_size;
10007
10008 if (soid.snap != CEPH_NOSNAP) {
11fdf7f2
TL
10009 ceph_assert(obc->ssc->snapset.clone_snaps.count(soid.snap));
10010 ceph_assert(obc->ssc->snapset.clone_size.count(soid.snap));
10011 ceph_assert(obc->ssc->snapset.clone_size[soid.snap] ==
7c673cae 10012 results->object_size);
11fdf7f2 10013 ceph_assert(obc->ssc->snapset.clone_overlap.count(soid.snap));
7c673cae
FG
10014
10015 tctx->delta_stats.num_bytes += obc->ssc->snapset.get_clone_bytes(soid.snap);
10016 } else {
10017 tctx->delta_stats.num_bytes += results->object_size;
10018 }
10019 }
10020
10021 if (results->mirror_snapset) {
11fdf7f2 10022 ceph_assert(tctx->new_obs.oi.soid.snap == CEPH_NOSNAP);
7c673cae
FG
10023 tctx->new_snapset.from_snap_set(
10024 results->snapset,
9f95a23c 10025 get_osdmap()->require_osd_release < ceph_release_t::luminous);
7c673cae 10026 }
7c673cae
FG
10027 dout(20) << __func__ << " new_snapset " << tctx->new_snapset << dendl;
10028
10029 // take RWWRITE lock for duration of our local write. ignore starvation.
10030 if (!tctx->lock_manager.take_write_lock(
10031 obc->obs.oi.soid,
10032 obc)) {
11fdf7f2 10033 ceph_abort_msg("problem!");
7c673cae
FG
10034 }
10035 dout(20) << __func__ << " took lock on obc, " << obc->rwstate << dendl;
10036
10037 finish_ctx(tctx.get(), pg_log_entry_t::PROMOTE);
10038
10039 simple_opc_submit(std::move(tctx));
10040
10041 osd->logger->inc(l_osd_tier_promote);
10042
10043 if (agent_state &&
10044 agent_state->is_idle())
10045 agent_choose_mode();
10046}
10047
11fdf7f2
TL
10048void PrimaryLogPG::finish_promote_manifest(int r, CopyResults *results,
10049 ObjectContextRef obc)
10050{
10051 const hobject_t& soid = obc->obs.oi.soid;
10052 dout(10) << __func__ << " " << soid << " r=" << r
10053 << " uv" << results->user_version << dendl;
10054
10055 if (r == -ECANCELED || r == -EAGAIN) {
10056 return;
10057 }
10058
10059 if (r < 0) {
10060 derr << __func__ << " unexpected promote error " << cpp_strerror(r) << dendl;
10061 // pass error to everyone blocked on this object
10062 // FIXME: this is pretty sloppy, but at this point we got
10063 // something unexpected and don't have many other options.
10064 map<hobject_t,list<OpRequestRef>>::iterator blocked_iter =
10065 waiting_for_blocked_object.find(soid);
10066 if (blocked_iter != waiting_for_blocked_object.end()) {
10067 while (!blocked_iter->second.empty()) {
10068 osd->reply_op_error(blocked_iter->second.front(), r);
10069 blocked_iter->second.pop_front();
10070 }
10071 waiting_for_blocked_object.erase(blocked_iter);
10072 }
10073 return;
10074 }
f67539c2 10075
11fdf7f2
TL
10076 osd->promote_finish(results->object_size);
10077 osd->logger->inc(l_osd_tier_promote);
10078
10079 if (agent_state &&
10080 agent_state->is_idle())
10081 agent_choose_mode();
10082}
10083
94b18763
FG
10084void PrimaryLogPG::cancel_copy(CopyOpRef cop, bool requeue,
10085 vector<ceph_tid_t> *tids)
7c673cae
FG
10086{
10087 dout(10) << __func__ << " " << cop->obc->obs.oi.soid
10088 << " from " << cop->src << " " << cop->oloc
10089 << " v" << cop->results.user_version << dendl;
10090
10091 // cancel objecter op, if we can
10092 if (cop->objecter_tid) {
94b18763 10093 tids->push_back(cop->objecter_tid);
7c673cae
FG
10094 cop->objecter_tid = 0;
10095 if (cop->objecter_tid2) {
94b18763 10096 tids->push_back(cop->objecter_tid2);
7c673cae
FG
10097 cop->objecter_tid2 = 0;
10098 }
10099 }
10100
10101 copy_ops.erase(cop->obc->obs.oi.soid);
10102 cop->obc->stop_block();
10103
10104 kick_object_context_blocked(cop->obc);
10105 cop->results.should_requeue = requeue;
10106 CopyCallbackResults result(-ECANCELED, &cop->results);
10107 cop->cb->complete(result);
10108
10109 // There may still be an objecter callback referencing this copy op.
10110 // That callback will not need the obc since it's been canceled, and
10111 // we need the obc reference to go away prior to flush.
10112 cop->obc = ObjectContextRef();
10113}
10114
94b18763 10115void PrimaryLogPG::cancel_copy_ops(bool requeue, vector<ceph_tid_t> *tids)
7c673cae
FG
10116{
10117 dout(10) << __func__ << dendl;
10118 map<hobject_t,CopyOpRef>::iterator p = copy_ops.begin();
10119 while (p != copy_ops.end()) {
10120 // requeue this op? can I queue up all of them?
94b18763 10121 cancel_copy((p++)->second, requeue, tids);
7c673cae
FG
10122 }
10123}
10124
10125
10126// ========================================================================
10127// flush
10128//
10129// Flush a dirty object in the cache tier by writing it back to the
10130// base tier. The sequence looks like:
10131//
10132// * send a copy-from operation to the base tier to copy the current
10133// version of the object
10134// * base tier will pull the object via (perhaps multiple) copy-get(s)
10135// * on completion, we check if the object has been modified. if so,
10136// just reply with -EAGAIN.
10137// * try to take a write lock so we can clear the dirty flag. if this
10138// fails, wait and retry
10139// * start a repop that clears the bit.
10140//
10141// If we have to wait, we will retry by coming back through the
10142// start_flush method. We check if a flush is already in progress
10143// and, if so, try to finish it by rechecking the version and trying
10144// to clear the dirty bit.
10145//
10146// In order for the cache-flush (a write op) to not block the copy-get
10147// from reading the object, the client *must* set the SKIPRWLOCKS
10148// flag.
10149//
10150// NOTE: normally writes are strictly ordered for the client, but
10151// flushes are special in that they can be reordered with respect to
10152// other writes. In particular, we can't have a flush request block
10153// an update to the cache pool object!
10154
10155struct C_Flush : public Context {
10156 PrimaryLogPGRef pg;
10157 hobject_t oid;
10158 epoch_t last_peering_reset;
10159 ceph_tid_t tid;
10160 utime_t start;
10161 C_Flush(PrimaryLogPG *p, hobject_t o, epoch_t lpr)
10162 : pg(p), oid(o), last_peering_reset(lpr),
10163 tid(0), start(ceph_clock_now())
10164 {}
10165 void finish(int r) override {
10166 if (r == -ECANCELED)
10167 return;
9f95a23c 10168 std::scoped_lock locker{*pg};
7c673cae
FG
10169 if (last_peering_reset == pg->get_last_peering_reset()) {
10170 pg->finish_flush(oid, tid, r);
10171 pg->osd->logger->tinc(l_osd_tier_flush_lat, ceph_clock_now() - start);
10172 }
7c673cae
FG
10173 }
10174};
10175
f67539c2
TL
10176int PrimaryLogPG::start_dedup(OpRequestRef op, ObjectContextRef obc)
10177{
10178 const object_info_t& oi = obc->obs.oi;
10179 const hobject_t& soid = oi.soid;
10180
10181 ceph_assert(obc->is_blocked());
10182 if (oi.size == 0) {
10183 // evicted
10184 return 0;
10185 }
10186 if (pool.info.get_fingerprint_type() == pg_pool_t::TYPE_FINGERPRINT_NONE) {
10187 dout(0) << " fingerprint algorithm is not set " << dendl;
10188 return -EINVAL;
10189 }
10190
10191 /*
10192 * The operations to make dedup chunks are tracked by a ManifestOp.
10193 * This op will be finished if all the operations are completed.
10194 */
10195 ManifestOpRef mop(std::make_shared<ManifestOp>(nullptr));
10196
10197 // cdc
10198 std::map<uint64_t, bufferlist> chunks;
10199 int r = do_cdc(oi, mop->new_manifest.chunk_map, chunks);
10200 if (r < 0) {
10201 return r;
10202 }
10203 if (!chunks.size()) {
10204 return 0;
10205 }
10206
10207 // chunks issued here are different with chunk_map newly generated
10208 // because the same chunks in previous snap will not be issued
10209 // So, we need two data structures; the first is the issued chunk list to track
10210 // issued operations, and the second is the new chunk_map to update chunk_map after
10211 // all operations are finished
10212 object_ref_delta_t refs;
10213 ObjectContextRef obc_l, obc_g;
10214 get_adjacent_clones(obc, obc_l, obc_g);
10215 // skip if the same content exits in prev snap at same offset
10216 mop->new_manifest.calc_refs_to_inc_on_set(
10217 obc_l ? &(obc_l->obs.oi.manifest) : nullptr,
10218 obc_g ? &(obc_g->obs.oi.manifest) : nullptr,
10219 refs);
10220
10221 for (auto p : chunks) {
10222 hobject_t target = mop->new_manifest.chunk_map[p.first].oid;
10223 if (refs.find(target) == refs.end()) {
10224 continue;
10225 }
10226 C_SetDedupChunks *fin = new C_SetDedupChunks(this, soid, get_last_peering_reset(), p.first);
10227 ceph_tid_t tid = refcount_manifest(soid, target, refcount_t::CREATE_OR_GET_REF,
10228 fin, move(chunks[p.first]));
10229 mop->chunks[target] = make_pair(p.first, p.second.length());
10230 mop->num_chunks++;
10231 mop->tids[p.first] = tid;
10232 fin->tid = tid;
10233 dout(10) << __func__ << " oid: " << soid << " tid: " << tid
10234 << " target: " << target << " offset: " << p.first
10235 << " length: " << p.second.length() << dendl;
10236 }
10237
10238 if (mop->tids.size()) {
10239 manifest_ops[soid] = mop;
10240 manifest_ops[soid]->op = op;
10241 } else {
10242 // size == 0
10243 return 0;
10244 }
10245
10246 return -EINPROGRESS;
10247}
10248
10249int PrimaryLogPG::do_cdc(const object_info_t& oi,
10250 std::map<uint64_t, chunk_info_t>& chunk_map,
10251 std::map<uint64_t, bufferlist>& chunks)
10252{
10253 string chunk_algo = pool.info.get_dedup_chunk_algorithm_name();
10254 int64_t chunk_size = pool.info.get_dedup_cdc_chunk_size();
10255 uint64_t total_length = 0;
10256
10257 std::unique_ptr<CDC> cdc = CDC::create(chunk_algo, cbits(chunk_size)-1);
10258 if (!cdc) {
10259 dout(0) << __func__ << " unrecognized chunk-algorithm " << dendl;
10260 return -EINVAL;
10261 }
10262
10263 bufferlist bl;
10264 /**
10265 * We disable EC pool as a base tier of distributed dedup.
10266 * The reason why we disallow erasure code pool here is that the EC pool does not support objects_read_sync().
10267 * Therefore, we should change the current implementation totally to make EC pool compatible.
10268 * As s result, we leave this as a future work.
10269 */
10270 int r = pgbackend->objects_read_sync(
10271 oi.soid, 0, oi.size, 0, &bl);
10272 if (r < 0) {
10273 dout(0) << __func__ << " read fail " << oi.soid
10274 << " len: " << oi.size << " r: " << r << dendl;
10275 return r;
10276 }
10277 if (bl.length() != oi.size) {
10278 dout(0) << __func__ << " bl.length: " << bl.length() << " != oi.size: "
10279 << oi.size << " during chunking " << dendl;
10280 return -EIO;
10281 }
10282
10283 dout(10) << __func__ << " oid: " << oi.soid << " len: " << bl.length()
10284 << " oi.size: " << oi.size
10285 << " chunk_size: " << chunk_size << dendl;
10286
10287 vector<pair<uint64_t, uint64_t>> cdc_chunks;
10288 cdc->calc_chunks(bl, &cdc_chunks);
10289
10290 // get fingerprint
10291 for (auto p : cdc_chunks) {
10292 bufferlist chunk;
10293 chunk.substr_of(bl, p.first, p.second);
10294 hobject_t target = get_fpoid_from_chunk(oi.soid, chunk);
10295 chunks[p.first] = move(chunk);
10296 chunk_map[p.first] = chunk_info_t(0, p.second, target);
10297 total_length += p.second;
10298 }
10299 return total_length;
10300}
10301
10302hobject_t PrimaryLogPG::get_fpoid_from_chunk(const hobject_t soid, bufferlist& chunk)
10303{
10304 pg_pool_t::fingerprint_t fp_algo = pool.info.get_fingerprint_type();
10305 if (fp_algo == pg_pool_t::TYPE_FINGERPRINT_NONE) {
10306 return hobject_t();
10307 }
10308 object_t fp_oid = [&fp_algo, &chunk]() -> string {
10309 switch (fp_algo) {
10310 case pg_pool_t::TYPE_FINGERPRINT_SHA1:
10311 return ceph::crypto::digest<ceph::crypto::SHA1>(chunk).to_str();
10312 case pg_pool_t::TYPE_FINGERPRINT_SHA256:
10313 return ceph::crypto::digest<ceph::crypto::SHA256>(chunk).to_str();
10314 case pg_pool_t::TYPE_FINGERPRINT_SHA512:
10315 return ceph::crypto::digest<ceph::crypto::SHA512>(chunk).to_str();
10316 default:
10317 assert(0 == "unrecognized fingerprint type");
10318 return {};
10319 }
10320 }();
10321
10322 pg_t raw_pg;
10323 object_locator_t oloc(soid);
10324 oloc.pool = pool.info.get_dedup_tier();
10325 get_osdmap()->object_locator_to_pg(fp_oid, oloc, raw_pg);
10326 hobject_t target(fp_oid, oloc.key, snapid_t(),
10327 raw_pg.ps(), raw_pg.pool(),
10328 oloc.nspace);
10329 return target;
10330}
10331
10332int PrimaryLogPG::finish_set_dedup(hobject_t oid, int r, ceph_tid_t tid, uint64_t offset)
10333{
10334 dout(10) << __func__ << " " << oid << " tid " << tid
10335 << " " << cpp_strerror(r) << dendl;
10336 map<hobject_t,ManifestOpRef>::iterator p = manifest_ops.find(oid);
10337 if (p == manifest_ops.end()) {
10338 dout(10) << __func__ << " no manifest_op found" << dendl;
10339 return -EINVAL;
10340 }
10341 ManifestOpRef mop = p->second;
10342 mop->results[offset] = r;
10343 if (r < 0) {
10344 // if any failure occurs, put a mark on the results to recognize the failure
10345 mop->results[0] = r;
10346 }
10347 if (mop->num_chunks != mop->results.size()) {
10348 // there are on-going works
10349 return -EINPROGRESS;
10350 }
10351 ObjectContextRef obc = get_object_context(oid, false);
10352 if (!obc) {
10353 if (mop->op)
10354 osd->reply_op_error(mop->op, -EINVAL);
10355 return -EINVAL;
10356 }
10357 ceph_assert(obc->is_blocked());
10358 obc->stop_block();
10359 kick_object_context_blocked(obc);
10360 if (mop->results[0] < 0) {
10361 // check if the previous op returns fail
10362 ceph_assert(mop->num_chunks == mop->results.size());
10363 manifest_ops.erase(oid);
10364 osd->reply_op_error(mop->op, mop->results[0]);
10365 return -EIO;
10366 }
10367
10368 if (mop->chunks.size()) {
10369 OpContextUPtr ctx = simple_opc_create(obc);
10370 ceph_assert(ctx);
10371 if (ctx->lock_manager.get_lock_type(
10372 RWState::RWWRITE,
10373 oid,
10374 obc,
10375 mop->op)) {
10376 dout(20) << __func__ << " took write lock" << dendl;
10377 } else if (mop->op) {
10378 dout(10) << __func__ << " waiting on write lock " << mop->op << dendl;
10379 close_op_ctx(ctx.release());
10380 return -EAGAIN;
10381 }
10382
10383 ctx->at_version = get_next_version();
10384 ctx->new_obs = obc->obs;
10385 ctx->new_obs.oi.clear_flag(object_info_t::FLAG_DIRTY);
10386
10387 /*
10388 * Let's assume that there is a manifest snapshotted object, and we issue tier_flush() to head.
10389 * head: [0, 2) aaa <-- tier_flush()
10390 * 20: [0, 2) ddd, [6, 2) bbb, [8, 2) ccc
10391 *
10392 * In this case, if the new chunk_map is as follows,
10393 * new_chunk_map : [0, 2) ddd, [6, 2) bbb, [8, 2) ccc
10394 * we should drop aaa from head by using calc_refs_to_drop_on_removal().
10395 * So, the precedure is
10396 * 1. calc_refs_to_drop_on_removal()
10397 * 2. register old references to drop after tier_flush() is committed
10398 * 3. update new chunk_map
10399 */
10400
10401 ObjectCleanRegions c_regions = ctx->clean_regions;
10402 ObjectContextRef cobc = get_prev_clone_obc(obc);
10403 c_regions.mark_fully_dirty();
10404 // CDC was done on entire range of manifest object,
10405 // so the first thing we should do here is to drop the reference to old chunks
10406 ObjectContextRef obc_l, obc_g;
10407 get_adjacent_clones(obc, obc_l, obc_g);
10408 // clear all old references
10409 object_ref_delta_t refs;
10410 ctx->obs->oi.manifest.calc_refs_to_drop_on_removal(
10411 obc_l ? &(obc_l->obs.oi.manifest) : nullptr,
10412 obc_g ? &(obc_g->obs.oi.manifest) : nullptr,
10413 refs);
10414 if (!refs.is_empty()) {
10415 ctx->register_on_commit(
10416 [oid, this, refs](){
10417 dec_refcount(oid, refs);
10418 });
10419 }
10420
10421 // set new references
10422 ctx->new_obs.oi.manifest.chunk_map = mop->new_manifest.chunk_map;
10423
10424 finish_ctx(ctx.get(), pg_log_entry_t::CLEAN);
10425 simple_opc_submit(std::move(ctx));
10426 }
10427 if (mop->op)
10428 osd->reply_op_error(mop->op, r);
10429
10430 manifest_ops.erase(oid);
10431 return 0;
10432}
10433
7c673cae
FG
10434int PrimaryLogPG::start_flush(
10435 OpRequestRef op, ObjectContextRef obc,
10436 bool blocking, hobject_t *pmissing,
9f95a23c 10437 std::optional<std::function<void()>> &&on_flush)
7c673cae
FG
10438{
10439 const object_info_t& oi = obc->obs.oi;
10440 const hobject_t& soid = oi.soid;
10441 dout(10) << __func__ << " " << soid
10442 << " v" << oi.version
10443 << " uv" << oi.user_version
10444 << " " << (blocking ? "blocking" : "non-blocking/best-effort")
10445 << dendl;
10446
9f95a23c
TL
10447 bool preoctopus_compat =
10448 get_osdmap()->require_osd_release < ceph_release_t::octopus;
10449 SnapSet snapset;
10450 if (preoctopus_compat) {
10451 // for pre-octopus compatibility, filter SnapSet::snaps. not
10452 // certain we need this, but let's be conservative.
10453 snapset = obc->ssc->snapset.get_filtered(pool.info);
10454 } else {
10455 // NOTE: change this to a const ref when we remove this compat code
10456 snapset = obc->ssc->snapset;
10457 }
7c673cae 10458
f67539c2
TL
10459 if (obc->obs.oi.has_manifest() && obc->obs.oi.manifest.is_chunked()) {
10460 // current dedup tier only supports blocking operation
10461 if (!blocking) {
10462 return -EOPNOTSUPP;
10463 }
10464 }
10465
7c673cae
FG
10466 // verify there are no (older) check for dirty clones
10467 {
10468 dout(20) << " snapset " << snapset << dendl;
10469 vector<snapid_t>::reverse_iterator p = snapset.clones.rbegin();
10470 while (p != snapset.clones.rend() && *p >= soid.snap)
10471 ++p;
10472 if (p != snapset.clones.rend()) {
10473 hobject_t next = soid;
10474 next.snap = *p;
11fdf7f2 10475 ceph_assert(next.snap < soid.snap);
9f95a23c 10476 if (recovery_state.get_pg_log().get_missing().is_missing(next)) {
7c673cae
FG
10477 dout(10) << __func__ << " missing clone is " << next << dendl;
10478 if (pmissing)
10479 *pmissing = next;
10480 return -ENOENT;
10481 }
10482 ObjectContextRef older_obc = get_object_context(next, false);
10483 if (older_obc) {
10484 dout(20) << __func__ << " next oldest clone is " << older_obc->obs.oi
10485 << dendl;
10486 if (older_obc->obs.oi.is_dirty()) {
10487 dout(10) << __func__ << " next oldest clone is dirty: "
10488 << older_obc->obs.oi << dendl;
10489 return -EBUSY;
10490 }
10491 } else {
10492 dout(20) << __func__ << " next oldest clone " << next
10493 << " is not present; implicitly clean" << dendl;
10494 }
10495 } else {
10496 dout(20) << __func__ << " no older clones" << dendl;
10497 }
10498 }
10499
10500 if (blocking)
10501 obc->start_block();
10502
10503 map<hobject_t,FlushOpRef>::iterator p = flush_ops.find(soid);
10504 if (p != flush_ops.end()) {
10505 FlushOpRef fop = p->second;
10506 if (fop->op == op) {
10507 // we couldn't take the write lock on a cache-try-flush before;
10508 // now we are trying again for the lock.
10509 return try_flush_mark_clean(fop);
10510 }
10511 if (fop->flushed_version == obc->obs.oi.user_version &&
10512 (fop->blocking || !blocking)) {
10513 // nonblocking can join anything
10514 // blocking can only join a blocking flush
10515 dout(20) << __func__ << " piggybacking on existing flush " << dendl;
10516 if (op)
10517 fop->dup_ops.push_back(op);
10518 return -EAGAIN; // clean up this ctx; op will retry later
10519 }
10520
10521 // cancel current flush since it will fail anyway, or because we
10522 // are blocking and the existing flush is nonblocking.
10523 dout(20) << __func__ << " canceling previous flush; it will fail" << dendl;
10524 if (fop->op)
10525 osd->reply_op_error(fop->op, -EBUSY);
10526 while (!fop->dup_ops.empty()) {
10527 osd->reply_op_error(fop->dup_ops.front(), -EBUSY);
10528 fop->dup_ops.pop_front();
10529 }
94b18763
FG
10530 vector<ceph_tid_t> tids;
10531 cancel_flush(fop, false, &tids);
10532 osd->objecter->op_cancel(tids, -ECANCELED);
7c673cae
FG
10533 }
10534
11fdf7f2 10535 if (obc->obs.oi.has_manifest() && obc->obs.oi.manifest.is_chunked()) {
f67539c2 10536 int r = start_dedup(op, obc);
11fdf7f2
TL
10537 if (r != -EINPROGRESS) {
10538 if (blocking)
10539 obc->stop_block();
10540 }
10541 return r;
10542 }
10543
7c673cae
FG
10544 /**
10545 * In general, we need to send a delete and a copyfrom.
10546 * Consider snapc 10:[10, 9, 8, 4, 3, 2]:[10(10, 9), 4(4,3,2)]
10547 * where 4 is marked as clean. To flush 10, we have to:
10548 * 1) delete 4:[4,3,2] -- Logically, the object does not exist after 4
10549 * 2) copyfrom 8:[8,4,3,2] -- flush object after snap 8
10550 *
10551 * There is a complicating case. Supposed there had been a clone 7
10552 * for snaps [7, 6] which has been trimmed since they no longer exist.
10553 * In the base pool, we'd have 5:[4,3,2]:[4(4,3,2)]+head. When we submit
10554 * the delete, the snap will be promoted to 5, and the head will become
11fdf7f2 10555 * a whiteout. When the copy-from goes through, we'll end up with
7c673cae
FG
10556 * 8:[8,4,3,2]:[4(4,3,2)]+head.
10557 *
10558 * Another complication is the case where there is an interval change
10559 * after doing the delete and the flush but before marking the object
10560 * clean. We'll happily delete head and then recreate it at the same
10561 * sequence number, which works out ok.
10562 */
10563
10564 SnapContext snapc, dsnapc;
10565 if (snapset.seq != 0) {
10566 if (soid.snap == CEPH_NOSNAP) {
9f95a23c 10567 snapc = snapset.get_ssc_as_of(snapset.seq);
7c673cae
FG
10568 } else {
10569 snapid_t min_included_snap;
11fdf7f2
TL
10570 auto p = snapset.clone_snaps.find(soid.snap);
10571 ceph_assert(p != snapset.clone_snaps.end());
10572 min_included_snap = p->second.back();
7c673cae
FG
10573 snapc = snapset.get_ssc_as_of(min_included_snap - 1);
10574 }
10575
10576 snapid_t prev_snapc = 0;
10577 for (vector<snapid_t>::reverse_iterator citer = snapset.clones.rbegin();
10578 citer != snapset.clones.rend();
10579 ++citer) {
10580 if (*citer < soid.snap) {
10581 prev_snapc = *citer;
10582 break;
10583 }
10584 }
10585
10586 dsnapc = snapset.get_ssc_as_of(prev_snapc);
10587 }
10588
10589 object_locator_t base_oloc(soid);
10590 base_oloc.pool = pool.info.tier_of;
10591
10592 if (dsnapc.seq < snapc.seq) {
10593 ObjectOperation o;
10594 o.remove();
10595 osd->objecter->mutate(
10596 soid.oid,
10597 base_oloc,
10598 o,
10599 dsnapc,
10600 ceph::real_clock::from_ceph_timespec(oi.mtime),
10601 (CEPH_OSD_FLAG_IGNORE_OVERLAY |
10602 CEPH_OSD_FLAG_ENFORCE_SNAPC),
10603 NULL /* no callback, we'll rely on the ordering w.r.t the next op */);
10604 }
10605
10606 FlushOpRef fop(std::make_shared<FlushOp>());
10607 fop->obc = obc;
10608 fop->flushed_version = oi.user_version;
10609 fop->blocking = blocking;
10610 fop->on_flush = std::move(on_flush);
10611 fop->op = op;
10612
10613 ObjectOperation o;
10614 if (oi.is_whiteout()) {
10615 fop->removal = true;
10616 o.remove();
10617 } else {
10618 object_locator_t oloc(soid);
10619 o.copy_from(soid.oid.name, soid.snap, oloc, oi.user_version,
10620 CEPH_OSD_COPY_FROM_FLAG_FLUSH |
10621 CEPH_OSD_COPY_FROM_FLAG_IGNORE_OVERLAY |
10622 CEPH_OSD_COPY_FROM_FLAG_IGNORE_CACHE |
10623 CEPH_OSD_COPY_FROM_FLAG_MAP_SNAP_CLONE,
10624 LIBRADOS_OP_FLAG_FADVISE_SEQUENTIAL|LIBRADOS_OP_FLAG_FADVISE_NOCACHE);
10625
10626 //mean the base tier don't cache data after this
10627 if (agent_state && agent_state->evict_mode != TierAgentState::EVICT_MODE_FULL)
10628 o.set_last_op_flags(LIBRADOS_OP_FLAG_FADVISE_DONTNEED);
10629 }
10630 C_Flush *fin = new C_Flush(this, soid, get_last_peering_reset());
10631
10632 ceph_tid_t tid = osd->objecter->mutate(
10633 soid.oid, base_oloc, o, snapc,
10634 ceph::real_clock::from_ceph_timespec(oi.mtime),
10635 CEPH_OSD_FLAG_IGNORE_OVERLAY | CEPH_OSD_FLAG_ENFORCE_SNAPC,
10636 new C_OnFinisher(fin,
9f95a23c 10637 osd->get_objecter_finisher(get_pg_shard())));
7c673cae
FG
10638 /* we're under the pg lock and fin->finish() is grabbing that */
10639 fin->tid = tid;
10640 fop->objecter_tid = tid;
10641
10642 flush_ops[soid] = fop;
9f95a23c
TL
10643
10644 recovery_state.update_stats(
10645 [&oi](auto &history, auto &stats) {
10646 stats.stats.sum.num_flush++;
10647 stats.stats.sum.num_flush_kb += shift_round_up(oi.size, 10);
10648 return false;
10649 });
7c673cae
FG
10650 return -EINPROGRESS;
10651}
10652
10653void PrimaryLogPG::finish_flush(hobject_t oid, ceph_tid_t tid, int r)
10654{
10655 dout(10) << __func__ << " " << oid << " tid " << tid
10656 << " " << cpp_strerror(r) << dendl;
10657 map<hobject_t,FlushOpRef>::iterator p = flush_ops.find(oid);
10658 if (p == flush_ops.end()) {
10659 dout(10) << __func__ << " no flush_op found" << dendl;
10660 return;
10661 }
10662 FlushOpRef fop = p->second;
11fdf7f2 10663 if (tid != fop->objecter_tid && !fop->obc->obs.oi.has_manifest()) {
7c673cae
FG
10664 dout(10) << __func__ << " tid " << tid << " != fop " << fop
10665 << " tid " << fop->objecter_tid << dendl;
10666 return;
10667 }
10668 ObjectContextRef obc = fop->obc;
10669 fop->objecter_tid = 0;
10670
10671 if (r < 0 && !(r == -ENOENT && fop->removal)) {
10672 if (fop->op)
10673 osd->reply_op_error(fop->op, -EBUSY);
10674 if (fop->blocking) {
10675 obc->stop_block();
10676 kick_object_context_blocked(obc);
10677 }
10678
10679 if (!fop->dup_ops.empty()) {
10680 dout(20) << __func__ << " requeueing dups" << dendl;
10681 requeue_ops(fop->dup_ops);
10682 }
10683 if (fop->on_flush) {
10684 (*(fop->on_flush))();
9f95a23c 10685 fop->on_flush = std::nullopt;
7c673cae
FG
10686 }
10687 flush_ops.erase(oid);
10688 return;
10689 }
10690
10691 r = try_flush_mark_clean(fop);
10692 if (r == -EBUSY && fop->op) {
10693 osd->reply_op_error(fop->op, r);
10694 }
10695}
10696
10697int PrimaryLogPG::try_flush_mark_clean(FlushOpRef fop)
10698{
10699 ObjectContextRef obc = fop->obc;
10700 const hobject_t& oid = obc->obs.oi.soid;
10701
10702 if (fop->blocking) {
10703 obc->stop_block();
10704 kick_object_context_blocked(obc);
10705 }
10706
10707 if (fop->flushed_version != obc->obs.oi.user_version ||
10708 !obc->obs.exists) {
10709 if (obc->obs.exists)
10710 dout(10) << __func__ << " flushed_version " << fop->flushed_version
10711 << " != current " << obc->obs.oi.user_version
10712 << dendl;
10713 else
10714 dout(10) << __func__ << " object no longer exists" << dendl;
10715
10716 if (!fop->dup_ops.empty()) {
10717 dout(20) << __func__ << " requeueing dups" << dendl;
10718 requeue_ops(fop->dup_ops);
10719 }
10720 if (fop->on_flush) {
10721 (*(fop->on_flush))();
9f95a23c 10722 fop->on_flush = std::nullopt;
7c673cae
FG
10723 }
10724 flush_ops.erase(oid);
10725 if (fop->blocking)
10726 osd->logger->inc(l_osd_tier_flush_fail);
10727 else
10728 osd->logger->inc(l_osd_tier_try_flush_fail);
10729 return -EBUSY;
10730 }
10731
10732 if (!fop->blocking &&
f67539c2 10733 m_scrubber->write_blocked_by_scrub(oid)) {
7c673cae
FG
10734 if (fop->op) {
10735 dout(10) << __func__ << " blocked by scrub" << dendl;
10736 requeue_op(fop->op);
10737 requeue_ops(fop->dup_ops);
10738 return -EAGAIN; // will retry
10739 } else {
10740 osd->logger->inc(l_osd_tier_try_flush_fail);
94b18763
FG
10741 vector<ceph_tid_t> tids;
10742 cancel_flush(fop, false, &tids);
10743 osd->objecter->op_cancel(tids, -ECANCELED);
7c673cae
FG
10744 return -ECANCELED;
10745 }
10746 }
10747
10748 // successfully flushed, can we evict this object?
11fdf7f2
TL
10749 if (!obc->obs.oi.has_manifest() && !fop->op &&
10750 agent_state && agent_state->evict_mode != TierAgentState::EVICT_MODE_IDLE &&
7c673cae
FG
10751 agent_maybe_evict(obc, true)) {
10752 osd->logger->inc(l_osd_tier_clean);
10753 if (fop->on_flush) {
10754 (*(fop->on_flush))();
9f95a23c 10755 fop->on_flush = std::nullopt;
7c673cae
FG
10756 }
10757 flush_ops.erase(oid);
10758 return 0;
10759 }
10760
10761 dout(10) << __func__ << " clearing DIRTY flag for " << oid << dendl;
10762 OpContextUPtr ctx = simple_opc_create(fop->obc);
10763
10764 // successfully flushed; can we clear the dirty bit?
10765 // try to take the lock manually, since we don't
10766 // have a ctx yet.
10767 if (ctx->lock_manager.get_lock_type(
9f95a23c 10768 RWState::RWWRITE,
7c673cae
FG
10769 oid,
10770 obc,
10771 fop->op)) {
10772 dout(20) << __func__ << " took write lock" << dendl;
10773 } else if (fop->op) {
28e407b8
AA
10774 dout(10) << __func__ << " waiting on write lock " << fop->op << " "
10775 << fop->dup_ops << dendl;
28e407b8
AA
10776 // fop->op is now waiting on the lock; get fop->dup_ops to wait too.
10777 for (auto op : fop->dup_ops) {
10778 bool locked = ctx->lock_manager.get_lock_type(
9f95a23c 10779 RWState::RWWRITE,
28e407b8
AA
10780 oid,
10781 obc,
10782 op);
11fdf7f2 10783 ceph_assert(!locked);
28e407b8 10784 }
11fdf7f2 10785 close_op_ctx(ctx.release());
7c673cae
FG
10786 return -EAGAIN; // will retry
10787 } else {
10788 dout(10) << __func__ << " failed write lock, no op; failing" << dendl;
10789 close_op_ctx(ctx.release());
10790 osd->logger->inc(l_osd_tier_try_flush_fail);
94b18763
FG
10791 vector<ceph_tid_t> tids;
10792 cancel_flush(fop, false, &tids);
10793 osd->objecter->op_cancel(tids, -ECANCELED);
7c673cae
FG
10794 return -ECANCELED;
10795 }
10796
10797 if (fop->on_flush) {
10798 ctx->register_on_finish(*(fop->on_flush));
9f95a23c 10799 fop->on_flush = std::nullopt;
7c673cae
FG
10800 }
10801
10802 ctx->at_version = get_next_version();
10803
10804 ctx->new_obs = obc->obs;
10805 ctx->new_obs.oi.clear_flag(object_info_t::FLAG_DIRTY);
10806 --ctx->delta_stats.num_objects_dirty;
11fdf7f2
TL
10807 if (fop->obc->obs.oi.has_manifest()) {
10808 ceph_assert(obc->obs.oi.manifest.is_chunked());
10809 PGTransaction* t = ctx->op_t.get();
10810 uint64_t chunks_size = 0;
10811 for (auto &p : ctx->new_obs.oi.manifest.chunk_map) {
10812 chunks_size += p.second.length;
10813 }
10814 if (ctx->new_obs.oi.is_omap() && pool.info.supports_omap()) {
10815 t->omap_clear(oid);
10816 ctx->new_obs.oi.clear_omap_digest();
10817 ctx->new_obs.oi.clear_flag(object_info_t::FLAG_OMAP);
9f95a23c 10818 ctx->clean_regions.mark_omap_dirty();
11fdf7f2 10819 }
f67539c2 10820 if (obc->obs.oi.size == chunks_size) {
11fdf7f2
TL
10821 t->truncate(oid, 0);
10822 interval_set<uint64_t> trim;
10823 trim.insert(0, ctx->new_obs.oi.size);
10824 ctx->modified_ranges.union_of(trim);
10825 truncate_update_size_and_usage(ctx->delta_stats,
10826 ctx->new_obs.oi,
10827 0);
9f95a23c 10828 ctx->clean_regions.mark_data_region_dirty(0, ctx->new_obs.oi.size);
11fdf7f2
TL
10829 ctx->new_obs.oi.new_object();
10830 for (auto &p : ctx->new_obs.oi.manifest.chunk_map) {
11fdf7f2
TL
10831 p.second.set_flag(chunk_info_t::FLAG_MISSING);
10832 }
10833 } else {
10834 for (auto &p : ctx->new_obs.oi.manifest.chunk_map) {
f67539c2
TL
10835 dout(20) << __func__ << " offset: " << p.second.offset
10836 << " length: " << p.second.length << dendl;
10837 p.second.clear_flag(chunk_info_t::FLAG_MISSING); // CLEAN
11fdf7f2
TL
10838 }
10839 }
10840 }
7c673cae
FG
10841
10842 finish_ctx(ctx.get(), pg_log_entry_t::CLEAN);
10843
10844 osd->logger->inc(l_osd_tier_clean);
10845
10846 if (!fop->dup_ops.empty() || fop->op) {
10847 dout(20) << __func__ << " requeueing for " << ctx->at_version << dendl;
10848 list<OpRequestRef> ls;
10849 if (fop->op)
10850 ls.push_back(fop->op);
10851 ls.splice(ls.end(), fop->dup_ops);
10852 requeue_ops(ls);
10853 }
10854
10855 simple_opc_submit(std::move(ctx));
10856
10857 flush_ops.erase(oid);
10858
10859 if (fop->blocking)
10860 osd->logger->inc(l_osd_tier_flush);
10861 else
10862 osd->logger->inc(l_osd_tier_try_flush);
10863
10864 return -EINPROGRESS;
10865}
10866
94b18763
FG
10867void PrimaryLogPG::cancel_flush(FlushOpRef fop, bool requeue,
10868 vector<ceph_tid_t> *tids)
7c673cae
FG
10869{
10870 dout(10) << __func__ << " " << fop->obc->obs.oi.soid << " tid "
10871 << fop->objecter_tid << dendl;
10872 if (fop->objecter_tid) {
94b18763 10873 tids->push_back(fop->objecter_tid);
7c673cae
FG
10874 fop->objecter_tid = 0;
10875 }
94b18763
FG
10876 if (fop->io_tids.size()) {
10877 for (auto &p : fop->io_tids) {
10878 tids->push_back(p.second);
10879 p.second = 0;
f67539c2 10880 }
94b18763
FG
10881 }
10882 if (fop->blocking && fop->obc->is_blocked()) {
7c673cae
FG
10883 fop->obc->stop_block();
10884 kick_object_context_blocked(fop->obc);
10885 }
10886 if (requeue) {
10887 if (fop->op)
10888 requeue_op(fop->op);
10889 requeue_ops(fop->dup_ops);
10890 }
10891 if (fop->on_flush) {
10892 (*(fop->on_flush))();
9f95a23c 10893 fop->on_flush = std::nullopt;
7c673cae
FG
10894 }
10895 flush_ops.erase(fop->obc->obs.oi.soid);
10896}
10897
94b18763 10898void PrimaryLogPG::cancel_flush_ops(bool requeue, vector<ceph_tid_t> *tids)
7c673cae
FG
10899{
10900 dout(10) << __func__ << dendl;
10901 map<hobject_t,FlushOpRef>::iterator p = flush_ops.begin();
10902 while (p != flush_ops.end()) {
94b18763 10903 cancel_flush((p++)->second, requeue, tids);
7c673cae
FG
10904 }
10905}
10906
10907bool PrimaryLogPG::is_present_clone(hobject_t coid)
10908{
10909 if (!pool.info.allow_incomplete_clones())
10910 return true;
10911 if (is_missing_object(coid))
10912 return true;
10913 ObjectContextRef obc = get_object_context(coid, false);
10914 return obc && obc->obs.exists;
10915}
10916
10917// ========================================================================
11fdf7f2 10918// rep op gather
7c673cae
FG
10919
10920class C_OSD_RepopCommit : public Context {
10921 PrimaryLogPGRef pg;
10922 boost::intrusive_ptr<PrimaryLogPG::RepGather> repop;
10923public:
10924 C_OSD_RepopCommit(PrimaryLogPG *pg, PrimaryLogPG::RepGather *repop)
10925 : pg(pg), repop(repop) {}
10926 void finish(int) override {
10927 pg->repop_all_committed(repop.get());
10928 }
10929};
10930
10931void PrimaryLogPG::repop_all_committed(RepGather *repop)
10932{
10933 dout(10) << __func__ << ": repop tid " << repop->rep_tid << " all committed "
10934 << dendl;
10935 repop->all_committed = true;
7c673cae
FG
10936 if (!repop->rep_aborted) {
10937 if (repop->v != eversion_t()) {
9f95a23c 10938 recovery_state.complete_write(repop->v, repop->pg_local_last_complete);
7c673cae
FG
10939 }
10940 eval_repop(repop);
10941 }
10942}
10943
10944void PrimaryLogPG::op_applied(const eversion_t &applied_version)
10945{
10946 dout(10) << "op_applied version " << applied_version << dendl;
11fdf7f2
TL
10947 ceph_assert(applied_version != eversion_t());
10948 ceph_assert(applied_version <= info.last_update);
9f95a23c 10949 recovery_state.local_write_applied(applied_version);
f67539c2
TL
10950
10951 if (is_primary() && m_scrubber->should_requeue_blocked_ops(recovery_state.get_last_update_applied())) {
10952 osd->queue_scrub_applied_update(this, is_scrub_blocking_ops());
7c673cae
FG
10953 }
10954}
10955
10956void PrimaryLogPG::eval_repop(RepGather *repop)
10957{
f67539c2
TL
10958 #ifdef HAVE_JAEGER
10959 if (repop->op->osd_parent_span) {
10960 auto eval_span = jaeger_tracing::child_span(__func__, repop->op->osd_parent_span);
10961 }
10962 #endif
9f95a23c
TL
10963 dout(10) << "eval_repop " << *repop
10964 << (repop->op && repop->op->get_req<MOSDOp>() ? "" : " (no op)") << dendl;
7c673cae
FG
10965
10966 // ondisk?
10967 if (repop->all_committed) {
10968 dout(10) << " commit: " << *repop << dendl;
10969 for (auto p = repop->on_committed.begin();
10970 p != repop->on_committed.end();
10971 repop->on_committed.erase(p++)) {
10972 (*p)();
10973 }
10974 // send dup commits, in order
11fdf7f2
TL
10975 auto it = waiting_for_ondisk.find(repop->v);
10976 if (it != waiting_for_ondisk.end()) {
10977 ceph_assert(waiting_for_ondisk.begin()->first == repop->v);
10978 for (auto& i : it->second) {
10979 int return_code = repop->r;
10980 if (return_code >= 0) {
10981 return_code = std::get<2>(i);
10982 }
10983 osd->reply_op_error(std::get<0>(i), return_code, repop->v,
9f95a23c 10984 std::get<1>(i), std::get<3>(i));
7c673cae 10985 }
11fdf7f2 10986 waiting_for_ondisk.erase(it);
7c673cae 10987 }
7c673cae
FG
10988
10989 publish_stats_to_osd();
7c673cae
FG
10990
10991 dout(10) << " removing " << *repop << dendl;
11fdf7f2 10992 ceph_assert(!repop_queue.empty());
f67539c2 10993 dout(20) << " q front is " << *repop_queue.front() << dendl;
11fdf7f2 10994 if (repop_queue.front() == repop) {
7c673cae
FG
10995 RepGather *to_remove = nullptr;
10996 while (!repop_queue.empty() &&
11fdf7f2 10997 (to_remove = repop_queue.front())->all_committed) {
7c673cae
FG
10998 repop_queue.pop_front();
10999 for (auto p = to_remove->on_success.begin();
11000 p != to_remove->on_success.end();
11001 to_remove->on_success.erase(p++)) {
11002 (*p)();
11003 }
11004 remove_repop(to_remove);
11005 }
11006 }
11007 }
11008}
11009
11010void PrimaryLogPG::issue_repop(RepGather *repop, OpContext *ctx)
11011{
11fdf7f2 11012 FUNCTRACE(cct);
7c673cae
FG
11013 const hobject_t& soid = ctx->obs->oi.soid;
11014 dout(7) << "issue_repop rep_tid " << repop->rep_tid
11015 << " o " << soid
11016 << dendl;
f67539c2
TL
11017#ifdef HAVE_JAEGER
11018 if (ctx->op->osd_parent_span) {
11019 auto issue_repop_span = jaeger_tracing::child_span(__func__, ctx->op->osd_parent_span);
11020 }
11021#endif
7c673cae
FG
11022
11023 repop->v = ctx->at_version;
7c673cae 11024
7c673cae
FG
11025 ctx->op_t->add_obc(ctx->obc);
11026 if (ctx->clone_obc) {
7c673cae
FG
11027 ctx->op_t->add_obc(ctx->clone_obc);
11028 }
11fdf7f2
TL
11029 if (ctx->head_obc) {
11030 ctx->op_t->add_obc(ctx->head_obc);
7c673cae
FG
11031 }
11032
11033 Context *on_all_commit = new C_OSD_RepopCommit(this, repop);
7c673cae 11034 if (!(ctx->log.empty())) {
11fdf7f2 11035 ceph_assert(ctx->at_version >= projected_last_update);
7c673cae
FG
11036 projected_last_update = ctx->at_version;
11037 }
11038 for (auto &&entry: ctx->log) {
11039 projected_log.add(entry);
11040 }
11fdf7f2 11041
9f95a23c
TL
11042 recovery_state.pre_submit_op(
11043 soid,
11044 ctx->log,
11045 ctx->at_version);
7c673cae
FG
11046 pgbackend->submit_transaction(
11047 soid,
11048 ctx->delta_stats,
11049 ctx->at_version,
11050 std::move(ctx->op_t),
9f95a23c
TL
11051 recovery_state.get_pg_trim_to(),
11052 recovery_state.get_min_last_complete_ondisk(),
f67539c2 11053 std::move(ctx->log),
7c673cae 11054 ctx->updated_hset_history,
7c673cae
FG
11055 on_all_commit,
11056 repop->rep_tid,
11057 ctx->reqid,
11058 ctx->op);
11059}
11060
11061PrimaryLogPG::RepGather *PrimaryLogPG::new_repop(
11062 OpContext *ctx, ObjectContextRef obc,
11063 ceph_tid_t rep_tid)
11064{
11065 if (ctx->op)
11066 dout(10) << "new_repop rep_tid " << rep_tid << " on " << *ctx->op->get_req() << dendl;
11067 else
11068 dout(10) << "new_repop rep_tid " << rep_tid << " (no op)" << dendl;
11069
11070 RepGather *repop = new RepGather(
11fdf7f2 11071 ctx, rep_tid, info.last_complete);
7c673cae
FG
11072
11073 repop->start = ceph_clock_now();
11074
11075 repop_queue.push_back(&repop->queue_item);
11076 repop->get();
11077
11078 osd->logger->inc(l_osd_op_wip);
11079
11080 dout(10) << __func__ << ": " << *repop << dendl;
11081 return repop;
11082}
11083
11084boost::intrusive_ptr<PrimaryLogPG::RepGather> PrimaryLogPG::new_repop(
11085 eversion_t version,
11086 int r,
11087 ObcLockManager &&manager,
11088 OpRequestRef &&op,
9f95a23c 11089 std::optional<std::function<void(void)> > &&on_complete)
7c673cae
FG
11090{
11091 RepGather *repop = new RepGather(
11092 std::move(manager),
11093 std::move(op),
11094 std::move(on_complete),
11095 osd->get_tid(),
11096 info.last_complete,
7c673cae
FG
11097 r);
11098 repop->v = version;
11099
11100 repop->start = ceph_clock_now();
11101
11102 repop_queue.push_back(&repop->queue_item);
11103
11104 osd->logger->inc(l_osd_op_wip);
11105
11106 dout(10) << __func__ << ": " << *repop << dendl;
11107 return boost::intrusive_ptr<RepGather>(repop);
11108}
f67539c2 11109
7c673cae
FG
11110void PrimaryLogPG::remove_repop(RepGather *repop)
11111{
11112 dout(20) << __func__ << " " << *repop << dendl;
11113
11114 for (auto p = repop->on_finish.begin();
11115 p != repop->on_finish.end();
11116 repop->on_finish.erase(p++)) {
11117 (*p)();
11118 }
11119
11120 release_object_locks(
11121 repop->lock_manager);
11122 repop->put();
11123
11124 osd->logger->dec(l_osd_op_wip);
11125}
11126
11127PrimaryLogPG::OpContextUPtr PrimaryLogPG::simple_opc_create(ObjectContextRef obc)
11128{
11129 dout(20) << __func__ << " " << obc->obs.oi.soid << dendl;
7c673cae
FG
11130 ceph_tid_t rep_tid = osd->get_tid();
11131 osd_reqid_t reqid(osd->get_cluster_msgr_name(), 0, rep_tid);
c07f9fc5 11132 OpContextUPtr ctx(new OpContext(OpRequestRef(), reqid, nullptr, obc, this));
7c673cae
FG
11133 ctx->op_t.reset(new PGTransaction());
11134 ctx->mtime = ceph_clock_now();
11135 return ctx;
11136}
11137
11138void PrimaryLogPG::simple_opc_submit(OpContextUPtr ctx)
11139{
11140 RepGather *repop = new_repop(ctx.get(), ctx->obc, ctx->reqid.tid);
11141 dout(20) << __func__ << " " << repop << dendl;
11142 issue_repop(repop, ctx.get());
11143 eval_repop(repop);
9f95a23c 11144 recovery_state.update_trim_to();
7c673cae
FG
11145 repop->put();
11146}
11147
11148
11149void PrimaryLogPG::submit_log_entries(
31f18b77 11150 const mempool::osd_pglog::list<pg_log_entry_t> &entries,
7c673cae 11151 ObcLockManager &&manager,
9f95a23c 11152 std::optional<std::function<void(void)> > &&_on_complete,
7c673cae
FG
11153 OpRequestRef op,
11154 int r)
11155{
11156 dout(10) << __func__ << " " << entries << dendl;
11fdf7f2 11157 ceph_assert(is_primary());
7c673cae
FG
11158
11159 eversion_t version;
11160 if (!entries.empty()) {
11fdf7f2 11161 ceph_assert(entries.rbegin()->version >= projected_last_update);
7c673cae
FG
11162 version = projected_last_update = entries.rbegin()->version;
11163 }
11164
11165 boost::intrusive_ptr<RepGather> repop;
9f95a23c
TL
11166 std::optional<std::function<void(void)> > on_complete;
11167 if (get_osdmap()->require_osd_release >= ceph_release_t::jewel) {
7c673cae
FG
11168 repop = new_repop(
11169 version,
11170 r,
11171 std::move(manager),
11172 std::move(op),
11173 std::move(_on_complete));
11174 } else {
11175 on_complete = std::move(_on_complete);
11176 }
11177
11178 pgbackend->call_write_ordered(
11179 [this, entries, repop, on_complete]() {
11180 ObjectStore::Transaction t;
11181 eversion_t old_last_update = info.last_update;
9f95a23c
TL
11182 recovery_state.merge_new_log_entries(
11183 entries, t, recovery_state.get_pg_trim_to(),
11184 recovery_state.get_min_last_complete_ondisk());
7c673cae
FG
11185
11186 set<pg_shard_t> waiting_on;
9f95a23c
TL
11187 for (set<pg_shard_t>::const_iterator i = get_acting_recovery_backfill().begin();
11188 i != get_acting_recovery_backfill().end();
7c673cae
FG
11189 ++i) {
11190 pg_shard_t peer(*i);
11191 if (peer == pg_whoami) continue;
9f95a23c
TL
11192 ceph_assert(recovery_state.get_peer_missing().count(peer));
11193 ceph_assert(recovery_state.has_peer_info(peer));
11194 if (get_osdmap()->require_osd_release >= ceph_release_t::jewel) {
11fdf7f2 11195 ceph_assert(repop);
7c673cae
FG
11196 MOSDPGUpdateLogMissing *m = new MOSDPGUpdateLogMissing(
11197 entries,
11198 spg_t(info.pgid.pgid, i->shard),
11199 pg_whoami.shard,
11fdf7f2 11200 get_osdmap_epoch(),
9f95a23c 11201 get_last_peering_reset(),
94b18763 11202 repop->rep_tid,
9f95a23c
TL
11203 recovery_state.get_pg_trim_to(),
11204 recovery_state.get_min_last_complete_ondisk());
7c673cae 11205 osd->send_message_osd_cluster(
11fdf7f2 11206 peer.osd, m, get_osdmap_epoch());
7c673cae
FG
11207 waiting_on.insert(peer);
11208 } else {
11209 MOSDPGLog *m = new MOSDPGLog(
11210 peer.shard, pg_whoami.shard,
11211 info.last_update.epoch,
9f95a23c 11212 info, get_last_peering_reset());
7c673cae
FG
11213 m->log.log = entries;
11214 m->log.tail = old_last_update;
11215 m->log.head = info.last_update;
11216 osd->send_message_osd_cluster(
11fdf7f2 11217 peer.osd, m, get_osdmap_epoch());
7c673cae
FG
11218 }
11219 }
11fdf7f2
TL
11220 ceph_tid_t rep_tid = repop->rep_tid;
11221 waiting_on.insert(pg_whoami);
11222 log_entry_update_waiting_on.insert(
11223 make_pair(
11224 rep_tid,
11225 LogUpdateCtx{std::move(repop), std::move(waiting_on)}
11226 ));
11227 struct OnComplete : public Context {
11228 PrimaryLogPGRef pg;
11229 ceph_tid_t rep_tid;
11230 epoch_t epoch;
11231 OnComplete(
11232 PrimaryLogPGRef pg,
11233 ceph_tid_t rep_tid,
11234 epoch_t epoch)
11235 : pg(pg), rep_tid(rep_tid), epoch(epoch) {}
11236 void finish(int) override {
9f95a23c 11237 std::scoped_lock l{*pg};
11fdf7f2
TL
11238 if (!pg->pg_has_reset_since(epoch)) {
11239 auto it = pg->log_entry_update_waiting_on.find(rep_tid);
11240 ceph_assert(it != pg->log_entry_update_waiting_on.end());
11241 auto it2 = it->second.waiting_on.find(pg->pg_whoami);
11242 ceph_assert(it2 != it->second.waiting_on.end());
11243 it->second.waiting_on.erase(it2);
11244 if (it->second.waiting_on.empty()) {
11245 pg->repop_all_committed(it->second.repop.get());
11246 pg->log_entry_update_waiting_on.erase(it);
7c673cae 11247 }
7c673cae 11248 }
11fdf7f2
TL
11249 }
11250 };
11251 t.register_on_commit(
11252 new OnComplete{this, rep_tid, get_osdmap_epoch()});
11253 int r = osd->store->queue_transaction(ch, std::move(t), NULL);
11254 ceph_assert(r == 0);
11255 op_applied(info.last_update);
7c673cae 11256 });
94b18763 11257
9f95a23c 11258 recovery_state.update_trim_to();
7c673cae
FG
11259}
11260
11261void PrimaryLogPG::cancel_log_updates()
11262{
11263 // get rid of all the LogUpdateCtx so their references to repops are
11264 // dropped
11265 log_entry_update_waiting_on.clear();
11266}
11267
11268// -------------------------------------------------------
11269
11fdf7f2 11270void PrimaryLogPG::get_watchers(list<obj_watch_item_t> *ls)
7c673cae 11271{
9f95a23c 11272 std::scoped_lock l{*this};
7c673cae
FG
11273 pair<hobject_t, ObjectContextRef> i;
11274 while (object_contexts.get_next(i.first, &i)) {
11275 ObjectContextRef obc(i.second);
11fdf7f2 11276 get_obc_watchers(obc, *ls);
7c673cae
FG
11277 }
11278}
11279
11280void PrimaryLogPG::get_obc_watchers(ObjectContextRef obc, list<obj_watch_item_t> &pg_watchers)
11281{
11282 for (map<pair<uint64_t, entity_name_t>, WatchRef>::iterator j =
11283 obc->watchers.begin();
11284 j != obc->watchers.end();
11285 ++j) {
11286 obj_watch_item_t owi;
11287
11288 owi.obj = obc->obs.oi.soid;
11289 owi.wi.addr = j->second->get_peer_addr();
11290 owi.wi.name = j->second->get_entity();
11291 owi.wi.cookie = j->second->get_cookie();
11292 owi.wi.timeout_seconds = j->second->get_timeout();
11293
11294 dout(30) << "watch: Found oid=" << owi.obj << " addr=" << owi.wi.addr
11295 << " name=" << owi.wi.name << " cookie=" << owi.wi.cookie << dendl;
11296
11297 pg_watchers.push_back(owi);
11298 }
11299}
11300
f67539c2 11301void PrimaryLogPG::check_blocklisted_watchers()
7c673cae 11302{
f67539c2 11303 dout(20) << "PrimaryLogPG::check_blocklisted_watchers for pg " << get_pgid() << dendl;
7c673cae
FG
11304 pair<hobject_t, ObjectContextRef> i;
11305 while (object_contexts.get_next(i.first, &i))
f67539c2 11306 check_blocklisted_obc_watchers(i.second);
7c673cae
FG
11307}
11308
f67539c2 11309void PrimaryLogPG::check_blocklisted_obc_watchers(ObjectContextRef obc)
7c673cae 11310{
f67539c2 11311 dout(20) << "PrimaryLogPG::check_blocklisted_obc_watchers for obc " << obc->obs.oi.soid << dendl;
7c673cae
FG
11312 for (map<pair<uint64_t, entity_name_t>, WatchRef>::iterator k =
11313 obc->watchers.begin();
11314 k != obc->watchers.end();
11315 ) {
11316 //Advance iterator now so handle_watch_timeout() can erase element
11317 map<pair<uint64_t, entity_name_t>, WatchRef>::iterator j = k++;
11318 dout(30) << "watch: Found " << j->second->get_entity() << " cookie " << j->second->get_cookie() << dendl;
11319 entity_addr_t ea = j->second->get_peer_addr();
11320 dout(30) << "watch: Check entity_addr_t " << ea << dendl;
f67539c2
TL
11321 if (get_osdmap()->is_blocklisted(ea)) {
11322 dout(10) << "watch: Found blocklisted watcher for " << ea << dendl;
11fdf7f2 11323 ceph_assert(j->second->get_pg() == this);
7c673cae
FG
11324 j->second->unregister_cb();
11325 handle_watch_timeout(j->second);
11326 }
11327 }
11328}
11329
11330void PrimaryLogPG::populate_obc_watchers(ObjectContextRef obc)
11331{
f6b5b4d7 11332 ceph_assert(is_primary() && is_active());
9f95a23c 11333 auto it_objects = recovery_state.get_pg_log().get_log().objects.find(obc->obs.oi.soid);
11fdf7f2 11334 ceph_assert((recovering.count(obc->obs.oi.soid) ||
7c673cae 11335 !is_missing_object(obc->obs.oi.soid)) ||
9f95a23c 11336 (it_objects != recovery_state.get_pg_log().get_log().objects.end() && // or this is a revert... see recover_primary()
11fdf7f2 11337 it_objects->second->op ==
7c673cae 11338 pg_log_entry_t::LOST_REVERT &&
11fdf7f2 11339 it_objects->second->reverting_to ==
7c673cae
FG
11340 obc->obs.oi.version));
11341
11342 dout(10) << "populate_obc_watchers " << obc->obs.oi.soid << dendl;
11fdf7f2 11343 ceph_assert(obc->watchers.empty());
7c673cae
FG
11344 // populate unconnected_watchers
11345 for (map<pair<uint64_t, entity_name_t>, watch_info_t>::iterator p =
11346 obc->obs.oi.watchers.begin();
11347 p != obc->obs.oi.watchers.end();
11348 ++p) {
11349 utime_t expire = info.stats.last_became_active;
11350 expire += p->second.timeout_seconds;
11351 dout(10) << " unconnected watcher " << p->first << " will expire " << expire << dendl;
11352 WatchRef watch(
11353 Watch::makeWatchRef(
11354 this, osd, obc, p->second.timeout_seconds, p->first.first,
11355 p->first.second, p->second.addr));
11356 watch->disconnect();
11357 obc->watchers.insert(
11358 make_pair(
11359 make_pair(p->first.first, p->first.second),
11360 watch));
11361 }
f67539c2
TL
11362 // Look for watchers from blocklisted clients and drop
11363 check_blocklisted_obc_watchers(obc);
7c673cae
FG
11364}
11365
11366void PrimaryLogPG::handle_watch_timeout(WatchRef watch)
11367{
11368 ObjectContextRef obc = watch->get_obc(); // handle_watch_timeout owns this ref
11369 dout(10) << "handle_watch_timeout obc " << obc << dendl;
11370
11371 if (!is_active()) {
11372 dout(10) << "handle_watch_timeout not active, no-op" << dendl;
11373 return;
11374 }
a8e16298
TL
11375 if (!obc->obs.exists) {
11376 dout(10) << __func__ << " object " << obc->obs.oi.soid << " dne" << dendl;
11377 return;
11378 }
7c673cae
FG
11379 if (is_degraded_or_backfilling_object(obc->obs.oi.soid)) {
11380 callbacks_for_degraded_object[obc->obs.oi.soid].push_back(
11381 watch->get_delayed_cb()
11382 );
11383 dout(10) << "handle_watch_timeout waiting for degraded on obj "
11384 << obc->obs.oi.soid
11385 << dendl;
11386 return;
11387 }
11388
f67539c2 11389 if (m_scrubber->write_blocked_by_scrub(obc->obs.oi.soid)) {
7c673cae
FG
11390 dout(10) << "handle_watch_timeout waiting for scrub on obj "
11391 << obc->obs.oi.soid
11392 << dendl;
f67539c2 11393 m_scrubber->add_callback(
7c673cae
FG
11394 watch->get_delayed_cb() // This callback!
11395 );
11396 return;
11397 }
11398
11399 OpContextUPtr ctx = simple_opc_create(obc);
11400 ctx->at_version = get_next_version();
11401
11402 object_info_t& oi = ctx->new_obs.oi;
11403 oi.watchers.erase(make_pair(watch->get_cookie(),
11404 watch->get_entity()));
11405
11406 list<watch_disconnect_t> watch_disconnects = {
11407 watch_disconnect_t(watch->get_cookie(), watch->get_entity(), true)
11408 };
11409 ctx->register_on_success(
11410 [this, obc, watch_disconnects]() {
11411 complete_disconnect_watches(obc, watch_disconnects);
11412 });
11413
11414
11415 PGTransaction *t = ctx->op_t.get();
11416 ctx->log.push_back(pg_log_entry_t(pg_log_entry_t::MODIFY, obc->obs.oi.soid,
11417 ctx->at_version,
11418 oi.version,
11419 0,
11420 osd_reqid_t(), ctx->mtime, 0));
11421
11422 oi.prior_version = obc->obs.oi.version;
11423 oi.version = ctx->at_version;
11424 bufferlist bl;
11fdf7f2 11425 encode(oi, bl, get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
7c673cae
FG
11426 t->setattr(obc->obs.oi.soid, OI_ATTR, bl);
11427
11428 // apply new object state.
11429 ctx->obc->obs = ctx->new_obs;
11430
11431 // no ctx->delta_stats
11432 simple_opc_submit(std::move(ctx));
11433}
11434
11435ObjectContextRef PrimaryLogPG::create_object_context(const object_info_t& oi,
11436 SnapSetContext *ssc)
11437{
11438 ObjectContextRef obc(object_contexts.lookup_or_create(oi.soid));
11fdf7f2 11439 ceph_assert(obc->destructor_callback == NULL);
f67539c2 11440 obc->destructor_callback = new C_PG_ObjectContext(this, obc.get());
7c673cae
FG
11441 obc->obs.oi = oi;
11442 obc->obs.exists = false;
11443 obc->ssc = ssc;
11444 if (ssc)
11445 register_snapset_context(ssc);
11446 dout(10) << "create_object_context " << (void*)obc.get() << " " << oi.soid << " " << dendl;
11447 if (is_active())
11448 populate_obc_watchers(obc);
11449 return obc;
11450}
11451
11452ObjectContextRef PrimaryLogPG::get_object_context(
11453 const hobject_t& soid,
11454 bool can_create,
11455 const map<string, bufferlist> *attrs)
11456{
9f95a23c 11457 auto it_objects = recovery_state.get_pg_log().get_log().objects.find(soid);
11fdf7f2 11458 ceph_assert(
9f95a23c 11459 attrs || !recovery_state.get_pg_log().get_missing().is_missing(soid) ||
7c673cae 11460 // or this is a revert... see recover_primary()
9f95a23c 11461 (it_objects != recovery_state.get_pg_log().get_log().objects.end() &&
11fdf7f2 11462 it_objects->second->op ==
7c673cae
FG
11463 pg_log_entry_t::LOST_REVERT));
11464 ObjectContextRef obc = object_contexts.lookup(soid);
11465 osd->logger->inc(l_osd_object_ctx_cache_total);
11466 if (obc) {
11467 osd->logger->inc(l_osd_object_ctx_cache_hit);
11468 dout(10) << __func__ << ": found obc in cache: " << obc
11469 << dendl;
11470 } else {
11471 dout(10) << __func__ << ": obc NOT found in cache: " << soid << dendl;
11472 // check disk
11473 bufferlist bv;
11474 if (attrs) {
11fdf7f2
TL
11475 auto it_oi = attrs->find(OI_ATTR);
11476 ceph_assert(it_oi != attrs->end());
11477 bv = it_oi->second;
7c673cae
FG
11478 } else {
11479 int r = pgbackend->objects_get_attr(soid, OI_ATTR, &bv);
11480 if (r < 0) {
11481 if (!can_create) {
11482 dout(10) << __func__ << ": no obc for soid "
11483 << soid << " and !can_create"
11484 << dendl;
11485 return ObjectContextRef(); // -ENOENT!
11486 }
11487
11488 dout(10) << __func__ << ": no obc for soid "
11489 << soid << " but can_create"
11490 << dendl;
11491 // new object.
11492 object_info_t oi(soid);
11493 SnapSetContext *ssc = get_snapset_context(
11494 soid, true, 0, false);
11fdf7f2 11495 ceph_assert(ssc);
7c673cae
FG
11496 obc = create_object_context(oi, ssc);
11497 dout(10) << __func__ << ": " << obc << " " << soid
11498 << " " << obc->rwstate
11499 << " oi: " << obc->obs.oi
11500 << " ssc: " << obc->ssc
11501 << " snapset: " << obc->ssc->snapset << dendl;
11502 return obc;
11503 }
11504 }
11505
11506 object_info_t oi;
11507 try {
11fdf7f2
TL
11508 bufferlist::const_iterator bliter = bv.begin();
11509 decode(oi, bliter);
7c673cae
FG
11510 } catch (...) {
11511 dout(0) << __func__ << ": obc corrupt: " << soid << dendl;
11512 return ObjectContextRef(); // -ENOENT!
11513 }
11514
11fdf7f2 11515 ceph_assert(oi.soid.pool == (int64_t)info.pgid.pool());
7c673cae
FG
11516
11517 obc = object_contexts.lookup_or_create(oi.soid);
11518 obc->destructor_callback = new C_PG_ObjectContext(this, obc.get());
11519 obc->obs.oi = oi;
11520 obc->obs.exists = true;
11521
11522 obc->ssc = get_snapset_context(
11523 soid, true,
11524 soid.has_snapset() ? attrs : 0);
11525
f6b5b4d7 11526 if (is_primary() && is_active())
7c673cae
FG
11527 populate_obc_watchers(obc);
11528
11fdf7f2 11529 if (pool.info.is_erasure()) {
7c673cae
FG
11530 if (attrs) {
11531 obc->attr_cache = *attrs;
11532 } else {
11533 int r = pgbackend->objects_get_attrs(
11534 soid,
11535 &obc->attr_cache);
11fdf7f2 11536 ceph_assert(r == 0);
7c673cae
FG
11537 }
11538 }
11539
11540 dout(10) << __func__ << ": creating obc from disk: " << obc
11541 << dendl;
11542 }
224ce89b
WB
11543
11544 // XXX: Caller doesn't expect this
11545 if (obc->ssc == NULL) {
11546 derr << __func__ << ": obc->ssc not available, not returning context" << dendl;
11547 return ObjectContextRef(); // -ENOENT!
11548 }
11549
7c673cae
FG
11550 dout(10) << __func__ << ": " << obc << " " << soid
11551 << " " << obc->rwstate
11552 << " oi: " << obc->obs.oi
11553 << " exists: " << (int)obc->obs.exists
11554 << " ssc: " << obc->ssc
11555 << " snapset: " << obc->ssc->snapset << dendl;
11556 return obc;
11557}
11558
11559void PrimaryLogPG::context_registry_on_change()
11560{
11561 pair<hobject_t, ObjectContextRef> i;
11562 while (object_contexts.get_next(i.first, &i)) {
11563 ObjectContextRef obc(i.second);
11564 if (obc) {
11565 for (map<pair<uint64_t, entity_name_t>, WatchRef>::iterator j =
11566 obc->watchers.begin();
11567 j != obc->watchers.end();
11568 obc->watchers.erase(j++)) {
11569 j->second->discard();
11570 }
11571 }
11572 }
11573}
11574
11575
11576/*
11577 * If we return an error, and set *pmissing, then promoting that
11578 * object may help.
11579 *
11580 * If we return -EAGAIN, we will always set *pmissing to the missing
11581 * object to wait for.
11582 *
11583 * If we return an error but do not set *pmissing, then we know the
11584 * object does not exist.
11585 */
11586int PrimaryLogPG::find_object_context(const hobject_t& oid,
11587 ObjectContextRef *pobc,
11588 bool can_create,
11589 bool map_snapid_to_clone,
11590 hobject_t *pmissing)
11591{
11fdf7f2
TL
11592 FUNCTRACE(cct);
11593 ceph_assert(oid.pool == static_cast<int64_t>(info.pgid.pool()));
7c673cae
FG
11594 // want the head?
11595 if (oid.snap == CEPH_NOSNAP) {
11596 ObjectContextRef obc = get_object_context(oid, can_create);
11597 if (!obc) {
11598 if (pmissing)
11599 *pmissing = oid;
11600 return -ENOENT;
11601 }
11fdf7f2 11602 dout(10) << __func__ << " " << oid
7c673cae
FG
11603 << " @" << oid.snap
11604 << " oi=" << obc->obs.oi
11605 << dendl;
11606 *pobc = obc;
11607
11608 return 0;
11609 }
11610
7c673cae 11611 // we want a snap
7c673cae 11612
9f95a23c 11613 hobject_t head = oid.get_head();
7c673cae
FG
11614 SnapSetContext *ssc = get_snapset_context(oid, can_create);
11615 if (!ssc || !(ssc->exists || can_create)) {
11616 dout(20) << __func__ << " " << oid << " no snapset" << dendl;
11617 if (pmissing)
11618 *pmissing = head; // start by getting the head
11619 if (ssc)
11620 put_snapset_context(ssc);
11621 return -ENOENT;
11622 }
11623
11624 if (map_snapid_to_clone) {
11fdf7f2 11625 dout(10) << __func__ << " " << oid << " @" << oid.snap
7c673cae
FG
11626 << " snapset " << ssc->snapset
11627 << " map_snapid_to_clone=true" << dendl;
11628 if (oid.snap > ssc->snapset.seq) {
11629 // already must be readable
11630 ObjectContextRef obc = get_object_context(head, false);
11fdf7f2 11631 dout(10) << __func__ << " " << oid << " @" << oid.snap
7c673cae
FG
11632 << " snapset " << ssc->snapset
11633 << " maps to head" << dendl;
11634 *pobc = obc;
11635 put_snapset_context(ssc);
11636 return (obc && obc->obs.exists) ? 0 : -ENOENT;
11637 } else {
11638 vector<snapid_t>::const_iterator citer = std::find(
11639 ssc->snapset.clones.begin(),
11640 ssc->snapset.clones.end(),
11641 oid.snap);
11642 if (citer == ssc->snapset.clones.end()) {
11fdf7f2 11643 dout(10) << __func__ << " " << oid << " @" << oid.snap
7c673cae
FG
11644 << " snapset " << ssc->snapset
11645 << " maps to nothing" << dendl;
11646 put_snapset_context(ssc);
11647 return -ENOENT;
11648 }
11649
11fdf7f2 11650 dout(10) << __func__ << " " << oid << " @" << oid.snap
7c673cae
FG
11651 << " snapset " << ssc->snapset
11652 << " maps to " << oid << dendl;
11653
9f95a23c 11654 if (recovery_state.get_pg_log().get_missing().is_missing(oid)) {
11fdf7f2 11655 dout(10) << __func__ << " " << oid << " @" << oid.snap
7c673cae
FG
11656 << " snapset " << ssc->snapset
11657 << " " << oid << " is missing" << dendl;
11658 if (pmissing)
11659 *pmissing = oid;
11660 put_snapset_context(ssc);
11661 return -EAGAIN;
11662 }
11663
11664 ObjectContextRef obc = get_object_context(oid, false);
11665 if (!obc || !obc->obs.exists) {
11fdf7f2 11666 dout(10) << __func__ << " " << oid << " @" << oid.snap
7c673cae
FG
11667 << " snapset " << ssc->snapset
11668 << " " << oid << " is not present" << dendl;
11669 if (pmissing)
11670 *pmissing = oid;
11671 put_snapset_context(ssc);
11672 return -ENOENT;
11673 }
11fdf7f2 11674 dout(10) << __func__ << " " << oid << " @" << oid.snap
7c673cae
FG
11675 << " snapset " << ssc->snapset
11676 << " " << oid << " HIT" << dendl;
11677 *pobc = obc;
11678 put_snapset_context(ssc);
11679 return 0;
11680 }
11681 ceph_abort(); //unreachable
11682 }
11683
11fdf7f2 11684 dout(10) << __func__ << " " << oid << " @" << oid.snap
7c673cae 11685 << " snapset " << ssc->snapset << dendl;
f67539c2 11686
7c673cae
FG
11687 // head?
11688 if (oid.snap > ssc->snapset.seq) {
11fdf7f2
TL
11689 ObjectContextRef obc = get_object_context(head, false);
11690 dout(10) << __func__ << " " << head
7c673cae 11691 << " want " << oid.snap << " > snapset seq " << ssc->snapset.seq
11fdf7f2 11692 << " -- HIT " << obc->obs
7c673cae 11693 << dendl;
11fdf7f2
TL
11694 if (!obc->ssc)
11695 obc->ssc = ssc;
11696 else {
11697 ceph_assert(ssc == obc->ssc);
11698 put_snapset_context(ssc);
11699 }
11700 *pobc = obc;
11701 return 0;
7c673cae
FG
11702 }
11703
11704 // which clone would it be?
11705 unsigned k = 0;
11706 while (k < ssc->snapset.clones.size() &&
11707 ssc->snapset.clones[k] < oid.snap)
11708 k++;
11709 if (k == ssc->snapset.clones.size()) {
11fdf7f2 11710 dout(10) << __func__ << " no clones with last >= oid.snap "
7c673cae
FG
11711 << oid.snap << " -- DNE" << dendl;
11712 put_snapset_context(ssc);
11713 return -ENOENT;
11714 }
11715 hobject_t soid(oid.oid, oid.get_key(), ssc->snapset.clones[k], oid.get_hash(),
11716 info.pgid.pool(), oid.get_namespace());
11717
9f95a23c 11718 if (recovery_state.get_pg_log().get_missing().is_missing(soid)) {
11fdf7f2 11719 dout(20) << __func__ << " " << soid << " missing, try again later"
7c673cae
FG
11720 << dendl;
11721 if (pmissing)
11722 *pmissing = soid;
11723 put_snapset_context(ssc);
11724 return -EAGAIN;
11725 }
11726
11727 ObjectContextRef obc = get_object_context(soid, false);
11728 if (!obc || !obc->obs.exists) {
7c673cae
FG
11729 if (pmissing)
11730 *pmissing = soid;
11731 put_snapset_context(ssc);
9f95a23c
TL
11732 if (is_primary()) {
11733 if (is_degraded_or_backfilling_object(soid)) {
11734 dout(20) << __func__ << " clone is degraded or backfilling " << soid << dendl;
11735 return -EAGAIN;
11736 } else if (is_degraded_on_async_recovery_target(soid)) {
11737 dout(20) << __func__ << " clone is recovering " << soid << dendl;
11738 return -EAGAIN;
11739 } else {
11740 dout(20) << __func__ << " missing clone " << soid << dendl;
11741 return -ENOENT;
11742 }
c07f9fc5 11743 } else {
9f95a23c 11744 dout(20) << __func__ << " replica missing clone" << soid << dendl;
c07f9fc5
FG
11745 return -ENOENT;
11746 }
7c673cae
FG
11747 }
11748
11749 if (!obc->ssc) {
11750 obc->ssc = ssc;
11751 } else {
11fdf7f2 11752 ceph_assert(obc->ssc == ssc);
7c673cae
FG
11753 put_snapset_context(ssc);
11754 }
11755 ssc = 0;
11756
11757 // clone
11fdf7f2 11758 dout(20) << __func__ << " " << soid
7c673cae 11759 << " snapset " << obc->ssc->snapset
7c673cae
FG
11760 << dendl;
11761 snapid_t first, last;
11fdf7f2
TL
11762 auto p = obc->ssc->snapset.clone_snaps.find(soid.snap);
11763 ceph_assert(p != obc->ssc->snapset.clone_snaps.end());
11764 if (p->second.empty()) {
11765 dout(1) << __func__ << " " << soid << " empty snapset -- DNE" << dendl;
11766 ceph_assert(!cct->_conf->osd_debug_verify_snaps);
11767 return -ENOENT;
7c673cae 11768 }
9f95a23c
TL
11769 if (std::find(p->second.begin(), p->second.end(), oid.snap) ==
11770 p->second.end()) {
11771 dout(20) << __func__ << " " << soid << " clone_snaps " << p->second
11772 << " does not contain " << oid.snap << " -- DNE" << dendl;
11773 return -ENOENT;
11774 }
11775 if (get_osdmap()->in_removed_snaps_queue(info.pgid.pgid.pool(), oid.snap)) {
11776 dout(20) << __func__ << " " << soid << " snap " << oid.snap
11777 << " in removed_snaps_queue" << " -- DNE" << dendl;
7c673cae
FG
11778 return -ENOENT;
11779 }
9f95a23c
TL
11780 dout(20) << __func__ << " " << soid << " clone_snaps " << p->second
11781 << " contains " << oid.snap << " -- HIT " << obc->obs << dendl;
11782 *pobc = obc;
11783 return 0;
7c673cae
FG
11784}
11785
11786void PrimaryLogPG::object_context_destructor_callback(ObjectContext *obc)
11787{
11788 if (obc->ssc)
11789 put_snapset_context(obc->ssc);
11790}
11791
11792void PrimaryLogPG::add_object_context_to_pg_stat(ObjectContextRef obc, pg_stat_t *pgstat)
11793{
11794 object_info_t& oi = obc->obs.oi;
11795
11fdf7f2
TL
11796 dout(10) << __func__ << " " << oi.soid << dendl;
11797 ceph_assert(!oi.soid.is_snapdir());
7c673cae 11798
11fdf7f2
TL
11799 object_stat_sum_t stat;
11800 stat.num_objects++;
7c673cae
FG
11801 if (oi.is_dirty())
11802 stat.num_objects_dirty++;
11803 if (oi.is_whiteout())
11804 stat.num_whiteouts++;
11805 if (oi.is_omap())
11806 stat.num_objects_omap++;
11807 if (oi.is_cache_pinned())
11808 stat.num_objects_pinned++;
11fdf7f2
TL
11809 if (oi.has_manifest())
11810 stat.num_objects_manifest++;
7c673cae 11811
11fdf7f2 11812 if (oi.soid.is_snap()) {
7c673cae
FG
11813 stat.num_object_clones++;
11814
11815 if (!obc->ssc)
11816 obc->ssc = get_snapset_context(oi.soid, false);
11fdf7f2
TL
11817 ceph_assert(obc->ssc);
11818 stat.num_bytes += obc->ssc->snapset.get_clone_bytes(oi.soid.snap);
11819 } else {
11820 stat.num_bytes += oi.size;
7c673cae
FG
11821 }
11822
11823 // add it in
11824 pgstat->stats.sum.add(stat);
11825}
11826
11827void PrimaryLogPG::kick_object_context_blocked(ObjectContextRef obc)
11828{
11829 const hobject_t& soid = obc->obs.oi.soid;
11830 if (obc->is_blocked()) {
11831 dout(10) << __func__ << " " << soid << " still blocked" << dendl;
11832 return;
11833 }
11834
11835 map<hobject_t, list<OpRequestRef>>::iterator p = waiting_for_blocked_object.find(soid);
11836 if (p != waiting_for_blocked_object.end()) {
11837 list<OpRequestRef>& ls = p->second;
11838 dout(10) << __func__ << " " << soid << " requeuing " << ls.size() << " requests" << dendl;
11839 requeue_ops(ls);
11840 waiting_for_blocked_object.erase(p);
11841 }
11842
11843 map<hobject_t, ObjectContextRef>::iterator i =
11844 objects_blocked_on_snap_promotion.find(obc->obs.oi.soid.get_head());
11845 if (i != objects_blocked_on_snap_promotion.end()) {
11fdf7f2 11846 ceph_assert(i->second == obc);
7c673cae
FG
11847 objects_blocked_on_snap_promotion.erase(i);
11848 }
11849
11850 if (obc->requeue_scrub_on_unblock) {
f67539c2 11851
7c673cae 11852 obc->requeue_scrub_on_unblock = false;
f67539c2
TL
11853
11854 dout(20) << __func__ << " requeuing if still active: " << (is_active() ? "yes" : "no") << dendl;
11855
494da23a
TL
11856 // only requeue if we are still active: we may be unblocking
11857 // because we are resetting for a new peering interval
11858 if (is_active()) {
f67539c2 11859 osd->queue_scrub_unblocking(this, is_scrub_blocking_ops());
494da23a 11860 }
7c673cae
FG
11861 }
11862}
11863
11864SnapSetContext *PrimaryLogPG::get_snapset_context(
11865 const hobject_t& oid,
11866 bool can_create,
11867 const map<string, bufferlist> *attrs,
11868 bool oid_existed)
11869{
11fdf7f2 11870 std::lock_guard l(snapset_contexts_lock);
7c673cae
FG
11871 SnapSetContext *ssc;
11872 map<hobject_t, SnapSetContext*>::iterator p = snapset_contexts.find(
11873 oid.get_snapdir());
11874 if (p != snapset_contexts.end()) {
11875 if (can_create || p->second->exists) {
11876 ssc = p->second;
11877 } else {
11878 return NULL;
11879 }
11880 } else {
11881 bufferlist bv;
11882 if (!attrs) {
11883 int r = -ENOENT;
11fdf7f2 11884 if (!(oid.is_head() && !oid_existed)) {
7c673cae 11885 r = pgbackend->objects_get_attr(oid.get_head(), SS_ATTR, &bv);
7c673cae 11886 }
11fdf7f2
TL
11887 if (r < 0 && !can_create)
11888 return NULL;
7c673cae 11889 } else {
11fdf7f2
TL
11890 auto it_ss = attrs->find(SS_ATTR);
11891 ceph_assert(it_ss != attrs->end());
11892 bv = it_ss->second;
7c673cae
FG
11893 }
11894 ssc = new SnapSetContext(oid.get_snapdir());
11895 _register_snapset_context(ssc);
11896 if (bv.length()) {
11fdf7f2 11897 bufferlist::const_iterator bvp = bv.begin();
224ce89b
WB
11898 try {
11899 ssc->snapset.decode(bvp);
f67539c2
TL
11900 } catch (const ceph::buffer::error& e) {
11901 dout(0) << __func__ << " Can't decode snapset: " << e.what() << dendl;
224ce89b
WB
11902 return NULL;
11903 }
7c673cae
FG
11904 ssc->exists = true;
11905 } else {
11906 ssc->exists = false;
11907 }
11908 }
11fdf7f2 11909 ceph_assert(ssc);
7c673cae
FG
11910 ssc->ref++;
11911 return ssc;
11912}
11913
11914void PrimaryLogPG::put_snapset_context(SnapSetContext *ssc)
11915{
11fdf7f2 11916 std::lock_guard l(snapset_contexts_lock);
7c673cae
FG
11917 --ssc->ref;
11918 if (ssc->ref == 0) {
11919 if (ssc->registered)
11920 snapset_contexts.erase(ssc->oid);
11921 delete ssc;
11922 }
11923}
11924
7c673cae
FG
11925/*
11926 * Return values:
11927 * NONE - didn't pull anything
11928 * YES - pulled what the caller wanted
11fdf7f2 11929 * HEAD - needed to pull head first
7c673cae 11930 */
11fdf7f2 11931enum { PULL_NONE, PULL_HEAD, PULL_YES };
7c673cae
FG
11932
11933int PrimaryLogPG::recover_missing(
11934 const hobject_t &soid, eversion_t v,
11935 int priority,
11936 PGBackend::RecoveryHandle *h)
11937{
9f95a23c 11938 if (recovery_state.get_missing_loc().is_unfound(soid)) {
11fdf7f2 11939 dout(7) << __func__ << " " << soid
f67539c2 11940 << " v " << v
7c673cae
FG
11941 << " but it is unfound" << dendl;
11942 return PULL_NONE;
11943 }
11944
9f95a23c 11945 if (recovery_state.get_missing_loc().is_deleted(soid)) {
c07f9fc5 11946 start_recovery_op(soid);
11fdf7f2 11947 ceph_assert(!recovering.count(soid));
c07f9fc5 11948 recovering.insert(make_pair(soid, ObjectContextRef()));
11fdf7f2 11949 epoch_t cur_epoch = get_osdmap_epoch();
9f95a23c 11950 remove_missing_object(soid, v, new LambdaContext(
c07f9fc5 11951 [=](int) {
9f95a23c 11952 std::scoped_lock locker{*this};
c07f9fc5
FG
11953 if (!pg_has_reset_since(cur_epoch)) {
11954 bool object_missing = false;
9f95a23c 11955 for (const auto& shard : get_acting_recovery_backfill()) {
c07f9fc5
FG
11956 if (shard == pg_whoami)
11957 continue;
9f95a23c 11958 if (recovery_state.get_peer_missing(shard).is_missing(soid)) {
c07f9fc5
FG
11959 dout(20) << __func__ << ": soid " << soid << " needs to be deleted from replica " << shard << dendl;
11960 object_missing = true;
11961 break;
11962 }
11963 }
11964 if (!object_missing) {
11965 object_stat_sum_t stat_diff;
11966 stat_diff.num_objects_recovered = 1;
11fdf7f2
TL
11967 if (scrub_after_recovery)
11968 stat_diff.num_objects_repaired = 1;
c07f9fc5
FG
11969 on_global_recover(soid, stat_diff, true);
11970 } else {
11971 auto recovery_handle = pgbackend->open_recovery_op();
11972 pgbackend->recover_delete_object(soid, v, recovery_handle);
11973 pgbackend->run_recovery_op(recovery_handle, priority);
11974 }
11975 }
c07f9fc5
FG
11976 }));
11977 return PULL_YES;
11978 }
11979
7c673cae
FG
11980 // is this a snapped object? if so, consult the snapset.. we may not need the entire object!
11981 ObjectContextRef obc;
11982 ObjectContextRef head_obc;
11983 if (soid.snap && soid.snap < CEPH_NOSNAP) {
11fdf7f2 11984 // do we have the head?
7c673cae 11985 hobject_t head = soid.get_head();
9f95a23c 11986 if (recovery_state.get_pg_log().get_missing().is_missing(head)) {
7c673cae
FG
11987 if (recovering.count(head)) {
11988 dout(10) << " missing but already recovering head " << head << dendl;
11989 return PULL_NONE;
11990 } else {
11991 int r = recover_missing(
9f95a23c 11992 head, recovery_state.get_pg_log().get_missing().get_items().find(head)->second.need, priority,
7c673cae
FG
11993 h);
11994 if (r != PULL_NONE)
11fdf7f2 11995 return PULL_HEAD;
7c673cae
FG
11996 return PULL_NONE;
11997 }
11998 }
7c673cae 11999 head_obc = get_object_context(
11fdf7f2 12000 head,
7c673cae
FG
12001 false,
12002 0);
11fdf7f2 12003 ceph_assert(head_obc);
7c673cae
FG
12004 }
12005 start_recovery_op(soid);
11fdf7f2 12006 ceph_assert(!recovering.count(soid));
7c673cae 12007 recovering.insert(make_pair(soid, obc));
224ce89b 12008 int r = pgbackend->recover_object(
7c673cae
FG
12009 soid,
12010 v,
12011 head_obc,
12012 obc,
12013 h);
224ce89b 12014 // This is only a pull which shouldn't return an error
11fdf7f2 12015 ceph_assert(r >= 0);
7c673cae
FG
12016 return PULL_YES;
12017}
12018
c07f9fc5
FG
12019void PrimaryLogPG::remove_missing_object(const hobject_t &soid,
12020 eversion_t v, Context *on_complete)
12021{
12022 dout(20) << __func__ << " " << soid << " " << v << dendl;
11fdf7f2 12023 ceph_assert(on_complete != nullptr);
c07f9fc5
FG
12024 // delete locally
12025 ObjectStore::Transaction t;
12026 remove_snap_mapped_object(t, soid);
12027
12028 ObjectRecoveryInfo recovery_info;
12029 recovery_info.soid = soid;
12030 recovery_info.version = v;
12031
11fdf7f2 12032 epoch_t cur_epoch = get_osdmap_epoch();
9f95a23c 12033 t.register_on_complete(new LambdaContext(
c07f9fc5 12034 [=](int) {
9f95a23c 12035 std::unique_lock locker{*this};
c07f9fc5
FG
12036 if (!pg_has_reset_since(cur_epoch)) {
12037 ObjectStore::Transaction t2;
12038 on_local_recover(soid, recovery_info, ObjectContextRef(), true, &t2);
12039 t2.register_on_complete(on_complete);
11fdf7f2
TL
12040 int r = osd->store->queue_transaction(ch, std::move(t2), nullptr);
12041 ceph_assert(r == 0);
9f95a23c 12042 locker.unlock();
c07f9fc5 12043 } else {
9f95a23c 12044 locker.unlock();
c07f9fc5
FG
12045 on_complete->complete(-EAGAIN);
12046 }
12047 }));
11fdf7f2
TL
12048 int r = osd->store->queue_transaction(ch, std::move(t), nullptr);
12049 ceph_assert(r == 0);
c07f9fc5 12050}
7c673cae 12051
eafe8130 12052void PrimaryLogPG::finish_degraded_object(const hobject_t oid)
7c673cae 12053{
11fdf7f2 12054 dout(10) << __func__ << " " << oid << dendl;
7c673cae
FG
12055 if (callbacks_for_degraded_object.count(oid)) {
12056 list<Context*> contexts;
12057 contexts.swap(callbacks_for_degraded_object[oid]);
12058 callbacks_for_degraded_object.erase(oid);
12059 for (list<Context*>::iterator i = contexts.begin();
12060 i != contexts.end();
12061 ++i) {
12062 (*i)->complete(0);
12063 }
12064 }
12065 map<hobject_t, snapid_t>::iterator i = objects_blocked_on_degraded_snap.find(
12066 oid.get_head());
12067 if (i != objects_blocked_on_degraded_snap.end() &&
12068 i->second == oid.snap)
12069 objects_blocked_on_degraded_snap.erase(i);
12070}
12071
12072void PrimaryLogPG::_committed_pushed_object(
12073 epoch_t epoch, eversion_t last_complete)
12074{
9f95a23c 12075 std::scoped_lock locker{*this};
7c673cae 12076 if (!pg_has_reset_since(epoch)) {
9f95a23c 12077 recovery_state.recovery_committed_to(last_complete);
7c673cae 12078 } else {
9f95a23c
TL
12079 dout(10) << __func__
12080 << " pg has changed, not touching last_complete_ondisk" << dendl;
7c673cae 12081 }
7c673cae
FG
12082}
12083
12084void PrimaryLogPG::_applied_recovered_object(ObjectContextRef obc)
12085{
c07f9fc5
FG
12086 dout(20) << __func__ << dendl;
12087 if (obc) {
12088 dout(20) << "obc = " << *obc << dendl;
12089 }
11fdf7f2 12090 ceph_assert(active_pushes >= 1);
7c673cae
FG
12091 --active_pushes;
12092
12093 // requeue an active chunky scrub waiting on recovery ops
f67539c2
TL
12094 if (!recovery_state.is_deleting() && active_pushes == 0 &&
12095 m_scrubber->is_scrub_active()) {
12096
12097 osd->queue_scrub_pushes_update(this, is_scrub_blocking_ops());
7c673cae 12098 }
7c673cae
FG
12099}
12100
12101void PrimaryLogPG::_applied_recovered_object_replica()
12102{
c07f9fc5 12103 dout(20) << __func__ << dendl;
11fdf7f2 12104 ceph_assert(active_pushes >= 1);
7c673cae
FG
12105 --active_pushes;
12106
f67539c2 12107 // requeue an active scrub waiting on recovery ops
9f95a23c 12108 if (!recovery_state.is_deleting() && active_pushes == 0 &&
f67539c2
TL
12109 m_scrubber->is_scrub_active()) {
12110
12111 osd->queue_scrub_replica_pushes(this, m_scrubber->replica_op_priority());
7c673cae 12112 }
7c673cae
FG
12113}
12114
9f95a23c
TL
12115void PrimaryLogPG::on_failed_pull(
12116 const set<pg_shard_t> &from,
12117 const hobject_t &soid,
12118 const eversion_t &v)
7c673cae
FG
12119{
12120 dout(20) << __func__ << ": " << soid << dendl;
11fdf7f2 12121 ceph_assert(recovering.count(soid));
7c673cae
FG
12122 auto obc = recovering[soid];
12123 if (obc) {
12124 list<OpRequestRef> blocked_ops;
12125 obc->drop_recovery_read(&blocked_ops);
12126 requeue_ops(blocked_ops);
12127 }
12128 recovering.erase(soid);
81eedcae 12129 for (auto&& i : from) {
9f95a23c
TL
12130 if (i != pg_whoami) { // we'll get it below in primary_error
12131 recovery_state.force_object_missing(i, soid, v);
81eedcae
TL
12132 }
12133 }
9f95a23c 12134
7c673cae 12135 dout(0) << __func__ << " " << soid << " from shard " << from
9f95a23c
TL
12136 << ", reps on " << recovery_state.get_missing_loc().get_locations(soid)
12137 << " unfound? " << recovery_state.get_missing_loc().is_unfound(soid)
12138 << dendl;
7c673cae 12139 finish_recovery_op(soid); // close out this attempt,
9f95a23c
TL
12140 finish_degraded_object(soid);
12141
12142 if (from.count(pg_whoami)) {
12143 dout(0) << " primary missing oid " << soid << " version " << v << dendl;
12144 primary_error(soid, v);
12145 backfills_in_flight.erase(soid);
12146 }
7c673cae
FG
12147}
12148
7c673cae
FG
12149eversion_t PrimaryLogPG::pick_newest_available(const hobject_t& oid)
12150{
12151 eversion_t v;
12152 pg_missing_item pmi;
9f95a23c 12153 bool is_missing = recovery_state.get_pg_log().get_missing().is_missing(oid, &pmi);
11fdf7f2 12154 ceph_assert(is_missing);
7c673cae
FG
12155 v = pmi.have;
12156 dout(10) << "pick_newest_available " << oid << " " << v << " on osd." << osd->whoami << " (local)" << dendl;
12157
9f95a23c
TL
12158 ceph_assert(!get_acting_recovery_backfill().empty());
12159 for (set<pg_shard_t>::iterator i = get_acting_recovery_backfill().begin();
12160 i != get_acting_recovery_backfill().end();
7c673cae
FG
12161 ++i) {
12162 if (*i == get_primary()) continue;
12163 pg_shard_t peer = *i;
9f95a23c 12164 if (!recovery_state.get_peer_missing(peer).is_missing(oid)) {
7c673cae
FG
12165 continue;
12166 }
9f95a23c 12167 eversion_t h = recovery_state.get_peer_missing(peer).get_items().at(oid).have;
7c673cae
FG
12168 dout(10) << "pick_newest_available " << oid << " " << h << " on osd." << peer << dendl;
12169 if (h > v)
12170 v = h;
12171 }
12172
12173 dout(10) << "pick_newest_available " << oid << " " << v << " (newest)" << dendl;
12174 return v;
12175}
12176
12177void PrimaryLogPG::do_update_log_missing(OpRequestRef &op)
12178{
12179 const MOSDPGUpdateLogMissing *m = static_cast<const MOSDPGUpdateLogMissing*>(
12180 op->get_req());
11fdf7f2 12181 ceph_assert(m->get_type() == MSG_OSD_PG_UPDATE_LOG_MISSING);
7c673cae 12182 ObjectStore::Transaction t;
9f95a23c 12183 std::optional<eversion_t> op_trim_to, op_roll_forward_to;
94b18763
FG
12184 if (m->pg_trim_to != eversion_t())
12185 op_trim_to = m->pg_trim_to;
12186 if (m->pg_roll_forward_to != eversion_t())
12187 op_roll_forward_to = m->pg_roll_forward_to;
12188
9f95a23c
TL
12189 dout(20) << __func__
12190 << " op_trim_to = " << op_trim_to << " op_roll_forward_to = " << op_roll_forward_to << dendl;
94b18763 12191
9f95a23c
TL
12192 recovery_state.append_log_entries_update_missing(
12193 m->entries, t, op_trim_to, op_roll_forward_to);
94b18763 12194 eversion_t new_lcod = info.last_complete;
7c673cae 12195
9f95a23c 12196 Context *complete = new LambdaContext(
7c673cae
FG
12197 [=](int) {
12198 const MOSDPGUpdateLogMissing *msg = static_cast<const MOSDPGUpdateLogMissing*>(
12199 op->get_req());
9f95a23c 12200 std::scoped_lock locker{*this};
7c673cae 12201 if (!pg_has_reset_since(msg->get_epoch())) {
94b18763 12202 update_last_complete_ondisk(new_lcod);
7c673cae
FG
12203 MOSDPGUpdateLogMissingReply *reply =
12204 new MOSDPGUpdateLogMissingReply(
12205 spg_t(info.pgid.pgid, primary_shard().shard),
12206 pg_whoami.shard,
12207 msg->get_epoch(),
12208 msg->min_epoch,
94b18763
FG
12209 msg->get_tid(),
12210 new_lcod);
7c673cae
FG
12211 reply->set_priority(CEPH_MSG_PRIO_HIGH);
12212 msg->get_connection()->send_message(reply);
12213 }
7c673cae
FG
12214 });
12215
9f95a23c 12216 if (get_osdmap()->require_osd_release >= ceph_release_t::kraken) {
7c673cae
FG
12217 t.register_on_commit(complete);
12218 } else {
12219 /* Hack to work around the fact that ReplicatedBackend sends
12220 * ack+commit if commit happens first
12221 *
12222 * This behavior is no longer necessary, but we preserve it so old
12223 * primaries can keep their repops in order */
11fdf7f2 12224 if (pool.info.is_erasure()) {
7c673cae
FG
12225 t.register_on_complete(complete);
12226 } else {
12227 t.register_on_commit(complete);
12228 }
12229 }
7c673cae 12230 int tr = osd->store->queue_transaction(
11fdf7f2 12231 ch,
7c673cae
FG
12232 std::move(t),
12233 nullptr);
11fdf7f2
TL
12234 ceph_assert(tr == 0);
12235 op_applied(info.last_update);
7c673cae
FG
12236}
12237
12238void PrimaryLogPG::do_update_log_missing_reply(OpRequestRef &op)
12239{
12240 const MOSDPGUpdateLogMissingReply *m =
12241 static_cast<const MOSDPGUpdateLogMissingReply*>(
12242 op->get_req());
12243 dout(20) << __func__ << " got reply from "
12244 << m->get_from() << dendl;
12245
12246 auto it = log_entry_update_waiting_on.find(m->get_tid());
12247 if (it != log_entry_update_waiting_on.end()) {
12248 if (it->second.waiting_on.count(m->get_from())) {
12249 it->second.waiting_on.erase(m->get_from());
94b18763
FG
12250 if (m->last_complete_ondisk != eversion_t()) {
12251 update_peer_last_complete_ondisk(m->get_from(), m->last_complete_ondisk);
12252 }
7c673cae
FG
12253 } else {
12254 osd->clog->error()
12255 << info.pgid << " got reply "
12256 << *m << " from shard we are not waiting for "
12257 << m->get_from();
12258 }
12259
12260 if (it->second.waiting_on.empty()) {
12261 repop_all_committed(it->second.repop.get());
12262 log_entry_update_waiting_on.erase(it);
12263 }
12264 } else {
12265 osd->clog->error()
12266 << info.pgid << " got reply "
12267 << *m << " on unknown tid " << m->get_tid();
12268 }
12269}
12270
12271/* Mark all unfound objects as lost.
12272 */
12273void PrimaryLogPG::mark_all_unfound_lost(
12274 int what,
9f95a23c 12275 std::function<void(int,const std::string&,bufferlist&)> on_finish)
7c673cae
FG
12276{
12277 dout(3) << __func__ << " " << pg_log_entry_t::get_op_name(what) << dendl;
224ce89b 12278 list<hobject_t> oids;
7c673cae
FG
12279
12280 dout(30) << __func__ << ": log before:\n";
9f95a23c 12281 recovery_state.get_pg_log().get_log().print(*_dout);
7c673cae
FG
12282 *_dout << dendl;
12283
31f18b77 12284 mempool::osd_pglog::list<pg_log_entry_t> log_entries;
7c673cae
FG
12285
12286 utime_t mtime = ceph_clock_now();
12287 map<hobject_t, pg_missing_item>::const_iterator m =
9f95a23c 12288 recovery_state.get_missing_loc().get_needs_recovery().begin();
7c673cae 12289 map<hobject_t, pg_missing_item>::const_iterator mend =
9f95a23c 12290 recovery_state.get_missing_loc().get_needs_recovery().end();
7c673cae
FG
12291
12292 ObcLockManager manager;
12293 eversion_t v = get_next_version();
11fdf7f2 12294 v.epoch = get_osdmap_epoch();
9f95a23c 12295 uint64_t num_unfound = recovery_state.get_missing_loc().num_unfound();
7c673cae
FG
12296 while (m != mend) {
12297 const hobject_t &oid(m->first);
9f95a23c 12298 if (!recovery_state.get_missing_loc().is_unfound(oid)) {
7c673cae
FG
12299 // We only care about unfound objects
12300 ++m;
12301 continue;
12302 }
12303
12304 ObjectContextRef obc;
12305 eversion_t prev;
12306
12307 switch (what) {
12308 case pg_log_entry_t::LOST_MARK:
11fdf7f2 12309 ceph_abort_msg("actually, not implemented yet!");
7c673cae
FG
12310 break;
12311
12312 case pg_log_entry_t::LOST_REVERT:
12313 prev = pick_newest_available(oid);
12314 if (prev > eversion_t()) {
12315 // log it
12316 pg_log_entry_t e(
12317 pg_log_entry_t::LOST_REVERT, oid, v,
12318 m->second.need, 0, osd_reqid_t(), mtime, 0);
12319 e.reverting_to = prev;
12320 e.mark_unrollbackable();
12321 log_entries.push_back(e);
12322 dout(10) << e << dendl;
12323
12324 // we are now missing the new version; recovery code will sort it out.
12325 ++v.version;
12326 ++m;
12327 break;
12328 }
12329
12330 case pg_log_entry_t::LOST_DELETE:
12331 {
12332 pg_log_entry_t e(pg_log_entry_t::LOST_DELETE, oid, v, m->second.need,
12333 0, osd_reqid_t(), mtime, 0);
9f95a23c 12334 if (get_osdmap()->require_osd_release >= ceph_release_t::jewel) {
7c673cae
FG
12335 if (pool.info.require_rollback()) {
12336 e.mod_desc.try_rmobject(v.version);
12337 } else {
12338 e.mark_unrollbackable();
12339 }
12340 } // otherwise, just do what we used to do
12341 dout(10) << e << dendl;
12342 log_entries.push_back(e);
224ce89b 12343 oids.push_back(oid);
7c673cae 12344
b32b8144
FG
12345 // If context found mark object as deleted in case
12346 // of racing with new creation. This can happen if
12347 // object lost and EIO at primary.
12348 obc = object_contexts.lookup(oid);
12349 if (obc)
12350 obc->obs.exists = false;
12351
7c673cae
FG
12352 ++v.version;
12353 ++m;
12354 }
12355 break;
12356
12357 default:
12358 ceph_abort();
12359 }
12360 }
12361
9f95a23c
TL
12362 recovery_state.update_stats(
12363 [](auto &history, auto &stats) {
12364 stats.stats_invalid = true;
12365 return false;
12366 });
7c673cae
FG
12367
12368 submit_log_entries(
12369 log_entries,
12370 std::move(manager),
9f95a23c
TL
12371 std::optional<std::function<void(void)> >(
12372 [this, oids, num_unfound, on_finish]() {
12373 if (recovery_state.perform_deletes_during_peering()) {
c07f9fc5
FG
12374 for (auto oid : oids) {
12375 // clear old locations - merge_new_log_entries will have
12376 // handled rebuilding missing_loc for each of these
12377 // objects if we have the RECOVERY_DELETES flag
9f95a23c 12378 recovery_state.object_recovered(oid, object_stat_sum_t());
c07f9fc5
FG
12379 }
12380 }
12381
b32b8144
FG
12382 if (is_recovery_unfound()) {
12383 queue_peering_event(
11fdf7f2
TL
12384 PGPeeringEventRef(
12385 std::make_shared<PGPeeringEvent>(
12386 get_osdmap_epoch(),
12387 get_osdmap_epoch(),
9f95a23c 12388 PeeringState::DoRecovery())));
b32b8144
FG
12389 } else if (is_backfill_unfound()) {
12390 queue_peering_event(
11fdf7f2
TL
12391 PGPeeringEventRef(
12392 std::make_shared<PGPeeringEvent>(
12393 get_osdmap_epoch(),
12394 get_osdmap_epoch(),
9f95a23c 12395 PeeringState::RequestBackfill())));
b32b8144
FG
12396 } else {
12397 queue_recovery();
7c673cae 12398 }
7c673cae
FG
12399
12400 stringstream ss;
12401 ss << "pg has " << num_unfound
12402 << " objects unfound and apparently lost marking";
12403 string rs = ss.str();
12404 dout(0) << "do_command r=" << 0 << " " << rs << dendl;
12405 osd->clog->info() << rs;
9f95a23c
TL
12406 bufferlist empty;
12407 on_finish(0, rs, empty);
7c673cae
FG
12408 }),
12409 OpRequestRef());
12410}
12411
12412void PrimaryLogPG::_split_into(pg_t child_pgid, PG *child, unsigned split_bits)
12413{
11fdf7f2 12414 ceph_assert(repop_queue.empty());
7c673cae
FG
12415}
12416
12417/*
12418 * pg status change notification
12419 */
12420
12421void PrimaryLogPG::apply_and_flush_repops(bool requeue)
12422{
12423 list<OpRequestRef> rq;
12424
12425 // apply all repops
12426 while (!repop_queue.empty()) {
12427 RepGather *repop = repop_queue.front();
12428 repop_queue.pop_front();
12429 dout(10) << " canceling repop tid " << repop->rep_tid << dendl;
12430 repop->rep_aborted = true;
7c673cae
FG
12431 repop->on_committed.clear();
12432 repop->on_success.clear();
12433
12434 if (requeue) {
12435 if (repop->op) {
12436 dout(10) << " requeuing " << *repop->op->get_req() << dendl;
12437 rq.push_back(repop->op);
12438 repop->op = OpRequestRef();
12439 }
12440
12441 // also requeue any dups, interleaved into position
11fdf7f2 12442 auto p = waiting_for_ondisk.find(repop->v);
7c673cae
FG
12443 if (p != waiting_for_ondisk.end()) {
12444 dout(10) << " also requeuing ondisk waiters " << p->second << dendl;
11fdf7f2
TL
12445 for (auto& i : p->second) {
12446 rq.push_back(std::get<0>(i));
7c673cae
FG
12447 }
12448 waiting_for_ondisk.erase(p);
12449 }
12450 }
12451
12452 remove_repop(repop);
12453 }
12454
11fdf7f2 12455 ceph_assert(repop_queue.empty());
7c673cae
FG
12456
12457 if (requeue) {
12458 requeue_ops(rq);
12459 if (!waiting_for_ondisk.empty()) {
11fdf7f2
TL
12460 for (auto& i : waiting_for_ondisk) {
12461 for (auto& j : i.second) {
12462 derr << __func__ << ": op " << *(std::get<0>(j)->get_req())
12463 << " waiting on " << i.first << dendl;
12464 }
7c673cae 12465 }
11fdf7f2 12466 ceph_assert(waiting_for_ondisk.empty());
7c673cae
FG
12467 }
12468 }
12469
12470 waiting_for_ondisk.clear();
12471}
12472
12473void PrimaryLogPG::on_flushed()
12474{
9f95a23c 12475 requeue_ops(waiting_for_flush);
7c673cae
FG
12476 if (!is_peered() || !is_primary()) {
12477 pair<hobject_t, ObjectContextRef> i;
12478 while (object_contexts.get_next(i.first, &i)) {
11fdf7f2 12479 derr << __func__ << ": object " << i.first << " obc still alive" << dendl;
7c673cae 12480 }
11fdf7f2 12481 ceph_assert(object_contexts.empty());
7c673cae 12482 }
7c673cae
FG
12483}
12484
9f95a23c 12485void PrimaryLogPG::on_removal(ObjectStore::Transaction &t)
7c673cae 12486{
11fdf7f2 12487 dout(10) << __func__ << dendl;
7c673cae 12488
11fdf7f2 12489 on_shutdown();
9f95a23c
TL
12490
12491 t.register_on_commit(new C_DeleteMore(this, get_osdmap_epoch()));
7c673cae
FG
12492}
12493
c07f9fc5
FG
12494void PrimaryLogPG::clear_async_reads()
12495{
12496 dout(10) << __func__ << dendl;
12497 for(auto& i : in_progress_async_reads) {
12498 dout(10) << "clear ctx: "
12499 << "OpRequestRef " << i.first
12500 << " OpContext " << i.second
12501 << dendl;
12502 close_op_ctx(i.second);
12503 }
12504}
12505
11fdf7f2 12506void PrimaryLogPG::clear_cache()
7c673cae 12507{
11fdf7f2
TL
12508 object_contexts.clear();
12509}
7c673cae 12510
11fdf7f2
TL
12511void PrimaryLogPG::on_shutdown()
12512{
12513 dout(10) << __func__ << dendl;
7c673cae 12514
224ce89b
WB
12515 if (recovery_queued) {
12516 recovery_queued = false;
12517 osd->clear_queued_recovery(this);
12518 }
12519
f67539c2 12520 m_scrubber->scrub_clear_state();
7c673cae 12521
f67539c2 12522 m_scrubber->unreg_next_scrub();
94b18763
FG
12523
12524 vector<ceph_tid_t> tids;
12525 cancel_copy_ops(false, &tids);
12526 cancel_flush_ops(false, &tids);
12527 cancel_proxy_ops(false, &tids);
9f95a23c 12528 cancel_manifest_ops(false, &tids);
94b18763
FG
12529 osd->objecter->op_cancel(tids, -ECANCELED);
12530
7c673cae
FG
12531 apply_and_flush_repops(false);
12532 cancel_log_updates();
31f18b77 12533 // we must remove PGRefs, so do this this prior to release_backoffs() callers
f67539c2 12534 clear_backoffs();
31f18b77
FG
12535 // clean up snap trim references
12536 snap_trimmer_machine.process_event(Reset());
7c673cae
FG
12537
12538 pgbackend->on_change();
12539
12540 context_registry_on_change();
12541 object_contexts.clear();
12542
c07f9fc5
FG
12543 clear_async_reads();
12544
7c673cae
FG
12545 osd->remote_reserver.cancel_reservation(info.pgid);
12546 osd->local_reserver.cancel_reservation(info.pgid);
12547
12548 clear_primary_state();
12549 cancel_recovery();
11fdf7f2
TL
12550
12551 if (is_primary()) {
12552 osd->clear_ready_to_merge(this);
12553 }
7c673cae
FG
12554}
12555
9f95a23c 12556void PrimaryLogPG::on_activate_complete()
7c673cae 12557{
9f95a23c
TL
12558 check_local();
12559 // waiters
12560 if (!recovery_state.needs_flush()) {
12561 requeue_ops(waiting_for_peered);
12562 } else if (!waiting_for_peered.empty()) {
12563 dout(10) << __func__ << " flushes in progress, moving "
12564 << waiting_for_peered.size()
12565 << " items to waiting_for_flush"
12566 << dendl;
12567 ceph_assert(waiting_for_flush.empty());
12568 waiting_for_flush.swap(waiting_for_peered);
12569 }
12570
12571
7c673cae
FG
12572 // all clean?
12573 if (needs_recovery()) {
12574 dout(10) << "activate not all replicas are up-to-date, queueing recovery" << dendl;
12575 queue_peering_event(
11fdf7f2
TL
12576 PGPeeringEventRef(
12577 std::make_shared<PGPeeringEvent>(
12578 get_osdmap_epoch(),
12579 get_osdmap_epoch(),
9f95a23c 12580 PeeringState::DoRecovery())));
7c673cae
FG
12581 } else if (needs_backfill()) {
12582 dout(10) << "activate queueing backfill" << dendl;
12583 queue_peering_event(
11fdf7f2
TL
12584 PGPeeringEventRef(
12585 std::make_shared<PGPeeringEvent>(
12586 get_osdmap_epoch(),
12587 get_osdmap_epoch(),
9f95a23c 12588 PeeringState::RequestBackfill())));
7c673cae
FG
12589 } else {
12590 dout(10) << "activate all replicas clean, no recovery" << dendl;
12591 queue_peering_event(
11fdf7f2
TL
12592 PGPeeringEventRef(
12593 std::make_shared<PGPeeringEvent>(
12594 get_osdmap_epoch(),
12595 get_osdmap_epoch(),
9f95a23c 12596 PeeringState::AllReplicasRecovered())));
7c673cae
FG
12597 }
12598
12599 publish_stats_to_osd();
12600
9f95a23c 12601 if (get_backfill_targets().size()) {
f67539c2 12602 last_backfill_started = recovery_state.earliest_backfill();
7c673cae 12603 new_backfill = true;
11fdf7f2 12604 ceph_assert(!last_backfill_started.is_max());
9f95a23c 12605 dout(5) << __func__ << ": bft=" << get_backfill_targets()
7c673cae 12606 << " from " << last_backfill_started << dendl;
9f95a23c
TL
12607 for (set<pg_shard_t>::const_iterator i = get_backfill_targets().begin();
12608 i != get_backfill_targets().end();
7c673cae
FG
12609 ++i) {
12610 dout(5) << "target shard " << *i
9f95a23c 12611 << " from " << recovery_state.get_peer_info(*i).last_backfill
7c673cae
FG
12612 << dendl;
12613 }
12614 }
12615
12616 hit_set_setup();
12617 agent_setup();
12618}
12619
9f95a23c 12620void PrimaryLogPG::on_change(ObjectStore::Transaction &t)
7c673cae 12621{
11fdf7f2 12622 dout(10) << __func__ << dendl;
7c673cae
FG
12623
12624 if (hit_set && hit_set->insert_count() == 0) {
12625 dout(20) << " discarding empty hit_set" << dendl;
12626 hit_set_clear();
12627 }
12628
12629 if (recovery_queued) {
12630 recovery_queued = false;
12631 osd->clear_queued_recovery(this);
12632 }
12633
12634 // requeue everything in the reverse order they should be
12635 // reexamined.
12636 requeue_ops(waiting_for_peered);
b32b8144 12637 requeue_ops(waiting_for_flush);
7c673cae 12638 requeue_ops(waiting_for_active);
9f95a23c 12639 requeue_ops(waiting_for_readable);
7c673cae 12640
94b18763
FG
12641 vector<ceph_tid_t> tids;
12642 cancel_copy_ops(is_primary(), &tids);
12643 cancel_flush_ops(is_primary(), &tids);
12644 cancel_proxy_ops(is_primary(), &tids);
9f95a23c 12645 cancel_manifest_ops(is_primary(), &tids);
94b18763 12646 osd->objecter->op_cancel(tids, -ECANCELED);
7c673cae
FG
12647
12648 // requeue object waiters
12649 for (auto& p : waiting_for_unreadable_object) {
12650 release_backoffs(p.first);
12651 }
12652 if (is_primary()) {
12653 requeue_object_waiters(waiting_for_unreadable_object);
12654 } else {
12655 waiting_for_unreadable_object.clear();
12656 }
12657 for (map<hobject_t,list<OpRequestRef>>::iterator p = waiting_for_degraded_object.begin();
12658 p != waiting_for_degraded_object.end();
12659 waiting_for_degraded_object.erase(p++)) {
12660 release_backoffs(p->first);
12661 if (is_primary())
12662 requeue_ops(p->second);
12663 else
12664 p->second.clear();
12665 finish_degraded_object(p->first);
12666 }
12667
12668 // requeues waiting_for_scrub
f67539c2 12669 m_scrubber->scrub_clear_state();
7c673cae
FG
12670
12671 for (auto p = waiting_for_blocked_object.begin();
12672 p != waiting_for_blocked_object.end();
12673 waiting_for_blocked_object.erase(p++)) {
12674 if (is_primary())
12675 requeue_ops(p->second);
12676 else
12677 p->second.clear();
12678 }
12679 for (auto i = callbacks_for_degraded_object.begin();
12680 i != callbacks_for_degraded_object.end();
12681 ) {
12682 finish_degraded_object((i++)->first);
12683 }
11fdf7f2 12684 ceph_assert(callbacks_for_degraded_object.empty());
7c673cae
FG
12685
12686 if (is_primary()) {
12687 requeue_ops(waiting_for_cache_not_full);
7c673cae
FG
12688 } else {
12689 waiting_for_cache_not_full.clear();
7c673cae
FG
12690 }
12691 objects_blocked_on_cache_full.clear();
12692
12693 for (list<pair<OpRequestRef, OpContext*> >::iterator i =
12694 in_progress_async_reads.begin();
12695 i != in_progress_async_reads.end();
12696 in_progress_async_reads.erase(i++)) {
12697 close_op_ctx(i->second);
12698 if (is_primary())
12699 requeue_op(i->first);
12700 }
12701
12702 // this will requeue ops we were working on but didn't finish, and
12703 // any dups
12704 apply_and_flush_repops(is_primary());
12705 cancel_log_updates();
12706
12707 // do this *after* apply_and_flush_repops so that we catch any newly
12708 // registered watches.
12709 context_registry_on_change();
12710
9f95a23c 12711 pgbackend->on_change_cleanup(&t);
f67539c2 12712 m_scrubber->cleanup_store(&t);
7c673cae
FG
12713 pgbackend->on_change();
12714
12715 // clear snap_trimmer state
12716 snap_trimmer_machine.process_event(Reset());
12717
12718 debug_op_order.clear();
12719 unstable_stats.clear();
12720
12721 // we don't want to cache object_contexts through the interval change
12722 // NOTE: we actually assert that all currently live references are dead
12723 // by the time the flush for the next interval completes.
12724 object_contexts.clear();
12725
12726 // should have been cleared above by finishing all of the degraded objects
11fdf7f2 12727 ceph_assert(objects_blocked_on_degraded_snap.empty());
7c673cae
FG
12728}
12729
9f95a23c 12730void PrimaryLogPG::plpg_on_role_change()
7c673cae 12731{
11fdf7f2 12732 dout(10) << __func__ << dendl;
7c673cae
FG
12733 if (get_role() != 0 && hit_set) {
12734 dout(10) << " clearing hit set" << dendl;
12735 hit_set_clear();
12736 }
12737}
12738
9f95a23c 12739void PrimaryLogPG::plpg_on_pool_change()
7c673cae
FG
12740{
12741 dout(10) << __func__ << dendl;
12742 // requeue cache full waiters just in case the cache_mode is
12743 // changing away from writeback mode. note that if we are not
12744 // active the normal requeuing machinery is sufficient (and properly
12745 // ordered).
12746 if (is_active() &&
12747 pool.info.cache_mode != pg_pool_t::CACHEMODE_WRITEBACK &&
12748 !waiting_for_cache_not_full.empty()) {
12749 dout(10) << __func__ << " requeuing full waiters (not in writeback) "
12750 << dendl;
12751 requeue_ops(waiting_for_cache_not_full);
12752 objects_blocked_on_cache_full.clear();
12753 }
12754 hit_set_setup();
12755 agent_setup();
12756}
12757
12758// clear state. called on recovery completion AND cancellation.
12759void PrimaryLogPG::_clear_recovery_state()
12760{
7c673cae
FG
12761#ifdef DEBUG_RECOVERY_OIDS
12762 recovering_oids.clear();
12763#endif
f67539c2
TL
12764 dout(15) << __func__ << " flags: " << m_planned_scrub << dendl;
12765
7c673cae
FG
12766 last_backfill_started = hobject_t();
12767 set<hobject_t>::iterator i = backfills_in_flight.begin();
12768 while (i != backfills_in_flight.end()) {
7c673cae
FG
12769 backfills_in_flight.erase(i++);
12770 }
12771
12772 list<OpRequestRef> blocked_ops;
12773 for (map<hobject_t, ObjectContextRef>::iterator i = recovering.begin();
12774 i != recovering.end();
12775 recovering.erase(i++)) {
12776 if (i->second) {
12777 i->second->drop_recovery_read(&blocked_ops);
12778 requeue_ops(blocked_ops);
12779 }
12780 }
11fdf7f2 12781 ceph_assert(backfills_in_flight.empty());
7c673cae 12782 pending_backfill_updates.clear();
11fdf7f2 12783 ceph_assert(recovering.empty());
7c673cae
FG
12784 pgbackend->clear_recovery_state();
12785}
12786
12787void PrimaryLogPG::cancel_pull(const hobject_t &soid)
12788{
12789 dout(20) << __func__ << ": " << soid << dendl;
11fdf7f2 12790 ceph_assert(recovering.count(soid));
7c673cae
FG
12791 ObjectContextRef obc = recovering[soid];
12792 if (obc) {
12793 list<OpRequestRef> blocked_ops;
12794 obc->drop_recovery_read(&blocked_ops);
12795 requeue_ops(blocked_ops);
12796 }
12797 recovering.erase(soid);
12798 finish_recovery_op(soid);
12799 release_backoffs(soid);
12800 if (waiting_for_degraded_object.count(soid)) {
12801 dout(20) << " kicking degraded waiters on " << soid << dendl;
12802 requeue_ops(waiting_for_degraded_object[soid]);
12803 waiting_for_degraded_object.erase(soid);
12804 }
12805 if (waiting_for_unreadable_object.count(soid)) {
12806 dout(20) << " kicking unreadable waiters on " << soid << dendl;
12807 requeue_ops(waiting_for_unreadable_object[soid]);
12808 waiting_for_unreadable_object.erase(soid);
12809 }
12810 if (is_missing_object(soid))
9f95a23c 12811 recovery_state.set_last_requested(0);
7c673cae
FG
12812 finish_degraded_object(soid);
12813}
12814
12815void PrimaryLogPG::check_recovery_sources(const OSDMapRef& osdmap)
12816{
7c673cae 12817 pgbackend->check_recovery_sources(osdmap);
7c673cae
FG
12818}
12819
7c673cae
FG
12820bool PrimaryLogPG::start_recovery_ops(
12821 uint64_t max,
12822 ThreadPool::TPHandle &handle,
12823 uint64_t *ops_started)
12824{
12825 uint64_t& started = *ops_started;
12826 started = 0;
12827 bool work_in_progress = false;
11fdf7f2
TL
12828 bool recovery_started = false;
12829 ceph_assert(is_primary());
12830 ceph_assert(is_peered());
9f95a23c 12831 ceph_assert(!recovery_state.is_deleting());
11fdf7f2
TL
12832
12833 ceph_assert(recovery_queued);
12834 recovery_queued = false;
7c673cae
FG
12835
12836 if (!state_test(PG_STATE_RECOVERING) &&
3efd9988 12837 !state_test(PG_STATE_BACKFILLING)) {
7c673cae
FG
12838 /* TODO: I think this case is broken and will make do_recovery()
12839 * unhappy since we're returning false */
12840 dout(10) << "recovery raced and were queued twice, ignoring!" << dendl;
11fdf7f2 12841 return have_unfound();
7c673cae
FG
12842 }
12843
9f95a23c 12844 const auto &missing = recovery_state.get_pg_log().get_missing();
7c673cae 12845
7c673cae
FG
12846 uint64_t num_unfound = get_num_unfound();
12847
9f95a23c
TL
12848 if (!recovery_state.have_missing()) {
12849 recovery_state.local_recovery_complete();
7c673cae
FG
12850 }
12851
81eedcae 12852 if (!missing.have_missing() || // Primary does not have missing
9f95a23c
TL
12853 // or all of the missing objects are unfound.
12854 recovery_state.all_missing_unfound()) {
7c673cae 12855 // Recover the replicas.
11fdf7f2 12856 started = recover_replicas(max, handle, &recovery_started);
7c673cae
FG
12857 }
12858 if (!started) {
12859 // We still have missing objects that we should grab from replicas.
12860 started += recover_primary(max, handle);
12861 }
12862 if (!started && num_unfound != get_num_unfound()) {
12863 // second chance to recovery replicas
11fdf7f2 12864 started = recover_replicas(max, handle, &recovery_started);
7c673cae
FG
12865 }
12866
11fdf7f2 12867 if (started || recovery_started)
7c673cae
FG
12868 work_in_progress = true;
12869
12870 bool deferred_backfill = false;
12871 if (recovering.empty() &&
3efd9988 12872 state_test(PG_STATE_BACKFILLING) &&
9f95a23c 12873 !get_backfill_targets().empty() && started < max &&
7c673cae
FG
12874 missing.num_missing() == 0 &&
12875 waiting_on_backfill.empty()) {
12876 if (get_osdmap()->test_flag(CEPH_OSDMAP_NOBACKFILL)) {
12877 dout(10) << "deferring backfill due to NOBACKFILL" << dendl;
12878 deferred_backfill = true;
12879 } else if (get_osdmap()->test_flag(CEPH_OSDMAP_NOREBALANCE) &&
12880 !is_degraded()) {
12881 dout(10) << "deferring backfill due to NOREBALANCE" << dendl;
12882 deferred_backfill = true;
9f95a23c 12883 } else if (!recovery_state.is_backfill_reserved()) {
f67539c2 12884 /* DNMNOTE I think this branch is dead */
7c673cae
FG
12885 dout(10) << "deferring backfill due to !backfill_reserved" << dendl;
12886 if (!backfill_reserving) {
12887 dout(10) << "queueing RequestBackfill" << dendl;
12888 backfill_reserving = true;
12889 queue_peering_event(
11fdf7f2
TL
12890 PGPeeringEventRef(
12891 std::make_shared<PGPeeringEvent>(
12892 get_osdmap_epoch(),
12893 get_osdmap_epoch(),
9f95a23c 12894 PeeringState::RequestBackfill())));
7c673cae
FG
12895 }
12896 deferred_backfill = true;
12897 } else {
12898 started += recover_backfill(max - started, handle, &work_in_progress);
12899 }
12900 }
12901
12902 dout(10) << " started " << started << dendl;
12903 osd->logger->inc(l_osd_rop, started);
12904
12905 if (!recovering.empty() ||
12906 work_in_progress || recovery_ops_active > 0 || deferred_backfill)
11fdf7f2 12907 return !work_in_progress && have_unfound();
7c673cae 12908
11fdf7f2
TL
12909 ceph_assert(recovering.empty());
12910 ceph_assert(recovery_ops_active == 0);
7c673cae
FG
12911
12912 dout(10) << __func__ << " needs_recovery: "
9f95a23c 12913 << recovery_state.get_missing_loc().get_needs_recovery()
7c673cae
FG
12914 << dendl;
12915 dout(10) << __func__ << " missing_loc: "
9f95a23c 12916 << recovery_state.get_missing_loc().get_missing_locs()
7c673cae
FG
12917 << dendl;
12918 int unfound = get_num_unfound();
12919 if (unfound) {
12920 dout(10) << " still have " << unfound << " unfound" << dendl;
11fdf7f2 12921 return true;
7c673cae
FG
12922 }
12923
12924 if (missing.num_missing() > 0) {
12925 // this shouldn't happen!
c07f9fc5
FG
12926 osd->clog->error() << info.pgid << " Unexpected Error: recovery ending with "
12927 << missing.num_missing() << ": " << missing.get_items();
11fdf7f2 12928 return false;
7c673cae
FG
12929 }
12930
12931 if (needs_recovery()) {
12932 // this shouldn't happen!
12933 // We already checked num_missing() so we must have missing replicas
f67539c2 12934 osd->clog->error() << info.pgid
c07f9fc5 12935 << " Unexpected Error: recovery ending with missing replicas";
11fdf7f2 12936 return false;
7c673cae
FG
12937 }
12938
12939 if (state_test(PG_STATE_RECOVERING)) {
12940 state_clear(PG_STATE_RECOVERING);
c07f9fc5 12941 state_clear(PG_STATE_FORCED_RECOVERY);
7c673cae
FG
12942 if (needs_backfill()) {
12943 dout(10) << "recovery done, queuing backfill" << dendl;
12944 queue_peering_event(
11fdf7f2
TL
12945 PGPeeringEventRef(
12946 std::make_shared<PGPeeringEvent>(
12947 get_osdmap_epoch(),
12948 get_osdmap_epoch(),
9f95a23c 12949 PeeringState::RequestBackfill())));
7c673cae
FG
12950 } else {
12951 dout(10) << "recovery done, no backfill" << dendl;
c07f9fc5 12952 state_clear(PG_STATE_FORCED_BACKFILL);
7c673cae 12953 queue_peering_event(
11fdf7f2
TL
12954 PGPeeringEventRef(
12955 std::make_shared<PGPeeringEvent>(
12956 get_osdmap_epoch(),
12957 get_osdmap_epoch(),
9f95a23c 12958 PeeringState::AllReplicasRecovered())));
7c673cae
FG
12959 }
12960 } else { // backfilling
3efd9988 12961 state_clear(PG_STATE_BACKFILLING);
c07f9fc5
FG
12962 state_clear(PG_STATE_FORCED_BACKFILL);
12963 state_clear(PG_STATE_FORCED_RECOVERY);
7c673cae
FG
12964 dout(10) << "recovery done, backfill done" << dendl;
12965 queue_peering_event(
11fdf7f2
TL
12966 PGPeeringEventRef(
12967 std::make_shared<PGPeeringEvent>(
12968 get_osdmap_epoch(),
12969 get_osdmap_epoch(),
9f95a23c 12970 PeeringState::Backfilled())));
7c673cae
FG
12971 }
12972
12973 return false;
12974}
12975
12976/**
12977 * do one recovery op.
12978 * return true if done, false if nothing left to do.
12979 */
12980uint64_t PrimaryLogPG::recover_primary(uint64_t max, ThreadPool::TPHandle &handle)
12981{
11fdf7f2 12982 ceph_assert(is_primary());
7c673cae 12983
9f95a23c 12984 const auto &missing = recovery_state.get_pg_log().get_missing();
7c673cae 12985
11fdf7f2
TL
12986 dout(10) << __func__ << " recovering " << recovering.size()
12987 << " in pg,"
12988 << " missing " << missing << dendl;
12989
12990 dout(25) << __func__ << " " << missing.get_items() << dendl;
7c673cae
FG
12991
12992 // look at log!
12993 pg_log_entry_t *latest = 0;
12994 unsigned started = 0;
12995 int skipped = 0;
12996
12997 PGBackend::RecoveryHandle *h = pgbackend->open_recovery_op();
12998 map<version_t, hobject_t>::const_iterator p =
9f95a23c 12999 missing.get_rmissing().lower_bound(recovery_state.get_pg_log().get_log().last_requested);
7c673cae
FG
13000 while (p != missing.get_rmissing().end()) {
13001 handle.reset_tp_timeout();
13002 hobject_t soid;
13003 version_t v = p->first;
13004
9f95a23c
TL
13005 auto it_objects = recovery_state.get_pg_log().get_log().objects.find(p->second);
13006 if (it_objects != recovery_state.get_pg_log().get_log().objects.end()) {
11fdf7f2
TL
13007 latest = it_objects->second;
13008 ceph_assert(latest->is_update() || latest->is_delete());
7c673cae
FG
13009 soid = latest->soid;
13010 } else {
13011 latest = 0;
13012 soid = p->second;
13013 }
13014 const pg_missing_item& item = missing.get_items().find(p->second)->second;
13015 ++p;
13016
224ce89b 13017 hobject_t head = soid.get_head();
7c673cae
FG
13018
13019 eversion_t need = item.need;
13020
11fdf7f2 13021 dout(10) << __func__ << " "
7c673cae
FG
13022 << soid << " " << item.need
13023 << (missing.is_missing(soid) ? " (missing)":"")
13024 << (missing.is_missing(head) ? " (missing head)":"")
13025 << (recovering.count(soid) ? " (recovering)":"")
13026 << (recovering.count(head) ? " (recovering head)":"")
13027 << dendl;
13028
13029 if (latest) {
13030 switch (latest->op) {
13031 case pg_log_entry_t::CLONE:
13032 /*
13033 * Handling for this special case removed for now, until we
13034 * can correctly construct an accurate SnapSet from the old
13035 * one.
13036 */
13037 break;
13038
13039 case pg_log_entry_t::LOST_REVERT:
13040 {
13041 if (item.have == latest->reverting_to) {
13042 ObjectContextRef obc = get_object_context(soid, true);
f67539c2 13043
7c673cae
FG
13044 if (obc->obs.oi.version == latest->version) {
13045 // I'm already reverting
13046 dout(10) << " already reverting " << soid << dendl;
13047 } else {
13048 dout(10) << " reverting " << soid << " to " << latest->prior_version << dendl;
7c673cae
FG
13049 obc->obs.oi.version = latest->version;
13050
13051 ObjectStore::Transaction t;
13052 bufferlist b2;
13053 obc->obs.oi.encode(
13054 b2,
13055 get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
11fdf7f2 13056 ceph_assert(!pool.info.require_rollback());
7c673cae
FG
13057 t.setattr(coll, ghobject_t(soid), OI_ATTR, b2);
13058
9f95a23c
TL
13059 recovery_state.recover_got(
13060 soid,
13061 latest->version,
13062 false,
13063 t);
7c673cae
FG
13064
13065 ++active_pushes;
13066
11fdf7f2
TL
13067 t.register_on_applied(new C_OSD_AppliedRecoveredObject(this, obc));
13068 t.register_on_commit(new C_OSD_CommittedPushedObject(
13069 this,
13070 get_osdmap_epoch(),
13071 info.last_complete));
13072 osd->store->queue_transaction(ch, std::move(t));
7c673cae
FG
13073 continue;
13074 }
13075 } else {
13076 /*
13077 * Pull the old version of the object. Update missing_loc here to have the location
13078 * of the version we want.
13079 *
13080 * This doesn't use the usual missing_loc paths, but that's okay:
13081 * - if we have it locally, we hit the case above, and go from there.
13082 * - if we don't, we always pass through this case during recovery and set up the location
13083 * properly.
13084 * - this way we don't need to mangle the missing code to be general about needing an old
13085 * version...
13086 */
13087 eversion_t alternate_need = latest->reverting_to;
13088 dout(10) << " need to pull prior_version " << alternate_need << " for revert " << item << dendl;
13089
9f95a23c
TL
13090 set<pg_shard_t> good_peers;
13091 for (auto p = recovery_state.get_peer_missing().begin();
13092 p != recovery_state.get_peer_missing().end();
13093 ++p) {
7c673cae
FG
13094 if (p->second.is_missing(soid, need) &&
13095 p->second.get_items().at(soid).have == alternate_need) {
9f95a23c 13096 good_peers.insert(p->first);
7c673cae 13097 }
9f95a23c
TL
13098 }
13099 recovery_state.set_revert_with_targets(
13100 soid,
13101 good_peers);
7c673cae 13102 dout(10) << " will pull " << alternate_need << " or " << need
9f95a23c
TL
13103 << " from one of "
13104 << recovery_state.get_missing_loc().get_locations(soid)
7c673cae
FG
13105 << dendl;
13106 }
13107 }
13108 break;
13109 }
13110 }
f67539c2 13111
7c673cae
FG
13112 if (!recovering.count(soid)) {
13113 if (recovering.count(head)) {
13114 ++skipped;
13115 } else {
13116 int r = recover_missing(
13117 soid, need, get_recovery_op_priority(), h);
13118 switch (r) {
13119 case PULL_YES:
13120 ++started;
13121 break;
11fdf7f2 13122 case PULL_HEAD:
7c673cae
FG
13123 ++started;
13124 case PULL_NONE:
13125 ++skipped;
13126 break;
13127 default:
13128 ceph_abort();
13129 }
13130 if (started >= max)
13131 break;
13132 }
13133 }
f67539c2 13134
7c673cae
FG
13135 // only advance last_requested if we haven't skipped anything
13136 if (!skipped)
9f95a23c 13137 recovery_state.set_last_requested(v);
7c673cae 13138 }
f67539c2 13139
7c673cae
FG
13140 pgbackend->run_recovery_op(h, get_recovery_op_priority());
13141 return started;
13142}
13143
224ce89b
WB
13144bool PrimaryLogPG::primary_error(
13145 const hobject_t& soid, eversion_t v)
13146{
9f95a23c
TL
13147 recovery_state.force_object_missing(pg_whoami, soid, v);
13148 bool uhoh = recovery_state.get_missing_loc().is_unfound(soid);
224ce89b 13149 if (uhoh)
9f95a23c
TL
13150 osd->clog->error() << info.pgid << " missing primary copy of "
13151 << soid << ", unfound";
224ce89b 13152 else
9f95a23c
TL
13153 osd->clog->error() << info.pgid << " missing primary copy of "
13154 << soid
13155 << ", will try copies on "
13156 << recovery_state.get_missing_loc().get_locations(soid);
224ce89b
WB
13157 return uhoh;
13158}
13159
c07f9fc5
FG
13160int PrimaryLogPG::prep_object_replica_deletes(
13161 const hobject_t& soid, eversion_t v,
11fdf7f2
TL
13162 PGBackend::RecoveryHandle *h,
13163 bool *work_started)
c07f9fc5 13164{
11fdf7f2 13165 ceph_assert(is_primary());
c07f9fc5
FG
13166 dout(10) << __func__ << ": on " << soid << dendl;
13167
11fdf7f2
TL
13168 ObjectContextRef obc = get_object_context(soid, false);
13169 if (obc) {
13170 if (!obc->get_recovery_read()) {
13171 dout(20) << "replica delete delayed on " << soid
13172 << "; could not get rw_manager lock" << dendl;
13173 *work_started = true;
13174 return 0;
13175 } else {
13176 dout(20) << "replica delete got recovery read lock on " << soid
13177 << dendl;
13178 }
13179 }
13180
c07f9fc5 13181 start_recovery_op(soid);
11fdf7f2
TL
13182 ceph_assert(!recovering.count(soid));
13183 if (!obc)
13184 recovering.insert(make_pair(soid, ObjectContextRef()));
13185 else
13186 recovering.insert(make_pair(soid, obc));
c07f9fc5
FG
13187
13188 pgbackend->recover_delete_object(soid, v, h);
13189 return 1;
13190}
13191
7c673cae
FG
13192int PrimaryLogPG::prep_object_replica_pushes(
13193 const hobject_t& soid, eversion_t v,
11fdf7f2
TL
13194 PGBackend::RecoveryHandle *h,
13195 bool *work_started)
7c673cae 13196{
11fdf7f2 13197 ceph_assert(is_primary());
7c673cae
FG
13198 dout(10) << __func__ << ": on " << soid << dendl;
13199
9f95a23c
TL
13200 if (soid.snap && soid.snap < CEPH_NOSNAP) {
13201 // do we have the head and/or snapdir?
13202 hobject_t head = soid.get_head();
13203 if (recovery_state.get_pg_log().get_missing().is_missing(head)) {
13204 if (recovering.count(head)) {
13205 dout(10) << " missing but already recovering head " << head << dendl;
13206 return 0;
13207 } else {
13208 int r = recover_missing(
13209 head, recovery_state.get_pg_log().get_missing().get_items().find(head)->second.need,
13210 get_recovery_op_priority(), h);
13211 if (r != PULL_NONE)
13212 return 1;
13213 return 0;
13214 }
13215 }
13216 }
13217
7c673cae
FG
13218 // NOTE: we know we will get a valid oloc off of disk here.
13219 ObjectContextRef obc = get_object_context(soid, false);
13220 if (!obc) {
224ce89b 13221 primary_error(soid, v);
7c673cae
FG
13222 return 0;
13223 }
13224
13225 if (!obc->get_recovery_read()) {
13226 dout(20) << "recovery delayed on " << soid
13227 << "; could not get rw_manager lock" << dendl;
11fdf7f2 13228 *work_started = true;
7c673cae
FG
13229 return 0;
13230 } else {
13231 dout(20) << "recovery got recovery read lock on " << soid
13232 << dendl;
13233 }
13234
13235 start_recovery_op(soid);
11fdf7f2 13236 ceph_assert(!recovering.count(soid));
7c673cae
FG
13237 recovering.insert(make_pair(soid, obc));
13238
224ce89b 13239 int r = pgbackend->recover_object(
7c673cae
FG
13240 soid,
13241 v,
13242 ObjectContextRef(),
13243 obc, // has snapset context
13244 h);
224ce89b
WB
13245 if (r < 0) {
13246 dout(0) << __func__ << " Error " << r << " on oid " << soid << dendl;
9f95a23c 13247 on_failed_pull({ pg_whoami }, soid, v);
224ce89b
WB
13248 return 0;
13249 }
7c673cae
FG
13250 return 1;
13251}
13252
11fdf7f2
TL
13253uint64_t PrimaryLogPG::recover_replicas(uint64_t max, ThreadPool::TPHandle &handle,
13254 bool *work_started)
7c673cae
FG
13255{
13256 dout(10) << __func__ << "(" << max << ")" << dendl;
13257 uint64_t started = 0;
13258
13259 PGBackend::RecoveryHandle *h = pgbackend->open_recovery_op();
13260
13261 // this is FAR from an optimal recovery order. pretty lame, really.
9f95a23c 13262 ceph_assert(!get_acting_recovery_backfill().empty());
11fdf7f2
TL
13263 // choose replicas to recover, replica has the shortest missing list first
13264 // so we can bring it back to normal ASAP
13265 std::vector<std::pair<unsigned int, pg_shard_t>> replicas_by_num_missing,
13266 async_by_num_missing;
9f95a23c
TL
13267 replicas_by_num_missing.reserve(get_acting_recovery_backfill().size() - 1);
13268 for (auto &p: get_acting_recovery_backfill()) {
11fdf7f2
TL
13269 if (p == get_primary()) {
13270 continue;
13271 }
9f95a23c
TL
13272 auto pm = recovery_state.get_peer_missing().find(p);
13273 ceph_assert(pm != recovery_state.get_peer_missing().end());
11fdf7f2
TL
13274 auto nm = pm->second.num_missing();
13275 if (nm != 0) {
9f95a23c 13276 if (is_async_recovery_target(p)) {
11fdf7f2
TL
13277 async_by_num_missing.push_back(make_pair(nm, p));
13278 } else {
13279 replicas_by_num_missing.push_back(make_pair(nm, p));
13280 }
13281 }
13282 }
13283 // sort by number of missing objects, in ascending order.
13284 auto func = [](const std::pair<unsigned int, pg_shard_t> &lhs,
13285 const std::pair<unsigned int, pg_shard_t> &rhs) {
13286 return lhs.first < rhs.first;
13287 };
13288 // acting goes first
13289 std::sort(replicas_by_num_missing.begin(), replicas_by_num_missing.end(), func);
13290 // then async_recovery_targets
13291 std::sort(async_by_num_missing.begin(), async_by_num_missing.end(), func);
13292 replicas_by_num_missing.insert(replicas_by_num_missing.end(),
13293 async_by_num_missing.begin(), async_by_num_missing.end());
13294 for (auto &replica: replicas_by_num_missing) {
13295 pg_shard_t &peer = replica.second;
13296 ceph_assert(peer != get_primary());
9f95a23c
TL
13297 auto pm = recovery_state.get_peer_missing().find(peer);
13298 ceph_assert(pm != recovery_state.get_peer_missing().end());
7c673cae
FG
13299 size_t m_sz = pm->second.num_missing();
13300
13301 dout(10) << " peer osd." << peer << " missing " << m_sz << " objects." << dendl;
13302 dout(20) << " peer osd." << peer << " missing " << pm->second.get_items() << dendl;
13303
13304 // oldest first!
13305 const pg_missing_t &m(pm->second);
13306 for (map<version_t, hobject_t>::const_iterator p = m.get_rmissing().begin();
13307 p != m.get_rmissing().end() && started < max;
13308 ++p) {
13309 handle.reset_tp_timeout();
13310 const hobject_t soid(p->second);
13311
9f95a23c 13312 if (recovery_state.get_missing_loc().is_unfound(soid)) {
224ce89b
WB
13313 dout(10) << __func__ << ": " << soid << " still unfound" << dendl;
13314 continue;
13315 }
13316
9f95a23c
TL
13317 const pg_info_t &pi = recovery_state.get_peer_info(peer);
13318 if (soid > pi.last_backfill) {
7c673cae 13319 if (!recovering.count(soid)) {
9f95a23c
TL
13320 derr << __func__ << ": object " << soid << " last_backfill "
13321 << pi.last_backfill << dendl;
7c673cae
FG
13322 derr << __func__ << ": object added to missing set for backfill, but "
13323 << "is not in recovering, error!" << dendl;
13324 ceph_abort();
13325 }
13326 continue;
13327 }
13328
13329 if (recovering.count(soid)) {
13330 dout(10) << __func__ << ": already recovering " << soid << dendl;
13331 continue;
13332 }
13333
9f95a23c 13334 if (recovery_state.get_missing_loc().is_deleted(soid)) {
c07f9fc5
FG
13335 dout(10) << __func__ << ": " << soid << " is a delete, removing" << dendl;
13336 map<hobject_t,pg_missing_item>::const_iterator r = m.get_items().find(soid);
11fdf7f2 13337 started += prep_object_replica_deletes(soid, r->second.need, h, work_started);
c07f9fc5
FG
13338 continue;
13339 }
13340
9f95a23c
TL
13341 if (soid.is_snap() &&
13342 recovery_state.get_pg_log().get_missing().is_missing(
13343 soid.get_head())) {
7c673cae
FG
13344 dout(10) << __func__ << ": " << soid.get_head()
13345 << " still missing on primary" << dendl;
13346 continue;
13347 }
13348
9f95a23c 13349 if (recovery_state.get_pg_log().get_missing().is_missing(soid)) {
7c673cae
FG
13350 dout(10) << __func__ << ": " << soid << " still missing on primary" << dendl;
13351 continue;
13352 }
13353
13354 dout(10) << __func__ << ": recover_object_replicas(" << soid << ")" << dendl;
13355 map<hobject_t,pg_missing_item>::const_iterator r = m.get_items().find(soid);
11fdf7f2 13356 started += prep_object_replica_pushes(soid, r->second.need, h, work_started);
7c673cae
FG
13357 }
13358 }
13359
13360 pgbackend->run_recovery_op(h, get_recovery_op_priority());
13361 return started;
13362}
13363
13364hobject_t PrimaryLogPG::earliest_peer_backfill() const
13365{
13366 hobject_t e = hobject_t::get_max();
9f95a23c
TL
13367 for (const pg_shard_t& peer : get_backfill_targets()) {
13368 const auto iter = peer_backfill_info.find(peer);
11fdf7f2 13369 ceph_assert(iter != peer_backfill_info.end());
9f95a23c 13370 e = std::min(e, iter->second.begin);
7c673cae
FG
13371 }
13372 return e;
13373}
13374
13375bool PrimaryLogPG::all_peer_done() const
13376{
13377 // Primary hasn't got any more objects
11fdf7f2 13378 ceph_assert(backfill_info.empty());
7c673cae 13379
9f95a23c
TL
13380 for (const pg_shard_t& bt : get_backfill_targets()) {
13381 const auto piter = peer_backfill_info.find(bt);
11fdf7f2 13382 ceph_assert(piter != peer_backfill_info.end());
7c673cae
FG
13383 const BackfillInterval& pbi = piter->second;
13384 // See if peer has more to process
13385 if (!pbi.extends_to_end() || !pbi.empty())
13386 return false;
13387 }
13388 return true;
13389}
13390
13391/**
13392 * recover_backfill
13393 *
13394 * Invariants:
13395 *
13396 * backfilled: fully pushed to replica or present in replica's missing set (both
13397 * our copy and theirs).
13398 *
13399 * All objects on a backfill_target in
13400 * [MIN,peer_backfill_info[backfill_target].begin) are valid; logically-removed
13401 * objects have been actually deleted and all logically-valid objects are replicated.
13402 * There may be PG objects in this interval yet to be backfilled.
13403 *
13404 * All objects in PG in [MIN,backfill_info.begin) have been backfilled to all
13405 * backfill_targets. There may be objects on backfill_target(s) yet to be deleted.
13406 *
11fdf7f2 13407 * For a backfill target, all objects < std::min(peer_backfill_info[target].begin,
7c673cae
FG
13408 * backfill_info.begin) in PG are backfilled. No deleted objects in this
13409 * interval remain on the backfill target.
13410 *
13411 * For a backfill target, all objects <= peer_info[target].last_backfill
13412 * have been backfilled to target
13413 *
13414 * There *MAY* be missing/outdated objects between last_backfill_started and
11fdf7f2 13415 * std::min(peer_backfill_info[*].begin, backfill_info.begin) in the event that client
7c673cae
FG
13416 * io created objects since the last scan. For this reason, we call
13417 * update_range() again before continuing backfill.
13418 */
13419uint64_t PrimaryLogPG::recover_backfill(
13420 uint64_t max,
13421 ThreadPool::TPHandle &handle, bool *work_started)
13422{
11fdf7f2 13423 dout(10) << __func__ << " (" << max << ")"
9f95a23c 13424 << " bft=" << get_backfill_targets()
7c673cae
FG
13425 << " last_backfill_started " << last_backfill_started
13426 << (new_backfill ? " new_backfill":"")
13427 << dendl;
9f95a23c 13428 ceph_assert(!get_backfill_targets().empty());
7c673cae
FG
13429
13430 // Initialize from prior backfill state
13431 if (new_backfill) {
13432 // on_activate() was called prior to getting here
f67539c2 13433 ceph_assert(last_backfill_started == recovery_state.earliest_backfill());
7c673cae
FG
13434 new_backfill = false;
13435
13436 // initialize BackfillIntervals
9f95a23c
TL
13437 for (set<pg_shard_t>::const_iterator i = get_backfill_targets().begin();
13438 i != get_backfill_targets().end();
7c673cae 13439 ++i) {
9f95a23c
TL
13440 peer_backfill_info[*i].reset(
13441 recovery_state.get_peer_info(*i).last_backfill);
7c673cae
FG
13442 }
13443 backfill_info.reset(last_backfill_started);
13444
13445 backfills_in_flight.clear();
13446 pending_backfill_updates.clear();
13447 }
13448
9f95a23c
TL
13449 for (set<pg_shard_t>::const_iterator i = get_backfill_targets().begin();
13450 i != get_backfill_targets().end();
7c673cae
FG
13451 ++i) {
13452 dout(10) << "peer osd." << *i
9f95a23c 13453 << " info " << recovery_state.get_peer_info(*i)
7c673cae
FG
13454 << " interval " << peer_backfill_info[*i].begin
13455 << "-" << peer_backfill_info[*i].end
13456 << " " << peer_backfill_info[*i].objects.size() << " objects"
13457 << dendl;
13458 }
13459
13460 // update our local interval to cope with recent changes
13461 backfill_info.begin = last_backfill_started;
13462 update_range(&backfill_info, handle);
13463
13464 unsigned ops = 0;
7c673cae
FG
13465 vector<boost::tuple<hobject_t, eversion_t, pg_shard_t> > to_remove;
13466 set<hobject_t> add_to_stat;
13467
9f95a23c
TL
13468 for (set<pg_shard_t>::const_iterator i = get_backfill_targets().begin();
13469 i != get_backfill_targets().end();
7c673cae
FG
13470 ++i) {
13471 peer_backfill_info[*i].trim_to(
9f95a23c
TL
13472 std::max(
13473 recovery_state.get_peer_info(*i).last_backfill,
13474 last_backfill_started));
7c673cae
FG
13475 }
13476 backfill_info.trim_to(last_backfill_started);
13477
224ce89b 13478 PGBackend::RecoveryHandle *h = pgbackend->open_recovery_op();
7c673cae
FG
13479 while (ops < max) {
13480 if (backfill_info.begin <= earliest_peer_backfill() &&
13481 !backfill_info.extends_to_end() && backfill_info.empty()) {
13482 hobject_t next = backfill_info.end;
13483 backfill_info.reset(next);
13484 backfill_info.end = hobject_t::get_max();
13485 update_range(&backfill_info, handle);
13486 backfill_info.trim();
13487 }
13488
13489 dout(20) << " my backfill interval " << backfill_info << dendl;
13490
13491 bool sent_scan = false;
9f95a23c
TL
13492 for (set<pg_shard_t>::const_iterator i = get_backfill_targets().begin();
13493 i != get_backfill_targets().end();
7c673cae
FG
13494 ++i) {
13495 pg_shard_t bt = *i;
13496 BackfillInterval& pbi = peer_backfill_info[bt];
13497
13498 dout(20) << " peer shard " << bt << " backfill " << pbi << dendl;
13499 if (pbi.begin <= backfill_info.begin &&
13500 !pbi.extends_to_end() && pbi.empty()) {
13501 dout(10) << " scanning peer osd." << bt << " from " << pbi.end << dendl;
11fdf7f2 13502 epoch_t e = get_osdmap_epoch();
7c673cae 13503 MOSDPGScan *m = new MOSDPGScan(
9f95a23c 13504 MOSDPGScan::OP_SCAN_GET_DIGEST, pg_whoami, e, get_last_peering_reset(),
7c673cae
FG
13505 spg_t(info.pgid.pgid, bt.shard),
13506 pbi.end, hobject_t());
11fdf7f2
TL
13507 osd->send_message_osd_cluster(bt.osd, m, get_osdmap_epoch());
13508 ceph_assert(waiting_on_backfill.find(bt) == waiting_on_backfill.end());
7c673cae
FG
13509 waiting_on_backfill.insert(bt);
13510 sent_scan = true;
13511 }
13512 }
13513
13514 // Count simultaneous scans as a single op and let those complete
13515 if (sent_scan) {
13516 ops++;
13517 start_recovery_op(hobject_t::get_max()); // XXX: was pbi.end
13518 break;
13519 }
13520
13521 if (backfill_info.empty() && all_peer_done()) {
13522 dout(10) << " reached end for both local and all peers" << dendl;
13523 break;
13524 }
13525
13526 // Get object within set of peers to operate on and
13527 // the set of targets for which that object applies.
13528 hobject_t check = earliest_peer_backfill();
13529
13530 if (check < backfill_info.begin) {
13531
13532 set<pg_shard_t> check_targets;
9f95a23c
TL
13533 for (set<pg_shard_t>::const_iterator i = get_backfill_targets().begin();
13534 i != get_backfill_targets().end();
7c673cae
FG
13535 ++i) {
13536 pg_shard_t bt = *i;
13537 BackfillInterval& pbi = peer_backfill_info[bt];
13538 if (pbi.begin == check)
13539 check_targets.insert(bt);
13540 }
11fdf7f2 13541 ceph_assert(!check_targets.empty());
7c673cae
FG
13542
13543 dout(20) << " BACKFILL removing " << check
13544 << " from peers " << check_targets << dendl;
13545 for (set<pg_shard_t>::iterator i = check_targets.begin();
13546 i != check_targets.end();
13547 ++i) {
13548 pg_shard_t bt = *i;
13549 BackfillInterval& pbi = peer_backfill_info[bt];
11fdf7f2 13550 ceph_assert(pbi.begin == check);
7c673cae
FG
13551
13552 to_remove.push_back(boost::make_tuple(check, pbi.objects.begin()->second, bt));
13553 pbi.pop_front();
13554 }
13555
11fdf7f2 13556 last_backfill_started = check;
7c673cae
FG
13557
13558 // Don't increment ops here because deletions
13559 // are cheap and not replied to unlike real recovery_ops,
13560 // and we can't increment ops without requeueing ourself
13561 // for recovery.
13562 } else {
13563 eversion_t& obj_v = backfill_info.objects.begin()->second;
13564
13565 vector<pg_shard_t> need_ver_targs, missing_targs, keep_ver_targs, skip_targs;
9f95a23c
TL
13566 for (set<pg_shard_t>::const_iterator i = get_backfill_targets().begin();
13567 i != get_backfill_targets().end();
7c673cae
FG
13568 ++i) {
13569 pg_shard_t bt = *i;
13570 BackfillInterval& pbi = peer_backfill_info[bt];
13571 // Find all check peers that have the wrong version
13572 if (check == backfill_info.begin && check == pbi.begin) {
13573 if (pbi.objects.begin()->second != obj_v) {
13574 need_ver_targs.push_back(bt);
13575 } else {
13576 keep_ver_targs.push_back(bt);
13577 }
13578 } else {
9f95a23c 13579 const pg_info_t& pinfo = recovery_state.get_peer_info(bt);
7c673cae
FG
13580
13581 // Only include peers that we've caught up to their backfill line
13582 // otherwise, they only appear to be missing this object
13583 // because their pbi.begin > backfill_info.begin.
13584 if (backfill_info.begin > pinfo.last_backfill)
13585 missing_targs.push_back(bt);
13586 else
13587 skip_targs.push_back(bt);
13588 }
13589 }
13590
13591 if (!keep_ver_targs.empty()) {
13592 // These peers have version obj_v
13593 dout(20) << " BACKFILL keeping " << check
13594 << " with ver " << obj_v
13595 << " on peers " << keep_ver_targs << dendl;
13596 //assert(!waiting_for_degraded_object.count(check));
13597 }
13598 if (!need_ver_targs.empty() || !missing_targs.empty()) {
13599 ObjectContextRef obc = get_object_context(backfill_info.begin, false);
11fdf7f2 13600 ceph_assert(obc);
7c673cae
FG
13601 if (obc->get_recovery_read()) {
13602 if (!need_ver_targs.empty()) {
13603 dout(20) << " BACKFILL replacing " << check
13604 << " with ver " << obj_v
13605 << " to peers " << need_ver_targs << dendl;
13606 }
13607 if (!missing_targs.empty()) {
13608 dout(20) << " BACKFILL pushing " << backfill_info.begin
13609 << " with ver " << obj_v
13610 << " to peers " << missing_targs << dendl;
13611 }
13612 vector<pg_shard_t> all_push = need_ver_targs;
13613 all_push.insert(all_push.end(), missing_targs.begin(), missing_targs.end());
13614
224ce89b
WB
13615 handle.reset_tp_timeout();
13616 int r = prep_backfill_object_push(backfill_info.begin, obj_v, obc, all_push, h);
13617 if (r < 0) {
13618 *work_started = true;
13619 dout(0) << __func__ << " Error " << r << " trying to backfill " << backfill_info.begin << dendl;
13620 break;
13621 }
7c673cae
FG
13622 ops++;
13623 } else {
13624 *work_started = true;
13625 dout(20) << "backfill blocking on " << backfill_info.begin
13626 << "; could not get rw_manager lock" << dendl;
13627 break;
13628 }
13629 }
13630 dout(20) << "need_ver_targs=" << need_ver_targs
13631 << " keep_ver_targs=" << keep_ver_targs << dendl;
9f95a23c 13632 dout(20) << "backfill_targets=" << get_backfill_targets()
7c673cae
FG
13633 << " missing_targs=" << missing_targs
13634 << " skip_targs=" << skip_targs << dendl;
13635
13636 last_backfill_started = backfill_info.begin;
13637 add_to_stat.insert(backfill_info.begin); // XXX: Only one for all pushes?
13638 backfill_info.pop_front();
13639 vector<pg_shard_t> check_targets = need_ver_targs;
13640 check_targets.insert(check_targets.end(), keep_ver_targs.begin(), keep_ver_targs.end());
13641 for (vector<pg_shard_t>::iterator i = check_targets.begin();
13642 i != check_targets.end();
13643 ++i) {
13644 pg_shard_t bt = *i;
13645 BackfillInterval& pbi = peer_backfill_info[bt];
13646 pbi.pop_front();
13647 }
13648 }
13649 }
13650
7c673cae
FG
13651 for (set<hobject_t>::iterator i = add_to_stat.begin();
13652 i != add_to_stat.end();
13653 ++i) {
13654 ObjectContextRef obc = get_object_context(*i, false);
11fdf7f2 13655 ceph_assert(obc);
7c673cae
FG
13656 pg_stat_t stat;
13657 add_object_context_to_pg_stat(obc, &stat);
13658 pending_backfill_updates[*i] = stat;
13659 }
11fdf7f2
TL
13660 map<pg_shard_t,MOSDPGBackfillRemove*> reqs;
13661 for (unsigned i = 0; i < to_remove.size(); ++i) {
13662 handle.reset_tp_timeout();
13663 const hobject_t& oid = to_remove[i].get<0>();
13664 eversion_t v = to_remove[i].get<1>();
13665 pg_shard_t peer = to_remove[i].get<2>();
13666 MOSDPGBackfillRemove *m;
13667 auto it = reqs.find(peer);
13668 if (it != reqs.end()) {
13669 m = it->second;
13670 } else {
13671 m = reqs[peer] = new MOSDPGBackfillRemove(
13672 spg_t(info.pgid.pgid, peer.shard),
13673 get_osdmap_epoch());
7c673cae 13674 }
11fdf7f2 13675 m->ls.push_back(make_pair(oid, v));
7c673cae 13676
11fdf7f2
TL
13677 if (oid <= last_backfill_started)
13678 pending_backfill_updates[oid]; // add empty stat!
13679 }
13680 for (auto p : reqs) {
13681 osd->send_message_osd_cluster(p.first.osd, p.second,
13682 get_osdmap_epoch());
7c673cae
FG
13683 }
13684
7c673cae
FG
13685 pgbackend->run_recovery_op(h, get_recovery_op_priority());
13686
f67539c2
TL
13687 hobject_t backfill_pos =
13688 std::min(backfill_info.begin, earliest_peer_backfill());
7c673cae
FG
13689 dout(5) << "backfill_pos is " << backfill_pos << dendl;
13690 for (set<hobject_t>::iterator i = backfills_in_flight.begin();
13691 i != backfills_in_flight.end();
13692 ++i) {
13693 dout(20) << *i << " is still in flight" << dendl;
13694 }
13695
13696 hobject_t next_backfill_to_complete = backfills_in_flight.empty() ?
13697 backfill_pos : *(backfills_in_flight.begin());
f67539c2 13698 hobject_t new_last_backfill = recovery_state.earliest_backfill();
7c673cae
FG
13699 dout(10) << "starting new_last_backfill at " << new_last_backfill << dendl;
13700 for (map<hobject_t, pg_stat_t>::iterator i =
13701 pending_backfill_updates.begin();
13702 i != pending_backfill_updates.end() &&
13703 i->first < next_backfill_to_complete;
13704 pending_backfill_updates.erase(i++)) {
13705 dout(20) << " pending_backfill_update " << i->first << dendl;
11fdf7f2 13706 ceph_assert(i->first > new_last_backfill);
f67539c2
TL
13707 // carried from a previous round – if we are here, then we had to
13708 // be requeued (by e.g. on_global_recover()) and those operations
13709 // are done.
9f95a23c
TL
13710 recovery_state.update_complete_backfill_object_stats(
13711 i->first,
13712 i->second);
7c673cae
FG
13713 new_last_backfill = i->first;
13714 }
13715 dout(10) << "possible new_last_backfill at " << new_last_backfill << dendl;
13716
11fdf7f2 13717 ceph_assert(!pending_backfill_updates.empty() ||
7c673cae
FG
13718 new_last_backfill == last_backfill_started);
13719 if (pending_backfill_updates.empty() &&
13720 backfill_pos.is_max()) {
11fdf7f2 13721 ceph_assert(backfills_in_flight.empty());
7c673cae
FG
13722 new_last_backfill = backfill_pos;
13723 last_backfill_started = backfill_pos;
13724 }
13725 dout(10) << "final new_last_backfill at " << new_last_backfill << dendl;
13726
13727 // If new_last_backfill == MAX, then we will send OP_BACKFILL_FINISH to
13728 // all the backfill targets. Otherwise, we will move last_backfill up on
13729 // those targets need it and send OP_BACKFILL_PROGRESS to them.
9f95a23c
TL
13730 for (set<pg_shard_t>::const_iterator i = get_backfill_targets().begin();
13731 i != get_backfill_targets().end();
7c673cae
FG
13732 ++i) {
13733 pg_shard_t bt = *i;
9f95a23c 13734 const pg_info_t& pinfo = recovery_state.get_peer_info(bt);
7c673cae
FG
13735
13736 if (new_last_backfill > pinfo.last_backfill) {
9f95a23c 13737 recovery_state.update_peer_last_backfill(bt, new_last_backfill);
11fdf7f2 13738 epoch_t e = get_osdmap_epoch();
7c673cae
FG
13739 MOSDPGBackfill *m = NULL;
13740 if (pinfo.last_backfill.is_max()) {
13741 m = new MOSDPGBackfill(
13742 MOSDPGBackfill::OP_BACKFILL_FINISH,
13743 e,
9f95a23c 13744 get_last_peering_reset(),
7c673cae
FG
13745 spg_t(info.pgid.pgid, bt.shard));
13746 // Use default priority here, must match sub_op priority
7c673cae
FG
13747 start_recovery_op(hobject_t::get_max());
13748 } else {
13749 m = new MOSDPGBackfill(
13750 MOSDPGBackfill::OP_BACKFILL_PROGRESS,
13751 e,
9f95a23c 13752 get_last_peering_reset(),
7c673cae
FG
13753 spg_t(info.pgid.pgid, bt.shard));
13754 // Use default priority here, must match sub_op priority
13755 }
13756 m->last_backfill = pinfo.last_backfill;
13757 m->stats = pinfo.stats;
11fdf7f2 13758 osd->send_message_osd_cluster(bt.osd, m, get_osdmap_epoch());
7c673cae
FG
13759 dout(10) << " peer " << bt
13760 << " num_objects now " << pinfo.stats.stats.sum.num_objects
13761 << " / " << info.stats.stats.sum.num_objects << dendl;
13762 }
13763 }
13764
13765 if (ops)
13766 *work_started = true;
13767 return ops;
13768}
13769
224ce89b 13770int PrimaryLogPG::prep_backfill_object_push(
7c673cae
FG
13771 hobject_t oid, eversion_t v,
13772 ObjectContextRef obc,
13773 vector<pg_shard_t> peers,
13774 PGBackend::RecoveryHandle *h)
13775{
224ce89b 13776 dout(10) << __func__ << " " << oid << " v " << v << " to peers " << peers << dendl;
11fdf7f2 13777 ceph_assert(!peers.empty());
7c673cae
FG
13778
13779 backfills_in_flight.insert(oid);
9f95a23c 13780 recovery_state.prepare_backfill_for_missing(oid, v, peers);
7c673cae 13781
11fdf7f2 13782 ceph_assert(!recovering.count(oid));
7c673cae
FG
13783
13784 start_recovery_op(oid);
13785 recovering.insert(make_pair(oid, obc));
13786
224ce89b 13787 int r = pgbackend->recover_object(
7c673cae
FG
13788 oid,
13789 v,
13790 ObjectContextRef(),
13791 obc,
13792 h);
224ce89b
WB
13793 if (r < 0) {
13794 dout(0) << __func__ << " Error " << r << " on oid " << oid << dendl;
9f95a23c 13795 on_failed_pull({ pg_whoami }, oid, v);
224ce89b
WB
13796 }
13797 return r;
7c673cae
FG
13798}
13799
13800void PrimaryLogPG::update_range(
13801 BackfillInterval *bi,
13802 ThreadPool::TPHandle &handle)
13803{
13804 int local_min = cct->_conf->osd_backfill_scan_min;
13805 int local_max = cct->_conf->osd_backfill_scan_max;
13806
13807 if (bi->version < info.log_tail) {
13808 dout(10) << __func__<< ": bi is old, rescanning local backfill_info"
13809 << dendl;
11fdf7f2 13810 bi->version = info.last_update;
7c673cae
FG
13811 scan_range(local_min, local_max, bi, handle);
13812 }
13813
13814 if (bi->version >= projected_last_update) {
13815 dout(10) << __func__<< ": bi is current " << dendl;
11fdf7f2 13816 ceph_assert(bi->version == projected_last_update);
7c673cae 13817 } else if (bi->version >= info.log_tail) {
9f95a23c 13818 if (recovery_state.get_pg_log().get_log().empty() && projected_log.empty()) {
7c673cae
FG
13819 /* Because we don't move log_tail on split, the log might be
13820 * empty even if log_tail != last_update. However, the only
13821 * way to get here with an empty log is if log_tail is actually
13822 * eversion_t(), because otherwise the entry which changed
13823 * last_update since the last scan would have to be present.
13824 */
11fdf7f2 13825 ceph_assert(bi->version == eversion_t());
7c673cae
FG
13826 return;
13827 }
13828
13829 dout(10) << __func__<< ": bi is old, (" << bi->version
13830 << ") can be updated with log to projected_last_update "
13831 << projected_last_update << dendl;
13832
13833 auto func = [&](const pg_log_entry_t &e) {
13834 dout(10) << __func__ << ": updating from version " << e.version
13835 << dendl;
13836 const hobject_t &soid = e.soid;
13837 if (soid >= bi->begin &&
13838 soid < bi->end) {
13839 if (e.is_update()) {
13840 dout(10) << __func__ << ": " << e.soid << " updated to version "
13841 << e.version << dendl;
13842 bi->objects.erase(e.soid);
13843 bi->objects.insert(
13844 make_pair(
13845 e.soid,
13846 e.version));
13847 } else if (e.is_delete()) {
13848 dout(10) << __func__ << ": " << e.soid << " removed" << dendl;
13849 bi->objects.erase(e.soid);
13850 }
13851 }
13852 };
13853 dout(10) << "scanning pg log first" << dendl;
9f95a23c 13854 recovery_state.get_pg_log().get_log().scan_log_after(bi->version, func);
7c673cae
FG
13855 dout(10) << "scanning projected log" << dendl;
13856 projected_log.scan_log_after(bi->version, func);
13857 bi->version = projected_last_update;
13858 } else {
11fdf7f2 13859 ceph_abort_msg("scan_range should have raised bi->version past log_tail");
7c673cae
FG
13860 }
13861}
13862
13863void PrimaryLogPG::scan_range(
13864 int min, int max, BackfillInterval *bi,
13865 ThreadPool::TPHandle &handle)
13866{
11fdf7f2 13867 ceph_assert(is_locked());
7c673cae
FG
13868 dout(10) << "scan_range from " << bi->begin << dendl;
13869 bi->clear_objects();
13870
13871 vector<hobject_t> ls;
13872 ls.reserve(max);
13873 int r = pgbackend->objects_list_partial(bi->begin, min, max, &ls, &bi->end);
11fdf7f2 13874 ceph_assert(r >= 0);
7c673cae
FG
13875 dout(10) << " got " << ls.size() << " items, next " << bi->end << dendl;
13876 dout(20) << ls << dendl;
13877
13878 for (vector<hobject_t>::iterator p = ls.begin(); p != ls.end(); ++p) {
13879 handle.reset_tp_timeout();
13880 ObjectContextRef obc;
13881 if (is_primary())
13882 obc = object_contexts.lookup(*p);
13883 if (obc) {
92f5a8d4
TL
13884 if (!obc->obs.exists) {
13885 /* If the object does not exist here, it must have been removed
13886 * between the collection_list_partial and here. This can happen
13887 * for the first item in the range, which is usually last_backfill.
13888 */
13889 continue;
13890 }
7c673cae
FG
13891 bi->objects[*p] = obc->obs.oi.version;
13892 dout(20) << " " << *p << " " << obc->obs.oi.version << dendl;
13893 } else {
13894 bufferlist bl;
13895 int r = pgbackend->objects_get_attr(*p, OI_ATTR, &bl);
7c673cae 13896 /* If the object does not exist here, it must have been removed
92f5a8d4
TL
13897 * between the collection_list_partial and here. This can happen
13898 * for the first item in the range, which is usually last_backfill.
13899 */
7c673cae
FG
13900 if (r == -ENOENT)
13901 continue;
13902
11fdf7f2 13903 ceph_assert(r >= 0);
7c673cae
FG
13904 object_info_t oi(bl);
13905 bi->objects[*p] = oi.version;
13906 dout(20) << " " << *p << " " << oi.version << dendl;
13907 }
13908 }
13909}
13910
13911
13912/** check_local
f67539c2 13913 *
7c673cae
FG
13914 * verifies that stray objects have been deleted
13915 */
13916void PrimaryLogPG::check_local()
13917{
13918 dout(10) << __func__ << dendl;
13919
9f95a23c
TL
13920 ceph_assert(
13921 info.last_update >=
13922 recovery_state.get_pg_log().get_tail()); // otherwise we need some help!
7c673cae
FG
13923
13924 if (!cct->_conf->osd_debug_verify_stray_on_activate)
13925 return;
13926
13927 // just scan the log.
13928 set<hobject_t> did;
9f95a23c
TL
13929 for (list<pg_log_entry_t>::const_reverse_iterator p = recovery_state.get_pg_log().get_log().log.rbegin();
13930 p != recovery_state.get_pg_log().get_log().log.rend();
7c673cae
FG
13931 ++p) {
13932 if (did.count(p->soid))
13933 continue;
13934 did.insert(p->soid);
13935
c07f9fc5 13936 if (p->is_delete() && !is_missing_object(p->soid)) {
7c673cae
FG
13937 dout(10) << " checking " << p->soid
13938 << " at " << p->version << dendl;
13939 struct stat st;
13940 int r = osd->store->stat(
13941 ch,
13942 ghobject_t(p->soid, ghobject_t::NO_GEN, pg_whoami.shard),
13943 &st);
13944 if (r != -ENOENT) {
13945 derr << __func__ << " " << p->soid << " exists, but should have been "
13946 << "deleted" << dendl;
11fdf7f2 13947 ceph_abort_msg("erroneously present object");
7c673cae
FG
13948 }
13949 } else {
13950 // ignore old(+missing) objects
13951 }
13952 }
13953}
13954
13955
13956
13957// ===========================
13958// hit sets
13959
13960hobject_t PrimaryLogPG::get_hit_set_current_object(utime_t stamp)
13961{
13962 ostringstream ss;
13963 ss << "hit_set_" << info.pgid.pgid << "_current_" << stamp;
13964 hobject_t hoid(sobject_t(ss.str(), CEPH_NOSNAP), "",
13965 info.pgid.ps(), info.pgid.pool(),
13966 cct->_conf->osd_hit_set_namespace);
13967 dout(20) << __func__ << " " << hoid << dendl;
13968 return hoid;
13969}
13970
13971hobject_t PrimaryLogPG::get_hit_set_archive_object(utime_t start,
13972 utime_t end,
13973 bool using_gmt)
13974{
13975 ostringstream ss;
13976 ss << "hit_set_" << info.pgid.pgid << "_archive_";
13977 if (using_gmt) {
9f95a23c
TL
13978 start.gmtime(ss, true /* legacy pre-octopus form */) << "_";
13979 end.gmtime(ss, true /* legacy pre-octopus form */);
7c673cae 13980 } else {
9f95a23c
TL
13981 start.localtime(ss, true /* legacy pre-octopus form */) << "_";
13982 end.localtime(ss, true /* legacy pre-octopus form */);
7c673cae
FG
13983 }
13984 hobject_t hoid(sobject_t(ss.str(), CEPH_NOSNAP), "",
13985 info.pgid.ps(), info.pgid.pool(),
13986 cct->_conf->osd_hit_set_namespace);
13987 dout(20) << __func__ << " " << hoid << dendl;
13988 return hoid;
13989}
13990
13991void PrimaryLogPG::hit_set_clear()
13992{
13993 dout(20) << __func__ << dendl;
13994 hit_set.reset();
13995 hit_set_start_stamp = utime_t();
13996}
13997
13998void PrimaryLogPG::hit_set_setup()
13999{
14000 if (!is_active() ||
14001 !is_primary()) {
14002 hit_set_clear();
14003 return;
14004 }
14005
14006 if (is_active() && is_primary() &&
14007 (!pool.info.hit_set_count ||
14008 !pool.info.hit_set_period ||
14009 pool.info.hit_set_params.get_type() == HitSet::TYPE_NONE)) {
14010 hit_set_clear();
14011
14012 // only primary is allowed to remove all the hit set objects
14013 hit_set_remove_all();
14014 return;
14015 }
14016
14017 // FIXME: discard any previous data for now
14018 hit_set_create();
14019
14020 // include any writes we know about from the pg log. this doesn't
14021 // capture reads, but it is better than nothing!
14022 hit_set_apply_log();
14023}
14024
14025void PrimaryLogPG::hit_set_remove_all()
14026{
14027 // If any archives are degraded we skip this
9f95a23c 14028 for (auto p = info.hit_set.history.begin();
7c673cae
FG
14029 p != info.hit_set.history.end();
14030 ++p) {
14031 hobject_t aoid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt);
14032
14033 // Once we hit a degraded object just skip
14034 if (is_degraded_or_backfilling_object(aoid))
14035 return;
f67539c2 14036 if (m_scrubber->write_blocked_by_scrub(aoid))
7c673cae
FG
14037 return;
14038 }
14039
14040 if (!info.hit_set.history.empty()) {
9f95a23c 14041 auto p = info.hit_set.history.rbegin();
11fdf7f2 14042 ceph_assert(p != info.hit_set.history.rend());
7c673cae 14043 hobject_t oid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt);
11fdf7f2 14044 ceph_assert(!is_degraded_or_backfilling_object(oid));
7c673cae 14045 ObjectContextRef obc = get_object_context(oid, false);
11fdf7f2 14046 ceph_assert(obc);
7c673cae
FG
14047
14048 OpContextUPtr ctx = simple_opc_create(obc);
14049 ctx->at_version = get_next_version();
14050 ctx->updated_hset_history = info.hit_set;
14051 utime_t now = ceph_clock_now();
14052 ctx->mtime = now;
14053 hit_set_trim(ctx, 0);
14054 simple_opc_submit(std::move(ctx));
14055 }
14056
9f95a23c 14057 recovery_state.update_hset(pg_hit_set_history_t());
7c673cae
FG
14058 if (agent_state) {
14059 agent_state->discard_hit_sets();
14060 }
14061}
14062
14063void PrimaryLogPG::hit_set_create()
14064{
14065 utime_t now = ceph_clock_now();
14066 // make a copy of the params to modify
14067 HitSet::Params params(pool.info.hit_set_params);
14068
14069 dout(20) << __func__ << " " << params << dendl;
14070 if (pool.info.hit_set_params.get_type() == HitSet::TYPE_BLOOM) {
14071 BloomHitSet::Params *p =
14072 static_cast<BloomHitSet::Params*>(params.impl.get());
14073
14074 // convert false positive rate so it holds up across the full period
14075 p->set_fpp(p->get_fpp() / pool.info.hit_set_count);
14076 if (p->get_fpp() <= 0.0)
14077 p->set_fpp(.01); // fpp cannot be zero!
14078
14079 // if we don't have specified size, estimate target size based on the
14080 // previous bin!
14081 if (p->target_size == 0 && hit_set) {
14082 utime_t dur = now - hit_set_start_stamp;
14083 unsigned unique = hit_set->approx_unique_insert_count();
14084 dout(20) << __func__ << " previous set had approx " << unique
14085 << " unique items over " << dur << " seconds" << dendl;
14086 p->target_size = (double)unique * (double)pool.info.hit_set_period
14087 / (double)dur;
14088 }
14089 if (p->target_size <
14090 static_cast<uint64_t>(cct->_conf->osd_hit_set_min_size))
14091 p->target_size = cct->_conf->osd_hit_set_min_size;
14092
14093 if (p->target_size
14094 > static_cast<uint64_t>(cct->_conf->osd_hit_set_max_size))
14095 p->target_size = cct->_conf->osd_hit_set_max_size;
14096
14097 p->seed = now.sec();
14098
14099 dout(10) << __func__ << " target_size " << p->target_size
14100 << " fpp " << p->get_fpp() << dendl;
14101 }
14102 hit_set.reset(new HitSet(params));
14103 hit_set_start_stamp = now;
14104}
14105
14106/**
14107 * apply log entries to set
14108 *
14109 * this would only happen after peering, to at least capture writes
14110 * during an interval that was potentially lost.
14111 */
14112bool PrimaryLogPG::hit_set_apply_log()
14113{
14114 if (!hit_set)
14115 return false;
14116
14117 eversion_t to = info.last_update;
14118 eversion_t from = info.hit_set.current_last_update;
14119 if (to <= from) {
14120 dout(20) << __func__ << " no update" << dendl;
14121 return false;
14122 }
14123
14124 dout(20) << __func__ << " " << to << " .. " << info.last_update << dendl;
9f95a23c
TL
14125 list<pg_log_entry_t>::const_reverse_iterator p =
14126 recovery_state.get_pg_log().get_log().log.rbegin();
14127 while (p != recovery_state.get_pg_log().get_log().log.rend() && p->version > to)
7c673cae 14128 ++p;
9f95a23c 14129 while (p != recovery_state.get_pg_log().get_log().log.rend() && p->version > from) {
7c673cae
FG
14130 hit_set->insert(p->soid);
14131 ++p;
14132 }
14133
14134 return true;
14135}
14136
14137void PrimaryLogPG::hit_set_persist()
14138{
14139 dout(10) << __func__ << dendl;
14140 bufferlist bl;
14141 unsigned max = pool.info.hit_set_count;
14142
14143 utime_t now = ceph_clock_now();
14144 hobject_t oid;
14145
14146 // If any archives are degraded we skip this persist request
14147 // account for the additional entry being added below
9f95a23c 14148 for (auto p = info.hit_set.history.begin();
7c673cae
FG
14149 p != info.hit_set.history.end();
14150 ++p) {
14151 hobject_t aoid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt);
14152
14153 // Once we hit a degraded object just skip further trim
14154 if (is_degraded_or_backfilling_object(aoid))
14155 return;
f67539c2 14156 if (m_scrubber->write_blocked_by_scrub(aoid))
7c673cae
FG
14157 return;
14158 }
14159
14160 // If backfill is in progress and we could possibly overlap with the
14161 // hit_set_* objects, back off. Since these all have
14162 // hobject_t::hash set to pgid.ps(), and those sort first, we can
14163 // look just at that. This is necessary because our transactions
14164 // may include a modify of the new hit_set *and* a delete of the
14165 // old one, and this may span the backfill boundary.
9f95a23c
TL
14166 for (set<pg_shard_t>::const_iterator p = get_backfill_targets().begin();
14167 p != get_backfill_targets().end();
7c673cae 14168 ++p) {
9f95a23c 14169 const pg_info_t& pi = recovery_state.get_peer_info(*p);
7c673cae
FG
14170 if (pi.last_backfill == hobject_t() ||
14171 pi.last_backfill.get_hash() == info.pgid.ps()) {
14172 dout(10) << __func__ << " backfill target osd." << *p
14173 << " last_backfill has not progressed past pgid ps"
14174 << dendl;
14175 return;
14176 }
14177 }
14178
14179
14180 pg_hit_set_info_t new_hset = pg_hit_set_info_t(pool.info.use_gmt_hitset);
14181 new_hset.begin = hit_set_start_stamp;
14182 new_hset.end = now;
14183 oid = get_hit_set_archive_object(
14184 new_hset.begin,
14185 new_hset.end,
14186 new_hset.using_gmt);
14187
14188 // If the current object is degraded we skip this persist request
f67539c2 14189 if (m_scrubber->write_blocked_by_scrub(oid))
7c673cae
FG
14190 return;
14191
14192 hit_set->seal();
11fdf7f2 14193 encode(*hit_set, bl);
7c673cae
FG
14194 dout(20) << __func__ << " archive " << oid << dendl;
14195
14196 if (agent_state) {
14197 agent_state->add_hit_set(new_hset.begin, hit_set);
14198 uint32_t size = agent_state->hit_set_map.size();
14199 if (size >= pool.info.hit_set_count) {
14200 size = pool.info.hit_set_count > 0 ? pool.info.hit_set_count - 1: 0;
14201 }
14202 hit_set_in_memory_trim(size);
14203 }
14204
14205 ObjectContextRef obc = get_object_context(oid, true);
14206 OpContextUPtr ctx = simple_opc_create(obc);
14207
14208 ctx->at_version = get_next_version();
14209 ctx->updated_hset_history = info.hit_set;
14210 pg_hit_set_history_t &updated_hit_set_hist = *(ctx->updated_hset_history);
14211
14212 updated_hit_set_hist.current_last_update = info.last_update;
14213 new_hset.version = ctx->at_version;
14214
14215 updated_hit_set_hist.history.push_back(new_hset);
14216 hit_set_create();
14217
14218 // fabricate an object_info_t and SnapSet
14219 obc->obs.oi.version = ctx->at_version;
14220 obc->obs.oi.mtime = now;
14221 obc->obs.oi.size = bl.length();
14222 obc->obs.exists = true;
14223 obc->obs.oi.set_data_digest(bl.crc32c(-1));
14224
14225 ctx->new_obs = obc->obs;
14226
7c673cae
FG
14227 ctx->new_snapset = obc->ssc->snapset;
14228
14229 ctx->delta_stats.num_objects++;
14230 ctx->delta_stats.num_objects_hit_set_archive++;
11fdf7f2 14231
7c673cae
FG
14232 ctx->delta_stats.num_bytes += bl.length();
14233 ctx->delta_stats.num_bytes_hit_set_archive += bl.length();
14234
14235 bufferlist bss;
11fdf7f2 14236 encode(ctx->new_snapset, bss);
7c673cae 14237 bufferlist boi(sizeof(ctx->new_obs.oi));
11fdf7f2 14238 encode(ctx->new_obs.oi, boi,
7c673cae
FG
14239 get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
14240
14241 ctx->op_t->create(oid);
14242 if (bl.length()) {
14243 ctx->op_t->write(oid, 0, bl.length(), bl, 0);
9f95a23c
TL
14244 write_update_size_and_usage(ctx->delta_stats, obc->obs.oi, ctx->modified_ranges,
14245 0, bl.length());
14246 ctx->clean_regions.mark_data_region_dirty(0, bl.length());
7c673cae
FG
14247 }
14248 map <string, bufferlist> attrs;
f67539c2
TL
14249 attrs[OI_ATTR] = std::move(boi);
14250 attrs[SS_ATTR] = std::move(bss);
11fdf7f2 14251 setattrs_maybe_cache(ctx->obc, ctx->op_t.get(), attrs);
7c673cae
FG
14252 ctx->log.push_back(
14253 pg_log_entry_t(
14254 pg_log_entry_t::MODIFY,
14255 oid,
14256 ctx->at_version,
14257 eversion_t(),
14258 0,
14259 osd_reqid_t(),
14260 ctx->mtime,
14261 0)
14262 );
9f95a23c 14263 ctx->log.back().clean_regions = ctx->clean_regions;
7c673cae
FG
14264
14265 hit_set_trim(ctx, max);
14266
14267 simple_opc_submit(std::move(ctx));
14268}
14269
14270void PrimaryLogPG::hit_set_trim(OpContextUPtr &ctx, unsigned max)
14271{
11fdf7f2 14272 ceph_assert(ctx->updated_hset_history);
7c673cae
FG
14273 pg_hit_set_history_t &updated_hit_set_hist =
14274 *(ctx->updated_hset_history);
14275 for (unsigned num = updated_hit_set_hist.history.size(); num > max; --num) {
14276 list<pg_hit_set_info_t>::iterator p = updated_hit_set_hist.history.begin();
11fdf7f2 14277 ceph_assert(p != updated_hit_set_hist.history.end());
7c673cae
FG
14278 hobject_t oid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt);
14279
11fdf7f2 14280 ceph_assert(!is_degraded_or_backfilling_object(oid));
7c673cae
FG
14281
14282 dout(20) << __func__ << " removing " << oid << dendl;
14283 ++ctx->at_version.version;
14284 ctx->log.push_back(
14285 pg_log_entry_t(pg_log_entry_t::DELETE,
14286 oid,
14287 ctx->at_version,
14288 p->version,
14289 0,
14290 osd_reqid_t(),
14291 ctx->mtime,
14292 0));
14293
14294 ctx->op_t->remove(oid);
14295 updated_hit_set_hist.history.pop_front();
14296
14297 ObjectContextRef obc = get_object_context(oid, false);
11fdf7f2 14298 ceph_assert(obc);
7c673cae
FG
14299 --ctx->delta_stats.num_objects;
14300 --ctx->delta_stats.num_objects_hit_set_archive;
14301 ctx->delta_stats.num_bytes -= obc->obs.oi.size;
14302 ctx->delta_stats.num_bytes_hit_set_archive -= obc->obs.oi.size;
14303 }
14304}
14305
14306void PrimaryLogPG::hit_set_in_memory_trim(uint32_t max_in_memory)
14307{
14308 while (agent_state->hit_set_map.size() > max_in_memory) {
14309 agent_state->remove_oldest_hit_set();
14310 }
14311}
14312
14313
14314// =======================================
14315// cache agent
14316
14317void PrimaryLogPG::agent_setup()
14318{
11fdf7f2 14319 ceph_assert(is_locked());
7c673cae
FG
14320 if (!is_active() ||
14321 !is_primary() ||
11fdf7f2 14322 state_test(PG_STATE_PREMERGE) ||
7c673cae
FG
14323 pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE ||
14324 pool.info.tier_of < 0 ||
14325 !get_osdmap()->have_pg_pool(pool.info.tier_of)) {
14326 agent_clear();
14327 return;
14328 }
14329 if (!agent_state) {
14330 agent_state.reset(new TierAgentState);
14331
14332 // choose random starting position
14333 agent_state->position = hobject_t();
14334 agent_state->position.pool = info.pgid.pool();
14335 agent_state->position.set_hash(pool.info.get_random_pg_position(
14336 info.pgid.pgid,
14337 rand()));
14338 agent_state->start = agent_state->position;
14339
14340 dout(10) << __func__ << " allocated new state, position "
14341 << agent_state->position << dendl;
14342 } else {
14343 dout(10) << __func__ << " keeping existing state" << dendl;
14344 }
14345
14346 if (info.stats.stats_invalid) {
14347 osd->clog->warn() << "pg " << info.pgid << " has invalid (post-split) stats; must scrub before tier agent can activate";
14348 }
14349
14350 agent_choose_mode();
14351}
14352
14353void PrimaryLogPG::agent_clear()
14354{
14355 agent_stop();
14356 agent_state.reset(NULL);
14357}
14358
14359// Return false if no objects operated on since start of object hash space
14360bool PrimaryLogPG::agent_work(int start_max, int agent_flush_quota)
14361{
9f95a23c 14362 std::scoped_lock locker{*this};
7c673cae
FG
14363 if (!agent_state) {
14364 dout(10) << __func__ << " no agent state, stopping" << dendl;
7c673cae
FG
14365 return true;
14366 }
14367
9f95a23c 14368 ceph_assert(!recovery_state.is_deleting());
7c673cae
FG
14369
14370 if (agent_state->is_idle()) {
14371 dout(10) << __func__ << " idle, stopping" << dendl;
7c673cae
FG
14372 return true;
14373 }
14374
14375 osd->logger->inc(l_osd_agent_wake);
14376
14377 dout(10) << __func__
14378 << " max " << start_max
14379 << ", flush " << agent_state->get_flush_mode_name()
14380 << ", evict " << agent_state->get_evict_mode_name()
14381 << ", pos " << agent_state->position
14382 << dendl;
11fdf7f2
TL
14383 ceph_assert(is_primary());
14384 ceph_assert(is_active());
7c673cae
FG
14385
14386 agent_load_hit_sets();
14387
14388 const pg_pool_t *base_pool = get_osdmap()->get_pg_pool(pool.info.tier_of);
11fdf7f2 14389 ceph_assert(base_pool);
7c673cae
FG
14390
14391 int ls_min = 1;
14392 int ls_max = cct->_conf->osd_pool_default_cache_max_evict_check_size;
14393
14394 // list some objects. this conveniently lists clones (oldest to
14395 // newest) before heads... the same order we want to flush in.
14396 //
14397 // NOTE: do not flush the Sequencer. we will assume that the
14398 // listing we get back is imprecise.
14399 vector<hobject_t> ls;
14400 hobject_t next;
14401 int r = pgbackend->objects_list_partial(agent_state->position, ls_min, ls_max,
14402 &ls, &next);
11fdf7f2 14403 ceph_assert(r >= 0);
7c673cae
FG
14404 dout(20) << __func__ << " got " << ls.size() << " objects" << dendl;
14405 int started = 0;
14406 for (vector<hobject_t>::iterator p = ls.begin();
14407 p != ls.end();
14408 ++p) {
14409 if (p->nspace == cct->_conf->osd_hit_set_namespace) {
14410 dout(20) << __func__ << " skip (hit set) " << *p << dendl;
14411 osd->logger->inc(l_osd_agent_skip);
14412 continue;
14413 }
14414 if (is_degraded_or_backfilling_object(*p)) {
14415 dout(20) << __func__ << " skip (degraded) " << *p << dendl;
14416 osd->logger->inc(l_osd_agent_skip);
14417 continue;
14418 }
14419 if (is_missing_object(p->get_head())) {
14420 dout(20) << __func__ << " skip (missing head) " << *p << dendl;
14421 osd->logger->inc(l_osd_agent_skip);
14422 continue;
14423 }
14424 ObjectContextRef obc = get_object_context(*p, false, NULL);
14425 if (!obc) {
14426 // we didn't flush; we may miss something here.
14427 dout(20) << __func__ << " skip (no obc) " << *p << dendl;
14428 osd->logger->inc(l_osd_agent_skip);
14429 continue;
14430 }
14431 if (!obc->obs.exists) {
14432 dout(20) << __func__ << " skip (dne) " << obc->obs.oi.soid << dendl;
14433 osd->logger->inc(l_osd_agent_skip);
14434 continue;
14435 }
f67539c2 14436 if (m_scrubber->range_intersects_scrub(obc->obs.oi.soid,
28e407b8 14437 obc->obs.oi.soid.get_head())) {
7c673cae
FG
14438 dout(20) << __func__ << " skip (scrubbing) " << obc->obs.oi << dendl;
14439 osd->logger->inc(l_osd_agent_skip);
14440 continue;
14441 }
14442 if (obc->is_blocked()) {
14443 dout(20) << __func__ << " skip (blocked) " << obc->obs.oi << dendl;
14444 osd->logger->inc(l_osd_agent_skip);
14445 continue;
14446 }
14447 if (obc->is_request_pending()) {
14448 dout(20) << __func__ << " skip (request pending) " << obc->obs.oi << dendl;
14449 osd->logger->inc(l_osd_agent_skip);
14450 continue;
14451 }
14452
14453 // be careful flushing omap to an EC pool.
14454 if (!base_pool->supports_omap() &&
14455 obc->obs.oi.is_omap()) {
14456 dout(20) << __func__ << " skip (omap to EC) " << obc->obs.oi << dendl;
14457 osd->logger->inc(l_osd_agent_skip);
14458 continue;
14459 }
14460
14461 if (agent_state->evict_mode != TierAgentState::EVICT_MODE_IDLE &&
14462 agent_maybe_evict(obc, false))
14463 ++started;
14464 else if (agent_state->flush_mode != TierAgentState::FLUSH_MODE_IDLE &&
14465 agent_flush_quota > 0 && agent_maybe_flush(obc)) {
14466 ++started;
14467 --agent_flush_quota;
14468 }
14469 if (started >= start_max) {
14470 // If finishing early, set "next" to the next object
14471 if (++p != ls.end())
14472 next = *p;
14473 break;
14474 }
14475 }
14476
14477 if (++agent_state->hist_age > cct->_conf->osd_agent_hist_halflife) {
14478 dout(20) << __func__ << " resetting atime and temp histograms" << dendl;
14479 agent_state->hist_age = 0;
14480 agent_state->temp_hist.decay();
14481 }
14482
14483 // Total objects operated on so far
14484 int total_started = agent_state->started + started;
14485 bool need_delay = false;
14486
14487 dout(20) << __func__ << " start pos " << agent_state->position
14488 << " next start pos " << next
14489 << " started " << total_started << dendl;
14490
14491 // See if we've made a full pass over the object hash space
14492 // This might check at most ls_max objects a second time to notice that
14493 // we've checked every objects at least once.
14494 if (agent_state->position < agent_state->start &&
14495 next >= agent_state->start) {
14496 dout(20) << __func__ << " wrap around " << agent_state->start << dendl;
14497 if (total_started == 0)
14498 need_delay = true;
14499 else
14500 total_started = 0;
14501 agent_state->start = next;
14502 }
14503 agent_state->started = total_started;
14504
14505 // See if we are starting from beginning
14506 if (next.is_max())
14507 agent_state->position = hobject_t();
14508 else
14509 agent_state->position = next;
14510
14511 // Discard old in memory HitSets
14512 hit_set_in_memory_trim(pool.info.hit_set_count);
14513
14514 if (need_delay) {
11fdf7f2 14515 ceph_assert(agent_state->delaying == false);
7c673cae 14516 agent_delay();
7c673cae
FG
14517 return false;
14518 }
14519 agent_choose_mode();
7c673cae
FG
14520 return true;
14521}
14522
14523void PrimaryLogPG::agent_load_hit_sets()
14524{
14525 if (agent_state->evict_mode == TierAgentState::EVICT_MODE_IDLE) {
14526 return;
14527 }
14528
14529 if (agent_state->hit_set_map.size() < info.hit_set.history.size()) {
14530 dout(10) << __func__ << dendl;
9f95a23c 14531 for (auto p = info.hit_set.history.begin();
7c673cae
FG
14532 p != info.hit_set.history.end(); ++p) {
14533 if (agent_state->hit_set_map.count(p->begin.sec()) == 0) {
14534 dout(10) << __func__ << " loading " << p->begin << "-"
14535 << p->end << dendl;
14536 if (!pool.info.is_replicated()) {
14537 // FIXME: EC not supported here yet
14538 derr << __func__ << " on non-replicated pool" << dendl;
14539 break;
14540 }
14541
14542 hobject_t oid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt);
14543 if (is_unreadable_object(oid)) {
14544 dout(10) << __func__ << " unreadable " << oid << ", waiting" << dendl;
14545 break;
14546 }
14547
14548 ObjectContextRef obc = get_object_context(oid, false);
14549 if (!obc) {
14550 derr << __func__ << ": could not load hitset " << oid << dendl;
14551 break;
14552 }
14553
14554 bufferlist bl;
14555 {
7c673cae 14556 int r = osd->store->read(ch, ghobject_t(oid), 0, 0, bl);
11fdf7f2 14557 ceph_assert(r >= 0);
7c673cae
FG
14558 }
14559 HitSetRef hs(new HitSet);
11fdf7f2
TL
14560 bufferlist::const_iterator pbl = bl.begin();
14561 decode(*hs, pbl);
7c673cae
FG
14562 agent_state->add_hit_set(p->begin.sec(), hs);
14563 }
14564 }
14565 }
14566}
14567
14568bool PrimaryLogPG::agent_maybe_flush(ObjectContextRef& obc)
14569{
14570 if (!obc->obs.oi.is_dirty()) {
14571 dout(20) << __func__ << " skip (clean) " << obc->obs.oi << dendl;
14572 osd->logger->inc(l_osd_agent_skip);
14573 return false;
14574 }
14575 if (obc->obs.oi.is_cache_pinned()) {
14576 dout(20) << __func__ << " skip (cache_pinned) " << obc->obs.oi << dendl;
14577 osd->logger->inc(l_osd_agent_skip);
14578 return false;
14579 }
14580
14581 utime_t now = ceph_clock_now();
14582 utime_t ob_local_mtime;
14583 if (obc->obs.oi.local_mtime != utime_t()) {
14584 ob_local_mtime = obc->obs.oi.local_mtime;
14585 } else {
14586 ob_local_mtime = obc->obs.oi.mtime;
14587 }
14588 bool evict_mode_full =
14589 (agent_state->evict_mode == TierAgentState::EVICT_MODE_FULL);
14590 if (!evict_mode_full &&
14591 obc->obs.oi.soid.snap == CEPH_NOSNAP && // snaps immutable; don't delay
14592 (ob_local_mtime + utime_t(pool.info.cache_min_flush_age, 0) > now)) {
14593 dout(20) << __func__ << " skip (too young) " << obc->obs.oi << dendl;
14594 osd->logger->inc(l_osd_agent_skip);
14595 return false;
14596 }
14597
14598 if (osd->agent_is_active_oid(obc->obs.oi.soid)) {
14599 dout(20) << __func__ << " skip (flushing) " << obc->obs.oi << dendl;
14600 osd->logger->inc(l_osd_agent_skip);
14601 return false;
14602 }
14603
14604 dout(10) << __func__ << " flushing " << obc->obs.oi << dendl;
14605
14606 // FIXME: flush anything dirty, regardless of what distribution of
14607 // ages we expect.
14608
14609 hobject_t oid = obc->obs.oi.soid;
14610 osd->agent_start_op(oid);
14611 // no need to capture a pg ref, can't outlive fop or ctx
14612 std::function<void()> on_flush = [this, oid]() {
14613 osd->agent_finish_op(oid);
14614 };
14615
14616 int result = start_flush(
14617 OpRequestRef(), obc, false, NULL,
14618 on_flush);
14619 if (result != -EINPROGRESS) {
14620 on_flush();
14621 dout(10) << __func__ << " start_flush() failed " << obc->obs.oi
14622 << " with " << result << dendl;
14623 osd->logger->inc(l_osd_agent_skip);
14624 return false;
14625 }
14626
14627 osd->logger->inc(l_osd_agent_flush);
14628 return true;
14629}
14630
14631bool PrimaryLogPG::agent_maybe_evict(ObjectContextRef& obc, bool after_flush)
14632{
14633 const hobject_t& soid = obc->obs.oi.soid;
14634 if (!after_flush && obc->obs.oi.is_dirty()) {
14635 dout(20) << __func__ << " skip (dirty) " << obc->obs.oi << dendl;
14636 return false;
14637 }
81eedcae 14638 // This is already checked by agent_work() which passes after_flush = false
f67539c2 14639 if (after_flush && m_scrubber->range_intersects_scrub(soid, soid.get_head())) {
81eedcae
TL
14640 dout(20) << __func__ << " skip (scrubbing) " << obc->obs.oi << dendl;
14641 return false;
14642 }
7c673cae
FG
14643 if (!obc->obs.oi.watchers.empty()) {
14644 dout(20) << __func__ << " skip (watchers) " << obc->obs.oi << dendl;
14645 return false;
14646 }
14647 if (obc->is_blocked()) {
14648 dout(20) << __func__ << " skip (blocked) " << obc->obs.oi << dendl;
14649 return false;
14650 }
14651 if (obc->obs.oi.is_cache_pinned()) {
14652 dout(20) << __func__ << " skip (cache_pinned) " << obc->obs.oi << dendl;
14653 return false;
14654 }
14655
14656 if (soid.snap == CEPH_NOSNAP) {
14657 int result = _verify_no_head_clones(soid, obc->ssc->snapset);
14658 if (result < 0) {
14659 dout(20) << __func__ << " skip (clones) " << obc->obs.oi << dendl;
14660 return false;
14661 }
14662 }
14663
14664 if (agent_state->evict_mode != TierAgentState::EVICT_MODE_FULL) {
14665 // is this object old than cache_min_evict_age?
14666 utime_t now = ceph_clock_now();
14667 utime_t ob_local_mtime;
14668 if (obc->obs.oi.local_mtime != utime_t()) {
14669 ob_local_mtime = obc->obs.oi.local_mtime;
14670 } else {
14671 ob_local_mtime = obc->obs.oi.mtime;
14672 }
14673 if (ob_local_mtime + utime_t(pool.info.cache_min_evict_age, 0) > now) {
14674 dout(20) << __func__ << " skip (too young) " << obc->obs.oi << dendl;
14675 osd->logger->inc(l_osd_agent_skip);
14676 return false;
14677 }
14678 // is this object old and/or cold enough?
14679 int temp = 0;
14680 uint64_t temp_upper = 0, temp_lower = 0;
14681 if (hit_set)
14682 agent_estimate_temp(soid, &temp);
14683 agent_state->temp_hist.add(temp);
14684 agent_state->temp_hist.get_position_micro(temp, &temp_lower, &temp_upper);
14685
14686 dout(20) << __func__
14687 << " temp " << temp
14688 << " pos " << temp_lower << "-" << temp_upper
14689 << ", evict_effort " << agent_state->evict_effort
14690 << dendl;
14691 dout(30) << "agent_state:\n";
14692 Formatter *f = Formatter::create("");
14693 f->open_object_section("agent_state");
14694 agent_state->dump(f);
14695 f->close_section();
14696 f->flush(*_dout);
14697 delete f;
14698 *_dout << dendl;
14699
14700 if (1000000 - temp_upper >= agent_state->evict_effort)
14701 return false;
14702 }
14703
14704 dout(10) << __func__ << " evicting " << obc->obs.oi << dendl;
14705 OpContextUPtr ctx = simple_opc_create(obc);
14706
11fdf7f2 14707 auto null_op_req = OpRequestRef();
7c673cae 14708 if (!ctx->lock_manager.get_lock_type(
9f95a23c 14709 RWState::RWWRITE,
7c673cae
FG
14710 obc->obs.oi.soid,
14711 obc,
11fdf7f2 14712 null_op_req)) {
7c673cae
FG
14713 close_op_ctx(ctx.release());
14714 dout(20) << __func__ << " skip (cannot get lock) " << obc->obs.oi << dendl;
14715 return false;
14716 }
14717
14718 osd->agent_start_evict_op();
14719 ctx->register_on_finish(
14720 [this]() {
14721 osd->agent_finish_evict_op();
14722 });
14723
14724 ctx->at_version = get_next_version();
11fdf7f2 14725 ceph_assert(ctx->new_obs.exists);
7c673cae
FG
14726 int r = _delete_oid(ctx.get(), true, false);
14727 if (obc->obs.oi.is_omap())
14728 ctx->delta_stats.num_objects_omap--;
14729 ctx->delta_stats.num_evict++;
11fdf7f2 14730 ctx->delta_stats.num_evict_kb += shift_round_up(obc->obs.oi.size, 10);
7c673cae
FG
14731 if (obc->obs.oi.is_dirty())
14732 --ctx->delta_stats.num_objects_dirty;
11fdf7f2
TL
14733 ceph_assert(r == 0);
14734 finish_ctx(ctx.get(), pg_log_entry_t::DELETE);
7c673cae
FG
14735 simple_opc_submit(std::move(ctx));
14736 osd->logger->inc(l_osd_tier_evict);
14737 osd->logger->inc(l_osd_agent_evict);
14738 return true;
14739}
14740
14741void PrimaryLogPG::agent_stop()
14742{
14743 dout(20) << __func__ << dendl;
14744 if (agent_state && !agent_state->is_idle()) {
14745 agent_state->evict_mode = TierAgentState::EVICT_MODE_IDLE;
14746 agent_state->flush_mode = TierAgentState::FLUSH_MODE_IDLE;
14747 osd->agent_disable_pg(this, agent_state->evict_effort);
14748 }
14749}
14750
14751void PrimaryLogPG::agent_delay()
14752{
14753 dout(20) << __func__ << dendl;
14754 if (agent_state && !agent_state->is_idle()) {
11fdf7f2 14755 ceph_assert(agent_state->delaying == false);
7c673cae
FG
14756 agent_state->delaying = true;
14757 osd->agent_disable_pg(this, agent_state->evict_effort);
14758 }
14759}
14760
14761void PrimaryLogPG::agent_choose_mode_restart()
14762{
14763 dout(20) << __func__ << dendl;
9f95a23c 14764 std::scoped_lock locker{*this};
7c673cae
FG
14765 if (agent_state && agent_state->delaying) {
14766 agent_state->delaying = false;
14767 agent_choose_mode(true);
14768 }
7c673cae
FG
14769}
14770
14771bool PrimaryLogPG::agent_choose_mode(bool restart, OpRequestRef op)
14772{
14773 bool requeued = false;
14774 // Let delay play out
14775 if (agent_state->delaying) {
11fdf7f2 14776 dout(20) << __func__ << " " << this << " delaying, ignored" << dendl;
7c673cae
FG
14777 return requeued;
14778 }
14779
14780 TierAgentState::flush_mode_t flush_mode = TierAgentState::FLUSH_MODE_IDLE;
14781 TierAgentState::evict_mode_t evict_mode = TierAgentState::EVICT_MODE_IDLE;
14782 unsigned evict_effort = 0;
14783
14784 if (info.stats.stats_invalid) {
14785 // idle; stats can't be trusted until we scrub.
14786 dout(20) << __func__ << " stats invalid (post-split), idle" << dendl;
14787 goto skip_calc;
14788 }
14789
14790 {
14791 uint64_t divisor = pool.info.get_pg_num_divisor(info.pgid.pgid);
11fdf7f2 14792 ceph_assert(divisor > 0);
7c673cae
FG
14793
14794 // adjust (effective) user objects down based on the number
14795 // of HitSet objects, which should not count toward our total since
14796 // they cannot be flushed.
14797 uint64_t unflushable = info.stats.stats.sum.num_objects_hit_set_archive;
14798
14799 // also exclude omap objects if ec backing pool
14800 const pg_pool_t *base_pool = get_osdmap()->get_pg_pool(pool.info.tier_of);
11fdf7f2 14801 ceph_assert(base_pool);
7c673cae
FG
14802 if (!base_pool->supports_omap())
14803 unflushable += info.stats.stats.sum.num_objects_omap;
14804
14805 uint64_t num_user_objects = info.stats.stats.sum.num_objects;
14806 if (num_user_objects > unflushable)
14807 num_user_objects -= unflushable;
14808 else
14809 num_user_objects = 0;
14810
14811 uint64_t num_user_bytes = info.stats.stats.sum.num_bytes;
14812 uint64_t unflushable_bytes = info.stats.stats.sum.num_bytes_hit_set_archive;
14813 num_user_bytes -= unflushable_bytes;
14814 uint64_t num_overhead_bytes = osd->store->estimate_objects_overhead(num_user_objects);
14815 num_user_bytes += num_overhead_bytes;
14816
14817 // also reduce the num_dirty by num_objects_omap
14818 int64_t num_dirty = info.stats.stats.sum.num_objects_dirty;
14819 if (!base_pool->supports_omap()) {
14820 if (num_dirty > info.stats.stats.sum.num_objects_omap)
14821 num_dirty -= info.stats.stats.sum.num_objects_omap;
14822 else
14823 num_dirty = 0;
14824 }
14825
14826 dout(10) << __func__
14827 << " flush_mode: "
14828 << TierAgentState::get_flush_mode_name(agent_state->flush_mode)
14829 << " evict_mode: "
14830 << TierAgentState::get_evict_mode_name(agent_state->evict_mode)
14831 << " num_objects: " << info.stats.stats.sum.num_objects
14832 << " num_bytes: " << info.stats.stats.sum.num_bytes
14833 << " num_objects_dirty: " << info.stats.stats.sum.num_objects_dirty
14834 << " num_objects_omap: " << info.stats.stats.sum.num_objects_omap
14835 << " num_dirty: " << num_dirty
14836 << " num_user_objects: " << num_user_objects
14837 << " num_user_bytes: " << num_user_bytes
14838 << " num_overhead_bytes: " << num_overhead_bytes
14839 << " pool.info.target_max_bytes: " << pool.info.target_max_bytes
14840 << " pool.info.target_max_objects: " << pool.info.target_max_objects
14841 << dendl;
14842
14843 // get dirty, full ratios
14844 uint64_t dirty_micro = 0;
14845 uint64_t full_micro = 0;
14846 if (pool.info.target_max_bytes && num_user_objects > 0) {
14847 uint64_t avg_size = num_user_bytes / num_user_objects;
14848 dirty_micro =
14849 num_dirty * avg_size * 1000000 /
11fdf7f2 14850 std::max<uint64_t>(pool.info.target_max_bytes / divisor, 1);
7c673cae
FG
14851 full_micro =
14852 num_user_objects * avg_size * 1000000 /
11fdf7f2 14853 std::max<uint64_t>(pool.info.target_max_bytes / divisor, 1);
7c673cae
FG
14854 }
14855 if (pool.info.target_max_objects > 0) {
14856 uint64_t dirty_objects_micro =
14857 num_dirty * 1000000 /
11fdf7f2 14858 std::max<uint64_t>(pool.info.target_max_objects / divisor, 1);
7c673cae
FG
14859 if (dirty_objects_micro > dirty_micro)
14860 dirty_micro = dirty_objects_micro;
14861 uint64_t full_objects_micro =
14862 num_user_objects * 1000000 /
11fdf7f2 14863 std::max<uint64_t>(pool.info.target_max_objects / divisor, 1);
7c673cae
FG
14864 if (full_objects_micro > full_micro)
14865 full_micro = full_objects_micro;
14866 }
14867 dout(20) << __func__ << " dirty " << ((float)dirty_micro / 1000000.0)
14868 << " full " << ((float)full_micro / 1000000.0)
14869 << dendl;
14870
14871 // flush mode
14872 uint64_t flush_target = pool.info.cache_target_dirty_ratio_micro;
14873 uint64_t flush_high_target = pool.info.cache_target_dirty_high_ratio_micro;
14874 uint64_t flush_slop = (float)flush_target * cct->_conf->osd_agent_slop;
14875 if (restart || agent_state->flush_mode == TierAgentState::FLUSH_MODE_IDLE) {
14876 flush_target += flush_slop;
14877 flush_high_target += flush_slop;
14878 } else {
11fdf7f2
TL
14879 flush_target -= std::min(flush_target, flush_slop);
14880 flush_high_target -= std::min(flush_high_target, flush_slop);
7c673cae
FG
14881 }
14882
14883 if (dirty_micro > flush_high_target) {
14884 flush_mode = TierAgentState::FLUSH_MODE_HIGH;
11fdf7f2 14885 } else if (dirty_micro > flush_target || (!flush_target && num_dirty > 0)) {
7c673cae
FG
14886 flush_mode = TierAgentState::FLUSH_MODE_LOW;
14887 }
14888
14889 // evict mode
14890 uint64_t evict_target = pool.info.cache_target_full_ratio_micro;
14891 uint64_t evict_slop = (float)evict_target * cct->_conf->osd_agent_slop;
14892 if (restart || agent_state->evict_mode == TierAgentState::EVICT_MODE_IDLE)
14893 evict_target += evict_slop;
14894 else
11fdf7f2 14895 evict_target -= std::min(evict_target, evict_slop);
7c673cae
FG
14896
14897 if (full_micro > 1000000) {
14898 // evict anything clean
14899 evict_mode = TierAgentState::EVICT_MODE_FULL;
14900 evict_effort = 1000000;
14901 } else if (full_micro > evict_target) {
14902 // set effort in [0..1] range based on where we are between
14903 evict_mode = TierAgentState::EVICT_MODE_SOME;
14904 uint64_t over = full_micro - evict_target;
14905 uint64_t span = 1000000 - evict_target;
11fdf7f2
TL
14906 evict_effort = std::max(over * 1000000 / span,
14907 uint64_t(1000000.0 *
14908 cct->_conf->osd_agent_min_evict_effort));
7c673cae
FG
14909
14910 // quantize effort to avoid too much reordering in the agent_queue.
14911 uint64_t inc = cct->_conf->osd_agent_quantize_effort * 1000000;
11fdf7f2 14912 ceph_assert(inc > 0);
7c673cae
FG
14913 uint64_t was = evict_effort;
14914 evict_effort -= evict_effort % inc;
14915 if (evict_effort < inc)
14916 evict_effort = inc;
11fdf7f2 14917 ceph_assert(evict_effort >= inc && evict_effort <= 1000000);
7c673cae
FG
14918 dout(30) << __func__ << " evict_effort " << was << " quantized by " << inc << " to " << evict_effort << dendl;
14919 }
14920 }
14921
14922 skip_calc:
14923 bool old_idle = agent_state->is_idle();
14924 if (flush_mode != agent_state->flush_mode) {
14925 dout(5) << __func__ << " flush_mode "
14926 << TierAgentState::get_flush_mode_name(agent_state->flush_mode)
14927 << " -> "
14928 << TierAgentState::get_flush_mode_name(flush_mode)
14929 << dendl;
9f95a23c
TL
14930 recovery_state.update_stats(
14931 [=](auto &history, auto &stats) {
14932 if (flush_mode == TierAgentState::FLUSH_MODE_HIGH) {
14933 osd->agent_inc_high_count();
14934 stats.stats.sum.num_flush_mode_high = 1;
14935 } else if (flush_mode == TierAgentState::FLUSH_MODE_LOW) {
14936 stats.stats.sum.num_flush_mode_low = 1;
14937 }
14938 if (agent_state->flush_mode == TierAgentState::FLUSH_MODE_HIGH) {
14939 osd->agent_dec_high_count();
14940 stats.stats.sum.num_flush_mode_high = 0;
14941 } else if (agent_state->flush_mode == TierAgentState::FLUSH_MODE_LOW) {
14942 stats.stats.sum.num_flush_mode_low = 0;
14943 }
14944 return false;
14945 });
7c673cae
FG
14946 agent_state->flush_mode = flush_mode;
14947 }
14948 if (evict_mode != agent_state->evict_mode) {
14949 dout(5) << __func__ << " evict_mode "
14950 << TierAgentState::get_evict_mode_name(agent_state->evict_mode)
14951 << " -> "
14952 << TierAgentState::get_evict_mode_name(evict_mode)
14953 << dendl;
14954 if (agent_state->evict_mode == TierAgentState::EVICT_MODE_FULL &&
14955 is_active()) {
14956 if (op)
14957 requeue_op(op);
b32b8144 14958 requeue_ops(waiting_for_flush);
7c673cae 14959 requeue_ops(waiting_for_active);
9f95a23c 14960 requeue_ops(waiting_for_readable);
7c673cae
FG
14961 requeue_ops(waiting_for_scrub);
14962 requeue_ops(waiting_for_cache_not_full);
14963 objects_blocked_on_cache_full.clear();
14964 requeued = true;
14965 }
9f95a23c
TL
14966 recovery_state.update_stats(
14967 [=](auto &history, auto &stats) {
14968 if (evict_mode == TierAgentState::EVICT_MODE_SOME) {
14969 stats.stats.sum.num_evict_mode_some = 1;
14970 } else if (evict_mode == TierAgentState::EVICT_MODE_FULL) {
14971 stats.stats.sum.num_evict_mode_full = 1;
14972 }
14973 if (agent_state->evict_mode == TierAgentState::EVICT_MODE_SOME) {
14974 stats.stats.sum.num_evict_mode_some = 0;
14975 } else if (agent_state->evict_mode == TierAgentState::EVICT_MODE_FULL) {
14976 stats.stats.sum.num_evict_mode_full = 0;
14977 }
14978 return false;
14979 });
7c673cae
FG
14980 agent_state->evict_mode = evict_mode;
14981 }
14982 uint64_t old_effort = agent_state->evict_effort;
14983 if (evict_effort != agent_state->evict_effort) {
14984 dout(5) << __func__ << " evict_effort "
14985 << ((float)agent_state->evict_effort / 1000000.0)
14986 << " -> "
14987 << ((float)evict_effort / 1000000.0)
14988 << dendl;
14989 agent_state->evict_effort = evict_effort;
14990 }
14991
14992 // NOTE: we are using evict_effort as a proxy for *all* agent effort
14993 // (including flush). This is probably fine (they should be
14994 // correlated) but it is not precisely correct.
14995 if (agent_state->is_idle()) {
14996 if (!restart && !old_idle) {
14997 osd->agent_disable_pg(this, old_effort);
14998 }
14999 } else {
15000 if (restart || old_idle) {
15001 osd->agent_enable_pg(this, agent_state->evict_effort);
15002 } else if (old_effort != agent_state->evict_effort) {
15003 osd->agent_adjust_pg(this, old_effort, agent_state->evict_effort);
15004 }
15005 }
15006 return requeued;
15007}
15008
15009void PrimaryLogPG::agent_estimate_temp(const hobject_t& oid, int *temp)
15010{
11fdf7f2
TL
15011 ceph_assert(hit_set);
15012 ceph_assert(temp);
7c673cae
FG
15013 *temp = 0;
15014 if (hit_set->contains(oid))
15015 *temp = 1000000;
15016 unsigned i = 0;
15017 int last_n = pool.info.hit_set_search_last_n;
15018 for (map<time_t,HitSetRef>::reverse_iterator p =
15019 agent_state->hit_set_map.rbegin(); last_n > 0 &&
15020 p != agent_state->hit_set_map.rend(); ++p, ++i) {
15021 if (p->second->contains(oid)) {
15022 *temp += pool.info.get_grade(i);
15023 --last_n;
15024 }
15025 }
15026}
15027
15028// Dup op detection
15029
15030bool PrimaryLogPG::already_complete(eversion_t v)
15031{
15032 dout(20) << __func__ << ": " << v << dendl;
15033 for (xlist<RepGather*>::iterator i = repop_queue.begin();
15034 !i.end();
15035 ++i) {
15036 dout(20) << __func__ << ": " << **i << dendl;
15037 // skip copy from temp object ops
15038 if ((*i)->v == eversion_t()) {
15039 dout(20) << __func__ << ": " << **i
15040 << " version is empty" << dendl;
15041 continue;
15042 }
15043 if ((*i)->v > v) {
15044 dout(20) << __func__ << ": " << **i
15045 << " (*i)->v past v" << dendl;
15046 break;
15047 }
15048 if (!(*i)->all_committed) {
15049 dout(20) << __func__ << ": " << **i
15050 << " not committed, returning false"
15051 << dendl;
15052 return false;
15053 }
15054 }
15055 dout(20) << __func__ << ": returning true" << dendl;
15056 return true;
15057}
15058
7c673cae
FG
15059
15060// ==========================================================================================
15061// SCRUB
15062
f67539c2
TL
15063void PrimaryLogPG::do_replica_scrub_map(OpRequestRef op)
15064{
15065 dout(15) << __func__ << " is scrub active? " << m_scrubber->is_scrub_active() << dendl;
15066 op->mark_started();
15067
15068 if (!m_scrubber->is_scrub_active()) {
15069 dout(10) << __func__ << " scrub isn't active" << dendl;
15070 return;
15071 }
15072 m_scrubber->map_from_replica(op);
15073}
7c673cae 15074
f67539c2
TL
15075bool PrimaryLogPG::_range_available_for_scrub(const hobject_t& begin,
15076 const hobject_t& end)
7c673cae
FG
15077{
15078 pair<hobject_t, ObjectContextRef> next;
15079 next.second = object_contexts.lookup(begin);
15080 next.first = begin;
15081 bool more = true;
15082 while (more && next.first < end) {
15083 if (next.second && next.second->is_blocked()) {
15084 next.second->requeue_scrub_on_unblock = true;
15085 dout(10) << __func__ << ": scrub delayed, "
15086 << next.first << " is blocked"
15087 << dendl;
15088 return false;
15089 }
15090 more = object_contexts.get_next(next.first, &next);
15091 }
15092 return true;
15093}
15094
7c673cae 15095
11fdf7f2 15096int PrimaryLogPG::rep_repair_primary_object(const hobject_t& soid, OpContext *ctx)
224ce89b 15097{
11fdf7f2 15098 OpRequestRef op = ctx->op;
224ce89b 15099 // Only supports replicated pools
11fdf7f2
TL
15100 ceph_assert(!pool.info.is_erasure());
15101 ceph_assert(is_primary());
224ce89b
WB
15102
15103 dout(10) << __func__ << " " << soid
9f95a23c 15104 << " peers osd.{" << get_acting_recovery_backfill() << "}" << dendl;
224ce89b
WB
15105
15106 if (!is_clean()) {
15107 block_for_clean(soid, op);
15108 return -EAGAIN;
15109 }
15110
9f95a23c 15111 ceph_assert(!recovery_state.get_pg_log().get_missing().is_missing(soid));
11fdf7f2
TL
15112 auto& oi = ctx->new_obs.oi;
15113 eversion_t v = oi.version;
224ce89b 15114
224ce89b
WB
15115 if (primary_error(soid, v)) {
15116 dout(0) << __func__ << " No other replicas available for " << soid << dendl;
15117 // XXX: If we knew that there is no down osd which could include this
15118 // object, it would be nice if we could return EIO here.
15119 // If a "never fail" flag was available, that could be used
15120 // for rbd to NOT return EIO until object marked lost.
15121
15122 // Drop through to save this op in case an osd comes up with the object.
15123 }
15124
15125 // Restart the op after object becomes readable again
15126 waiting_for_unreadable_object[soid].push_back(op);
15127 op->mark_delayed("waiting for missing object");
15128
f67539c2
TL
15129 ceph_assert(is_clean());
15130 state_set(PG_STATE_REPAIR);
15131 state_clear(PG_STATE_CLEAN);
15132 queue_peering_event(
15133 PGPeeringEventRef(
15134 std::make_shared<PGPeeringEvent>(
15135 get_osdmap_epoch(),
15136 get_osdmap_epoch(),
15137 PeeringState::DoRecovery())));
224ce89b
WB
15138
15139 return -EAGAIN;
15140}
15141
7c673cae
FG
15142/*---SnapTrimmer Logging---*/
15143#undef dout_prefix
11fdf7f2 15144#define dout_prefix pg->gen_prefix(*_dout)
7c673cae
FG
15145
15146void PrimaryLogPG::SnapTrimmer::log_enter(const char *state_name)
15147{
15148 ldout(pg->cct, 20) << "enter " << state_name << dendl;
15149}
15150
15151void PrimaryLogPG::SnapTrimmer::log_exit(const char *state_name, utime_t enter_time)
15152{
15153 ldout(pg->cct, 20) << "exit " << state_name << dendl;
15154}
15155
f67539c2
TL
15156bool PrimaryLogPG::SnapTrimmer::permit_trim() {
15157 return
15158 pg->is_clean() &&
15159 !pg->m_scrubber->is_scrub_active() &&
15160 !pg->snap_trimq.empty();
15161}
15162
7c673cae
FG
15163/*---SnapTrimmer states---*/
15164#undef dout_prefix
11fdf7f2 15165#define dout_prefix (context< SnapTrimmer >().pg->gen_prefix(*_dout) \
7c673cae
FG
15166 << "SnapTrimmer state<" << get_state_name() << ">: ")
15167
15168/* NotTrimming */
15169PrimaryLogPG::NotTrimming::NotTrimming(my_context ctx)
f67539c2 15170 : my_base(ctx),
9f95a23c 15171 NamedState(nullptr, "NotTrimming")
7c673cae
FG
15172{
15173 context< SnapTrimmer >().log_enter(state_name);
15174}
15175
15176void PrimaryLogPG::NotTrimming::exit()
15177{
15178 context< SnapTrimmer >().log_exit(state_name, enter_time);
15179}
15180
15181boost::statechart::result PrimaryLogPG::NotTrimming::react(const KickTrim&)
15182{
15183 PrimaryLogPG *pg = context< SnapTrimmer >().pg;
15184 ldout(pg->cct, 10) << "NotTrimming react KickTrim" << dendl;
15185
15186 if (!(pg->is_primary() && pg->is_active())) {
15187 ldout(pg->cct, 10) << "NotTrimming not primary or active" << dendl;
15188 return discard_event();
15189 }
15190 if (!pg->is_clean() ||
15191 pg->snap_trimq.empty()) {
15192 ldout(pg->cct, 10) << "NotTrimming not clean or nothing to trim" << dendl;
15193 return discard_event();
15194 }
f67539c2 15195 if (pg->m_scrubber->is_scrub_active()) {
7c673cae 15196 ldout(pg->cct, 10) << " scrubbing, will requeue snap_trimmer after" << dendl;
7c673cae
FG
15197 return transit< WaitScrub >();
15198 } else {
15199 return transit< Trimming >();
15200 }
15201}
15202
15203boost::statechart::result PrimaryLogPG::WaitReservation::react(const SnapTrimReserved&)
15204{
15205 PrimaryLogPG *pg = context< SnapTrimmer >().pg;
15206 ldout(pg->cct, 10) << "WaitReservation react SnapTrimReserved" << dendl;
15207
15208 pending = nullptr;
15209 if (!context< SnapTrimmer >().can_trim()) {
15210 post_event(KickTrim());
15211 return transit< NotTrimming >();
15212 }
15213
15214 context<Trimming>().snap_to_trim = pg->snap_trimq.range_start();
15215 ldout(pg->cct, 10) << "NotTrimming: trimming "
15216 << pg->snap_trimq.range_start()
15217 << dendl;
15218 return transit< AwaitAsyncWork >();
15219}
15220
15221/* AwaitAsyncWork */
15222PrimaryLogPG::AwaitAsyncWork::AwaitAsyncWork(my_context ctx)
15223 : my_base(ctx),
9f95a23c 15224 NamedState(nullptr, "Trimming/AwaitAsyncWork")
7c673cae
FG
15225{
15226 auto *pg = context< SnapTrimmer >().pg;
15227 context< SnapTrimmer >().log_enter(state_name);
15228 context< SnapTrimmer >().pg->osd->queue_for_snap_trim(pg);
15229 pg->state_set(PG_STATE_SNAPTRIM);
224ce89b 15230 pg->state_clear(PG_STATE_SNAPTRIM_ERROR);
7c673cae
FG
15231 pg->publish_stats_to_osd();
15232}
15233
15234boost::statechart::result PrimaryLogPG::AwaitAsyncWork::react(const DoSnapWork&)
15235{
15236 PrimaryLogPGRef pg = context< SnapTrimmer >().pg;
15237 snapid_t snap_to_trim = context<Trimming>().snap_to_trim;
15238 auto &in_flight = context<Trimming>().in_flight;
11fdf7f2 15239 ceph_assert(in_flight.empty());
7c673cae 15240
11fdf7f2 15241 ceph_assert(pg->is_primary() && pg->is_active());
7c673cae
FG
15242 if (!context< SnapTrimmer >().can_trim()) {
15243 ldout(pg->cct, 10) << "something changed, reverting to NotTrimming" << dendl;
15244 post_event(KickTrim());
15245 return transit< NotTrimming >();
15246 }
15247
15248 ldout(pg->cct, 10) << "AwaitAsyncWork: trimming snap " << snap_to_trim << dendl;
15249
15250 vector<hobject_t> to_trim;
15251 unsigned max = pg->cct->_conf->osd_pg_max_concurrent_snap_trims;
15252 to_trim.reserve(max);
15253 int r = pg->snap_mapper.get_next_objects_to_trim(
15254 snap_to_trim,
15255 max,
15256 &to_trim);
15257 if (r != 0 && r != -ENOENT) {
15258 lderr(pg->cct) << "get_next_objects_to_trim returned "
15259 << cpp_strerror(r) << dendl;
11fdf7f2 15260 ceph_abort_msg("get_next_objects_to_trim returned an invalid code");
7c673cae
FG
15261 } else if (r == -ENOENT) {
15262 // Done!
15263 ldout(pg->cct, 10) << "got ENOENT" << dendl;
15264
7c673cae 15265 pg->snap_trimq.erase(snap_to_trim);
7c673cae 15266
9f95a23c
TL
15267 if (pg->snap_trimq_repeat.count(snap_to_trim)) {
15268 ldout(pg->cct, 10) << " removing from snap_trimq_repeat" << dendl;
15269 pg->snap_trimq_repeat.erase(snap_to_trim);
15270 } else {
15271 ldout(pg->cct, 10) << "adding snap " << snap_to_trim
15272 << " to purged_snaps"
15273 << dendl;
15274 ObjectStore::Transaction t;
15275 pg->recovery_state.adjust_purged_snaps(
15276 [snap_to_trim](auto &purged_snaps) {
15277 purged_snaps.insert(snap_to_trim);
15278 });
15279 pg->write_if_dirty(t);
15280
15281 ldout(pg->cct, 10) << "purged_snaps now "
15282 << pg->info.purged_snaps << ", snap_trimq now "
15283 << pg->snap_trimq << dendl;
15284
15285 int tr = pg->osd->store->queue_transaction(pg->ch, std::move(t), NULL);
15286 ceph_assert(tr == 0);
7c673cae 15287
9f95a23c
TL
15288 pg->recovery_state.share_pg_info();
15289 }
7c673cae
FG
15290 post_event(KickTrim());
15291 return transit< NotTrimming >();
15292 }
11fdf7f2 15293 ceph_assert(!to_trim.empty());
7c673cae
FG
15294
15295 for (auto &&object: to_trim) {
15296 // Get next
15297 ldout(pg->cct, 10) << "AwaitAsyncWork react trimming " << object << dendl;
224ce89b 15298 OpContextUPtr ctx;
9f95a23c 15299 int error = pg->trim_object(in_flight.empty(), object, snap_to_trim, &ctx);
224ce89b
WB
15300 if (error) {
15301 if (error == -ENOLCK) {
15302 ldout(pg->cct, 10) << "could not get write lock on obj "
15303 << object << dendl;
15304 } else {
15305 pg->state_set(PG_STATE_SNAPTRIM_ERROR);
15306 ldout(pg->cct, 10) << "Snaptrim error=" << error << dendl;
15307 }
15308 if (!in_flight.empty()) {
15309 ldout(pg->cct, 10) << "letting the ones we already started finish" << dendl;
15310 return transit< WaitRepops >();
15311 }
15312 if (error == -ENOLCK) {
7c673cae
FG
15313 ldout(pg->cct, 10) << "waiting for it to clear"
15314 << dendl;
15315 return transit< WaitRWLock >();
7c673cae 15316 } else {
224ce89b 15317 return transit< NotTrimming >();
7c673cae
FG
15318 }
15319 }
15320
15321 in_flight.insert(object);
15322 ctx->register_on_success(
15323 [pg, object, &in_flight]() {
11fdf7f2 15324 ceph_assert(in_flight.find(object) != in_flight.end());
7c673cae 15325 in_flight.erase(object);
224ce89b
WB
15326 if (in_flight.empty()) {
15327 if (pg->state_test(PG_STATE_SNAPTRIM_ERROR)) {
15328 pg->snap_trimmer_machine.process_event(Reset());
15329 } else {
15330 pg->snap_trimmer_machine.process_event(RepopsComplete());
15331 }
15332 }
7c673cae
FG
15333 });
15334
15335 pg->simple_opc_submit(std::move(ctx));
15336 }
15337
15338 return transit< WaitRepops >();
15339}
15340
15341void PrimaryLogPG::setattr_maybe_cache(
15342 ObjectContextRef obc,
7c673cae
FG
15343 PGTransaction *t,
15344 const string &key,
15345 bufferlist &val)
15346{
15347 t->setattr(obc->obs.oi.soid, key, val);
15348}
15349
15350void PrimaryLogPG::setattrs_maybe_cache(
15351 ObjectContextRef obc,
7c673cae
FG
15352 PGTransaction *t,
15353 map<string, bufferlist> &attrs)
15354{
15355 t->setattrs(obc->obs.oi.soid, attrs);
15356}
15357
15358void PrimaryLogPG::rmattr_maybe_cache(
15359 ObjectContextRef obc,
7c673cae
FG
15360 PGTransaction *t,
15361 const string &key)
15362{
15363 t->rmattr(obc->obs.oi.soid, key);
15364}
15365
15366int PrimaryLogPG::getattr_maybe_cache(
15367 ObjectContextRef obc,
15368 const string &key,
15369 bufferlist *val)
15370{
11fdf7f2 15371 if (pool.info.is_erasure()) {
7c673cae
FG
15372 map<string, bufferlist>::iterator i = obc->attr_cache.find(key);
15373 if (i != obc->attr_cache.end()) {
15374 if (val)
15375 *val = i->second;
15376 return 0;
15377 } else {
15378 return -ENODATA;
15379 }
15380 }
15381 return pgbackend->objects_get_attr(obc->obs.oi.soid, key, val);
15382}
15383
15384int PrimaryLogPG::getattrs_maybe_cache(
15385 ObjectContextRef obc,
b32b8144 15386 map<string, bufferlist> *out)
7c673cae
FG
15387{
15388 int r = 0;
11fdf7f2
TL
15389 ceph_assert(out);
15390 if (pool.info.is_erasure()) {
b32b8144 15391 *out = obc->attr_cache;
7c673cae
FG
15392 } else {
15393 r = pgbackend->objects_get_attrs(obc->obs.oi.soid, out);
15394 }
b32b8144
FG
15395 map<string, bufferlist> tmp;
15396 for (map<string, bufferlist>::iterator i = out->begin();
15397 i != out->end();
15398 ++i) {
15399 if (i->first.size() > 1 && i->first[0] == '_')
f67539c2 15400 tmp[i->first.substr(1, i->first.size())] = std::move(i->second);
7c673cae 15401 }
b32b8144 15402 tmp.swap(*out);
7c673cae
FG
15403 return r;
15404}
15405
11fdf7f2
TL
15406bool PrimaryLogPG::check_failsafe_full() {
15407 return osd->check_failsafe_full(get_dpp());
7c673cae
FG
15408}
15409
f67539c2
TL
15410bool PrimaryLogPG::maybe_preempt_replica_scrub(const hobject_t& oid)
15411{
15412 return m_scrubber->write_blocked_by_scrub(oid);
15413}
15414
7c673cae
FG
15415void intrusive_ptr_add_ref(PrimaryLogPG *pg) { pg->get("intptr"); }
15416void intrusive_ptr_release(PrimaryLogPG *pg) { pg->put("intptr"); }
15417
15418#ifdef PG_DEBUG_REFS
15419uint64_t get_with_id(PrimaryLogPG *pg) { return pg->get_with_id(); }
15420void put_with_id(PrimaryLogPG *pg, uint64_t id) { return pg->put_with_id(id); }
15421#endif
15422
15423void intrusive_ptr_add_ref(PrimaryLogPG::RepGather *repop) { repop->get(); }
15424void intrusive_ptr_release(PrimaryLogPG::RepGather *repop) { repop->put(); }