]> git.proxmox.com Git - ceph.git/blame - ceph/src/osd/PrimaryLogPG.cc
update ceph source to reef 18.1.2
[ceph.git] / ceph / src / osd / PrimaryLogPG.cc
CommitLineData
7c673cae
FG
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3/*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 * Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com>
8 *
9 * Author: Loic Dachary <loic@dachary.org>
10 *
11 * This is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public
13 * License version 2.1, as published by the Free Software
14 * Foundation. See file COPYING.
15 *
16 */
20effc67
TL
17#include <errno.h>
18
19#include <charconv>
20#include <sstream>
21#include <utility>
22
23#include <boost/intrusive_ptr.hpp>
24#include <boost/tuple/tuple.hpp>
7c673cae 25
7c673cae 26#include "PrimaryLogPG.h"
7c673cae 27
f67539c2 28#include "cls/cas/cls_cas_ops.h"
20effc67
TL
29#include "common/CDC.h"
30#include "common/EventTrace.h"
9f95a23c 31#include "common/ceph_crypto.h"
20effc67 32#include "common/config.h"
7c673cae 33#include "common/errno.h"
7c673cae 34#include "common/perf_counters.h"
20effc67
TL
35#include "common/scrub_types.h"
36#include "include/compat.h"
37#include "json_spirit/json_spirit_reader.h"
38#include "json_spirit/json_spirit_value.h"
39#include "messages/MCommandReply.h"
7c673cae 40#include "messages/MOSDBackoff.h"
20effc67 41#include "messages/MOSDOp.h"
7c673cae
FG
42#include "messages/MOSDPGBackfill.h"
43#include "messages/MOSDPGBackfillRemove.h"
20effc67
TL
44#include "messages/MOSDPGLog.h"
45#include "messages/MOSDPGScan.h"
46#include "messages/MOSDPGTrim.h"
7c673cae
FG
47#include "messages/MOSDPGUpdateLogMissing.h"
48#include "messages/MOSDPGUpdateLogMissingReply.h"
20effc67 49#include "messages/MOSDRepScrub.h"
7c673cae 50#include "messages/MOSDScrubReserve.h"
7c673cae 51#include "mon/MonClient.h"
20effc67
TL
52#include "objclass/objclass.h"
53#include "osd/ClassHandler.h"
7c673cae 54#include "osdc/Objecter.h"
20effc67
TL
55#include "osd/scrubber/PrimaryLogScrub.h"
56#include "osd/scrubber/ScrubStore.h"
57#include "osd/scrubber/pg_scrubber.h"
58
59#include "OSD.h"
60#include "OpRequest.h"
61#include "PG.h"
62#include "Session.h"
63
64// required includes order:
7c673cae
FG
65#include "json_spirit/json_spirit_value.h"
66#include "json_spirit/json_spirit_reader.h"
11fdf7f2 67#include "include/ceph_assert.h" // json_spirit clobbers it
7c673cae
FG
68#include "include/rados/rados_types.hpp"
69
70#ifdef WITH_LTTNG
71#include "tracing/osd.h"
72#else
73#define tracepoint(...)
74#endif
75
76#define dout_context cct
77#define dout_subsys ceph_subsys_osd
78#define DOUT_PREFIX_ARGS this, osd->whoami, get_osdmap()
79#undef dout_prefix
80#define dout_prefix _prefix(_dout, this)
7c673cae 81
20effc67 82#include "osd_tracer.h"
7c673cae
FG
83
84MEMPOOL_DEFINE_OBJECT_FACTORY(PrimaryLogPG, replicatedpg, osd);
85
20effc67 86using std::less;
f67539c2
TL
87using std::list;
88using std::ostream;
89using std::pair;
90using std::make_pair;
20effc67 91using std::make_unique;
f67539c2
TL
92using std::map;
93using std::ostringstream;
94using std::set;
95using std::string;
96using std::string_view;
97using std::stringstream;
98using std::unique_ptr;
99using std::vector;
100
101using ceph::bufferlist;
102using ceph::bufferptr;
103using ceph::Formatter;
104using ceph::decode;
105using ceph::decode_noclear;
106using ceph::encode;
107using ceph::encode_destructively;
108
9f95a23c 109using namespace ceph::osd::scheduler;
f67539c2 110using TOPNSPC::common::cmd_getval;
20effc67 111using TOPNSPC::common::cmd_getval_or;
f67539c2
TL
112
113template <typename T>
114static ostream& _prefix(std::ostream *_dout, T *pg) {
115 return pg->gen_prefix(*_dout);
116}
7c673cae 117
7c673cae
FG
118/**
119 * The CopyCallback class defines an interface for completions to the
120 * copy_start code. Users of the copy infrastructure must implement
121 * one and give an instance of the class to start_copy.
122 *
123 * The implementer is responsible for making sure that the CopyCallback
124 * can associate itself with the correct copy operation.
125 */
126class PrimaryLogPG::CopyCallback : public GenContext<CopyCallbackResults> {
127protected:
128 CopyCallback() {}
129 /**
130 * results.get<0>() is the return code: 0 for success; -ECANCELED if
131 * the operation was cancelled by the local OSD; -errno for other issues.
132 * results.get<1>() is a pointer to a CopyResults object, which you are
133 * responsible for deleting.
134 */
135 void finish(CopyCallbackResults results_) override = 0;
136
137public:
138 /// Provide the final size of the copied object to the CopyCallback
139 ~CopyCallback() override {}
140};
141
142template <typename T>
143class PrimaryLogPG::BlessedGenContext : public GenContext<T> {
144 PrimaryLogPGRef pg;
145 unique_ptr<GenContext<T>> c;
146 epoch_t e;
147public:
148 BlessedGenContext(PrimaryLogPG *pg, GenContext<T> *c, epoch_t e)
149 : pg(pg), c(c), e(e) {}
150 void finish(T t) override {
9f95a23c 151 std::scoped_lock locker{*pg};
7c673cae
FG
152 if (pg->pg_has_reset_since(e))
153 c.reset();
154 else
155 c.release()->complete(t);
7c673cae 156 }
11fdf7f2
TL
157 bool sync_finish(T t) {
158 // we assume here all blessed/wrapped Contexts can complete synchronously.
159 c.release()->complete(t);
160 return true;
161 }
7c673cae
FG
162};
163
164GenContext<ThreadPool::TPHandle&> *PrimaryLogPG::bless_gencontext(
165 GenContext<ThreadPool::TPHandle&> *c) {
166 return new BlessedGenContext<ThreadPool::TPHandle&>(
11fdf7f2
TL
167 this, c, get_osdmap_epoch());
168}
169
170template <typename T>
171class PrimaryLogPG::UnlockedBlessedGenContext : public GenContext<T> {
172 PrimaryLogPGRef pg;
173 unique_ptr<GenContext<T>> c;
174 epoch_t e;
175public:
176 UnlockedBlessedGenContext(PrimaryLogPG *pg, GenContext<T> *c, epoch_t e)
177 : pg(pg), c(c), e(e) {}
178 void finish(T t) override {
179 if (pg->pg_has_reset_since(e))
180 c.reset();
181 else
182 c.release()->complete(t);
183 }
184 bool sync_finish(T t) {
185 // we assume here all blessed/wrapped Contexts can complete synchronously.
186 c.release()->complete(t);
187 return true;
188 }
189};
190
191GenContext<ThreadPool::TPHandle&> *PrimaryLogPG::bless_unlocked_gencontext(
192 GenContext<ThreadPool::TPHandle&> *c) {
193 return new UnlockedBlessedGenContext<ThreadPool::TPHandle&>(
194 this, c, get_osdmap_epoch());
7c673cae
FG
195}
196
197class PrimaryLogPG::BlessedContext : public Context {
198 PrimaryLogPGRef pg;
199 unique_ptr<Context> c;
200 epoch_t e;
201public:
202 BlessedContext(PrimaryLogPG *pg, Context *c, epoch_t e)
203 : pg(pg), c(c), e(e) {}
204 void finish(int r) override {
9f95a23c 205 std::scoped_lock locker{*pg};
7c673cae
FG
206 if (pg->pg_has_reset_since(e))
207 c.reset();
208 else
209 c.release()->complete(r);
7c673cae 210 }
f67539c2 211 bool sync_finish(int r) override {
11fdf7f2
TL
212 // we assume here all blessed/wrapped Contexts can complete synchronously.
213 c.release()->complete(r);
214 return true;
215 }
7c673cae
FG
216};
217
7c673cae 218Context *PrimaryLogPG::bless_context(Context *c) {
11fdf7f2 219 return new BlessedContext(this, c, get_osdmap_epoch());
7c673cae
FG
220}
221
222class PrimaryLogPG::C_PG_ObjectContext : public Context {
223 PrimaryLogPGRef pg;
224 ObjectContext *obc;
225 public:
226 C_PG_ObjectContext(PrimaryLogPG *p, ObjectContext *o) :
227 pg(p), obc(o) {}
228 void finish(int r) override {
229 pg->object_context_destructor_callback(obc);
230 }
231};
232
7c673cae
FG
233struct OnReadComplete : public Context {
234 PrimaryLogPG *pg;
235 PrimaryLogPG::OpContext *opcontext;
236 OnReadComplete(
237 PrimaryLogPG *pg,
238 PrimaryLogPG::OpContext *ctx) : pg(pg), opcontext(ctx) {}
239 void finish(int r) override {
7c673cae
FG
240 opcontext->finish_read(pg);
241 }
242 ~OnReadComplete() override {}
243};
244
245class PrimaryLogPG::C_OSD_AppliedRecoveredObject : public Context {
246 PrimaryLogPGRef pg;
247 ObjectContextRef obc;
248 public:
249 C_OSD_AppliedRecoveredObject(PrimaryLogPG *p, ObjectContextRef o) :
250 pg(p), obc(o) {}
11fdf7f2
TL
251 bool sync_finish(int r) override {
252 pg->_applied_recovered_object(obc);
253 return true;
254 }
7c673cae 255 void finish(int r) override {
9f95a23c 256 std::scoped_lock locker{*pg};
7c673cae
FG
257 pg->_applied_recovered_object(obc);
258 }
259};
260
261class PrimaryLogPG::C_OSD_CommittedPushedObject : public Context {
262 PrimaryLogPGRef pg;
263 epoch_t epoch;
264 eversion_t last_complete;
265 public:
266 C_OSD_CommittedPushedObject(
267 PrimaryLogPG *p, epoch_t epoch, eversion_t lc) :
268 pg(p), epoch(epoch), last_complete(lc) {
269 }
270 void finish(int r) override {
271 pg->_committed_pushed_object(epoch, last_complete);
272 }
273};
274
275class PrimaryLogPG::C_OSD_AppliedRecoveredObjectReplica : public Context {
276 PrimaryLogPGRef pg;
277 public:
278 explicit C_OSD_AppliedRecoveredObjectReplica(PrimaryLogPG *p) :
279 pg(p) {}
11fdf7f2
TL
280 bool sync_finish(int r) override {
281 pg->_applied_recovered_object_replica();
282 return true;
283 }
7c673cae 284 void finish(int r) override {
9f95a23c 285 std::scoped_lock locker{*pg};
7c673cae
FG
286 pg->_applied_recovered_object_replica();
287 }
288};
289
290// OpContext
291void PrimaryLogPG::OpContext::start_async_reads(PrimaryLogPG *pg)
292{
293 inflightreads = 1;
294 list<pair<boost::tuple<uint64_t, uint64_t, unsigned>,
295 pair<bufferlist*, Context*> > > in;
296 in.swap(pending_async_reads);
297 pg->pgbackend->objects_read_async(
298 obc->obs.oi.soid,
299 in,
300 new OnReadComplete(pg, this), pg->get_pool().fast_read);
301}
302void PrimaryLogPG::OpContext::finish_read(PrimaryLogPG *pg)
303{
11fdf7f2 304 ceph_assert(inflightreads > 0);
7c673cae
FG
305 --inflightreads;
306 if (async_reads_complete()) {
11fdf7f2
TL
307 ceph_assert(pg->in_progress_async_reads.size());
308 ceph_assert(pg->in_progress_async_reads.front().second == this);
7c673cae 309 pg->in_progress_async_reads.pop_front();
c07f9fc5
FG
310
311 // Restart the op context now that all reads have been
312 // completed. Read failures will be handled by the op finisher
313 pg->execute_ctx(this);
7c673cae
FG
314 }
315}
316
c07f9fc5 317class CopyFromCallback : public PrimaryLogPG::CopyCallback {
7c673cae 318public:
c07f9fc5 319 PrimaryLogPG::CopyResults *results = nullptr;
7c673cae 320 PrimaryLogPG::OpContext *ctx;
c07f9fc5 321 OSDOp &osd_op;
9f95a23c
TL
322 uint32_t truncate_seq;
323 uint64_t truncate_size;
324 bool have_truncate = false;
c07f9fc5
FG
325
326 CopyFromCallback(PrimaryLogPG::OpContext *ctx, OSDOp &osd_op)
327 : ctx(ctx), osd_op(osd_op) {
328 }
7c673cae
FG
329 ~CopyFromCallback() override {}
330
331 void finish(PrimaryLogPG::CopyCallbackResults results_) override {
332 results = results_.get<1>();
333 int r = results_.get<0>();
7c673cae 334
9f95a23c
TL
335 // Only use truncate_{seq,size} from the original object if the client
336 // did not sent us these parameters
337 if (!have_truncate) {
338 truncate_seq = results->truncate_seq;
339 truncate_size = results->truncate_size;
340 }
341
7c673cae
FG
342 // for finish_copyfrom
343 ctx->user_at_version = results->user_version;
344
345 if (r >= 0) {
346 ctx->pg->execute_ctx(ctx);
c07f9fc5 347 } else {
7c673cae
FG
348 if (r != -ECANCELED) { // on cancel just toss it out; client resends
349 if (ctx->op)
350 ctx->pg->osd->reply_op_error(ctx->op, r);
351 } else if (results->should_requeue) {
352 if (ctx->op)
353 ctx->pg->requeue_op(ctx->op);
354 }
355 ctx->pg->close_op_ctx(ctx);
356 }
357 }
358
359 bool is_temp_obj_used() {
360 return results->started_temp_obj;
361 }
362 uint64_t get_data_size() {
363 return results->object_size;
364 }
9f95a23c
TL
365 void set_truncate(uint32_t seq, uint64_t size) {
366 truncate_seq = seq;
367 truncate_size = size;
368 have_truncate = true;
369 }
c07f9fc5
FG
370};
371
372struct CopyFromFinisher : public PrimaryLogPG::OpFinisher {
373 CopyFromCallback *copy_from_callback;
374
11fdf7f2 375 explicit CopyFromFinisher(CopyFromCallback *copy_from_callback)
c07f9fc5
FG
376 : copy_from_callback(copy_from_callback) {
377 }
378
379 int execute() override {
380 // instance will be destructed after this method completes
381 copy_from_callback->ctx->pg->finish_copyfrom(copy_from_callback);
382 return 0;
7c673cae
FG
383 }
384};
385
386// ======================
387// PGBackend::Listener
388
389void PrimaryLogPG::on_local_recover(
390 const hobject_t &hoid,
391 const ObjectRecoveryInfo &_recovery_info,
392 ObjectContextRef obc,
c07f9fc5 393 bool is_delete,
7c673cae
FG
394 ObjectStore::Transaction *t
395 )
396{
397 dout(10) << __func__ << ": " << hoid << dendl;
398
399 ObjectRecoveryInfo recovery_info(_recovery_info);
400 clear_object_snap_mapping(t, hoid);
c07f9fc5 401 if (!is_delete && recovery_info.soid.is_snap()) {
7c673cae
FG
402 OSDriver::OSTransaction _t(osdriver.get_transaction(t));
403 set<snapid_t> snaps;
11fdf7f2
TL
404 dout(20) << " snapset " << recovery_info.ss << dendl;
405 auto p = recovery_info.ss.clone_snaps.find(hoid.snap);
406 if (p != recovery_info.ss.clone_snaps.end()) {
407 snaps.insert(p->second.begin(), p->second.end());
1adf2230
AA
408 dout(20) << " snaps " << snaps << dendl;
409 snap_mapper.add_oid(
11fdf7f2
TL
410 recovery_info.soid,
411 snaps,
412 &_t);
413 } else {
414 derr << __func__ << " " << hoid << " had no clone_snaps" << dendl;
7c673cae 415 }
7c673cae 416 }
9f95a23c
TL
417 if (!is_delete && recovery_state.get_pg_log().get_missing().is_missing(recovery_info.soid) &&
418 recovery_state.get_pg_log().get_missing().get_items().find(recovery_info.soid)->second.need > recovery_info.version) {
11fdf7f2 419 ceph_assert(is_primary());
9f95a23c 420 const pg_log_entry_t *latest = recovery_state.get_pg_log().get_log().objects.find(recovery_info.soid)->second;
7c673cae
FG
421 if (latest->op == pg_log_entry_t::LOST_REVERT &&
422 latest->reverting_to == recovery_info.version) {
423 dout(10) << " got old revert version " << recovery_info.version
424 << " for " << *latest << dendl;
425 recovery_info.version = latest->version;
426 // update the attr to the revert event version
427 recovery_info.oi.prior_version = recovery_info.oi.version;
428 recovery_info.oi.version = latest->version;
429 bufferlist bl;
11fdf7f2 430 encode(recovery_info.oi, bl,
7c673cae 431 get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
11fdf7f2 432 ceph_assert(!pool.info.is_erasure());
7c673cae
FG
433 t->setattr(coll, ghobject_t(recovery_info.soid), OI_ATTR, bl);
434 if (obc)
435 obc->attr_cache[OI_ATTR] = bl;
436 }
437 }
438
439 // keep track of active pushes for scrub
440 ++active_pushes;
441
9f95a23c
TL
442 recovery_state.recover_got(
443 recovery_info.soid,
444 recovery_info.version,
445 is_delete,
446 *t);
7c673cae
FG
447
448 if (is_primary()) {
c07f9fc5
FG
449 if (!is_delete) {
450 obc->obs.exists = true;
7c673cae 451
c07f9fc5 452 bool got = obc->get_recovery_read();
11fdf7f2 453 ceph_assert(got);
7c673cae 454
11fdf7f2 455 ceph_assert(recovering.count(obc->obs.oi.soid));
c07f9fc5
FG
456 recovering[obc->obs.oi.soid] = obc;
457 obc->obs.oi = recovery_info.oi; // may have been updated above
c07f9fc5 458 }
7c673cae
FG
459
460 t->register_on_applied(new C_OSD_AppliedRecoveredObject(this, obc));
7c673cae
FG
461
462 publish_stats_to_osd();
7c673cae
FG
463 release_backoffs(hoid);
464 if (!is_unreadable_object(hoid)) {
465 auto unreadable_object_entry = waiting_for_unreadable_object.find(hoid);
466 if (unreadable_object_entry != waiting_for_unreadable_object.end()) {
467 dout(20) << " kicking unreadable waiters on " << hoid << dendl;
468 requeue_ops(unreadable_object_entry->second);
469 waiting_for_unreadable_object.erase(unreadable_object_entry);
470 }
471 }
7c673cae
FG
472 } else {
473 t->register_on_applied(
474 new C_OSD_AppliedRecoveredObjectReplica(this));
475
476 }
477
478 t->register_on_commit(
479 new C_OSD_CommittedPushedObject(
480 this,
11fdf7f2 481 get_osdmap_epoch(),
7c673cae 482 info.last_complete));
7c673cae
FG
483}
484
485void PrimaryLogPG::on_global_recover(
486 const hobject_t &soid,
c07f9fc5
FG
487 const object_stat_sum_t &stat_diff,
488 bool is_delete)
7c673cae 489{
9f95a23c 490 recovery_state.object_recovered(soid, stat_diff);
7c673cae
FG
491 publish_stats_to_osd();
492 dout(10) << "pushed " << soid << " to all replicas" << dendl;
f67539c2 493 auto i = recovering.find(soid);
11fdf7f2 494 ceph_assert(i != recovering.end());
7c673cae 495
11fdf7f2 496 if (i->second && i->second->rwstate.recovery_read_marker) {
c07f9fc5
FG
497 // recover missing won't have had an obc, but it gets filled in
498 // during on_local_recover
11fdf7f2 499 ceph_assert(i->second);
c07f9fc5
FG
500 list<OpRequestRef> requeue_list;
501 i->second->drop_recovery_read(&requeue_list);
502 requeue_ops(requeue_list);
503 }
7c673cae
FG
504
505 backfills_in_flight.erase(soid);
506
507 recovering.erase(i);
508 finish_recovery_op(soid);
509 release_backoffs(soid);
510 auto degraded_object_entry = waiting_for_degraded_object.find(soid);
511 if (degraded_object_entry != waiting_for_degraded_object.end()) {
512 dout(20) << " kicking degraded waiters on " << soid << dendl;
513 requeue_ops(degraded_object_entry->second);
514 waiting_for_degraded_object.erase(degraded_object_entry);
515 }
516 auto unreadable_object_entry = waiting_for_unreadable_object.find(soid);
517 if (unreadable_object_entry != waiting_for_unreadable_object.end()) {
518 dout(20) << " kicking unreadable waiters on " << soid << dendl;
519 requeue_ops(unreadable_object_entry->second);
520 waiting_for_unreadable_object.erase(unreadable_object_entry);
521 }
522 finish_degraded_object(soid);
523}
524
7c673cae 525void PrimaryLogPG::schedule_recovery_work(
1e59de90
TL
526 GenContext<ThreadPool::TPHandle&> *c,
527 uint64_t cost)
7c673cae 528{
1e59de90
TL
529 osd->queue_recovery_context(
530 this, c, cost,
531 recovery_state.get_recovery_op_priority());
7c673cae
FG
532}
533
9f95a23c
TL
534void PrimaryLogPG::replica_clear_repop_obc(
535 const vector<pg_log_entry_t> &logv,
536 ObjectStore::Transaction &t)
7c673cae 537{
9f95a23c
TL
538 for (auto &&e: logv) {
539 /* Have to blast all clones, they share a snapset */
540 object_contexts.clear_range(
541 e.soid.get_object_boundary(), e.soid.get_head());
542 ceph_assert(
543 snapset_contexts.find(e.soid.get_head()) ==
544 snapset_contexts.end());
545 }
224ce89b
WB
546}
547
11fdf7f2
TL
548bool PrimaryLogPG::should_send_op(
549 pg_shard_t peer,
550 const hobject_t &hoid) {
551 if (peer == get_primary())
552 return true;
9f95a23c 553 ceph_assert(recovery_state.has_peer_info(peer));
11fdf7f2
TL
554 bool should_send =
555 hoid.pool != (int64_t)info.pgid.pool() ||
556 hoid <= last_backfill_started ||
9f95a23c 557 hoid <= recovery_state.get_peer_info(peer).last_backfill;
11fdf7f2 558 if (!should_send) {
9f95a23c 559 ceph_assert(is_backfill_target(peer));
11fdf7f2
TL
560 dout(10) << __func__ << " issue_repop shipping empty opt to osd." << peer
561 << ", object " << hoid
562 << " beyond std::max(last_backfill_started "
563 << ", peer_info[peer].last_backfill "
9f95a23c
TL
564 << recovery_state.get_peer_info(peer).last_backfill
565 << ")" << dendl;
11fdf7f2
TL
566 return should_send;
567 }
9f95a23c
TL
568 if (is_async_recovery_target(peer) &&
569 recovery_state.get_peer_missing(peer).is_missing(hoid)) {
11fdf7f2
TL
570 should_send = false;
571 dout(10) << __func__ << " issue_repop shipping empty opt to osd." << peer
572 << ", object " << hoid
573 << " which is pending recovery in async_recovery_targets" << dendl;
574 }
575 return should_send;
576}
577
578
7c673cae
FG
579ConnectionRef PrimaryLogPG::get_con_osd_cluster(
580 int peer, epoch_t from_epoch)
581{
582 return osd->get_con_osd_cluster(peer, from_epoch);
583}
584
585PerfCounters *PrimaryLogPG::get_logger()
586{
587 return osd->logger;
588}
589
590
591// ====================
592// missing objects
593
594bool PrimaryLogPG::is_missing_object(const hobject_t& soid) const
595{
9f95a23c 596 return recovery_state.get_pg_log().get_missing().get_items().count(soid);
7c673cae
FG
597}
598
599void PrimaryLogPG::maybe_kick_recovery(
600 const hobject_t &soid)
601{
602 eversion_t v;
11fdf7f2 603 bool work_started = false;
9f95a23c 604 if (!recovery_state.get_missing_loc().needs_recovery(soid, &v))
7c673cae
FG
605 return;
606
607 map<hobject_t, ObjectContextRef>::const_iterator p = recovering.find(soid);
608 if (p != recovering.end()) {
609 dout(7) << "object " << soid << " v " << v << ", already recovering." << dendl;
9f95a23c 610 } else if (recovery_state.get_missing_loc().is_unfound(soid)) {
7c673cae
FG
611 dout(7) << "object " << soid << " v " << v << ", is unfound." << dendl;
612 } else {
613 dout(7) << "object " << soid << " v " << v << ", recovering." << dendl;
614 PGBackend::RecoveryHandle *h = pgbackend->open_recovery_op();
615 if (is_missing_object(soid)) {
9f95a23c
TL
616 recover_missing(soid, v, CEPH_MSG_PRIO_HIGH, h);
617 } else if (recovery_state.get_missing_loc().is_deleted(soid)) {
11fdf7f2 618 prep_object_replica_deletes(soid, v, h, &work_started);
7c673cae 619 } else {
11fdf7f2 620 prep_object_replica_pushes(soid, v, h, &work_started);
7c673cae 621 }
9f95a23c 622 pgbackend->run_recovery_op(h, CEPH_MSG_PRIO_HIGH);
7c673cae
FG
623 }
624}
625
626void PrimaryLogPG::wait_for_unreadable_object(
627 const hobject_t& soid, OpRequestRef op)
628{
11fdf7f2 629 ceph_assert(is_unreadable_object(soid));
7c673cae
FG
630 maybe_kick_recovery(soid);
631 waiting_for_unreadable_object[soid].push_back(op);
632 op->mark_delayed("waiting for missing object");
1e59de90 633 osd->logger->inc(l_osd_op_delayed_unreadable);
7c673cae
FG
634}
635
7c673cae
FG
636bool PrimaryLogPG::is_degraded_or_backfilling_object(const hobject_t& soid)
637{
638 /* The conditions below may clear (on_local_recover, before we queue
639 * the transaction) before we actually requeue the degraded waiters
640 * in on_global_recover after the transaction completes.
641 */
642 if (waiting_for_degraded_object.count(soid))
643 return true;
9f95a23c 644 if (recovery_state.get_pg_log().get_missing().get_items().count(soid))
7c673cae 645 return true;
9f95a23c
TL
646 ceph_assert(!get_acting_recovery_backfill().empty());
647 for (set<pg_shard_t>::iterator i = get_acting_recovery_backfill().begin();
648 i != get_acting_recovery_backfill().end();
7c673cae
FG
649 ++i) {
650 if (*i == get_primary()) continue;
651 pg_shard_t peer = *i;
9f95a23c 652 auto peer_missing_entry = recovery_state.get_peer_missing().find(peer);
11fdf7f2
TL
653 // If an object is missing on an async_recovery_target, return false.
654 // This will not block the op and the object is async recovered later.
9f95a23c 655 if (peer_missing_entry != recovery_state.get_peer_missing().end() &&
11fdf7f2 656 peer_missing_entry->second.get_items().count(soid)) {
9f95a23c 657 if (is_async_recovery_target(peer))
11fdf7f2
TL
658 continue;
659 else
660 return true;
661 }
7c673cae
FG
662 // Object is degraded if after last_backfill AND
663 // we are backfilling it
9f95a23c
TL
664 if (is_backfill_target(peer) &&
665 recovery_state.get_peer_info(peer).last_backfill <= soid &&
7c673cae
FG
666 last_backfill_started >= soid &&
667 backfills_in_flight.count(soid))
668 return true;
669 }
670 return false;
671}
672
11fdf7f2
TL
673bool PrimaryLogPG::is_degraded_on_async_recovery_target(const hobject_t& soid)
674{
9f95a23c
TL
675 for (auto &i: get_async_recovery_targets()) {
676 auto peer_missing_entry = recovery_state.get_peer_missing().find(i);
677 if (peer_missing_entry != recovery_state.get_peer_missing().end() &&
11fdf7f2
TL
678 peer_missing_entry->second.get_items().count(soid)) {
679 dout(30) << __func__ << " " << soid << dendl;
680 return true;
681 }
682 }
683 return false;
684}
685
7c673cae
FG
686void PrimaryLogPG::wait_for_degraded_object(const hobject_t& soid, OpRequestRef op)
687{
11fdf7f2 688 ceph_assert(is_degraded_or_backfilling_object(soid) || is_degraded_on_async_recovery_target(soid));
7c673cae
FG
689
690 maybe_kick_recovery(soid);
691 waiting_for_degraded_object[soid].push_back(op);
692 op->mark_delayed("waiting for degraded object");
1e59de90 693 osd->logger->inc(l_osd_op_delayed_degraded);
7c673cae
FG
694}
695
696void PrimaryLogPG::block_write_on_full_cache(
697 const hobject_t& _oid, OpRequestRef op)
698{
699 const hobject_t oid = _oid.get_head();
700 dout(20) << __func__ << ": blocking object " << oid
701 << " on full cache" << dendl;
702 objects_blocked_on_cache_full.insert(oid);
703 waiting_for_cache_not_full.push_back(op);
704 op->mark_delayed("waiting for cache not full");
705}
706
224ce89b
WB
707void PrimaryLogPG::block_for_clean(
708 const hobject_t& oid, OpRequestRef op)
709{
710 dout(20) << __func__ << ": blocking object " << oid
711 << " on primary repair" << dendl;
712 waiting_for_clean_to_primary_repair.push_back(op);
713 op->mark_delayed("waiting for clean to repair");
714}
715
7c673cae
FG
716void PrimaryLogPG::block_write_on_snap_rollback(
717 const hobject_t& oid, ObjectContextRef obc, OpRequestRef op)
718{
719 dout(20) << __func__ << ": blocking object " << oid.get_head()
720 << " on snap promotion " << obc->obs.oi.soid << dendl;
721 // otherwise, we'd have blocked in do_op
11fdf7f2
TL
722 ceph_assert(oid.is_head());
723 ceph_assert(objects_blocked_on_snap_promotion.count(oid) == 0);
20effc67
TL
724 /*
725 * We block the head object here.
726 *
727 * Let's assume that there is racing read When the head object is being rollbacked.
728 * Since the two different ops can trigger promote_object() with the same source,
729 * infinite loop happens by canceling ops each other.
730 * To avoid this, we block the head object during rollback.
731 * So, the racing read will be blocked until the rollback is completed.
732 * see also: https://tracker.ceph.com/issues/49726
733 */
734 ObjectContextRef head_obc = get_object_context(oid, false);
735 head_obc->start_block();
7c673cae
FG
736 objects_blocked_on_snap_promotion[oid] = obc;
737 wait_for_blocked_object(obc->obs.oi.soid, op);
738}
739
740void PrimaryLogPG::block_write_on_degraded_snap(
741 const hobject_t& snap, OpRequestRef op)
742{
743 dout(20) << __func__ << ": blocking object " << snap.get_head()
744 << " on degraded snap " << snap << dendl;
745 // otherwise, we'd have blocked in do_op
11fdf7f2 746 ceph_assert(objects_blocked_on_degraded_snap.count(snap.get_head()) == 0);
7c673cae
FG
747 objects_blocked_on_degraded_snap[snap.get_head()] = snap.snap;
748 wait_for_degraded_object(snap, op);
749}
750
11fdf7f2 751bool PrimaryLogPG::maybe_await_blocked_head(
7c673cae
FG
752 const hobject_t &hoid,
753 OpRequestRef op)
754{
755 ObjectContextRef obc;
756 obc = object_contexts.lookup(hoid.get_head());
757 if (obc) {
758 if (obc->is_blocked()) {
759 wait_for_blocked_object(obc->obs.oi.soid, op);
760 return true;
761 } else {
762 return false;
763 }
764 }
7c673cae
FG
765 return false;
766}
767
768void PrimaryLogPG::wait_for_blocked_object(const hobject_t& soid, OpRequestRef op)
769{
1e59de90 770 dout(10) << __func__ << " " << soid << " " << *op->get_req() << dendl;
7c673cae
FG
771 waiting_for_blocked_object[soid].push_back(op);
772 op->mark_delayed("waiting for blocked object");
773}
774
775void PrimaryLogPG::maybe_force_recovery()
776{
b32b8144 777 // no force if not in degraded/recovery/backfill states
7c673cae
FG
778 if (!is_degraded() &&
779 !state_test(PG_STATE_RECOVERING |
780 PG_STATE_RECOVERY_WAIT |
3efd9988 781 PG_STATE_BACKFILLING |
7c673cae
FG
782 PG_STATE_BACKFILL_WAIT |
783 PG_STATE_BACKFILL_TOOFULL))
784 return;
785
9f95a23c 786 if (recovery_state.get_pg_log().get_log().approx_size() <
7c673cae
FG
787 cct->_conf->osd_max_pg_log_entries *
788 cct->_conf->osd_force_recovery_pg_log_entries_factor)
789 return;
790
791 // find the oldest missing object
9f95a23c 792 version_t min_version = recovery_state.get_pg_log().get_log().head.version;
7c673cae 793 hobject_t soid;
9f95a23c
TL
794 if (!recovery_state.get_pg_log().get_missing().get_rmissing().empty()) {
795 min_version = recovery_state.get_pg_log().get_missing().get_rmissing().begin()->first;
796 soid = recovery_state.get_pg_log().get_missing().get_rmissing().begin()->second;
7c673cae 797 }
9f95a23c
TL
798 ceph_assert(!get_acting_recovery_backfill().empty());
799 for (set<pg_shard_t>::iterator it = get_acting_recovery_backfill().begin();
800 it != get_acting_recovery_backfill().end();
7c673cae
FG
801 ++it) {
802 if (*it == get_primary()) continue;
803 pg_shard_t peer = *it;
9f95a23c
TL
804 auto it_missing = recovery_state.get_peer_missing().find(peer);
805 if (it_missing != recovery_state.get_peer_missing().end() &&
11fdf7f2 806 !it_missing->second.get_rmissing().empty()) {
9f95a23c 807 const auto& min_obj = recovery_state.get_peer_missing(peer).get_rmissing().begin();
11fdf7f2
TL
808 dout(20) << __func__ << " peer " << peer << " min_version " << min_obj->first
809 << " oid " << min_obj->second << dendl;
810 if (min_version > min_obj->first) {
811 min_version = min_obj->first;
812 soid = min_obj->second;
813 }
7c673cae
FG
814 }
815 }
816
817 // recover it
818 if (soid != hobject_t())
819 maybe_kick_recovery(soid);
820}
821
9f95a23c
TL
822bool PrimaryLogPG::check_laggy(OpRequestRef& op)
823{
20effc67
TL
824 assert(HAVE_FEATURE(recovery_state.get_min_upacting_features(),
825 SERVER_OCTOPUS));
9f95a23c
TL
826 if (state_test(PG_STATE_WAIT)) {
827 dout(10) << __func__ << " PG is WAIT state" << dendl;
828 } else if (!state_test(PG_STATE_LAGGY)) {
829 auto mnow = osd->get_mnow();
830 auto ru = recovery_state.get_readable_until();
831 if (mnow <= ru) {
832 // not laggy
833 return true;
834 }
835 dout(10) << __func__
836 << " mnow " << mnow
837 << " > readable_until " << ru << dendl;
7c673cae 838
9f95a23c
TL
839 if (!is_primary()) {
840 osd->reply_op_error(op, -EAGAIN);
841 return false;
7c673cae 842 }
7c673cae 843
9f95a23c
TL
844 // go to laggy state
845 state_set(PG_STATE_LAGGY);
846 publish_stats_to_osd();
7c673cae 847 }
9f95a23c
TL
848 dout(10) << __func__ << " not readable" << dendl;
849 waiting_for_readable.push_back(op);
850 op->mark_delayed("waiting for readable");
851 return false;
852}
7c673cae 853
9f95a23c 854bool PrimaryLogPG::check_laggy_requeue(OpRequestRef& op)
7c673cae 855{
20effc67
TL
856 assert(HAVE_FEATURE(recovery_state.get_min_upacting_features(),
857 SERVER_OCTOPUS));
9f95a23c
TL
858 if (!state_test(PG_STATE_WAIT) && !state_test(PG_STATE_LAGGY)) {
859 return true; // not laggy
860 }
861 dout(10) << __func__ << " not readable" << dendl;
862 waiting_for_readable.push_front(op);
863 op->mark_delayed("waiting for readable");
7c673cae
FG
864 return false;
865}
866
9f95a23c 867void PrimaryLogPG::recheck_readable()
7c673cae 868{
9f95a23c
TL
869 if (!is_wait() && !is_laggy()) {
870 dout(20) << __func__ << " wasn't wait or laggy" << dendl;
871 return;
872 }
873 auto mnow = osd->get_mnow();
874 bool pub = false;
875 if (is_wait()) {
876 auto prior_readable_until_ub = recovery_state.get_prior_readable_until_ub();
877 if (mnow < prior_readable_until_ub) {
878 dout(10) << __func__ << " still wait (mnow " << mnow
879 << " < prior_readable_until_ub " << prior_readable_until_ub
880 << ")" << dendl;
881 } else {
882 dout(10) << __func__ << " no longer wait (mnow " << mnow
883 << " >= prior_readable_until_ub " << prior_readable_until_ub
884 << ")" << dendl;
885 state_clear(PG_STATE_WAIT);
886 recovery_state.clear_prior_readable_until_ub();
887 pub = true;
888 }
889 }
890 if (is_laggy()) {
891 auto ru = recovery_state.get_readable_until();
892 if (ru == ceph::signedspan::zero()) {
893 dout(10) << __func__ << " still laggy (mnow " << mnow
894 << ", readable_until zero)" << dendl;
895 } else if (mnow >= ru) {
896 dout(10) << __func__ << " still laggy (mnow " << mnow
897 << " >= readable_until " << ru << ")" << dendl;
898 } else {
899 dout(10) << __func__ << " no longer laggy (mnow " << mnow
900 << " < readable_until " << ru << ")" << dendl;
901 state_clear(PG_STATE_LAGGY);
902 pub = true;
903 }
904 }
905 if (pub) {
906 publish_stats_to_osd();
907 }
908 if (!is_laggy() && !is_wait()) {
909 requeue_ops(waiting_for_readable);
910 }
7c673cae
FG
911}
912
9f95a23c 913bool PrimaryLogPG::pgls_filter(const PGLSFilter& filter, const hobject_t& sobj)
7c673cae
FG
914{
915 bufferlist bl;
916
917 // If filter has expressed an interest in an xattr, load it.
9f95a23c 918 if (!filter.get_xattr().empty()) {
7c673cae
FG
919 int ret = pgbackend->objects_get_attr(
920 sobj,
9f95a23c 921 filter.get_xattr(),
7c673cae 922 &bl);
9f95a23c 923 dout(0) << "getattr (sobj=" << sobj << ", attr=" << filter.get_xattr() << ") returned " << ret << dendl;
7c673cae 924 if (ret < 0) {
9f95a23c 925 if (ret != -ENODATA || filter.reject_empty_xattr()) {
7c673cae
FG
926 return false;
927 }
928 }
929 }
930
9f95a23c 931 return filter.filter(sobj, bl);
7c673cae
FG
932}
933
9f95a23c
TL
934std::pair<int, std::unique_ptr<const PGLSFilter>>
935PrimaryLogPG::get_pgls_filter(bufferlist::const_iterator& iter)
7c673cae
FG
936{
937 string type;
9f95a23c
TL
938 // storing non-const PGLSFilter for the sake of ::init()
939 std::unique_ptr<PGLSFilter> filter;
7c673cae
FG
940
941 try {
11fdf7f2 942 decode(type, iter);
7c673cae 943 }
f67539c2 944 catch (ceph::buffer::error& e) {
9f95a23c 945 return { -EINVAL, nullptr };
7c673cae
FG
946 }
947
9f95a23c
TL
948 if (type.compare("plain") == 0) {
949 filter = std::make_unique<PGLSPlainFilter>();
7c673cae 950 } else {
20effc67 951 std::size_t dot = type.find('.');
7c673cae 952 if (dot == std::string::npos || dot == 0 || dot == type.size() - 1) {
9f95a23c 953 return { -EINVAL, nullptr };
7c673cae
FG
954 }
955
956 const std::string class_name = type.substr(0, dot);
957 const std::string filter_name = type.substr(dot + 1);
958 ClassHandler::ClassData *cls = NULL;
9f95a23c 959 int r = ClassHandler::get_instance().open_class(class_name, &cls);
7c673cae
FG
960 if (r != 0) {
961 derr << "Error opening class '" << class_name << "': "
962 << cpp_strerror(r) << dendl;
f67539c2 963 if (r != -EPERM) // propagate permission error
7c673cae 964 r = -EINVAL;
9f95a23c 965 return { r, nullptr };
7c673cae 966 } else {
11fdf7f2 967 ceph_assert(cls);
7c673cae
FG
968 }
969
970 ClassHandler::ClassFilter *class_filter = cls->get_filter(filter_name);
971 if (class_filter == NULL) {
972 derr << "Error finding filter '" << filter_name << "' in class "
973 << class_name << dendl;
9f95a23c 974 return { -EINVAL, nullptr };
7c673cae 975 }
9f95a23c 976 filter.reset(class_filter->fn());
7c673cae
FG
977 if (!filter) {
978 // Object classes are obliged to return us something, but let's
979 // give an error rather than asserting out.
980 derr << "Buggy class " << class_name << " failed to construct "
981 "filter " << filter_name << dendl;
9f95a23c 982 return { -EINVAL, nullptr };
7c673cae
FG
983 }
984 }
985
11fdf7f2 986 ceph_assert(filter);
7c673cae
FG
987 int r = filter->init(iter);
988 if (r < 0) {
989 derr << "Error initializing filter " << type << ": "
990 << cpp_strerror(r) << dendl;
9f95a23c 991 return { -EINVAL, nullptr };
7c673cae
FG
992 } else {
993 // Successfully constructed and initialized, return it.
9f95a23c 994 return std::make_pair(0, std::move(filter));
7c673cae
FG
995 }
996}
997
998
999// ==========================================================
1000
9f95a23c
TL
1001void PrimaryLogPG::do_command(
1002 const string_view& orig_prefix,
1003 const cmdmap_t& cmdmap,
1004 const bufferlist& idata,
1005 std::function<void(int,const std::string&,bufferlist&)> on_finish)
7c673cae 1006{
7c673cae 1007 string format;
9f95a23c 1008 cmd_getval(cmdmap, "format", format);
1e59de90 1009 auto f(Formatter::create_unique(format, "json-pretty", "json-pretty"));
9f95a23c
TL
1010 int ret = 0;
1011 stringstream ss; // stderr error message stream
1012 bufferlist outbl; // if empty at end, we'll dump formatter as output
1013
1014 // get final prefix:
1015 // - ceph pg <pgid> foo -> prefix=pg, cmd=foo
1016 // - ceph tell <pgid> foo -> prefix=foo
1017 string prefix(orig_prefix);
7c673cae 1018 string command;
9f95a23c
TL
1019 cmd_getval(cmdmap, "cmd", command);
1020 if (command.size()) {
1021 prefix = command;
1022 }
1023
1024 if (prefix == "query") {
7c673cae 1025 f->open_object_section("pg");
7c673cae 1026 f->dump_stream("snap_trimq") << snap_trimq;
b32b8144 1027 f->dump_unsigned("snap_trimq_len", snap_trimq.size());
9f95a23c 1028 recovery_state.dump_peering_state(f.get());
7c673cae
FG
1029
1030 f->open_array_section("recovery_state");
1031 handle_query_state(f.get());
1032 f->close_section();
1033
f67539c2 1034 if (is_primary() && is_active() && m_scrubber) {
20effc67 1035 m_scrubber->dump_scrubber(f.get(), m_planned_scrub);
f67539c2
TL
1036 }
1037
7c673cae
FG
1038 f->open_object_section("agent_state");
1039 if (agent_state)
1040 agent_state->dump(f.get());
1041 f->close_section();
1042
1043 f->close_section();
7c673cae 1044 }
1e59de90
TL
1045 else if (prefix == "log") {
1046
1047 f->open_object_section("op_log");
1048 f->open_object_section("pg_log_t");
1049 recovery_state.get_pg_log().get_log().dump(f.get());
1050 f->close_section();
1051 f->close_section();
1052 }
9f95a23c 1053 else if (prefix == "mark_unfound_lost") {
7c673cae 1054 string mulcmd;
9f95a23c 1055 cmd_getval(cmdmap, "mulcmd", mulcmd);
7c673cae
FG
1056 int mode = -1;
1057 if (mulcmd == "revert") {
11fdf7f2 1058 if (pool.info.is_erasure()) {
7c673cae 1059 ss << "mode must be 'delete' for ec pool";
9f95a23c
TL
1060 ret = -EINVAL;
1061 goto out;
7c673cae
FG
1062 }
1063 mode = pg_log_entry_t::LOST_REVERT;
1064 } else if (mulcmd == "delete") {
1065 mode = pg_log_entry_t::LOST_DELETE;
1066 } else {
1067 ss << "mode must be 'revert' or 'delete'; mark not yet implemented";
9f95a23c
TL
1068 ret = -EINVAL;
1069 goto out;
7c673cae 1070 }
11fdf7f2 1071 ceph_assert(mode == pg_log_entry_t::LOST_REVERT ||
9f95a23c 1072 mode == pg_log_entry_t::LOST_DELETE);
7c673cae
FG
1073
1074 if (!is_primary()) {
1075 ss << "not primary";
9f95a23c
TL
1076 ret = -EROFS;
1077 goto out;
7c673cae
FG
1078 }
1079
9f95a23c 1080 uint64_t unfound = recovery_state.get_missing_loc().num_unfound();
7c673cae
FG
1081 if (!unfound) {
1082 ss << "pg has no unfound objects";
9f95a23c 1083 goto out; // make command idempotent
7c673cae
FG
1084 }
1085
9f95a23c 1086 if (!recovery_state.all_unfound_are_queried_or_lost(get_osdmap())) {
7c673cae
FG
1087 ss << "pg has " << unfound
1088 << " unfound objects but we haven't probed all sources, not marking lost";
9f95a23c
TL
1089 ret = -EINVAL;
1090 goto out;
7c673cae
FG
1091 }
1092
9f95a23c
TL
1093 mark_all_unfound_lost(mode, on_finish);
1094 return;
7c673cae 1095 }
9f95a23c
TL
1096
1097 else if (prefix == "list_unfound") {
7c673cae
FG
1098 hobject_t offset;
1099 string offset_json;
11fdf7f2 1100 bool show_offset = false;
9f95a23c 1101 if (cmd_getval(cmdmap, "offset", offset_json)) {
7c673cae
FG
1102 json_spirit::Value v;
1103 try {
1104 if (!json_spirit::read(offset_json, v))
1105 throw std::runtime_error("bad json");
1106 offset.decode(v);
1107 } catch (std::runtime_error& e) {
1108 ss << "error parsing offset: " << e.what();
9f95a23c
TL
1109 ret = -EINVAL;
1110 goto out;
7c673cae 1111 }
11fdf7f2 1112 show_offset = true;
7c673cae
FG
1113 }
1114 f->open_object_section("missing");
11fdf7f2 1115 if (show_offset) {
7c673cae
FG
1116 f->open_object_section("offset");
1117 offset.dump(f.get());
1118 f->close_section();
1119 }
9f95a23c
TL
1120 auto &needs_recovery_map = recovery_state.get_missing_loc()
1121 .get_needs_recovery();
11fdf7f2 1122 f->dump_int("num_missing", needs_recovery_map.size());
7c673cae 1123 f->dump_int("num_unfound", get_num_unfound());
7c673cae
FG
1124 map<hobject_t, pg_missing_item>::const_iterator p =
1125 needs_recovery_map.upper_bound(offset);
1126 {
1127 f->open_array_section("objects");
1128 int32_t num = 0;
9f95a23c
TL
1129 for (; p != needs_recovery_map.end() &&
1130 num < cct->_conf->osd_command_max_records;
1131 ++p) {
1132 if (recovery_state.get_missing_loc().is_unfound(p->first)) {
7c673cae
FG
1133 f->open_object_section("object");
1134 {
1135 f->open_object_section("oid");
1136 p->first.dump(f.get());
1137 f->close_section();
1138 }
1139 p->second.dump(f.get()); // have, need keys
1140 {
1141 f->open_array_section("locations");
9f95a23c
TL
1142 for (auto &&r : recovery_state.get_missing_loc().get_locations(
1143 p->first)) {
1144 f->dump_stream("shard") << r;
1145 }
7c673cae
FG
1146 f->close_section();
1147 }
1148 f->close_section();
1149 num++;
1150 }
1151 }
1152 f->close_section();
1153 }
f67539c2
TL
1154 // Get possible locations of missing objects from pg information
1155 PeeringState::QueryUnfound q(f.get());
1156 recovery_state.handle_event(q, 0);
7c673cae
FG
1157 f->dump_bool("more", p != needs_recovery_map.end());
1158 f->close_section();
7c673cae
FG
1159 }
1160
9f95a23c
TL
1161 else if (prefix == "scrub" ||
1162 prefix == "deep_scrub") {
1163 bool deep = (prefix == "deep_scrub");
20effc67 1164 int64_t time = cmd_getval_or<int64_t>(cmdmap, "time", 0);
9f95a23c
TL
1165
1166 if (is_primary()) {
1167 const pg_pool_t *p = &pool.info;
1168 double pool_scrub_max_interval = 0;
1169 double scrub_max_interval;
1170 if (deep) {
1171 p->opts.get(pool_opts_t::DEEP_SCRUB_INTERVAL, &pool_scrub_max_interval);
1172 scrub_max_interval = pool_scrub_max_interval > 0 ?
1173 pool_scrub_max_interval : g_conf()->osd_deep_scrub_interval;
1174 } else {
1175 p->opts.get(pool_opts_t::SCRUB_MAX_INTERVAL, &pool_scrub_max_interval);
1176 scrub_max_interval = pool_scrub_max_interval > 0 ?
1177 pool_scrub_max_interval : g_conf()->osd_scrub_max_interval;
1178 }
1179 // Instead of marking must_scrub force a schedule scrub
1180 utime_t stamp = ceph_clock_now();
1181 if (time == 0)
1182 stamp -= scrub_max_interval;
1183 else
1184 stamp -= (float)time;
1185 stamp -= 100.0; // push back last scrub more for good measure
1186 if (deep) {
1187 set_last_deep_scrub_stamp(stamp);
9f95a23c 1188 }
20effc67 1189 set_last_scrub_stamp(stamp); // for 'deep' as well, as we use this value to order scrubs
9f95a23c
TL
1190 f->open_object_section("result");
1191 f->dump_bool("deep", deep);
1192 f->dump_stream("stamp") << stamp;
1193 f->close_section();
1194 } else {
1195 ss << "Not primary";
1196 ret = -EPERM;
1197 }
1198 outbl.append(ss.str());
1199 }
1200
20effc67
TL
1201 else if (prefix == "block" || prefix == "unblock" || prefix == "set" ||
1202 prefix == "unset") {
1203 string value;
1204 cmd_getval(cmdmap, "value", value);
1205
1206 if (is_primary()) {
1207 ret = m_scrubber->asok_debug(prefix, value, f.get(), ss);
1208 f->open_object_section("result");
1209 f->dump_bool("success", true);
1210 f->close_section();
1211 } else {
1212 ss << "Not primary";
1213 ret = -EPERM;
1214 }
1215 outbl.append(ss.str());
1216 }
9f95a23c
TL
1217 else {
1218 ret = -ENOSYS;
1219 ss << "prefix '" << prefix << "' not implemented";
1220 }
1221
1222 out:
1223 if (ret >= 0 && outbl.length() == 0) {
1224 f->flush(outbl);
1225 }
1226 on_finish(ret, ss.str(), outbl);
7c673cae
FG
1227}
1228
9f95a23c 1229
7c673cae
FG
1230// ==========================================================
1231
1232void PrimaryLogPG::do_pg_op(OpRequestRef op)
1233{
9f95a23c 1234 const MOSDOp *m = static_cast<const MOSDOp *>(op->get_req());
11fdf7f2 1235 ceph_assert(m->get_type() == CEPH_MSG_OSD_OP);
7c673cae
FG
1236 dout(10) << "do_pg_op " << *m << dendl;
1237
1238 op->mark_started();
1239
1240 int result = 0;
1241 string cname, mname;
7c673cae
FG
1242
1243 snapid_t snapid = m->get_snapid();
1244
1245 vector<OSDOp> ops = m->ops;
1246
1247 for (vector<OSDOp>::iterator p = ops.begin(); p != ops.end(); ++p) {
9f95a23c 1248 std::unique_ptr<const PGLSFilter> filter;
7c673cae 1249 OSDOp& osd_op = *p;
11fdf7f2 1250 auto bp = p->indata.cbegin();
7c673cae
FG
1251 switch (p->op.op) {
1252 case CEPH_OSD_OP_PGNLS_FILTER:
1253 try {
11fdf7f2
TL
1254 decode(cname, bp);
1255 decode(mname, bp);
7c673cae 1256 }
f67539c2 1257 catch (const ceph::buffer::error& e) {
7c673cae
FG
1258 dout(0) << "unable to decode PGLS_FILTER description in " << *m << dendl;
1259 result = -EINVAL;
1260 break;
1261 }
9f95a23c 1262 std::tie(result, filter) = get_pgls_filter(bp);
7c673cae
FG
1263 if (result < 0)
1264 break;
1265
11fdf7f2 1266 ceph_assert(filter);
7c673cae
FG
1267
1268 // fall through
1269
1270 case CEPH_OSD_OP_PGNLS:
1271 if (snapid != CEPH_NOSNAP) {
1272 result = -EINVAL;
1273 break;
1274 }
1275 if (get_osdmap()->raw_pg_to_pg(m->get_pg()) != info.pgid.pgid) {
1276 dout(10) << " pgnls pg=" << m->get_pg()
1277 << " " << get_osdmap()->raw_pg_to_pg(m->get_pg())
1278 << " != " << info.pgid << dendl;
1279 result = 0; // hmm?
1280 } else {
11fdf7f2
TL
1281 unsigned list_size = std::min<uint64_t>(cct->_conf->osd_max_pgls,
1282 p->op.pgls.count);
7c673cae 1283
11fdf7f2
TL
1284 dout(10) << " pgnls pg=" << m->get_pg() << " count " << list_size
1285 << dendl;
7c673cae
FG
1286 // read into a buffer
1287 vector<hobject_t> sentries;
1288 pg_nls_response_t response;
1289 try {
11fdf7f2 1290 decode(response.handle, bp);
7c673cae 1291 }
f67539c2 1292 catch (const ceph::buffer::error& e) {
7c673cae
FG
1293 dout(0) << "unable to decode PGNLS handle in " << *m << dendl;
1294 result = -EINVAL;
1295 break;
1296 }
1297
1298 hobject_t next;
1299 hobject_t lower_bound = response.handle;
1300 hobject_t pg_start = info.pgid.pgid.get_hobj_start();
1301 hobject_t pg_end = info.pgid.pgid.get_hobj_end(pool.info.get_pg_num());
1302 dout(10) << " pgnls lower_bound " << lower_bound
1303 << " pg_end " << pg_end << dendl;
1304 if (((!lower_bound.is_max() && lower_bound >= pg_end) ||
1305 (lower_bound != hobject_t() && lower_bound < pg_start))) {
1306 // this should only happen with a buggy client.
1307 dout(10) << "outside of PG bounds " << pg_start << " .. "
1308 << pg_end << dendl;
1309 result = -EINVAL;
1310 break;
1311 }
1312
1313 hobject_t current = lower_bound;
7c673cae
FG
1314 int r = pgbackend->objects_list_partial(
1315 current,
1316 list_size,
1317 list_size,
1318 &sentries,
1319 &next);
1320 if (r != 0) {
1321 result = -EINVAL;
1322 break;
1323 }
1324
1325 map<hobject_t, pg_missing_item>::const_iterator missing_iter =
9f95a23c 1326 recovery_state.get_pg_log().get_missing().get_items().lower_bound(current);
7c673cae
FG
1327 vector<hobject_t>::iterator ls_iter = sentries.begin();
1328 hobject_t _max = hobject_t::get_max();
1329 while (1) {
1330 const hobject_t &mcand =
9f95a23c 1331 missing_iter == recovery_state.get_pg_log().get_missing().get_items().end() ?
7c673cae
FG
1332 _max :
1333 missing_iter->first;
1334 const hobject_t &lcand =
1335 ls_iter == sentries.end() ?
1336 _max :
1337 *ls_iter;
1338
1339 hobject_t candidate;
1340 if (mcand == lcand) {
1341 candidate = mcand;
1342 if (!mcand.is_max()) {
1343 ++ls_iter;
1344 ++missing_iter;
1345 }
1346 } else if (mcand < lcand) {
1347 candidate = mcand;
11fdf7f2 1348 ceph_assert(!mcand.is_max());
7c673cae
FG
1349 ++missing_iter;
1350 } else {
1351 candidate = lcand;
11fdf7f2 1352 ceph_assert(!lcand.is_max());
7c673cae
FG
1353 ++ls_iter;
1354 }
1355
1356 dout(10) << " pgnls candidate 0x" << std::hex << candidate.get_hash()
11fdf7f2
TL
1357 << " vs lower bound 0x" << lower_bound.get_hash()
1358 << std::dec << dendl;
7c673cae
FG
1359
1360 if (candidate >= next) {
1361 break;
1362 }
1363
1364 if (response.entries.size() == list_size) {
1365 next = candidate;
1366 break;
1367 }
1368
7c673cae
FG
1369 if (candidate.snap != CEPH_NOSNAP)
1370 continue;
1371
1372 // skip internal namespace
1373 if (candidate.get_namespace() == cct->_conf->osd_hit_set_namespace)
1374 continue;
1375
9f95a23c 1376 if (recovery_state.get_missing_loc().is_deleted(candidate))
c07f9fc5
FG
1377 continue;
1378
7c673cae
FG
1379 // skip wrong namespace
1380 if (m->get_hobj().nspace != librados::all_nspaces &&
1381 candidate.get_namespace() != m->get_hobj().nspace)
1382 continue;
1383
9f95a23c 1384 if (filter && !pgls_filter(*filter, candidate))
7c673cae
FG
1385 continue;
1386
1387 dout(20) << "pgnls item 0x" << std::hex
1388 << candidate.get_hash()
1389 << ", rev 0x" << hobject_t::_reverse_bits(candidate.get_hash())
1390 << std::dec << " "
1391 << candidate.oid.name << dendl;
1392
1393 librados::ListObjectImpl item;
1394 item.nspace = candidate.get_namespace();
1395 item.oid = candidate.oid.name;
1396 item.locator = candidate.get_key();
1397 response.entries.push_back(item);
1398 }
1399
1400 if (next.is_max() &&
9f95a23c 1401 missing_iter == recovery_state.get_pg_log().get_missing().get_items().end() &&
7c673cae
FG
1402 ls_iter == sentries.end()) {
1403 result = 1;
1404
1405 // Set response.handle to the start of the next PG according
1406 // to the object sort order.
1407 response.handle = info.pgid.pgid.get_hobj_end(pool.info.get_pg_num());
1408 } else {
1409 response.handle = next;
1410 }
1411 dout(10) << "pgnls handle=" << response.handle << dendl;
11fdf7f2 1412 encode(response, osd_op.outdata);
7c673cae
FG
1413 dout(10) << " pgnls result=" << result << " outdata.length()="
1414 << osd_op.outdata.length() << dendl;
1415 }
1416 break;
1417
1418 case CEPH_OSD_OP_PGLS_FILTER:
1419 try {
11fdf7f2
TL
1420 decode(cname, bp);
1421 decode(mname, bp);
7c673cae 1422 }
f67539c2 1423 catch (const ceph::buffer::error& e) {
7c673cae
FG
1424 dout(0) << "unable to decode PGLS_FILTER description in " << *m << dendl;
1425 result = -EINVAL;
1426 break;
1427 }
9f95a23c 1428 std::tie(result, filter) = get_pgls_filter(bp);
7c673cae
FG
1429 if (result < 0)
1430 break;
1431
11fdf7f2 1432 ceph_assert(filter);
7c673cae
FG
1433
1434 // fall through
1435
1436 case CEPH_OSD_OP_PGLS:
1437 if (snapid != CEPH_NOSNAP) {
1438 result = -EINVAL;
1439 break;
1440 }
1441 if (get_osdmap()->raw_pg_to_pg(m->get_pg()) != info.pgid.pgid) {
1442 dout(10) << " pgls pg=" << m->get_pg()
1443 << " " << get_osdmap()->raw_pg_to_pg(m->get_pg())
1444 << " != " << info.pgid << dendl;
1445 result = 0; // hmm?
1446 } else {
11fdf7f2
TL
1447 unsigned list_size = std::min<uint64_t>(cct->_conf->osd_max_pgls,
1448 p->op.pgls.count);
7c673cae
FG
1449
1450 dout(10) << " pgls pg=" << m->get_pg() << " count " << list_size << dendl;
1451 // read into a buffer
1452 vector<hobject_t> sentries;
1453 pg_ls_response_t response;
1454 try {
11fdf7f2 1455 decode(response.handle, bp);
7c673cae 1456 }
f67539c2 1457 catch (const ceph::buffer::error& e) {
7c673cae
FG
1458 dout(0) << "unable to decode PGLS handle in " << *m << dendl;
1459 result = -EINVAL;
1460 break;
1461 }
1462
1463 hobject_t next;
1464 hobject_t current = response.handle;
7c673cae
FG
1465 int r = pgbackend->objects_list_partial(
1466 current,
1467 list_size,
1468 list_size,
1469 &sentries,
1470 &next);
1471 if (r != 0) {
1472 result = -EINVAL;
1473 break;
1474 }
1475
9f95a23c 1476 ceph_assert(snapid == CEPH_NOSNAP || recovery_state.get_pg_log().get_missing().get_items().empty());
7c673cae
FG
1477
1478 map<hobject_t, pg_missing_item>::const_iterator missing_iter =
9f95a23c 1479 recovery_state.get_pg_log().get_missing().get_items().lower_bound(current);
7c673cae
FG
1480 vector<hobject_t>::iterator ls_iter = sentries.begin();
1481 hobject_t _max = hobject_t::get_max();
1482 while (1) {
1483 const hobject_t &mcand =
9f95a23c 1484 missing_iter == recovery_state.get_pg_log().get_missing().get_items().end() ?
7c673cae
FG
1485 _max :
1486 missing_iter->first;
1487 const hobject_t &lcand =
1488 ls_iter == sentries.end() ?
1489 _max :
1490 *ls_iter;
1491
1492 hobject_t candidate;
1493 if (mcand == lcand) {
1494 candidate = mcand;
1495 if (!mcand.is_max()) {
1496 ++ls_iter;
1497 ++missing_iter;
1498 }
1499 } else if (mcand < lcand) {
1500 candidate = mcand;
11fdf7f2 1501 ceph_assert(!mcand.is_max());
7c673cae
FG
1502 ++missing_iter;
1503 } else {
1504 candidate = lcand;
11fdf7f2 1505 ceph_assert(!lcand.is_max());
7c673cae
FG
1506 ++ls_iter;
1507 }
1508
1509 if (candidate >= next) {
1510 break;
1511 }
f67539c2 1512
7c673cae
FG
1513 if (response.entries.size() == list_size) {
1514 next = candidate;
1515 break;
1516 }
1517
7c673cae
FG
1518 if (candidate.snap != CEPH_NOSNAP)
1519 continue;
1520
1521 // skip wrong namespace
1522 if (candidate.get_namespace() != m->get_hobj().nspace)
1523 continue;
1524
9f95a23c 1525 if (recovery_state.get_missing_loc().is_deleted(candidate))
c07f9fc5
FG
1526 continue;
1527
9f95a23c 1528 if (filter && !pgls_filter(*filter, candidate))
7c673cae
FG
1529 continue;
1530
1531 response.entries.push_back(make_pair(candidate.oid,
1532 candidate.get_key()));
1533 }
1534 if (next.is_max() &&
9f95a23c 1535 missing_iter == recovery_state.get_pg_log().get_missing().get_items().end() &&
7c673cae
FG
1536 ls_iter == sentries.end()) {
1537 result = 1;
1538 }
1539 response.handle = next;
11fdf7f2 1540 encode(response, osd_op.outdata);
7c673cae
FG
1541 dout(10) << " pgls result=" << result << " outdata.length()="
1542 << osd_op.outdata.length() << dendl;
1543 }
1544 break;
1545
1546 case CEPH_OSD_OP_PG_HITSET_LS:
1547 {
1548 list< pair<utime_t,utime_t> > ls;
1549 for (list<pg_hit_set_info_t>::const_iterator p = info.hit_set.history.begin();
1550 p != info.hit_set.history.end();
1551 ++p)
1552 ls.push_back(make_pair(p->begin, p->end));
1553 if (hit_set)
1554 ls.push_back(make_pair(hit_set_start_stamp, utime_t()));
11fdf7f2 1555 encode(ls, osd_op.outdata);
7c673cae
FG
1556 }
1557 break;
1558
1559 case CEPH_OSD_OP_PG_HITSET_GET:
1560 {
1561 utime_t stamp(osd_op.op.hit_set_get.stamp);
1562 if (hit_set_start_stamp && stamp >= hit_set_start_stamp) {
1563 // read the current in-memory HitSet, not the version we've
1564 // checkpointed.
1565 if (!hit_set) {
1566 result= -ENOENT;
1567 break;
1568 }
11fdf7f2 1569 encode(*hit_set, osd_op.outdata);
7c673cae
FG
1570 result = osd_op.outdata.length();
1571 } else {
1572 // read an archived HitSet.
1573 hobject_t oid;
1574 for (list<pg_hit_set_info_t>::const_iterator p = info.hit_set.history.begin();
1575 p != info.hit_set.history.end();
1576 ++p) {
1577 if (stamp >= p->begin && stamp <= p->end) {
1578 oid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt);
1579 break;
1580 }
1581 }
1582 if (oid == hobject_t()) {
1583 result = -ENOENT;
1584 break;
1585 }
1586 if (!pool.info.is_replicated()) {
1587 // FIXME: EC not supported yet
1588 result = -EOPNOTSUPP;
1589 break;
1590 }
1591 if (is_unreadable_object(oid)) {
1592 wait_for_unreadable_object(oid, op);
7c673cae
FG
1593 return;
1594 }
1595 result = osd->store->read(ch, ghobject_t(oid), 0, 0, osd_op.outdata);
1596 }
1597 }
1598 break;
1599
1600 case CEPH_OSD_OP_SCRUBLS:
1601 result = do_scrub_ls(m, &osd_op);
1602 break;
1603
1604 default:
1605 result = -EINVAL;
1606 break;
1607 }
1608
1609 if (result < 0)
1610 break;
1611 }
1612
1613 // reply
11fdf7f2 1614 MOSDOpReply *reply = new MOSDOpReply(m, 0, get_osdmap_epoch(),
7c673cae
FG
1615 CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK,
1616 false);
1617 reply->claim_op_out_data(ops);
1618 reply->set_result(result);
1619 reply->set_reply_versions(info.last_update, info.last_user_version);
1620 osd->send_message_osd_client(reply, m->get_connection());
7c673cae
FG
1621}
1622
9f95a23c 1623int PrimaryLogPG::do_scrub_ls(const MOSDOp *m, OSDOp *osd_op)
7c673cae
FG
1624{
1625 if (m->get_pg() != info.pgid.pgid) {
1626 dout(10) << " scrubls pg=" << m->get_pg() << " != " << info.pgid << dendl;
1627 return -EINVAL; // hmm?
1628 }
11fdf7f2 1629 auto bp = osd_op->indata.cbegin();
7c673cae
FG
1630 scrub_ls_arg_t arg;
1631 try {
1632 arg.decode(bp);
f67539c2 1633 } catch (ceph::buffer::error&) {
7c673cae
FG
1634 dout(10) << " corrupted scrub_ls_arg_t" << dendl;
1635 return -EINVAL;
1636 }
f67539c2 1637
7c673cae
FG
1638 int r = 0;
1639 scrub_ls_result_t result = {.interval = info.history.same_interval_since};
f67539c2 1640
7c673cae
FG
1641 if (arg.interval != 0 && arg.interval != info.history.same_interval_since) {
1642 r = -EAGAIN;
7c673cae 1643 } else {
f67539c2
TL
1644 bool store_queried = m_scrubber && m_scrubber->get_store_errors(arg, result);
1645 if (store_queried) {
1646 encode(result, osd_op->outdata);
1647 } else {
1648 // the scrubber's store is not initialized
1649 r = -ENOENT;
1650 }
7c673cae 1651 }
f67539c2 1652
7c673cae
FG
1653 return r;
1654}
1655
20effc67
TL
1656/**
1657 * Grabs locks for OpContext, should be cleaned up in close_op_ctx
1658 *
1659 * @param ctx [in,out] ctx to get locks for
1660 * @return true on success, false if we are queued
1661 */
1662bool PrimaryLogPG::get_rw_locks(bool write_ordered, OpContext *ctx)
1663{
1664 /* If head_obc, !obc->obs->exists and we will always take the
1665 * snapdir lock *before* the head lock. Since all callers will do
1666 * this (read or write) if we get the first we will be guaranteed
1667 * to get the second.
1668 */
1669 if (write_ordered && ctx->op->may_read()) {
1670 ctx->lock_type = RWState::RWEXCL;
1671 } else if (write_ordered) {
1672 ctx->lock_type = RWState::RWWRITE;
1673 } else {
1674 ceph_assert(ctx->op->may_read());
1675 ctx->lock_type = RWState::RWREAD;
1676 }
1677
1678 if (ctx->head_obc) {
1679 ceph_assert(!ctx->obc->obs.exists);
1680 if (!ctx->lock_manager.get_lock_type(
1681 ctx->lock_type,
1682 ctx->head_obc->obs.oi.soid,
1683 ctx->head_obc,
1684 ctx->op)) {
1685 ctx->lock_type = RWState::RWNONE;
1686 return false;
1687 }
1688 }
1689 if (ctx->lock_manager.get_lock_type(
1690 ctx->lock_type,
1691 ctx->obc->obs.oi.soid,
1692 ctx->obc,
1693 ctx->op)) {
1694 return true;
1695 } else {
1696 ceph_assert(!ctx->head_obc);
1697 ctx->lock_type = RWState::RWNONE;
1698 return false;
1699 }
1700}
1701
f67539c2
TL
1702/**
1703 * Releases locks
1704 *
1705 * @param manager [in] manager with locks to release
1706 */
1707void PrimaryLogPG::release_object_locks(
1708 ObcLockManager &lock_manager) {
1709 std::list<std::pair<ObjectContextRef, std::list<OpRequestRef> > > to_req;
1710 bool requeue_recovery = false;
1711 bool requeue_snaptrim = false;
1712 lock_manager.put_locks(
1713 &to_req,
1714 &requeue_recovery,
1715 &requeue_snaptrim);
1716 if (requeue_recovery)
1717 queue_recovery();
1718 if (requeue_snaptrim)
1719 snap_trimmer_machine.process_event(TrimWriteUnblocked());
1720
1721 if (!to_req.empty()) {
1722 // requeue at front of scrub blocking queue if we are blocked by scrub
1723 for (auto &&p: to_req) {
1724 if (m_scrubber->write_blocked_by_scrub(p.first->obs.oi.soid.get_head())) {
1725 for (auto& op : p.second) {
1726 op->mark_delayed("waiting for scrub");
1727 }
1728
1729 waiting_for_scrub.splice(
1730 waiting_for_scrub.begin(),
1731 p.second,
1732 p.second.begin(),
1733 p.second.end());
1734 } else if (is_laggy()) {
1735 for (auto& op : p.second) {
1736 op->mark_delayed("waiting for readable");
1737 }
1738 waiting_for_readable.splice(
1739 waiting_for_readable.begin(),
1740 p.second,
1741 p.second.begin(),
1742 p.second.end());
1743 } else {
1744 requeue_ops(p.second);
1745 }
1746 }
1747 }
1748}
1749
7c673cae 1750PrimaryLogPG::PrimaryLogPG(OSDService *o, OSDMapRef curmap,
11fdf7f2
TL
1751 const PGPool &_pool,
1752 const map<string,string>& ec_profile, spg_t p) :
7c673cae
FG
1753 PG(o, curmap, _pool, p),
1754 pgbackend(
1755 PGBackend::build_pg_backend(
11fdf7f2 1756 _pool.info, ec_profile, this, coll_t(p), ch, o->store, cct)),
7c673cae 1757 object_contexts(o->cct, o->cct->_conf->osd_pg_object_context_cache_count),
7c673cae
FG
1758 new_backfill(false),
1759 temp_seq(0),
1760 snap_trimmer_machine(this)
f67539c2 1761{
9f95a23c 1762 recovery_state.set_backend_predicates(
7c673cae
FG
1763 pgbackend->get_is_readable_predicate(),
1764 pgbackend->get_is_recoverable_predicate());
1765 snap_trimmer_machine.initiate();
f67539c2
TL
1766
1767 m_scrubber = make_unique<PrimaryLogScrub>(this);
7c673cae
FG
1768}
1769
20effc67
TL
1770PrimaryLogPG::~PrimaryLogPG()
1771{
1772 m_scrubber.reset();
1773}
1774
7c673cae
FG
1775void PrimaryLogPG::get_src_oloc(const object_t& oid, const object_locator_t& oloc, object_locator_t& src_oloc)
1776{
1777 src_oloc = oloc;
1778 if (oloc.key.empty())
1779 src_oloc.key = oid.name;
1780}
1781
1782void PrimaryLogPG::handle_backoff(OpRequestRef& op)
1783{
9f95a23c
TL
1784 auto m = op->get_req<MOSDBackoff>();
1785 auto session = ceph::ref_cast<Session>(m->get_connection()->get_priv());
7c673cae
FG
1786 if (!session)
1787 return; // drop it.
7c673cae
FG
1788 hobject_t begin = info.pgid.pgid.get_hobj_start();
1789 hobject_t end = info.pgid.pgid.get_hobj_end(pool.info.get_pg_num());
1790 if (begin < m->begin) {
1791 begin = m->begin;
1792 }
1793 if (end > m->end) {
1794 end = m->end;
1795 }
1796 dout(10) << __func__ << " backoff ack id " << m->id
1797 << " [" << begin << "," << end << ")" << dendl;
1798 session->ack_backoff(cct, m->pgid, m->id, begin, end);
1799}
1800
1801void PrimaryLogPG::do_request(
1802 OpRequestRef& op,
1803 ThreadPool::TPHandle &handle)
1804{
1805 if (op->osd_trace) {
1806 op->pg_trace.init("pg op", &trace_endpoint, &op->osd_trace);
1807 op->pg_trace.event("do request");
1808 }
20effc67 1809
20effc67 1810
f67539c2 1811// make sure we have a new enough map
7c673cae
FG
1812 auto p = waiting_for_map.find(op->get_source());
1813 if (p != waiting_for_map.end()) {
1814 // preserve ordering
1815 dout(20) << __func__ << " waiting_for_map "
1816 << p->first << " not empty, queueing" << dendl;
1817 p->second.push_back(op);
1818 op->mark_delayed("waiting_for_map not empty");
1819 return;
1820 }
1821 if (!have_same_or_newer_map(op->min_epoch)) {
1822 dout(20) << __func__ << " min " << op->min_epoch
1823 << ", queue on waiting_for_map " << op->get_source() << dendl;
1824 waiting_for_map[op->get_source()].push_back(op);
1825 op->mark_delayed("op must wait for map");
181888fb 1826 osd->request_osdmap_update(op->min_epoch);
7c673cae
FG
1827 return;
1828 }
1829
1830 if (can_discard_request(op)) {
1831 return;
1832 }
1833
1834 // pg-wide backoffs
1835 const Message *m = op->get_req();
11fdf7f2 1836 int msg_type = m->get_type();
7c673cae 1837 if (m->get_connection()->has_feature(CEPH_FEATURE_RADOS_BACKOFF)) {
9f95a23c 1838 auto session = ceph::ref_cast<Session>(m->get_connection()->get_priv());
7c673cae
FG
1839 if (!session)
1840 return; // drop it.
11fdf7f2 1841 if (msg_type == CEPH_MSG_OSD_OP) {
7c673cae
FG
1842 if (session->check_backoff(cct, info.pgid,
1843 info.pgid.pgid.get_hobj_start(), m)) {
1844 return;
1845 }
1846
1847 bool backoff =
1848 is_down() ||
1849 is_incomplete() ||
1850 (!is_active() && is_peered());
11fdf7f2 1851 if (g_conf()->osd_backoff_on_peering && !backoff) {
7c673cae
FG
1852 if (is_peering()) {
1853 backoff = true;
1854 }
1855 }
1856 if (backoff) {
1857 add_pg_backoff(session);
1858 return;
1859 }
1860 }
1861 // pg backoff acks at pg-level
11fdf7f2 1862 if (msg_type == CEPH_MSG_OSD_BACKOFF) {
7c673cae
FG
1863 const MOSDBackoff *ba = static_cast<const MOSDBackoff*>(m);
1864 if (ba->begin != ba->end) {
1865 handle_backoff(op);
1866 return;
1867 }
1868 }
1869 }
1870
7c673cae
FG
1871 if (!is_peered()) {
1872 // Delay unless PGBackend says it's ok
1873 if (pgbackend->can_handle_while_inactive(op)) {
1874 bool handled = pgbackend->handle_message(op);
11fdf7f2 1875 ceph_assert(handled);
7c673cae
FG
1876 return;
1877 } else {
1878 waiting_for_peered.push_back(op);
1879 op->mark_delayed("waiting for peered");
1880 return;
1881 }
1882 }
1883
9f95a23c 1884 if (recovery_state.needs_flush()) {
1e59de90 1885 dout(20) << "waiting for flush on " << *op->get_req() << dendl;
b32b8144
FG
1886 waiting_for_flush.push_back(op);
1887 op->mark_delayed("waiting for flush");
1888 return;
1889 }
1890
9f95a23c 1891 ceph_assert(is_peered() && !recovery_state.needs_flush());
7c673cae
FG
1892 if (pgbackend->handle_message(op))
1893 return;
1894
11fdf7f2 1895 switch (msg_type) {
7c673cae
FG
1896 case CEPH_MSG_OSD_OP:
1897 case CEPH_MSG_OSD_BACKOFF:
1898 if (!is_active()) {
1e59de90
TL
1899 dout(20) << " peered, not active, waiting for active on "
1900 << *op->get_req() << dendl;
7c673cae
FG
1901 waiting_for_active.push_back(op);
1902 op->mark_delayed("waiting for active");
1903 return;
1904 }
11fdf7f2 1905 switch (msg_type) {
7c673cae
FG
1906 case CEPH_MSG_OSD_OP:
1907 // verify client features
1908 if ((pool.info.has_tiers() || pool.info.is_tier()) &&
1909 !op->has_feature(CEPH_FEATURE_OSD_CACHEPOOL)) {
1910 osd->reply_op_error(op, -EOPNOTSUPP);
1911 return;
1912 }
1913 do_op(op);
1914 break;
1915 case CEPH_MSG_OSD_BACKOFF:
1916 // object-level backoff acks handled in osdop context
1917 handle_backoff(op);
1918 break;
1919 }
1920 break;
1921
7c673cae
FG
1922 case MSG_OSD_PG_SCAN:
1923 do_scan(op, handle);
1924 break;
1925
1926 case MSG_OSD_PG_BACKFILL:
1927 do_backfill(op);
1928 break;
1929
1930 case MSG_OSD_PG_BACKFILL_REMOVE:
1931 do_backfill_remove(op);
1932 break;
1933
1934 case MSG_OSD_SCRUB_RESERVE:
1935 {
f67539c2
TL
1936 if (!m_scrubber) {
1937 osd->reply_op_error(op, -EAGAIN);
1938 return;
1939 }
9f95a23c 1940 auto m = op->get_req<MOSDScrubReserve>();
7c673cae
FG
1941 switch (m->type) {
1942 case MOSDScrubReserve::REQUEST:
f67539c2 1943 m_scrubber->handle_scrub_reserve_request(op);
7c673cae
FG
1944 break;
1945 case MOSDScrubReserve::GRANT:
f67539c2 1946 m_scrubber->handle_scrub_reserve_grant(op, m->from);
7c673cae
FG
1947 break;
1948 case MOSDScrubReserve::REJECT:
f67539c2 1949 m_scrubber->handle_scrub_reserve_reject(op, m->from);
7c673cae
FG
1950 break;
1951 case MOSDScrubReserve::RELEASE:
f67539c2 1952 m_scrubber->handle_scrub_reserve_release(op);
7c673cae
FG
1953 break;
1954 }
1955 }
1956 break;
1957
1958 case MSG_OSD_REP_SCRUB:
1959 replica_scrub(op, handle);
1960 break;
1961
1962 case MSG_OSD_REP_SCRUBMAP:
1963 do_replica_scrub_map(op);
1964 break;
1965
1966 case MSG_OSD_PG_UPDATE_LOG_MISSING:
1967 do_update_log_missing(op);
1968 break;
1969
1970 case MSG_OSD_PG_UPDATE_LOG_MISSING_REPLY:
1971 do_update_log_missing_reply(op);
1972 break;
1973
1974 default:
11fdf7f2 1975 ceph_abort_msg("bad message type in do_request");
7c673cae
FG
1976 }
1977}
1978
7c673cae
FG
1979/** do_op - do an op
1980 * pg lock will be held (if multithreaded)
1981 * osd_lock NOT held.
1982 */
1983void PrimaryLogPG::do_op(OpRequestRef& op)
1984{
11fdf7f2 1985 FUNCTRACE(cct);
7c673cae
FG
1986 // NOTE: take a non-const pointer here; we must be careful not to
1987 // change anything that will break other reads on m (operator<<).
1988 MOSDOp *m = static_cast<MOSDOp*>(op->get_nonconst_req());
11fdf7f2 1989 ceph_assert(m->get_type() == CEPH_MSG_OSD_OP);
7c673cae
FG
1990 if (m->finish_decode()) {
1991 op->reset_desc(); // for TrackedOp
1992 m->clear_payload();
1993 }
1994
1995 dout(20) << __func__ << ": op " << *m << dendl;
1996
9f95a23c 1997 const hobject_t head = m->get_hobj().get_head();
7c673cae
FG
1998
1999 if (!info.pgid.pgid.contains(
2000 info.pgid.pgid.get_split_bits(pool.info.get_pg_num()), head)) {
2001 derr << __func__ << " " << info.pgid.pgid << " does not contain "
2002 << head << " pg_num " << pool.info.get_pg_num() << " hash "
2003 << std::hex << head.get_hash() << std::dec << dendl;
2004 osd->clog->warn() << info.pgid.pgid << " does not contain " << head
2005 << " op " << *m;
11fdf7f2 2006 ceph_assert(!cct->_conf->osd_debug_misdirected_ops);
7c673cae
FG
2007 return;
2008 }
2009
2010 bool can_backoff =
2011 m->get_connection()->has_feature(CEPH_FEATURE_RADOS_BACKOFF);
9f95a23c 2012 ceph::ref_t<Session> session;
7c673cae 2013 if (can_backoff) {
11fdf7f2 2014 session = static_cast<Session*>(m->get_connection()->get_priv().get());
7c673cae
FG
2015 if (!session.get()) {
2016 dout(10) << __func__ << " no session" << dendl;
2017 return;
2018 }
7c673cae
FG
2019
2020 if (session->check_backoff(cct, info.pgid, head, m)) {
2021 return;
2022 }
2023 }
2024
2025 if (m->has_flag(CEPH_OSD_FLAG_PARALLELEXEC)) {
2026 // not implemented.
2027 dout(20) << __func__ << ": PARALLELEXEC not implemented " << *m << dendl;
2028 osd->reply_op_error(op, -EINVAL);
2029 return;
2030 }
2031
9f95a23c
TL
2032 {
2033 int r = op->maybe_init_op_info(*get_osdmap());
7c673cae
FG
2034 if (r) {
2035 osd->reply_op_error(op, r);
2036 return;
2037 }
2038 }
2039
2040 if ((m->get_flags() & (CEPH_OSD_FLAG_BALANCE_READS |
2041 CEPH_OSD_FLAG_LOCALIZE_READS)) &&
2042 op->may_read() &&
2043 !(op->may_write() || op->may_cache())) {
2044 // balanced reads; any replica will do
9f95a23c 2045 if (!(is_primary() || is_nonprimary())) {
7c673cae
FG
2046 osd->handle_misdirected_op(this, op);
2047 return;
2048 }
2049 } else {
2050 // normal case; must be primary
2051 if (!is_primary()) {
2052 osd->handle_misdirected_op(this, op);
2053 return;
2054 }
2055 }
2056
9f95a23c
TL
2057 if (!check_laggy(op)) {
2058 return;
2059 }
2060
7c673cae
FG
2061 if (!op_has_sufficient_caps(op)) {
2062 osd->reply_op_error(op, -EPERM);
2063 return;
2064 }
2065
31f18b77
FG
2066 if (op->includes_pg_op()) {
2067 return do_pg_op(op);
2068 }
2069
7c673cae
FG
2070 // object name too long?
2071 if (m->get_oid().name.size() > cct->_conf->osd_max_object_name_len) {
2072 dout(4) << "do_op name is longer than "
2073 << cct->_conf->osd_max_object_name_len
2074 << " bytes" << dendl;
2075 osd->reply_op_error(op, -ENAMETOOLONG);
2076 return;
2077 }
2078 if (m->get_hobj().get_key().size() > cct->_conf->osd_max_object_name_len) {
2079 dout(4) << "do_op locator is longer than "
2080 << cct->_conf->osd_max_object_name_len
2081 << " bytes" << dendl;
2082 osd->reply_op_error(op, -ENAMETOOLONG);
2083 return;
2084 }
2085 if (m->get_hobj().nspace.size() > cct->_conf->osd_max_object_namespace_len) {
2086 dout(4) << "do_op namespace is longer than "
2087 << cct->_conf->osd_max_object_namespace_len
2088 << " bytes" << dendl;
2089 osd->reply_op_error(op, -ENAMETOOLONG);
2090 return;
2091 }
494da23a
TL
2092 if (m->get_hobj().oid.name.empty()) {
2093 dout(4) << "do_op empty oid name is not allowed" << dendl;
2094 osd->reply_op_error(op, -EINVAL);
2095 return;
2096 }
7c673cae
FG
2097
2098 if (int r = osd->store->validate_hobject_key(head)) {
2099 dout(4) << "do_op object " << head << " invalid for backing store: "
2100 << r << dendl;
2101 osd->reply_op_error(op, r);
2102 return;
2103 }
2104
f67539c2
TL
2105 // blocklisted?
2106 if (get_osdmap()->is_blocklisted(m->get_source_addr())) {
2107 dout(10) << "do_op " << m->get_source_addr() << " is blocklisted" << dendl;
2108 osd->reply_op_error(op, -EBLOCKLISTED);
7c673cae
FG
2109 return;
2110 }
2111
2112 // order this op as a write?
2113 bool write_ordered = op->rwordered();
2114
2115 // discard due to cluster full transition? (we discard any op that
2116 // originates before the cluster or pool is marked full; the client
2117 // will resend after the full flag is removed or if they expect the
2118 // op to succeed despite being full). The except is FULL_FORCE and
2119 // FULL_TRY ops, which there is no reason to discard because they
2120 // bypass all full checks anyway. If this op isn't write or
2121 // read-ordered, we skip.
2122 // FIXME: we exclude mds writes for now.
2123 if (write_ordered && !(m->get_source().is_mds() ||
2124 m->has_flag(CEPH_OSD_FLAG_FULL_TRY) ||
2125 m->has_flag(CEPH_OSD_FLAG_FULL_FORCE)) &&
2126 info.history.last_epoch_marked_full > m->get_map_epoch()) {
2127 dout(10) << __func__ << " discarding op sent before full " << m << " "
2128 << *m << dendl;
2129 return;
2130 }
2131 // mds should have stopped writing before this point.
2132 // We can't allow OSD to become non-startable even if mds
2133 // could be writing as part of file removals.
f67539c2 2134 if (write_ordered && osd->check_failsafe_full(get_dpp()) &&
11fdf7f2
TL
2135 !m->has_flag(CEPH_OSD_FLAG_FULL_TRY)) {
2136 dout(10) << __func__ << " fail-safe full check failed, dropping request." << dendl;
7c673cae
FG
2137 return;
2138 }
2139 int64_t poolid = get_pgid().pool();
20effc67
TL
2140 const pg_pool_t *pi = get_osdmap()->get_pg_pool(poolid);
2141 if (!pi) {
2142 return;
2143 }
2144 if (pi->has_flag(pg_pool_t::FLAG_EIO)) {
2145 // drop op on the floor; the client will handle returning EIO
2146 if (m->has_flag(CEPH_OSD_FLAG_SUPPORTSPOOLEIO)) {
2147 dout(10) << __func__ << " discarding op due to pool EIO flag" << dendl;
2148 } else {
2149 dout(10) << __func__ << " replying EIO due to pool EIO flag" << dendl;
2150 osd->reply_op_error(op, -EIO);
7c673cae 2151 }
20effc67
TL
2152 return;
2153 }
2154 if (op->may_write()) {
7c673cae
FG
2155
2156 // invalid?
2157 if (m->get_snapid() != CEPH_NOSNAP) {
2158 dout(20) << __func__ << ": write to clone not valid " << *m << dendl;
2159 osd->reply_op_error(op, -EINVAL);
2160 return;
2161 }
2162
2163 // too big?
2164 if (cct->_conf->osd_max_write_size &&
2165 m->get_data_len() > cct->_conf->osd_max_write_size << 20) {
2166 // journal can't hold commit!
2167 derr << "do_op msg data len " << m->get_data_len()
2168 << " > osd_max_write_size " << (cct->_conf->osd_max_write_size << 20)
2169 << " on " << *m << dendl;
2170 osd->reply_op_error(op, -OSD_WRITETOOBIG);
2171 return;
2172 }
2173 }
2174
2175 dout(10) << "do_op " << *m
2176 << (op->may_write() ? " may_write" : "")
2177 << (op->may_read() ? " may_read" : "")
2178 << (op->may_cache() ? " may_cache" : "")
2179 << " -> " << (write_ordered ? "write-ordered" : "read-ordered")
2180 << " flags " << ceph_osd_flag_string(m->get_flags())
2181 << dendl;
2182
20effc67 2183
7c673cae
FG
2184 // missing object?
2185 if (is_unreadable_object(head)) {
224ce89b
WB
2186 if (!is_primary()) {
2187 osd->reply_op_error(op, -EAGAIN);
2188 return;
2189 }
7c673cae 2190 if (can_backoff &&
11fdf7f2 2191 (g_conf()->osd_backoff_on_degraded ||
9f95a23c
TL
2192 (g_conf()->osd_backoff_on_unfound &&
2193 recovery_state.get_missing_loc().is_unfound(head)))) {
7c673cae
FG
2194 add_backoff(session, head, head);
2195 maybe_kick_recovery(head);
2196 } else {
2197 wait_for_unreadable_object(head, op);
2198 }
2199 return;
2200 }
2201
11fdf7f2
TL
2202 if (write_ordered) {
2203 // degraded object?
2204 if (is_degraded_or_backfilling_object(head)) {
2205 if (can_backoff && g_conf()->osd_backoff_on_degraded) {
2206 add_backoff(session, head, head);
2207 maybe_kick_recovery(head);
2208 } else {
2209 wait_for_degraded_object(head, op);
2210 }
2211 return;
7c673cae 2212 }
7c673cae 2213
f67539c2 2214 if (m_scrubber->is_scrub_active() && m_scrubber->write_blocked_by_scrub(head)) {
11fdf7f2
TL
2215 dout(20) << __func__ << ": waiting for scrub" << dendl;
2216 waiting_for_scrub.push_back(op);
2217 op->mark_delayed("waiting for scrub");
2218 return;
2219 }
9f95a23c
TL
2220 if (!check_laggy_requeue(op)) {
2221 return;
2222 }
7c673cae 2223
11fdf7f2
TL
2224 // blocked on snap?
2225 if (auto blocked_iter = objects_blocked_on_degraded_snap.find(head);
2226 blocked_iter != std::end(objects_blocked_on_degraded_snap)) {
2227 hobject_t to_wait_on(head);
2228 to_wait_on.snap = blocked_iter->second;
2229 wait_for_degraded_object(to_wait_on, op);
2230 return;
2231 }
2232 if (auto blocked_snap_promote_iter = objects_blocked_on_snap_promotion.find(head);
2233 blocked_snap_promote_iter != std::end(objects_blocked_on_snap_promotion)) {
2234 wait_for_blocked_object(blocked_snap_promote_iter->second->obs.oi.soid, op);
2235 return;
2236 }
2237 if (objects_blocked_on_cache_full.count(head)) {
2238 block_write_on_full_cache(head, op);
2239 return;
2240 }
7c673cae
FG
2241 }
2242
2243 // dup/resent?
2244 if (op->may_write() || op->may_cache()) {
2245 // warning: we will get back *a* request for this reqid, but not
2246 // necessarily the most recent. this happens with flush and
2247 // promote ops, but we can't possible have both in our log where
2248 // the original request is still not stable on disk, so for our
2249 // purposes here it doesn't matter which one we get.
2250 eversion_t version;
2251 version_t user_version;
2252 int return_code = 0;
9f95a23c 2253 vector<pg_log_op_return_item_t> op_returns;
7c673cae 2254 bool got = check_in_progress_op(
9f95a23c 2255 m->get_reqid(), &version, &user_version, &return_code, &op_returns);
7c673cae
FG
2256 if (got) {
2257 dout(3) << __func__ << " dup " << m->get_reqid()
2258 << " version " << version << dendl;
2259 if (already_complete(version)) {
9f95a23c 2260 osd->reply_op_error(op, return_code, version, user_version, op_returns);
7c673cae
FG
2261 } else {
2262 dout(10) << " waiting for " << version << " to commit" << dendl;
2263 // always queue ondisk waiters, so that we can requeue if needed
9f95a23c
TL
2264 waiting_for_ondisk[version].emplace_back(op, user_version, return_code,
2265 op_returns);
7c673cae
FG
2266 op->mark_delayed("waiting for ondisk");
2267 }
2268 return;
2269 }
2270 }
2271
2272 ObjectContextRef obc;
11fdf7f2 2273 bool can_create = op->may_write();
7c673cae 2274 hobject_t missing_oid;
11fdf7f2
TL
2275
2276 // kludge around the fact that LIST_SNAPS sets CEPH_SNAPDIR for LIST_SNAPS
11fdf7f2 2277 const hobject_t& oid =
9f95a23c 2278 m->get_snapid() == CEPH_SNAPDIR ? head : m->get_hobj();
11fdf7f2
TL
2279
2280 // make sure LIST_SNAPS is on CEPH_SNAPDIR and nothing else
2281 for (vector<OSDOp>::iterator p = m->ops.begin(); p != m->ops.end(); ++p) {
2282 OSDOp& osd_op = *p;
2283
2284 if (osd_op.op.op == CEPH_OSD_OP_LIST_SNAPS) {
2285 if (m->get_snapid() != CEPH_SNAPDIR) {
2286 dout(10) << "LIST_SNAPS with incorrect context" << dendl;
2287 osd->reply_op_error(op, -EINVAL);
2288 return;
2289 }
2290 } else {
2291 if (m->get_snapid() == CEPH_SNAPDIR) {
2292 dout(10) << "non-LIST_SNAPS on snapdir" << dendl;
2293 osd->reply_op_error(op, -EINVAL);
2294 return;
2295 }
2296 }
2297 }
7c673cae
FG
2298
2299 // io blocked on obc?
2300 if (!m->has_flag(CEPH_OSD_FLAG_FLUSH) &&
11fdf7f2 2301 maybe_await_blocked_head(oid, op)) {
7c673cae
FG
2302 return;
2303 }
2304
9f95a23c
TL
2305 if (!is_primary()) {
2306 if (!recovery_state.can_serve_replica_read(oid)) {
f67539c2
TL
2307 dout(20) << __func__
2308 << ": unstable write on replica, bouncing to primary "
9f95a23c
TL
2309 << *m << dendl;
2310 osd->reply_op_error(op, -EAGAIN);
2311 return;
9f95a23c 2312 }
f67539c2
TL
2313 dout(20) << __func__ << ": serving replica read on oid " << oid
2314 << dendl;
9f95a23c
TL
2315 }
2316
7c673cae
FG
2317 int r = find_object_context(
2318 oid, &obc, can_create,
2319 m->has_flag(CEPH_OSD_FLAG_MAP_SNAP_CLONE),
2320 &missing_oid);
2321
11fdf7f2
TL
2322 // LIST_SNAPS needs the ssc too
2323 if (obc &&
2324 m->get_snapid() == CEPH_SNAPDIR &&
2325 !obc->ssc) {
2326 obc->ssc = get_snapset_context(oid, true);
2327 }
2328
7c673cae
FG
2329 if (r == -EAGAIN) {
2330 // If we're not the primary of this OSD, we just return -EAGAIN. Otherwise,
2331 // we have to wait for the object.
2332 if (is_primary()) {
2333 // missing the specific snap we need; requeue and wait.
11fdf7f2 2334 ceph_assert(!op->may_write()); // only happens on a read/cache
7c673cae
FG
2335 wait_for_unreadable_object(missing_oid, op);
2336 return;
2337 }
2338 } else if (r == 0) {
2339 if (is_unreadable_object(obc->obs.oi.soid)) {
2340 dout(10) << __func__ << ": clone " << obc->obs.oi.soid
2341 << " is unreadable, waiting" << dendl;
2342 wait_for_unreadable_object(obc->obs.oi.soid, op);
2343 return;
2344 }
2345
2346 // degraded object? (the check above was for head; this could be a clone)
2347 if (write_ordered &&
2348 obc->obs.oi.soid.snap != CEPH_NOSNAP &&
2349 is_degraded_or_backfilling_object(obc->obs.oi.soid)) {
2350 dout(10) << __func__ << ": clone " << obc->obs.oi.soid
2351 << " is degraded, waiting" << dendl;
2352 wait_for_degraded_object(obc->obs.oi.soid, op);
2353 return;
2354 }
2355 }
2356
2357 bool in_hit_set = false;
2358 if (hit_set) {
2359 if (obc.get()) {
2360 if (obc->obs.oi.soid != hobject_t() && hit_set->contains(obc->obs.oi.soid))
2361 in_hit_set = true;
2362 } else {
2363 if (missing_oid != hobject_t() && hit_set->contains(missing_oid))
2364 in_hit_set = true;
2365 }
2366 if (!op->hitset_inserted) {
2367 hit_set->insert(oid);
2368 op->hitset_inserted = true;
2369 if (hit_set->is_full() ||
2370 hit_set_start_stamp + pool.info.hit_set_period <= m->get_recv_stamp()) {
2371 hit_set_persist();
2372 }
2373 }
2374 }
2375
2376 if (agent_state) {
2377 if (agent_choose_mode(false, op))
2378 return;
2379 }
2380
20effc67 2381 if (obc.get() && obc->obs.exists) {
f67539c2
TL
2382 if (recover_adjacent_clones(obc, op)) {
2383 return;
2384 }
31f18b77
FG
2385 if (maybe_handle_manifest(op,
2386 write_ordered,
2387 obc))
2388 return;
2389 }
2390
7c673cae
FG
2391 if (maybe_handle_cache(op,
2392 write_ordered,
2393 obc,
2394 r,
2395 missing_oid,
2396 false,
2397 in_hit_set))
2398 return;
2399
2400 if (r && (r != -ENOENT || !obc)) {
2401 // copy the reqids for copy get on ENOENT
2402 if (r == -ENOENT &&
2403 (m->ops[0].op.op == CEPH_OSD_OP_COPY_GET)) {
2404 fill_in_copy_get_noent(op, oid, m->ops[0]);
2405 return;
2406 }
224ce89b 2407 dout(20) << __func__ << ": find_object_context got error " << r << dendl;
7c673cae 2408 if (op->may_write() &&
9f95a23c 2409 get_osdmap()->require_osd_release >= ceph_release_t::kraken) {
7c673cae
FG
2410 record_write_error(op, oid, nullptr, r);
2411 } else {
2412 osd->reply_op_error(op, r);
2413 }
2414 return;
2415 }
2416
2417 // make sure locator is consistent
2418 object_locator_t oloc(obc->obs.oi.soid);
2419 if (m->get_object_locator() != oloc) {
f67539c2 2420 dout(10) << " provided locator " << m->get_object_locator()
7c673cae 2421 << " != object's " << obc->obs.oi.soid << dendl;
f67539c2 2422 osd->clog->warn() << "bad locator " << m->get_object_locator()
7c673cae
FG
2423 << " on object " << oloc
2424 << " op " << *m;
2425 }
2426
2427 // io blocked on obc?
2428 if (obc->is_blocked() &&
2429 !m->has_flag(CEPH_OSD_FLAG_FLUSH)) {
2430 wait_for_blocked_object(obc->obs.oi.soid, op);
2431 return;
2432 }
2433
2434 dout(25) << __func__ << " oi " << obc->obs.oi << dendl;
2435
c07f9fc5 2436 OpContext *ctx = new OpContext(op, m->get_reqid(), &m->ops, obc, this);
7c673cae 2437
7c673cae
FG
2438 if (m->has_flag(CEPH_OSD_FLAG_SKIPRWLOCKS)) {
2439 dout(20) << __func__ << ": skipping rw locks" << dendl;
2440 } else if (m->get_flags() & CEPH_OSD_FLAG_FLUSH) {
2441 dout(20) << __func__ << ": part of flush, will ignore write lock" << dendl;
2442
2443 // verify there is in fact a flush in progress
2444 // FIXME: we could make this a stronger test.
2445 map<hobject_t,FlushOpRef>::iterator p = flush_ops.find(obc->obs.oi.soid);
2446 if (p == flush_ops.end()) {
2447 dout(10) << __func__ << " no flush in progress, aborting" << dendl;
2448 reply_ctx(ctx, -EINVAL);
2449 return;
2450 }
2451 } else if (!get_rw_locks(write_ordered, ctx)) {
2452 dout(20) << __func__ << " waiting for rw locks " << dendl;
2453 op->mark_delayed("waiting for rw locks");
2454 close_op_ctx(ctx);
2455 return;
2456 }
2457 dout(20) << __func__ << " obc " << *obc << dendl;
2458
2459 if (r) {
2460 dout(20) << __func__ << " returned an error: " << r << dendl;
7c673cae 2461 if (op->may_write() &&
9f95a23c
TL
2462 get_osdmap()->require_osd_release >= ceph_release_t::kraken) {
2463 record_write_error(op, oid, nullptr, r,
2464 ctx->op->allows_returnvec() ? ctx : nullptr);
7c673cae
FG
2465 } else {
2466 osd->reply_op_error(op, r);
2467 }
9f95a23c 2468 close_op_ctx(ctx);
7c673cae
FG
2469 return;
2470 }
2471
2472 if (m->has_flag(CEPH_OSD_FLAG_IGNORE_CACHE)) {
2473 ctx->ignore_cache = true;
2474 }
2475
2476 if ((op->may_read()) && (obc->obs.oi.is_lost())) {
2477 // This object is lost. Reading from it returns an error.
2478 dout(20) << __func__ << ": object " << obc->obs.oi.soid
2479 << " is lost" << dendl;
2480 reply_ctx(ctx, -ENFILE);
2481 return;
2482 }
2483 if (!op->may_write() &&
2484 !op->may_cache() &&
2485 (!obc->obs.exists ||
2486 ((m->get_snapid() != CEPH_SNAPDIR) &&
2487 obc->obs.oi.is_whiteout()))) {
2488 // copy the reqids for copy get on ENOENT
2489 if (m->ops[0].op.op == CEPH_OSD_OP_COPY_GET) {
2490 fill_in_copy_get_noent(op, oid, m->ops[0]);
2491 close_op_ctx(ctx);
2492 return;
2493 }
2494 reply_ctx(ctx, -ENOENT);
2495 return;
2496 }
2497
2498 op->mark_started();
2499
2500 execute_ctx(ctx);
2501 utime_t prepare_latency = ceph_clock_now();
2502 prepare_latency -= op->get_dequeued_time();
2503 osd->logger->tinc(l_osd_op_prepare_lat, prepare_latency);
2504 if (op->may_read() && op->may_write()) {
2505 osd->logger->tinc(l_osd_op_rw_prepare_lat, prepare_latency);
2506 } else if (op->may_read()) {
2507 osd->logger->tinc(l_osd_op_r_prepare_lat, prepare_latency);
2508 } else if (op->may_write() || op->may_cache()) {
2509 osd->logger->tinc(l_osd_op_w_prepare_lat, prepare_latency);
2510 }
2511
2512 // force recovery of the oldest missing object if too many logs
2513 maybe_force_recovery();
2514}
b32b8144 2515
31f18b77
FG
2516PrimaryLogPG::cache_result_t PrimaryLogPG::maybe_handle_manifest_detail(
2517 OpRequestRef op,
2518 bool write_ordered,
2519 ObjectContextRef obc)
2520{
20effc67
TL
2521 if (!obc) {
2522 dout(20) << __func__ << ": no obc " << dendl;
2523 return cache_result_t::NOOP;
2524 }
2525
2526 if (!obc->obs.oi.has_manifest()) {
2527 dout(20) << __func__ << ": " << obc->obs.oi.soid
2528 << " is not manifest object " << dendl;
2529 return cache_result_t::NOOP;
2530 }
9f95a23c 2531 if (op->get_req<MOSDOp>()->get_flags() & CEPH_OSD_FLAG_IGNORE_REDIRECT) {
31f18b77
FG
2532 dout(20) << __func__ << ": ignoring redirect due to flag" << dendl;
2533 return cache_result_t::NOOP;
2534 }
2535
31f18b77 2536 // if it is write-ordered and blocked, stop now
11fdf7f2 2537 if (obc->is_blocked() && write_ordered) {
31f18b77
FG
2538 // we're already doing something with this object
2539 dout(20) << __func__ << " blocked on " << obc->obs.oi.soid << dendl;
2540 return cache_result_t::NOOP;
2541 }
2542
9f95a23c 2543 vector<OSDOp> ops = op->get_req<MOSDOp>()->ops;
31f18b77
FG
2544 for (vector<OSDOp>::iterator p = ops.begin(); p != ops.end(); ++p) {
2545 OSDOp& osd_op = *p;
2546 ceph_osd_op& op = osd_op.op;
11fdf7f2 2547 if (op.op == CEPH_OSD_OP_SET_REDIRECT ||
f67539c2 2548 op.op == CEPH_OSD_OP_SET_CHUNK ||
9f95a23c 2549 op.op == CEPH_OSD_OP_UNSET_MANIFEST ||
f67539c2
TL
2550 op.op == CEPH_OSD_OP_TIER_PROMOTE ||
2551 op.op == CEPH_OSD_OP_TIER_FLUSH ||
20effc67
TL
2552 op.op == CEPH_OSD_OP_TIER_EVICT ||
2553 op.op == CEPH_OSD_OP_ISDIRTY) {
31f18b77
FG
2554 return cache_result_t::NOOP;
2555 }
2556 }
2557
2558 switch (obc->obs.oi.manifest.type) {
2559 case object_manifest_t::TYPE_REDIRECT:
2560 if (op->may_write() || write_ordered) {
11fdf7f2 2561 do_proxy_write(op, obc);
31f18b77 2562 } else {
f67539c2 2563 // promoted object
11fdf7f2
TL
2564 if (obc->obs.oi.size != 0) {
2565 return cache_result_t::NOOP;
2566 }
31f18b77
FG
2567 do_proxy_read(op, obc);
2568 }
2569 return cache_result_t::HANDLED_PROXY;
f67539c2 2570 case object_manifest_t::TYPE_CHUNKED:
11fdf7f2
TL
2571 {
2572 if (can_proxy_chunked_read(op, obc)) {
2573 map<hobject_t,FlushOpRef>::iterator p = flush_ops.find(obc->obs.oi.soid);
2574 if (p != flush_ops.end()) {
2575 do_proxy_chunked_op(op, obc->obs.oi.soid, obc, true);
2576 return cache_result_t::HANDLED_PROXY;
2577 }
2578 do_proxy_chunked_op(op, obc->obs.oi.soid, obc, write_ordered);
2579 return cache_result_t::HANDLED_PROXY;
2580 }
2581
2582 MOSDOp *m = static_cast<MOSDOp*>(op->get_nonconst_req());
2583 ceph_assert(m->get_type() == CEPH_MSG_OSD_OP);
2584 hobject_t head = m->get_hobj();
2585
2586 if (is_degraded_or_backfilling_object(head)) {
2587 dout(20) << __func__ << ": " << head << " is degraded, waiting" << dendl;
2588 wait_for_degraded_object(head, op);
2589 return cache_result_t::BLOCKED_RECOVERY;
2590 }
2591
f67539c2 2592 if (m_scrubber->write_blocked_by_scrub(head)) {
11fdf7f2
TL
2593 dout(20) << __func__ << ": waiting for scrub" << dendl;
2594 waiting_for_scrub.push_back(op);
2595 op->mark_delayed("waiting for scrub");
2596 return cache_result_t::BLOCKED_RECOVERY;
2597 }
9f95a23c
TL
2598 if (!check_laggy_requeue(op)) {
2599 return cache_result_t::BLOCKED_RECOVERY;
2600 }
f67539c2 2601
11fdf7f2
TL
2602 for (auto& p : obc->obs.oi.manifest.chunk_map) {
2603 if (p.second.is_missing()) {
9f95a23c 2604 auto m = op->get_req<MOSDOp>();
11fdf7f2
TL
2605 const object_locator_t oloc = m->get_object_locator();
2606 promote_object(obc, obc->obs.oi.soid, oloc, op, NULL);
2607 return cache_result_t::BLOCKED_PROMOTE;
2608 }
2609 }
11fdf7f2
TL
2610 return cache_result_t::NOOP;
2611 }
31f18b77 2612 default:
11fdf7f2 2613 ceph_abort_msg("unrecognized manifest type");
31f18b77
FG
2614 }
2615
2616 return cache_result_t::NOOP;
2617}
7c673cae
FG
2618
2619void PrimaryLogPG::record_write_error(OpRequestRef op, const hobject_t &soid,
9f95a23c
TL
2620 MOSDOpReply *orig_reply, int r,
2621 OpContext *ctx_for_op_returns)
7c673cae
FG
2622{
2623 dout(20) << __func__ << " r=" << r << dendl;
11fdf7f2 2624 ceph_assert(op->may_write());
9f95a23c 2625 const osd_reqid_t &reqid = op->get_req<MOSDOp>()->get_reqid();
31f18b77 2626 mempool::osd_pglog::list<pg_log_entry_t> entries;
7c673cae
FG
2627 entries.push_back(pg_log_entry_t(pg_log_entry_t::ERROR, soid,
2628 get_next_version(), eversion_t(), 0,
2629 reqid, utime_t(), r));
9f95a23c
TL
2630 if (ctx_for_op_returns) {
2631 entries.back().set_op_returns(*ctx_for_op_returns->ops);
2632 dout(20) << __func__ << " op_returns=" << entries.back().op_returns << dendl;
2633 }
7c673cae
FG
2634
2635 struct OnComplete {
2636 PrimaryLogPG *pg;
2637 OpRequestRef op;
2638 boost::intrusive_ptr<MOSDOpReply> orig_reply;
2639 int r;
2640 OnComplete(
2641 PrimaryLogPG *pg,
2642 OpRequestRef op,
2643 MOSDOpReply *orig_reply,
2644 int r)
2645 : pg(pg), op(op),
2646 orig_reply(orig_reply, false /* take over ref */), r(r)
2647 {}
2648 void operator()() {
2649 ldpp_dout(pg, 20) << "finished " << __func__ << " r=" << r << dendl;
9f95a23c 2650 auto m = op->get_req<MOSDOp>();
7c673cae 2651 MOSDOpReply *reply = orig_reply.detach();
7c673cae
FG
2652 ldpp_dout(pg, 10) << " sending commit on " << *m << " " << reply << dendl;
2653 pg->osd->send_message_osd_client(reply, m->get_connection());
2654 }
2655 };
2656
2657 ObcLockManager lock_manager;
2658 submit_log_entries(
2659 entries,
2660 std::move(lock_manager),
9f95a23c 2661 std::optional<std::function<void(void)> >(
7c673cae
FG
2662 OnComplete(this, op, orig_reply, r)),
2663 op,
2664 r);
2665}
2666
2667PrimaryLogPG::cache_result_t PrimaryLogPG::maybe_handle_cache_detail(
2668 OpRequestRef op,
2669 bool write_ordered,
2670 ObjectContextRef obc,
2671 int r, hobject_t missing_oid,
2672 bool must_promote,
2673 bool in_hit_set,
2674 ObjectContextRef *promote_obc)
2675{
b32b8144
FG
2676 // return quickly if caching is not enabled
2677 if (pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE)
2678 return cache_result_t::NOOP;
2679
7c673cae
FG
2680 if (op &&
2681 op->get_req() &&
2682 op->get_req()->get_type() == CEPH_MSG_OSD_OP &&
9f95a23c 2683 (op->get_req<MOSDOp>()->get_flags() &
7c673cae
FG
2684 CEPH_OSD_FLAG_IGNORE_CACHE)) {
2685 dout(20) << __func__ << ": ignoring cache due to flag" << dendl;
2686 return cache_result_t::NOOP;
2687 }
7c673cae
FG
2688
2689 must_promote = must_promote || op->need_promote();
2690
2691 if (obc)
2692 dout(25) << __func__ << " " << obc->obs.oi << " "
2693 << (obc->obs.exists ? "exists" : "DNE")
2694 << " missing_oid " << missing_oid
2695 << " must_promote " << (int)must_promote
2696 << " in_hit_set " << (int)in_hit_set
2697 << dendl;
2698 else
2699 dout(25) << __func__ << " (no obc)"
2700 << " missing_oid " << missing_oid
2701 << " must_promote " << (int)must_promote
2702 << " in_hit_set " << (int)in_hit_set
2703 << dendl;
2704
2705 // if it is write-ordered and blocked, stop now
2706 if (obc.get() && obc->is_blocked() && write_ordered) {
2707 // we're already doing something with this object
2708 dout(20) << __func__ << " blocked on " << obc->obs.oi.soid << dendl;
2709 return cache_result_t::NOOP;
2710 }
2711
2712 if (r == -ENOENT && missing_oid == hobject_t()) {
2713 // we know this object is logically absent (e.g., an undefined clone)
2714 return cache_result_t::NOOP;
2715 }
2716
2717 if (obc.get() && obc->obs.exists) {
2718 osd->logger->inc(l_osd_op_cache_hit);
2719 return cache_result_t::NOOP;
2720 }
b32b8144
FG
2721 if (!is_primary()) {
2722 dout(20) << __func__ << " cache miss; ask the primary" << dendl;
2723 osd->reply_op_error(op, -EAGAIN);
2724 return cache_result_t::REPLIED_WITH_EAGAIN;
2725 }
7c673cae
FG
2726
2727 if (missing_oid == hobject_t() && obc.get()) {
2728 missing_oid = obc->obs.oi.soid;
2729 }
2730
9f95a23c 2731 auto m = op->get_req<MOSDOp>();
7c673cae
FG
2732 const object_locator_t oloc = m->get_object_locator();
2733
2734 if (op->need_skip_handle_cache()) {
2735 return cache_result_t::NOOP;
2736 }
2737
7c673cae
FG
2738 OpRequestRef promote_op;
2739
2740 switch (pool.info.cache_mode) {
2741 case pg_pool_t::CACHEMODE_WRITEBACK:
2742 if (agent_state &&
2743 agent_state->evict_mode == TierAgentState::EVICT_MODE_FULL) {
2744 if (!op->may_write() && !op->may_cache() &&
2745 !write_ordered && !must_promote) {
2746 dout(20) << __func__ << " cache pool full, proxying read" << dendl;
2747 do_proxy_read(op);
2748 return cache_result_t::HANDLED_PROXY;
2749 }
2750 dout(20) << __func__ << " cache pool full, waiting" << dendl;
2751 block_write_on_full_cache(missing_oid, op);
2752 return cache_result_t::BLOCKED_FULL;
2753 }
2754
2755 if (must_promote || (!hit_set && !op->need_skip_promote())) {
2756 promote_object(obc, missing_oid, oloc, op, promote_obc);
2757 return cache_result_t::BLOCKED_PROMOTE;
2758 }
2759
2760 if (op->may_write() || op->may_cache()) {
11fdf7f2 2761 do_proxy_write(op);
7c673cae
FG
2762
2763 // Promote too?
f67539c2 2764 if (!op->need_skip_promote() &&
7c673cae
FG
2765 maybe_promote(obc, missing_oid, oloc, in_hit_set,
2766 pool.info.min_write_recency_for_promote,
2767 OpRequestRef(),
2768 promote_obc)) {
2769 return cache_result_t::BLOCKED_PROMOTE;
2770 }
2771 return cache_result_t::HANDLED_PROXY;
2772 } else {
2773 do_proxy_read(op);
2774
2775 // Avoid duplicate promotion
2776 if (obc.get() && obc->is_blocked()) {
2777 if (promote_obc)
2778 *promote_obc = obc;
2779 return cache_result_t::BLOCKED_PROMOTE;
2780 }
2781
2782 // Promote too?
2783 if (!op->need_skip_promote()) {
2784 (void)maybe_promote(obc, missing_oid, oloc, in_hit_set,
2785 pool.info.min_read_recency_for_promote,
2786 promote_op, promote_obc);
2787 }
2788
2789 return cache_result_t::HANDLED_PROXY;
2790 }
11fdf7f2 2791 ceph_abort_msg("unreachable");
7c673cae
FG
2792 return cache_result_t::NOOP;
2793
7c673cae
FG
2794 case pg_pool_t::CACHEMODE_READONLY:
2795 // TODO: clean this case up
2796 if (!obc.get() && r == -ENOENT) {
2797 // we don't have the object and op's a read
2798 promote_object(obc, missing_oid, oloc, op, promote_obc);
2799 return cache_result_t::BLOCKED_PROMOTE;
2800 }
2801 if (!r) { // it must be a write
2802 do_cache_redirect(op);
2803 return cache_result_t::HANDLED_REDIRECT;
2804 }
2805 // crap, there was a failure of some kind
2806 return cache_result_t::NOOP;
2807
9f95a23c
TL
2808 case pg_pool_t::CACHEMODE_FORWARD:
2809 // this mode is deprecated; proxy instead
7c673cae
FG
2810 case pg_pool_t::CACHEMODE_PROXY:
2811 if (!must_promote) {
2812 if (op->may_write() || op->may_cache() || write_ordered) {
11fdf7f2
TL
2813 do_proxy_write(op);
2814 return cache_result_t::HANDLED_PROXY;
7c673cae
FG
2815 } else {
2816 do_proxy_read(op);
2817 return cache_result_t::HANDLED_PROXY;
2818 }
2819 }
2820 // ugh, we're forced to promote.
2821 if (agent_state &&
2822 agent_state->evict_mode == TierAgentState::EVICT_MODE_FULL) {
2823 dout(20) << __func__ << " cache pool full, waiting" << dendl;
2824 block_write_on_full_cache(missing_oid, op);
2825 return cache_result_t::BLOCKED_FULL;
2826 }
2827 promote_object(obc, missing_oid, oloc, op, promote_obc);
2828 return cache_result_t::BLOCKED_PROMOTE;
2829
9f95a23c
TL
2830 case pg_pool_t::CACHEMODE_READFORWARD:
2831 // this mode is deprecated; proxy instead
7c673cae
FG
2832 case pg_pool_t::CACHEMODE_READPROXY:
2833 // Do writeback to the cache tier for writes
2834 if (op->may_write() || write_ordered || must_promote) {
2835 if (agent_state &&
2836 agent_state->evict_mode == TierAgentState::EVICT_MODE_FULL) {
2837 dout(20) << __func__ << " cache pool full, waiting" << dendl;
2838 block_write_on_full_cache(missing_oid, op);
2839 return cache_result_t::BLOCKED_FULL;
2840 }
2841 promote_object(obc, missing_oid, oloc, op, promote_obc);
2842 return cache_result_t::BLOCKED_PROMOTE;
2843 }
2844
2845 // If it is a read, we can read, we need to proxy it
2846 do_proxy_read(op);
2847 return cache_result_t::HANDLED_PROXY;
2848
2849 default:
11fdf7f2 2850 ceph_abort_msg("unrecognized cache_mode");
7c673cae
FG
2851 }
2852 return cache_result_t::NOOP;
2853}
2854
2855bool PrimaryLogPG::maybe_promote(ObjectContextRef obc,
2856 const hobject_t& missing_oid,
2857 const object_locator_t& oloc,
2858 bool in_hit_set,
2859 uint32_t recency,
2860 OpRequestRef promote_op,
2861 ObjectContextRef *promote_obc)
2862{
2863 dout(20) << __func__ << " missing_oid " << missing_oid
2864 << " in_hit_set " << in_hit_set << dendl;
2865
2866 switch (recency) {
2867 case 0:
2868 break;
2869 case 1:
2870 // Check if in the current hit set
2871 if (in_hit_set) {
2872 break;
2873 } else {
2874 // not promoting
2875 return false;
2876 }
2877 break;
2878 default:
2879 {
2880 unsigned count = (int)in_hit_set;
2881 if (count) {
2882 // Check if in other hit sets
2883 const hobject_t& oid = obc.get() ? obc->obs.oi.soid : missing_oid;
2884 for (map<time_t,HitSetRef>::reverse_iterator itor =
2885 agent_state->hit_set_map.rbegin();
2886 itor != agent_state->hit_set_map.rend();
2887 ++itor) {
2888 if (!itor->second->contains(oid)) {
2889 break;
2890 }
2891 ++count;
2892 if (count >= recency) {
2893 break;
2894 }
2895 }
2896 }
2897 if (count >= recency) {
2898 break;
2899 }
2900 return false; // not promoting
2901 }
2902 break;
2903 }
2904
2905 if (osd->promote_throttle()) {
2906 dout(10) << __func__ << " promote throttled" << dendl;
2907 return false;
2908 }
2909 promote_object(obc, missing_oid, oloc, promote_op, promote_obc);
2910 return true;
2911}
2912
2913void PrimaryLogPG::do_cache_redirect(OpRequestRef op)
2914{
9f95a23c 2915 auto m = op->get_req<MOSDOp>();
7c673cae 2916 int flags = m->get_flags() & (CEPH_OSD_FLAG_ACK|CEPH_OSD_FLAG_ONDISK);
11fdf7f2
TL
2917 MOSDOpReply *reply = new MOSDOpReply(m, -ENOENT, get_osdmap_epoch(),
2918 flags, false);
7c673cae
FG
2919 request_redirect_t redir(m->get_object_locator(), pool.info.tier_of);
2920 reply->set_redirect(redir);
2921 dout(10) << "sending redirect to pool " << pool.info.tier_of << " for op "
1e59de90 2922 << *op->get_req() << dendl;
7c673cae
FG
2923 m->get_connection()->send_message(reply);
2924 return;
2925}
2926
2927struct C_ProxyRead : public Context {
2928 PrimaryLogPGRef pg;
2929 hobject_t oid;
2930 epoch_t last_peering_reset;
2931 ceph_tid_t tid;
2932 PrimaryLogPG::ProxyReadOpRef prdop;
2933 utime_t start;
2934 C_ProxyRead(PrimaryLogPG *p, hobject_t o, epoch_t lpr,
2935 const PrimaryLogPG::ProxyReadOpRef& prd)
2936 : pg(p), oid(o), last_peering_reset(lpr),
2937 tid(0), prdop(prd), start(ceph_clock_now())
2938 {}
2939 void finish(int r) override {
2940 if (prdop->canceled)
2941 return;
9f95a23c 2942 std::scoped_lock locker{*pg};
7c673cae 2943 if (prdop->canceled) {
7c673cae
FG
2944 return;
2945 }
2946 if (last_peering_reset == pg->get_last_peering_reset()) {
2947 pg->finish_proxy_read(oid, tid, r);
2948 pg->osd->logger->tinc(l_osd_tier_r_lat, ceph_clock_now() - start);
2949 }
7c673cae
FG
2950 }
2951};
2952
11fdf7f2
TL
2953struct C_ProxyChunkRead : public Context {
2954 PrimaryLogPGRef pg;
2955 hobject_t oid;
2956 epoch_t last_peering_reset;
2957 ceph_tid_t tid;
2958 PrimaryLogPG::ProxyReadOpRef prdop;
2959 utime_t start;
2960 ObjectOperation *obj_op;
2961 int op_index = 0;
2962 uint64_t req_offset = 0;
2963 ObjectContextRef obc;
2964 uint64_t req_total_len = 0;
2965 C_ProxyChunkRead(PrimaryLogPG *p, hobject_t o, epoch_t lpr,
2966 const PrimaryLogPG::ProxyReadOpRef& prd)
2967 : pg(p), oid(o), last_peering_reset(lpr),
2968 tid(0), prdop(prd), start(ceph_clock_now()), obj_op(NULL)
2969 {}
2970 void finish(int r) override {
2971 if (prdop->canceled)
2972 return;
9f95a23c 2973 std::scoped_lock locker{*pg};
11fdf7f2 2974 if (prdop->canceled) {
11fdf7f2
TL
2975 return;
2976 }
2977 if (last_peering_reset == pg->get_last_peering_reset()) {
2978 if (r >= 0) {
2979 if (!prdop->ops[op_index].outdata.length()) {
2980 ceph_assert(req_total_len);
2981 bufferlist list;
2982 bufferptr bptr(req_total_len);
2983 list.push_back(std::move(bptr));
2984 prdop->ops[op_index].outdata.append(list);
2985 }
2986 ceph_assert(obj_op);
2987 uint64_t copy_offset;
2988 if (req_offset >= prdop->ops[op_index].op.extent.offset) {
2989 copy_offset = req_offset - prdop->ops[op_index].op.extent.offset;
2990 } else {
2991 copy_offset = 0;
2992 }
9f95a23c
TL
2993 prdop->ops[op_index].outdata.begin(copy_offset).copy_in(
2994 obj_op->ops[0].outdata.length(),
2995 obj_op->ops[0].outdata.c_str());
f67539c2
TL
2996 }
2997
11fdf7f2
TL
2998 pg->finish_proxy_read(oid, tid, r);
2999 pg->osd->logger->tinc(l_osd_tier_r_lat, ceph_clock_now() - start);
3000 if (obj_op) {
3001 delete obj_op;
3002 }
3003 }
11fdf7f2
TL
3004 }
3005};
3006
31f18b77 3007void PrimaryLogPG::do_proxy_read(OpRequestRef op, ObjectContextRef obc)
7c673cae
FG
3008{
3009 // NOTE: non-const here because the ProxyReadOp needs mutable refs to
3010 // stash the result in the request's OSDOp vector
3011 MOSDOp *m = static_cast<MOSDOp*>(op->get_nonconst_req());
31f18b77
FG
3012 object_locator_t oloc;
3013 hobject_t soid;
3014 /* extensible tier */
3015 if (obc && obc->obs.exists && obc->obs.oi.has_manifest()) {
3016 switch (obc->obs.oi.manifest.type) {
3017 case object_manifest_t::TYPE_REDIRECT:
3018 oloc = object_locator_t(obc->obs.oi.manifest.redirect_target);
f67539c2 3019 soid = obc->obs.oi.manifest.redirect_target;
31f18b77 3020 break;
31f18b77 3021 default:
11fdf7f2 3022 ceph_abort_msg("unrecognized manifest type");
31f18b77
FG
3023 }
3024 } else {
3025 /* proxy */
3026 soid = m->get_hobj();
3027 oloc = object_locator_t(m->get_object_locator());
3028 oloc.pool = pool.info.tier_of;
3029 }
7c673cae
FG
3030 unsigned flags = CEPH_OSD_FLAG_IGNORE_CACHE | CEPH_OSD_FLAG_IGNORE_OVERLAY;
3031
3032 // pass through some original flags that make sense.
3033 // - leave out redirection and balancing flags since we are
3034 // already proxying through the primary
3035 // - leave off read/write/exec flags that are derived from the op
3036 flags |= m->get_flags() & (CEPH_OSD_FLAG_RWORDERED |
3037 CEPH_OSD_FLAG_ORDERSNAP |
3038 CEPH_OSD_FLAG_ENFORCE_SNAPC |
3039 CEPH_OSD_FLAG_MAP_SNAP_CLONE);
3040
3041 dout(10) << __func__ << " Start proxy read for " << *m << dendl;
3042
3043 ProxyReadOpRef prdop(std::make_shared<ProxyReadOp>(op, soid, m->ops));
3044
3045 ObjectOperation obj_op;
3046 obj_op.dup(prdop->ops);
3047
3048 if (pool.info.cache_mode == pg_pool_t::CACHEMODE_WRITEBACK &&
3049 (agent_state && agent_state->evict_mode != TierAgentState::EVICT_MODE_FULL)) {
3050 for (unsigned i = 0; i < obj_op.ops.size(); i++) {
3051 ceph_osd_op op = obj_op.ops[i].op;
3052 switch (op.op) {
3053 case CEPH_OSD_OP_READ:
3054 case CEPH_OSD_OP_SYNC_READ:
3055 case CEPH_OSD_OP_SPARSE_READ:
3056 case CEPH_OSD_OP_CHECKSUM:
c07f9fc5 3057 case CEPH_OSD_OP_CMPEXT:
7c673cae
FG
3058 op.flags = (op.flags | CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL) &
3059 ~(CEPH_OSD_OP_FLAG_FADVISE_DONTNEED | CEPH_OSD_OP_FLAG_FADVISE_NOCACHE);
3060 }
3061 }
3062 }
3063
3064 C_ProxyRead *fin = new C_ProxyRead(this, soid, get_last_peering_reset(),
3065 prdop);
3066 ceph_tid_t tid = osd->objecter->read(
3067 soid.oid, oloc, obj_op,
3068 m->get_snapid(), NULL,
9f95a23c 3069 flags, new C_OnFinisher(fin, osd->get_objecter_finisher(get_pg_shard())),
7c673cae
FG
3070 &prdop->user_version,
3071 &prdop->data_offset,
3072 m->get_features());
3073 fin->tid = tid;
3074 prdop->objecter_tid = tid;
3075 proxyread_ops[tid] = prdop;
3076 in_progress_proxy_ops[soid].push_back(op);
3077}
3078
3079void PrimaryLogPG::finish_proxy_read(hobject_t oid, ceph_tid_t tid, int r)
3080{
3081 dout(10) << __func__ << " " << oid << " tid " << tid
3082 << " " << cpp_strerror(r) << dendl;
3083
3084 map<ceph_tid_t, ProxyReadOpRef>::iterator p = proxyread_ops.find(tid);
3085 if (p == proxyread_ops.end()) {
3086 dout(10) << __func__ << " no proxyread_op found" << dendl;
3087 return;
3088 }
3089 ProxyReadOpRef prdop = p->second;
3090 if (tid != prdop->objecter_tid) {
3091 dout(10) << __func__ << " tid " << tid << " != prdop " << prdop
3092 << " tid " << prdop->objecter_tid << dendl;
3093 return;
3094 }
3095 if (oid != prdop->soid) {
3096 dout(10) << __func__ << " oid " << oid << " != prdop " << prdop
3097 << " soid " << prdop->soid << dendl;
3098 return;
3099 }
3100 proxyread_ops.erase(tid);
3101
3102 map<hobject_t, list<OpRequestRef>>::iterator q = in_progress_proxy_ops.find(oid);
3103 if (q == in_progress_proxy_ops.end()) {
3104 dout(10) << __func__ << " no in_progress_proxy_ops found" << dendl;
3105 return;
3106 }
11fdf7f2 3107 ceph_assert(q->second.size());
7c673cae
FG
3108 list<OpRequestRef>::iterator it = std::find(q->second.begin(),
3109 q->second.end(),
3110 prdop->op);
11fdf7f2 3111 ceph_assert(it != q->second.end());
7c673cae
FG
3112 OpRequestRef op = *it;
3113 q->second.erase(it);
3114 if (q->second.size() == 0) {
3115 in_progress_proxy_ops.erase(oid);
11fdf7f2
TL
3116 } else if (std::find(q->second.begin(),
3117 q->second.end(),
3118 prdop->op) != q->second.end()) {
3119 /* multiple read case */
3120 dout(20) << __func__ << " " << oid << " is not completed " << dendl;
3121 return;
7c673cae
FG
3122 }
3123
3124 osd->logger->inc(l_osd_tier_proxy_read);
3125
9f95a23c 3126 auto m = op->get_req<MOSDOp>();
c07f9fc5 3127 OpContext *ctx = new OpContext(op, m->get_reqid(), &prdop->ops, this);
11fdf7f2 3128 ctx->reply = new MOSDOpReply(m, 0, get_osdmap_epoch(), 0, false);
7c673cae
FG
3129 ctx->user_at_version = prdop->user_version;
3130 ctx->data_off = prdop->data_offset;
3131 ctx->ignore_log_op_stats = true;
3132 complete_read_ctx(r, ctx);
3133}
3134
3135void PrimaryLogPG::kick_proxy_ops_blocked(hobject_t& soid)
3136{
3137 map<hobject_t, list<OpRequestRef>>::iterator p = in_progress_proxy_ops.find(soid);
3138 if (p == in_progress_proxy_ops.end())
3139 return;
3140
3141 list<OpRequestRef>& ls = p->second;
3142 dout(10) << __func__ << " " << soid << " requeuing " << ls.size() << " requests" << dendl;
3143 requeue_ops(ls);
3144 in_progress_proxy_ops.erase(p);
3145}
3146
94b18763
FG
3147void PrimaryLogPG::cancel_proxy_read(ProxyReadOpRef prdop,
3148 vector<ceph_tid_t> *tids)
7c673cae
FG
3149{
3150 dout(10) << __func__ << " " << prdop->soid << dendl;
3151 prdop->canceled = true;
3152
3153 // cancel objecter op, if we can
3154 if (prdop->objecter_tid) {
94b18763 3155 tids->push_back(prdop->objecter_tid);
7c673cae
FG
3156 for (uint32_t i = 0; i < prdop->ops.size(); i++) {
3157 prdop->ops[i].outdata.clear();
3158 }
3159 proxyread_ops.erase(prdop->objecter_tid);
3160 prdop->objecter_tid = 0;
3161 }
3162}
3163
94b18763 3164void PrimaryLogPG::cancel_proxy_ops(bool requeue, vector<ceph_tid_t> *tids)
7c673cae
FG
3165{
3166 dout(10) << __func__ << dendl;
3167
3168 // cancel proxy reads
3169 map<ceph_tid_t, ProxyReadOpRef>::iterator p = proxyread_ops.begin();
3170 while (p != proxyread_ops.end()) {
94b18763 3171 cancel_proxy_read((p++)->second, tids);
7c673cae
FG
3172 }
3173
3174 // cancel proxy writes
3175 map<ceph_tid_t, ProxyWriteOpRef>::iterator q = proxywrite_ops.begin();
3176 while (q != proxywrite_ops.end()) {
94b18763 3177 cancel_proxy_write((q++)->second, tids);
7c673cae
FG
3178 }
3179
3180 if (requeue) {
3181 map<hobject_t, list<OpRequestRef>>::iterator p =
3182 in_progress_proxy_ops.begin();
3183 while (p != in_progress_proxy_ops.end()) {
3184 list<OpRequestRef>& ls = p->second;
3185 dout(10) << __func__ << " " << p->first << " requeuing " << ls.size()
3186 << " requests" << dendl;
3187 requeue_ops(ls);
3188 in_progress_proxy_ops.erase(p++);
3189 }
3190 } else {
3191 in_progress_proxy_ops.clear();
3192 }
3193}
3194
3195struct C_ProxyWrite_Commit : public Context {
3196 PrimaryLogPGRef pg;
3197 hobject_t oid;
3198 epoch_t last_peering_reset;
3199 ceph_tid_t tid;
3200 PrimaryLogPG::ProxyWriteOpRef pwop;
3201 C_ProxyWrite_Commit(PrimaryLogPG *p, hobject_t o, epoch_t lpr,
3202 const PrimaryLogPG::ProxyWriteOpRef& pw)
3203 : pg(p), oid(o), last_peering_reset(lpr),
3204 tid(0), pwop(pw)
3205 {}
3206 void finish(int r) override {
3207 if (pwop->canceled)
3208 return;
9f95a23c 3209 std::scoped_lock locker{*pg};
7c673cae 3210 if (pwop->canceled) {
7c673cae
FG
3211 return;
3212 }
3213 if (last_peering_reset == pg->get_last_peering_reset()) {
3214 pg->finish_proxy_write(oid, tid, r);
3215 }
7c673cae
FG
3216 }
3217};
3218
11fdf7f2 3219void PrimaryLogPG::do_proxy_write(OpRequestRef op, ObjectContextRef obc)
7c673cae
FG
3220{
3221 // NOTE: non-const because ProxyWriteOp takes a mutable ref
3222 MOSDOp *m = static_cast<MOSDOp*>(op->get_nonconst_req());
31f18b77 3223 object_locator_t oloc;
7c673cae 3224 SnapContext snapc(m->get_snap_seq(), m->get_snaps());
31f18b77
FG
3225 hobject_t soid;
3226 /* extensible tier */
3227 if (obc && obc->obs.exists && obc->obs.oi.has_manifest()) {
3228 switch (obc->obs.oi.manifest.type) {
3229 case object_manifest_t::TYPE_REDIRECT:
3230 oloc = object_locator_t(obc->obs.oi.manifest.redirect_target);
f67539c2 3231 soid = obc->obs.oi.manifest.redirect_target;
31f18b77 3232 break;
31f18b77 3233 default:
11fdf7f2 3234 ceph_abort_msg("unrecognized manifest type");
31f18b77
FG
3235 }
3236 } else {
3237 /* proxy */
3238 soid = m->get_hobj();
3239 oloc = object_locator_t(m->get_object_locator());
3240 oloc.pool = pool.info.tier_of;
3241 }
7c673cae 3242
7c673cae 3243 unsigned flags = CEPH_OSD_FLAG_IGNORE_CACHE | CEPH_OSD_FLAG_IGNORE_OVERLAY;
31f18b77
FG
3244 if (!(op->may_write() || op->may_cache())) {
3245 flags |= CEPH_OSD_FLAG_RWORDERED;
3246 }
9f95a23c
TL
3247 if (op->allows_returnvec()) {
3248 flags |= CEPH_OSD_FLAG_RETURNVEC;
3249 }
3250
7c673cae
FG
3251 dout(10) << __func__ << " Start proxy write for " << *m << dendl;
3252
3253 ProxyWriteOpRef pwop(std::make_shared<ProxyWriteOp>(op, soid, m->ops, m->get_reqid()));
c07f9fc5 3254 pwop->ctx = new OpContext(op, m->get_reqid(), &pwop->ops, this);
7c673cae
FG
3255 pwop->mtime = m->get_mtime();
3256
3257 ObjectOperation obj_op;
3258 obj_op.dup(pwop->ops);
3259
3260 C_ProxyWrite_Commit *fin = new C_ProxyWrite_Commit(
3261 this, soid, get_last_peering_reset(), pwop);
3262 ceph_tid_t tid = osd->objecter->mutate(
3263 soid.oid, oloc, obj_op, snapc,
3264 ceph::real_clock::from_ceph_timespec(pwop->mtime),
9f95a23c 3265 flags, new C_OnFinisher(fin, osd->get_objecter_finisher(get_pg_shard())),
7c673cae
FG
3266 &pwop->user_version, pwop->reqid);
3267 fin->tid = tid;
3268 pwop->objecter_tid = tid;
3269 proxywrite_ops[tid] = pwop;
3270 in_progress_proxy_ops[soid].push_back(op);
3271}
3272
f67539c2 3273void PrimaryLogPG::do_proxy_chunked_op(OpRequestRef op, const hobject_t& missing_oid,
11fdf7f2
TL
3274 ObjectContextRef obc, bool write_ordered)
3275{
3276 MOSDOp *m = static_cast<MOSDOp*>(op->get_nonconst_req());
3277 OSDOp *osd_op = NULL;
3278 for (unsigned int i = 0; i < m->ops.size(); i++) {
3279 osd_op = &m->ops[i];
3280 uint64_t cursor = osd_op->op.extent.offset;
3281 uint64_t op_length = osd_op->op.extent.offset + osd_op->op.extent.length;
3282 uint64_t chunk_length = 0, chunk_index = 0, req_len = 0;
3283 object_manifest_t *manifest = &obc->obs.oi.manifest;
3284 map <uint64_t, map<uint64_t, uint64_t>> chunk_read;
3285
3286 while (cursor < op_length) {
3287 chunk_index = 0;
3288 chunk_length = 0;
3289 /* find the right chunk position for cursor */
f67539c2
TL
3290 for (auto &p : manifest->chunk_map) {
3291 if (p.first <= cursor && p.first + p.second.length > cursor) {
3292 chunk_length = p.second.length;
3293 chunk_index = p.first;
11fdf7f2
TL
3294 break;
3295 }
f67539c2 3296 }
11fdf7f2
TL
3297 /* no index */
3298 if (!chunk_index && !chunk_length) {
3299 if (cursor == osd_op->op.extent.offset) {
f67539c2 3300 OpContext *ctx = new OpContext(op, m->get_reqid(), &m->ops, this);
11fdf7f2 3301 ctx->reply = new MOSDOpReply(m, 0, get_osdmap_epoch(), 0, false);
f67539c2
TL
3302 ctx->data_off = osd_op->op.extent.offset;
3303 ctx->ignore_log_op_stats = true;
3304 complete_read_ctx(0, ctx);
11fdf7f2
TL
3305 }
3306 break;
3307 }
3308 uint64_t next_length = chunk_length;
3309 /* the size to read -> | op length | */
3310 /* | a chunk | */
3311 if (cursor + next_length > op_length) {
3312 next_length = op_length - cursor;
3313 }
3314 /* the size to read -> | op length | */
3315 /* | a chunk | */
f67539c2
TL
3316 if (cursor + next_length > chunk_index + chunk_length) {
3317 next_length = chunk_index + chunk_length - cursor;
3318 }
11fdf7f2
TL
3319
3320 chunk_read[cursor] = {{chunk_index, next_length}};
3321 cursor += next_length;
3322 }
3323
3324 req_len = cursor - osd_op->op.extent.offset;
3325 for (auto &p : chunk_read) {
3326 auto chunks = p.second.begin();
f67539c2
TL
3327 dout(20) << __func__ << " chunk_index: " << chunks->first
3328 << " next_length: " << chunks->second << " cursor: "
11fdf7f2
TL
3329 << p.first << dendl;
3330 do_proxy_chunked_read(op, obc, i, chunks->first, p.first, chunks->second, req_len, write_ordered);
3331 }
f67539c2 3332 }
11fdf7f2
TL
3333}
3334
3335struct RefCountCallback : public Context {
3336public:
11fdf7f2
TL
3337 PrimaryLogPG::OpContext *ctx;
3338 OSDOp& osd_op;
9f95a23c 3339 bool requeue = false;
f67539c2 3340
9f95a23c
TL
3341 RefCountCallback(PrimaryLogPG::OpContext *ctx, OSDOp &osd_op)
3342 : ctx(ctx), osd_op(osd_op) {}
11fdf7f2 3343 void finish(int r) override {
9f95a23c
TL
3344 // NB: caller must already have pg->lock held
3345 ctx->obc->stop_block();
3346 ctx->pg->kick_object_context_blocked(ctx->obc);
3347 if (r >= 0) {
3348 osd_op.rval = 0;
3349 ctx->pg->execute_ctx(ctx);
3350 } else {
3351 // on cancel simply toss op out,
3352 // or requeue as requested
3353 if (r != -ECANCELED) {
3354 if (ctx->op)
3355 ctx->pg->osd->reply_op_error(ctx->op, r);
3356 } else if (requeue) {
3357 if (ctx->op)
3358 ctx->pg->requeue_op(ctx->op);
11fdf7f2 3359 }
9f95a23c 3360 ctx->pg->close_op_ctx(ctx);
11fdf7f2 3361 }
9f95a23c
TL
3362 }
3363 void set_requeue(bool rq) {
3364 requeue = rq;
11fdf7f2
TL
3365 }
3366};
3367
3368struct SetManifestFinisher : public PrimaryLogPG::OpFinisher {
3369 OSDOp& osd_op;
3370
3371 explicit SetManifestFinisher(OSDOp& osd_op) : osd_op(osd_op) {
3372 }
3373
3374 int execute() override {
3375 return osd_op.rval;
3376 }
3377};
3378
9f95a23c 3379struct C_SetManifestRefCountDone : public Context {
f67539c2 3380 PrimaryLogPGRef pg;
9f95a23c 3381 hobject_t soid;
20effc67
TL
3382 uint64_t offset;
3383 ceph_tid_t tid = 0;
f67539c2 3384 C_SetManifestRefCountDone(PrimaryLogPG *p,
20effc67
TL
3385 hobject_t soid, uint64_t offset) :
3386 pg(p), soid(soid), offset(offset) {}
9f95a23c
TL
3387 void finish(int r) override {
3388 if (r == -ECANCELED)
3389 return;
9f95a23c 3390 std::scoped_lock locker{*pg};
20effc67 3391 pg->finish_set_manifest_refcount(soid, r, tid, offset);
f67539c2
TL
3392 }
3393};
3394
3395struct C_SetDedupChunks : public Context {
3396 PrimaryLogPGRef pg;
3397 hobject_t oid;
3398 epoch_t last_peering_reset;
3399 ceph_tid_t tid;
3400 uint64_t offset;
3401
3402 C_SetDedupChunks(PrimaryLogPG *p, hobject_t o, epoch_t lpr, uint64_t offset)
3403 : pg(p), oid(o), last_peering_reset(lpr),
3404 tid(0), offset(offset)
3405 {}
3406 void finish(int r) override {
3407 if (r == -ECANCELED)
3408 return;
3409 std::scoped_lock locker{*pg};
3410 if (last_peering_reset != pg->get_last_peering_reset()) {
3411 return;
3412 }
3413 pg->finish_set_dedup(oid, r, tid, offset);
9f95a23c
TL
3414 }
3415};
3416
3417void PrimaryLogPG::cancel_manifest_ops(bool requeue, vector<ceph_tid_t> *tids)
3418{
3419 dout(10) << __func__ << dendl;
3420 auto p = manifest_ops.begin();
3421 while (p != manifest_ops.end()) {
3422 auto mop = p->second;
3423 // cancel objecter op, if we can
3424 if (mop->objecter_tid) {
3425 tids->push_back(mop->objecter_tid);
3426 mop->objecter_tid = 0;
20effc67
TL
3427 } else if (!mop->tids.empty()) {
3428 for (auto &p : mop->tids) {
3429 tids->push_back(p.second);
3430 }
9f95a23c 3431 }
f67539c2
TL
3432 if (mop->cb) {
3433 mop->cb->set_requeue(requeue);
3434 mop->cb->complete(-ECANCELED);
3435 }
9f95a23c
TL
3436 manifest_ops.erase(p++);
3437 }
3438}
3439
20effc67 3440int PrimaryLogPG::get_manifest_ref_count(ObjectContextRef obc, std::string& fp_oid, OpRequestRef op)
f67539c2
TL
3441{
3442 int cnt = 0;
3443 // head
3444 for (auto &p : obc->obs.oi.manifest.chunk_map) {
3445 if (p.second.oid.oid.name == fp_oid) {
3446 cnt++;
3447 }
3448 }
3449 // snap
3450 SnapSet& ss = obc->ssc->snapset;
3451 const OSDMapRef& osdmap = get_osdmap();
3452 for (vector<snapid_t>::const_reverse_iterator p = ss.clones.rbegin();
3453 p != ss.clones.rend();
3454 ++p) {
3455 object_ref_delta_t refs;
3456 ObjectContextRef obc_l = nullptr;
3457 ObjectContextRef obc_g = nullptr;
3458 hobject_t clone_oid = obc->obs.oi.soid;
3459 clone_oid.snap = *p;
3460 if (osdmap->in_removed_snaps_queue(info.pgid.pgid.pool(), *p)) {
3461 return -EBUSY;
3462 }
33c7a0ef
TL
3463 if (is_unreadable_object(clone_oid)) {
3464 dout(10) << __func__ << ": " << clone_oid
3465 << " is unreadable. Need to wait for recovery" << dendl;
3466 wait_for_unreadable_object(clone_oid, op);
3467 return -EAGAIN;
3468 }
f67539c2
TL
3469 ObjectContextRef clone_obc = get_object_context(clone_oid, false);
3470 if (!clone_obc) {
3471 break;
3472 }
20effc67
TL
3473 if (recover_adjacent_clones(clone_obc, op)) {
3474 return -EAGAIN;
3475 }
f67539c2
TL
3476 get_adjacent_clones(clone_obc, obc_l, obc_g);
3477 clone_obc->obs.oi.manifest.calc_refs_to_inc_on_set(
3478 obc_g ? &(obc_g->obs.oi.manifest) : nullptr ,
3479 nullptr,
3480 refs);
3481 for (auto p = refs.begin(); p != refs.end(); ++p) {
3482 if (p->first.oid.name == fp_oid && p->second > 0) {
3483 cnt += p->second;
3484 }
3485 }
3486 }
3487
3488 return cnt;
3489}
3490
3491bool PrimaryLogPG::recover_adjacent_clones(ObjectContextRef obc, OpRequestRef op)
3492{
a4b75251
TL
3493 if (!obc->ssc || !obc->ssc->snapset.clones.size()) {
3494 return false;
3495 }
3496 MOSDOp *m = static_cast<MOSDOp*>(op->get_nonconst_req());
3497 bool has_manifest_op = std::any_of(
3498 begin(m->ops),
3499 end(m->ops),
3500 [](const auto& osd_op) {
3501 return osd_op.op.op == CEPH_OSD_OP_SET_CHUNK;
3502 });
3503 if (!obc->obs.oi.manifest.is_chunked() && !has_manifest_op) {
f67539c2
TL
3504 return false;
3505 }
20effc67 3506 ceph_assert(op);
f67539c2
TL
3507
3508 const SnapSet& snapset = obc->ssc->snapset;
3509 auto s = std::find(snapset.clones.begin(), snapset.clones.end(), obc->obs.oi.soid.snap);
3510 auto is_unreadable_snap = [this, obc, &snapset, op](auto iter) -> bool {
3511 hobject_t cid = obc->obs.oi.soid;
3512 cid.snap = (iter == snapset.clones.end()) ? snapid_t(CEPH_NOSNAP) : *iter;
3513 if (is_unreadable_object(cid)) {
3514 dout(10) << __func__ << ": clone " << cid
3515 << " is unreadable, waiting" << dendl;
3516 wait_for_unreadable_object(cid, op);
3517 return true;
3518 }
3519 return false;
3520 };
3521 if (s != snapset.clones.begin()) {
3522 if (is_unreadable_snap(s - 1)) {
3523 return true;
3524 }
3525 }
3526 if (s != snapset.clones.end()) {
3527 if (is_unreadable_snap(s + 1)) {
3528 return true;
3529 }
3530 }
3531 return false;
3532}
3533
3534ObjectContextRef PrimaryLogPG::get_prev_clone_obc(ObjectContextRef obc)
3535{
3536 auto s = std::find(obc->ssc->snapset.clones.begin(), obc->ssc->snapset.clones.end(),
3537 obc->obs.oi.soid.snap);
3538 if (s != obc->ssc->snapset.clones.begin()) {
3539 auto s_iter = s - 1;
3540 hobject_t cid = obc->obs.oi.soid;
3541 object_ref_delta_t refs;
3542 cid.snap = *s_iter;
3543 ObjectContextRef cobc = get_object_context(cid, false, NULL);
3544 ceph_assert(cobc);
3545 return cobc;
3546 }
3547 return nullptr;
3548}
3549
3550void PrimaryLogPG::dec_refcount(const hobject_t& soid, const object_ref_delta_t& refs)
3551{
3552 for (auto p = refs.begin(); p != refs.end(); ++p) {
3553 int dec_ref_count = p->second;
3554 ceph_assert(dec_ref_count < 0);
3555 while (dec_ref_count < 0) {
3556 dout(10) << __func__ << ": decrement reference on offset oid: " << p->first << dendl;
3557 refcount_manifest(soid, p->first,
3558 refcount_t::DECREMENT_REF, NULL, std::nullopt);
3559 dec_ref_count++;
3560 }
3561 }
3562}
3563
3564
3565void PrimaryLogPG::get_adjacent_clones(ObjectContextRef src_obc,
3566 ObjectContextRef& _l, ObjectContextRef& _g)
3567{
3568 const SnapSet& snapset = src_obc->ssc->snapset;
3569 const object_info_t& oi = src_obc->obs.oi;
3570
3571 auto get_context = [this, &oi, &snapset](auto iter)
3572 -> ObjectContextRef {
3573 hobject_t cid = oi.soid;
3574 cid.snap = (iter == snapset.clones.end()) ? snapid_t(CEPH_NOSNAP) : *iter;
3575 ObjectContextRef obc = get_object_context(cid, false, NULL);
3576 ceph_assert(obc);
3577 return obc;
3578 };
3579
3580 // check adjacent clones
3581 auto s = std::find(snapset.clones.begin(), snapset.clones.end(), oi.soid.snap);
3582
3583 // We *must* find the clone iff it's not head,
3584 // let s == snapset.clones.end() mean head
3585 ceph_assert((s == snapset.clones.end()) == oi.soid.is_head());
3586
3587 if (s != snapset.clones.begin()) {
3588 _l = get_context(s - 1);
3589 }
3590
3591 if (s != snapset.clones.end()) {
3592 _g = get_context(s + 1);
3593 }
3594}
3595
3596bool PrimaryLogPG::inc_refcount_by_set(OpContext* ctx, object_manifest_t& set_chunk,
3597 OSDOp& osd_op)
3598{
3599 object_ref_delta_t refs;
3600 ObjectContextRef obc_l, obc_g;
3601 get_adjacent_clones(ctx->obc, obc_l, obc_g);
3602 set_chunk.calc_refs_to_inc_on_set(
3603 obc_l ? &(obc_l->obs.oi.manifest) : nullptr,
3604 obc_g ? &(obc_g->obs.oi.manifest) : nullptr,
3605 refs);
20effc67 3606 bool need_inc_ref = false;
f67539c2 3607 if (!refs.is_empty()) {
1e59de90 3608 ManifestOpRef mop(std::make_shared<ManifestOp>(ctx->obc, nullptr));
20effc67
TL
3609 for (auto c : set_chunk.chunk_map) {
3610 auto p = refs.find(c.second.oid);
3611 if (p == refs.end()) {
3612 continue;
3613 }
3614
3615 int inc_ref_count = p->second;
3616 if (inc_ref_count > 0) {
3617 /*
3618 * In set-chunk case, the first thing we should do is to increment
3619 * the reference the targe object has prior to update object_manifest in object_info_t.
3620 * So, call directly refcount_manifest.
3621 */
3622 auto target_oid = p->first;
3623 auto offset = c.first;
3624 auto length = c.second.length;
3625 auto* fin = new C_SetManifestRefCountDone(this, ctx->obs->oi.soid, offset);
3626 ceph_tid_t tid = refcount_manifest(ctx->obs->oi.soid, target_oid,
3627 refcount_t::INCREMENT_REF, fin, std::nullopt);
3628 fin->tid = tid;
3629 mop->chunks[target_oid] = make_pair(offset, length);
3630 mop->num_chunks++;
3631 mop->tids[offset] = tid;
3632
3633 if (!ctx->obc->is_blocked()) {
1e59de90 3634 dout(15) << fmt::format("{}: blocking object on rc: tid:{}", __func__, tid) << dendl;
20effc67
TL
3635 ctx->obc->start_block();
3636 }
3637 need_inc_ref = true;
3638 } else if (inc_ref_count < 0) {
3639 hobject_t src = ctx->obs->oi.soid;
3640 hobject_t tgt = p->first;
3641 ctx->register_on_commit(
3642 [src, tgt, this](){
3643 refcount_manifest(src, tgt, refcount_t::DECREMENT_REF, NULL, std::nullopt);
3644 });
3645 }
3646 }
3647 if (mop->tids.size()) {
3648 mop->cb = new RefCountCallback(ctx, osd_op);
f67539c2 3649 manifest_ops[ctx->obs->oi.soid] = mop;
20effc67 3650 manifest_ops[ctx->obs->oi.soid]->op = ctx->op;
f67539c2
TL
3651 }
3652 }
3653
20effc67 3654 return need_inc_ref;
f67539c2
TL
3655}
3656
20effc67
TL
3657void PrimaryLogPG::update_chunk_map_by_dirty(OpContext* ctx) {
3658 /*
3659 * We should consider two cases here:
3660 * 1) just modification: This created dirty regions, but didn't update chunk_map.
3661 * 2) rollback: In rollback, head will be converted to the clone the rollback targets.
3662 * Also, rollback already updated chunk_map.
3663 * So, we should do here is to check whether chunk_map is updated and the clean_region has dirty regions.
3664 * In case of the rollback, chunk_map doesn't need to be clear
3665 */
f67539c2
TL
3666 for (auto &p : ctx->obs->oi.manifest.chunk_map) {
3667 if (!ctx->clean_regions.is_clean_region(p.first, p.second.length)) {
3668 ctx->new_obs.oi.manifest.chunk_map.erase(p.first);
3669 if (ctx->new_obs.oi.manifest.chunk_map.empty()) {
3670 ctx->new_obs.oi.manifest.type = object_manifest_t::TYPE_NONE;
3671 ctx->new_obs.oi.clear_flag(object_info_t::FLAG_MANIFEST);
3672 ctx->delta_stats.num_objects_manifest--;
3673 }
3674 }
3675 }
20effc67
TL
3676}
3677
3678void PrimaryLogPG::dec_refcount_by_dirty(OpContext* ctx)
3679{
3680 object_ref_delta_t refs;
3681 ObjectContextRef cobc = nullptr;
3682 ObjectContextRef obc = ctx->obc;
f67539c2
TL
3683 // Look over previous snapshot, then figure out whether updated chunk needs to be deleted
3684 cobc = get_prev_clone_obc(obc);
3685 obc->obs.oi.manifest.calc_refs_to_drop_on_modify(
3686 cobc ? &cobc->obs.oi.manifest : nullptr,
3687 ctx->clean_regions,
3688 refs);
3689 if (!refs.is_empty()) {
3690 hobject_t soid = obc->obs.oi.soid;
3691 ctx->register_on_commit(
3692 [soid, this, refs](){
3693 dec_refcount(soid, refs);
3694 });
3695 }
3696}
3697
3698void PrimaryLogPG::dec_all_refcount_manifest(const object_info_t& oi, OpContext* ctx)
3699{
3700 ceph_assert(oi.has_manifest());
3701 ceph_assert(ctx->obc->ssc);
3702
3703 if (oi.manifest.is_chunked()) {
3704 object_ref_delta_t refs;
20effc67
TL
3705 ObjectContextRef obc_l, obc_g, obc;
3706 /* in trim_object, oi and ctx can have different oid */
3707 obc = get_object_context(oi.soid, false, NULL);
3708 ceph_assert(obc);
3709 get_adjacent_clones(obc, obc_l, obc_g);
f67539c2
TL
3710 oi.manifest.calc_refs_to_drop_on_removal(
3711 obc_l ? &(obc_l->obs.oi.manifest) : nullptr,
3712 obc_g ? &(obc_g->obs.oi.manifest) : nullptr,
3713 refs);
3714
3715 if (!refs.is_empty()) {
20effc67 3716 /* dec_refcount will use head object anyway */
f67539c2
TL
3717 hobject_t soid = ctx->obc->obs.oi.soid;
3718 ctx->register_on_commit(
3719 [soid, this, refs](){
3720 dec_refcount(soid, refs);
3721 });
3722 }
3723 } else if (oi.manifest.is_redirect() &&
3724 oi.test_flag(object_info_t::FLAG_REDIRECT_HAS_REFERENCE)) {
3725 ctx->register_on_commit(
3726 [oi, this](){
3727 refcount_manifest(oi.soid, oi.manifest.redirect_target,
3728 refcount_t::DECREMENT_REF, NULL, std::nullopt);
3729 });
3730 }
3731}
3732
3733ceph_tid_t PrimaryLogPG::refcount_manifest(hobject_t src_soid, hobject_t tgt_soid, refcount_t type,
3734 Context *cb, std::optional<bufferlist> chunk)
11fdf7f2
TL
3735{
3736 unsigned flags = CEPH_OSD_FLAG_IGNORE_CACHE | CEPH_OSD_FLAG_IGNORE_OVERLAY |
f67539c2
TL
3737 CEPH_OSD_FLAG_RWORDERED;
3738
3739 dout(10) << __func__ << " Start refcount from " << src_soid
3740 << " to " << tgt_soid << dendl;
11fdf7f2 3741
11fdf7f2
TL
3742 ObjectOperation obj_op;
3743 bufferlist in;
f67539c2
TL
3744 if (type == refcount_t::INCREMENT_REF) {
3745 cls_cas_chunk_get_ref_op call;
3746 call.source = src_soid.get_head();
3747 ::encode(call, in);
3748 obj_op.call("cas", "chunk_get_ref", in);
3749 } else if (type == refcount_t::DECREMENT_REF) {
3750 cls_cas_chunk_put_ref_op call;
3751 call.source = src_soid.get_head();
3752 ::encode(call, in);
3753 obj_op.call("cas", "chunk_put_ref", in);
3754 } else if (type == refcount_t::CREATE_OR_GET_REF) {
3755 cls_cas_chunk_create_or_get_ref_op get_call;
3756 get_call.source = src_soid.get_head();
3757 ceph_assert(chunk);
20effc67 3758 get_call.data = std::move(*chunk);
f67539c2
TL
3759 ::encode(get_call, in);
3760 obj_op.call("cas", "chunk_create_or_get_ref", in);
3761 } else {
3762 ceph_assert(0 == "unrecognized type");
3763 }
3764
9f95a23c 3765 Context *c = nullptr;
11fdf7f2 3766 if (cb) {
f67539c2 3767 c = new C_OnFinisher(cb, osd->get_objecter_finisher(get_pg_shard()));
11fdf7f2
TL
3768 }
3769
f67539c2
TL
3770 object_locator_t oloc(tgt_soid);
3771 ObjectContextRef src_obc = get_object_context(src_soid, false, NULL);
3772 ceph_assert(src_obc);
9f95a23c 3773 auto tid = osd->objecter->mutate(
f67539c2
TL
3774 tgt_soid.oid, oloc, obj_op, SnapContext(),
3775 ceph::real_clock::from_ceph_timespec(src_obc->obs.oi.mtime),
11fdf7f2 3776 flags, c);
f67539c2
TL
3777 return tid;
3778}
11fdf7f2
TL
3779
3780void PrimaryLogPG::do_proxy_chunked_read(OpRequestRef op, ObjectContextRef obc, int op_index,
3781 uint64_t chunk_index, uint64_t req_offset, uint64_t req_length,
3782 uint64_t req_total_len, bool write_ordered)
3783{
3784 MOSDOp *m = static_cast<MOSDOp*>(op->get_nonconst_req());
3785 object_manifest_t *manifest = &obc->obs.oi.manifest;
3786 if (!manifest->chunk_map.count(chunk_index)) {
3787 return;
f67539c2 3788 }
11fdf7f2
TL
3789 uint64_t chunk_length = manifest->chunk_map[chunk_index].length;
3790 hobject_t soid = manifest->chunk_map[chunk_index].oid;
3791 hobject_t ori_soid = m->get_hobj();
3792 object_locator_t oloc(soid);
3793 unsigned flags = CEPH_OSD_FLAG_IGNORE_CACHE | CEPH_OSD_FLAG_IGNORE_OVERLAY;
3794 if (write_ordered) {
3795 flags |= CEPH_OSD_FLAG_RWORDERED;
3796 }
f67539c2 3797
11fdf7f2
TL
3798 if (!chunk_length || soid == hobject_t()) {
3799 return;
3800 }
3801
3802 /* same as do_proxy_read() */
3803 flags |= m->get_flags() & (CEPH_OSD_FLAG_RWORDERED |
3804 CEPH_OSD_FLAG_ORDERSNAP |
3805 CEPH_OSD_FLAG_ENFORCE_SNAPC |
3806 CEPH_OSD_FLAG_MAP_SNAP_CLONE);
3807
f67539c2
TL
3808 dout(10) << __func__ << " Start do chunk proxy read for " << *m
3809 << " index: " << op_index << " oid: " << soid.oid.name << " req_offset: " << req_offset
11fdf7f2
TL
3810 << " req_length: " << req_length << dendl;
3811
3812 ProxyReadOpRef prdop(std::make_shared<ProxyReadOp>(op, ori_soid, m->ops));
3813
3814 ObjectOperation *pobj_op = new ObjectOperation;
3815 OSDOp &osd_op = pobj_op->add_op(m->ops[op_index].op.op);
3816
3817 if (chunk_index <= req_offset) {
3818 osd_op.op.extent.offset = manifest->chunk_map[chunk_index].offset + req_offset - chunk_index;
3819 } else {
3820 ceph_abort_msg("chunk_index > req_offset");
f67539c2
TL
3821 }
3822 osd_op.op.extent.length = req_length;
11fdf7f2
TL
3823
3824 ObjectOperation obj_op;
3825 obj_op.dup(pobj_op->ops);
3826
3827 C_ProxyChunkRead *fin = new C_ProxyChunkRead(this, ori_soid, get_last_peering_reset(),
3828 prdop);
3829 fin->obj_op = pobj_op;
3830 fin->op_index = op_index;
3831 fin->req_offset = req_offset;
3832 fin->obc = obc;
3833 fin->req_total_len = req_total_len;
3834
11fdf7f2
TL
3835 ceph_tid_t tid = osd->objecter->read(
3836 soid.oid, oloc, obj_op,
3837 m->get_snapid(), NULL,
9f95a23c 3838 flags, new C_OnFinisher(fin, osd->get_objecter_finisher(get_pg_shard())),
11fdf7f2
TL
3839 &prdop->user_version,
3840 &prdop->data_offset,
3841 m->get_features());
3842 fin->tid = tid;
3843 prdop->objecter_tid = tid;
3844 proxyread_ops[tid] = prdop;
3845 in_progress_proxy_ops[ori_soid].push_back(op);
3846}
3847
3848bool PrimaryLogPG::can_proxy_chunked_read(OpRequestRef op, ObjectContextRef obc)
3849{
3850 MOSDOp *m = static_cast<MOSDOp*>(op->get_nonconst_req());
3851 OSDOp *osd_op = NULL;
3852 bool ret = true;
3853 for (unsigned int i = 0; i < m->ops.size(); i++) {
3854 osd_op = &m->ops[i];
3855 ceph_osd_op op = osd_op->op;
3856 switch (op.op) {
f67539c2 3857 case CEPH_OSD_OP_READ:
11fdf7f2
TL
3858 case CEPH_OSD_OP_SYNC_READ: {
3859 uint64_t cursor = osd_op->op.extent.offset;
3860 uint64_t remain = osd_op->op.extent.length;
3861
3862 /* requested chunks exist in chunk_map ? */
3863 for (auto &p : obc->obs.oi.manifest.chunk_map) {
3864 if (p.first <= cursor && p.first + p.second.length > cursor) {
3865 if (!p.second.is_missing()) {
3866 return false;
3867 }
3868 if (p.second.length >= remain) {
3869 remain = 0;
f67539c2 3870 break;
11fdf7f2
TL
3871 } else {
3872 remain = remain - p.second.length;
3873 }
3874 cursor += p.second.length;
3875 }
3876 }
f67539c2 3877
11fdf7f2
TL
3878 if (remain) {
3879 dout(20) << __func__ << " requested chunks don't exist in chunk_map " << dendl;
3880 return false;
3881 }
3882 continue;
3883 }
3884 default:
3885 return false;
3886 }
3887 }
3888 return ret;
3889}
3890
7c673cae
FG
3891void PrimaryLogPG::finish_proxy_write(hobject_t oid, ceph_tid_t tid, int r)
3892{
3893 dout(10) << __func__ << " " << oid << " tid " << tid
3894 << " " << cpp_strerror(r) << dendl;
3895
3896 map<ceph_tid_t, ProxyWriteOpRef>::iterator p = proxywrite_ops.find(tid);
3897 if (p == proxywrite_ops.end()) {
3898 dout(10) << __func__ << " no proxywrite_op found" << dendl;
3899 return;
3900 }
3901 ProxyWriteOpRef pwop = p->second;
11fdf7f2
TL
3902 ceph_assert(tid == pwop->objecter_tid);
3903 ceph_assert(oid == pwop->soid);
7c673cae
FG
3904
3905 proxywrite_ops.erase(tid);
3906
3907 map<hobject_t, list<OpRequestRef> >::iterator q = in_progress_proxy_ops.find(oid);
3908 if (q == in_progress_proxy_ops.end()) {
3909 dout(10) << __func__ << " no in_progress_proxy_ops found" << dendl;
3910 delete pwop->ctx;
3911 pwop->ctx = NULL;
3912 return;
3913 }
3914 list<OpRequestRef>& in_progress_op = q->second;
11fdf7f2 3915 ceph_assert(in_progress_op.size());
7c673cae
FG
3916 list<OpRequestRef>::iterator it = std::find(in_progress_op.begin(),
3917 in_progress_op.end(),
3918 pwop->op);
11fdf7f2 3919 ceph_assert(it != in_progress_op.end());
7c673cae
FG
3920 in_progress_op.erase(it);
3921 if (in_progress_op.size() == 0) {
3922 in_progress_proxy_ops.erase(oid);
11fdf7f2
TL
3923 } else if (std::find(in_progress_op.begin(),
3924 in_progress_op.end(),
3925 pwop->op) != in_progress_op.end()) {
3926 if (pwop->ctx)
3927 delete pwop->ctx;
3928 pwop->ctx = NULL;
3929 dout(20) << __func__ << " " << oid << " tid " << tid
3930 << " in_progress_op size: "
3931 << in_progress_op.size() << dendl;
3932 return;
7c673cae
FG
3933 }
3934
3935 osd->logger->inc(l_osd_tier_proxy_write);
3936
9f95a23c 3937 auto m = pwop->op->get_req<MOSDOp>();
11fdf7f2 3938 ceph_assert(m != NULL);
7c673cae
FG
3939
3940 if (!pwop->sent_reply) {
3941 // send commit.
9f95a23c
TL
3942 assert(pwop->ctx->reply == nullptr);
3943 MOSDOpReply *reply = new MOSDOpReply(m, r, get_osdmap_epoch(), 0,
3944 true /* we claim it below */);
3945 reply->set_reply_versions(eversion_t(), pwop->user_version);
7c673cae 3946 reply->add_flags(CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK);
9f95a23c 3947 reply->claim_op_out_data(pwop->ops);
7c673cae
FG
3948 dout(10) << " sending commit on " << pwop << " " << reply << dendl;
3949 osd->send_message_osd_client(reply, m->get_connection());
3950 pwop->sent_reply = true;
3951 pwop->ctx->op->mark_commit_sent();
3952 }
3953
3954 delete pwop->ctx;
3955 pwop->ctx = NULL;
3956}
3957
94b18763
FG
3958void PrimaryLogPG::cancel_proxy_write(ProxyWriteOpRef pwop,
3959 vector<ceph_tid_t> *tids)
7c673cae
FG
3960{
3961 dout(10) << __func__ << " " << pwop->soid << dendl;
3962 pwop->canceled = true;
3963
3964 // cancel objecter op, if we can
3965 if (pwop->objecter_tid) {
94b18763 3966 tids->push_back(pwop->objecter_tid);
7c673cae
FG
3967 delete pwop->ctx;
3968 pwop->ctx = NULL;
3969 proxywrite_ops.erase(pwop->objecter_tid);
3970 pwop->objecter_tid = 0;
3971 }
3972}
3973
3974class PromoteCallback: public PrimaryLogPG::CopyCallback {
3975 ObjectContextRef obc;
3976 PrimaryLogPG *pg;
3977 utime_t start;
3978public:
3979 PromoteCallback(ObjectContextRef obc_, PrimaryLogPG *pg_)
3980 : obc(obc_),
3981 pg(pg_),
3982 start(ceph_clock_now()) {}
3983
3984 void finish(PrimaryLogPG::CopyCallbackResults results) override {
3985 PrimaryLogPG::CopyResults *results_data = results.get<1>();
3986 int r = results.get<0>();
20effc67
TL
3987 if (obc->obs.oi.has_manifest() && obc->obs.oi.manifest.is_chunked()) {
3988 pg->finish_promote_manifest(r, results_data, obc);
3989 } else {
3990 pg->finish_promote(r, results_data, obc);
3991 }
7c673cae
FG
3992 pg->osd->logger->tinc(l_osd_tier_promote_lat, ceph_clock_now() - start);
3993 }
3994};
3995
11fdf7f2
TL
3996class PromoteManifestCallback: public PrimaryLogPG::CopyCallback {
3997 ObjectContextRef obc;
3998 PrimaryLogPG *pg;
3999 utime_t start;
4000 PrimaryLogPG::OpContext *ctx;
4001 PrimaryLogPG::CopyCallbackResults promote_results;
4002public:
20effc67 4003 PromoteManifestCallback(ObjectContextRef obc_, PrimaryLogPG *pg_, PrimaryLogPG::OpContext *ctx)
11fdf7f2
TL
4004 : obc(obc_),
4005 pg(pg_),
4006 start(ceph_clock_now()), ctx(ctx) {}
4007
4008 void finish(PrimaryLogPG::CopyCallbackResults results) override {
4009 PrimaryLogPG::CopyResults *results_data = results.get<1>();
4010 int r = results.get<0>();
20effc67
TL
4011 promote_results = results;
4012 if (obc->obs.oi.has_manifest() && obc->obs.oi.manifest.is_redirect()) {
4013 ctx->user_at_version = results_data->user_version;
4014 }
4015 if (r >= 0) {
4016 ctx->pg->execute_ctx(ctx);
11fdf7f2 4017 } else {
20effc67
TL
4018 if (r != -ECANCELED) {
4019 if (ctx->op)
4020 ctx->pg->osd->reply_op_error(ctx->op, r);
4021 } else if (results_data->should_requeue) {
4022 if (ctx->op)
4023 ctx->pg->requeue_op(ctx->op);
4024 }
4025 ctx->pg->close_op_ctx(ctx);
11fdf7f2
TL
4026 }
4027 pg->osd->logger->tinc(l_osd_tier_promote_lat, ceph_clock_now() - start);
4028 }
4029 friend struct PromoteFinisher;
4030};
4031
4032struct PromoteFinisher : public PrimaryLogPG::OpFinisher {
4033 PromoteManifestCallback *promote_callback;
4034
4035 explicit PromoteFinisher(PromoteManifestCallback *promote_callback)
4036 : promote_callback(promote_callback) {
4037 }
4038
4039 int execute() override {
4040 if (promote_callback->ctx->obc->obs.oi.manifest.is_redirect()) {
4041 promote_callback->ctx->pg->finish_promote(promote_callback->promote_results.get<0>(),
4042 promote_callback->promote_results.get<1>(),
4043 promote_callback->obc);
4044 } else if (promote_callback->ctx->obc->obs.oi.manifest.is_chunked()) {
4045 promote_callback->ctx->pg->finish_promote_manifest(promote_callback->promote_results.get<0>(),
4046 promote_callback->promote_results.get<1>(),
4047 promote_callback->obc);
4048 } else {
4049 ceph_abort_msg("unrecognized manifest type");
4050 }
4051 return 0;
4052 }
4053};
4054
7c673cae
FG
4055void PrimaryLogPG::promote_object(ObjectContextRef obc,
4056 const hobject_t& missing_oid,
4057 const object_locator_t& oloc,
4058 OpRequestRef op,
4059 ObjectContextRef *promote_obc)
4060{
4061 hobject_t hoid = obc ? obc->obs.oi.soid : missing_oid;
11fdf7f2 4062 ceph_assert(hoid != hobject_t());
f67539c2 4063 if (m_scrubber->write_blocked_by_scrub(hoid)) {
7c673cae
FG
4064 dout(10) << __func__ << " " << hoid
4065 << " blocked by scrub" << dendl;
4066 if (op) {
4067 waiting_for_scrub.push_back(op);
4068 op->mark_delayed("waiting for scrub");
4069 dout(10) << __func__ << " " << hoid
4070 << " placing op in waiting_for_scrub" << dendl;
4071 } else {
4072 dout(10) << __func__ << " " << hoid
4073 << " no op, dropping on the floor" << dendl;
4074 }
4075 return;
4076 }
9f95a23c
TL
4077 if (op && !check_laggy_requeue(op)) {
4078 return;
4079 }
7c673cae 4080 if (!obc) { // we need to create an ObjectContext
11fdf7f2 4081 ceph_assert(missing_oid != hobject_t());
7c673cae
FG
4082 obc = get_object_context(missing_oid, true);
4083 }
4084 if (promote_obc)
4085 *promote_obc = obc;
4086
4087 /*
4088 * Before promote complete, if there are proxy-reads for the object,
4089 * for this case we don't use DONTNEED.
4090 */
4091 unsigned src_fadvise_flags = LIBRADOS_OP_FLAG_FADVISE_SEQUENTIAL;
4092 map<hobject_t, list<OpRequestRef>>::iterator q = in_progress_proxy_ops.find(obc->obs.oi.soid);
4093 if (q == in_progress_proxy_ops.end()) {
4094 src_fadvise_flags |= LIBRADOS_OP_FLAG_FADVISE_DONTNEED;
4095 }
4096
11fdf7f2
TL
4097 CopyCallback *cb;
4098 object_locator_t my_oloc;
4099 hobject_t src_hoid;
4100 if (!obc->obs.oi.has_manifest()) {
4101 my_oloc = oloc;
4102 my_oloc.pool = pool.info.tier_of;
4103 src_hoid = obc->obs.oi.soid;
4104 cb = new PromoteCallback(obc, this);
4105 } else {
4106 if (obc->obs.oi.manifest.is_chunked()) {
4107 src_hoid = obc->obs.oi.soid;
20effc67 4108 cb = new PromoteCallback(obc, this);
11fdf7f2
TL
4109 } else if (obc->obs.oi.manifest.is_redirect()) {
4110 object_locator_t src_oloc(obc->obs.oi.manifest.redirect_target);
4111 my_oloc = src_oloc;
4112 src_hoid = obc->obs.oi.manifest.redirect_target;
4113 cb = new PromoteCallback(obc, this);
4114 } else {
4115 ceph_abort_msg("unrecognized manifest type");
4116 }
4117 }
7c673cae
FG
4118
4119 unsigned flags = CEPH_OSD_COPY_FROM_FLAG_IGNORE_OVERLAY |
4120 CEPH_OSD_COPY_FROM_FLAG_IGNORE_CACHE |
4121 CEPH_OSD_COPY_FROM_FLAG_MAP_SNAP_CLONE |
4122 CEPH_OSD_COPY_FROM_FLAG_RWORDERED;
11fdf7f2 4123 start_copy(cb, obc, src_hoid, my_oloc, 0, flags,
7c673cae
FG
4124 obc->obs.oi.soid.snap == CEPH_NOSNAP,
4125 src_fadvise_flags, 0);
4126
11fdf7f2 4127 ceph_assert(obc->is_blocked());
7c673cae
FG
4128
4129 if (op)
4130 wait_for_blocked_object(obc->obs.oi.soid, op);
9f95a23c
TL
4131
4132 recovery_state.update_stats(
4133 [](auto &history, auto &stats) {
4134 stats.stats.sum.num_promote++;
4135 return false;
4136 });
7c673cae
FG
4137}
4138
4139void PrimaryLogPG::execute_ctx(OpContext *ctx)
4140{
11fdf7f2 4141 FUNCTRACE(cct);
7c673cae
FG
4142 dout(10) << __func__ << " " << ctx << dendl;
4143 ctx->reset_obs(ctx->obc);
4144 ctx->update_log_only = false; // reset in case finish_copyfrom() is re-running execute_ctx
4145 OpRequestRef op = ctx->op;
9f95a23c 4146 auto m = op->get_req<MOSDOp>();
7c673cae
FG
4147 ObjectContextRef obc = ctx->obc;
4148 const hobject_t& soid = obc->obs.oi.soid;
4149
4150 // this method must be idempotent since we may call it several times
4151 // before we finally apply the resulting transaction.
4152 ctx->op_t.reset(new PGTransaction);
4153
4154 if (op->may_write() || op->may_cache()) {
4155 // snap
4156 if (!(m->has_flag(CEPH_OSD_FLAG_ENFORCE_SNAPC)) &&
4157 pool.info.is_pool_snaps_mode()) {
4158 // use pool's snapc
4159 ctx->snapc = pool.snapc;
4160 } else {
4161 // client specified snapc
4162 ctx->snapc.seq = m->get_snap_seq();
4163 ctx->snapc.snaps = m->get_snaps();
4164 filter_snapc(ctx->snapc.snaps);
4165 }
4166 if ((m->has_flag(CEPH_OSD_FLAG_ORDERSNAP)) &&
4167 ctx->snapc.seq < obc->ssc->snapset.seq) {
4168 dout(10) << " ORDERSNAP flag set and snapc seq " << ctx->snapc.seq
4169 << " < snapset seq " << obc->ssc->snapset.seq
4170 << " on " << obc->obs.oi.soid << dendl;
4171 reply_ctx(ctx, -EOLDSNAPC);
4172 return;
4173 }
4174
4175 // version
4176 ctx->at_version = get_next_version();
4177 ctx->mtime = m->get_mtime();
4178
c07f9fc5 4179 dout(10) << __func__ << " " << soid << " " << *ctx->ops
f67539c2 4180 << " ov " << obc->obs.oi.version << " av " << ctx->at_version
7c673cae
FG
4181 << " snapc " << ctx->snapc
4182 << " snapset " << obc->ssc->snapset
f67539c2 4183 << dendl;
7c673cae 4184 } else {
c07f9fc5 4185 dout(10) << __func__ << " " << soid << " " << *ctx->ops
7c673cae 4186 << " ov " << obc->obs.oi.version
f67539c2 4187 << dendl;
7c673cae
FG
4188 }
4189
4190 if (!ctx->user_at_version)
4191 ctx->user_at_version = obc->obs.oi.user_version;
4192 dout(30) << __func__ << " user_at_version " << ctx->user_at_version << dendl;
4193
7c673cae
FG
4194 {
4195#ifdef WITH_LTTNG
4196 osd_reqid_t reqid = ctx->op->get_reqid();
4197#endif
4198 tracepoint(osd, prepare_tx_enter, reqid.name._type,
4199 reqid.name._num, reqid.tid, reqid.inc);
4200 }
20effc67 4201
7c673cae
FG
4202
4203 int result = prepare_transaction(ctx);
4204
4205 {
4206#ifdef WITH_LTTNG
4207 osd_reqid_t reqid = ctx->op->get_reqid();
4208#endif
4209 tracepoint(osd, prepare_tx_exit, reqid.name._type,
4210 reqid.name._num, reqid.tid, reqid.inc);
4211 }
4212
c07f9fc5
FG
4213 bool pending_async_reads = !ctx->pending_async_reads.empty();
4214 if (result == -EINPROGRESS || pending_async_reads) {
7c673cae 4215 // come back later.
c07f9fc5 4216 if (pending_async_reads) {
11fdf7f2 4217 ceph_assert(pool.info.is_erasure());
c07f9fc5
FG
4218 in_progress_async_reads.push_back(make_pair(op, ctx));
4219 ctx->start_async_reads(this);
4220 }
7c673cae
FG
4221 return;
4222 }
4223
4224 if (result == -EAGAIN) {
4225 // clean up after the ctx
4226 close_op_ctx(ctx);
4227 return;
4228 }
4229
9f95a23c
TL
4230 bool ignore_out_data = false;
4231 if (!ctx->op_t->empty() &&
4232 op->may_write() &&
4233 result >= 0) {
4234 // successful update
4235 if (ctx->op->allows_returnvec()) {
4236 // enforce reasonable bound on the return buffer sizes
4237 for (auto& i : *ctx->ops) {
4238 if (i.outdata.length() > cct->_conf->osd_max_write_op_reply_len) {
4239 dout(10) << __func__ << " op " << i << " outdata overflow" << dendl;
4240 result = -EOVERFLOW; // overall result is overflow
4241 i.rval = -EOVERFLOW;
4242 i.outdata.clear();
4243 }
4244 }
4245 } else {
4246 // legacy behavior -- zero result and return data etc.
4247 ignore_out_data = true;
4248 result = 0;
4249 }
7c673cae 4250 }
9f95a23c
TL
4251
4252 // prepare the reply
4253 ctx->reply = new MOSDOpReply(m, result, get_osdmap_epoch(), 0,
4254 ignore_out_data);
4255 dout(20) << __func__ << " alloc reply " << ctx->reply
4256 << " result " << result << dendl;
7c673cae
FG
4257
4258 // read or error?
4259 if ((ctx->op_t->empty() || result < 0) && !ctx->update_log_only) {
4260 // finish side-effects
4261 if (result >= 0)
4262 do_osd_op_effects(ctx, m->get_connection());
4263
c07f9fc5 4264 complete_read_ctx(result, ctx);
7c673cae
FG
4265 return;
4266 }
4267
4268 ctx->reply->set_reply_versions(ctx->at_version, ctx->user_at_version);
4269
11fdf7f2 4270 ceph_assert(op->may_write() || op->may_cache());
7c673cae
FG
4271
4272 // trim log?
9f95a23c 4273 recovery_state.update_trim_to();
7c673cae
FG
4274
4275 // verify that we are doing this in order?
4276 if (cct->_conf->osd_debug_op_order && m->get_source().is_client() &&
4277 !pool.info.is_tier() && !pool.info.has_tiers()) {
4278 map<client_t,ceph_tid_t>& cm = debug_op_order[obc->obs.oi.soid];
4279 ceph_tid_t t = m->get_tid();
4280 client_t n = m->get_source().num();
4281 map<client_t,ceph_tid_t>::iterator p = cm.find(n);
4282 if (p == cm.end()) {
4283 dout(20) << " op order client." << n << " tid " << t << " (first)" << dendl;
4284 cm[n] = t;
4285 } else {
4286 dout(20) << " op order client." << n << " tid " << t << " last was " << p->second << dendl;
4287 if (p->second > t) {
4288 derr << "bad op order, already applied " << p->second << " > this " << t << dendl;
11fdf7f2 4289 ceph_abort_msg("out of order op");
7c673cae
FG
4290 }
4291 p->second = t;
4292 }
4293 }
4294
4295 if (ctx->update_log_only) {
4296 if (result >= 0)
4297 do_osd_op_effects(ctx, m->get_connection());
4298
4299 dout(20) << __func__ << " update_log_only -- result=" << result << dendl;
4300 // save just what we need from ctx
4301 MOSDOpReply *reply = ctx->reply;
4302 ctx->reply = nullptr;
c07f9fc5 4303 reply->get_header().data_off = (ctx->data_off ? *ctx->data_off : 0);
7c673cae
FG
4304
4305 if (result == -ENOENT) {
4306 reply->set_enoent_reply_versions(info.last_update,
4307 info.last_user_version);
4308 }
4309 reply->add_flags(CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK);
4310 // append to pg log for dup detection - don't save buffers for now
9f95a23c
TL
4311 record_write_error(op, soid, reply, result,
4312 ctx->op->allows_returnvec() ? ctx : nullptr);
4313 close_op_ctx(ctx);
7c673cae
FG
4314 return;
4315 }
4316
4317 // no need to capture PG ref, repop cancel will handle that
4318 // Can capture the ctx by pointer, it's owned by the repop
4319 ctx->register_on_commit(
4320 [m, ctx, this](){
4321 if (ctx->op)
11fdf7f2 4322 log_op_stats(*ctx->op, ctx->bytes_written, ctx->bytes_read);
7c673cae
FG
4323
4324 if (m && !ctx->sent_reply) {
4325 MOSDOpReply *reply = ctx->reply;
9f95a23c 4326 ctx->reply = nullptr;
7c673cae
FG
4327 reply->add_flags(CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK);
4328 dout(10) << " sending reply on " << *m << " " << reply << dendl;
4329 osd->send_message_osd_client(reply, m->get_connection());
4330 ctx->sent_reply = true;
4331 ctx->op->mark_commit_sent();
4332 }
4333 });
4334 ctx->register_on_success(
4335 [ctx, this]() {
4336 do_osd_op_effects(
4337 ctx,
4338 ctx->op ? ctx->op->get_req()->get_connection() :
4339 ConnectionRef());
4340 });
4341 ctx->register_on_finish(
11fdf7f2 4342 [ctx]() {
7c673cae
FG
4343 delete ctx;
4344 });
4345
4346 // issue replica writes
4347 ceph_tid_t rep_tid = osd->get_tid();
4348
20effc67 4349 RepGather *repop = new_repop(ctx, rep_tid);
7c673cae
FG
4350
4351 issue_repop(repop, ctx);
4352 eval_repop(repop);
4353 repop->put();
4354}
4355
c07f9fc5
FG
4356void PrimaryLogPG::close_op_ctx(OpContext *ctx) {
4357 release_object_locks(ctx->lock_manager);
4358
4359 ctx->op_t.reset();
4360
4361 for (auto p = ctx->on_finish.begin(); p != ctx->on_finish.end();
4362 ctx->on_finish.erase(p++)) {
4363 (*p)();
4364 }
4365 delete ctx;
4366}
4367
7c673cae
FG
4368void PrimaryLogPG::reply_ctx(OpContext *ctx, int r)
4369{
4370 if (ctx->op)
4371 osd->reply_op_error(ctx->op, r);
4372 close_op_ctx(ctx);
4373}
4374
11fdf7f2
TL
4375void PrimaryLogPG::log_op_stats(const OpRequest& op,
4376 const uint64_t inb,
4377 const uint64_t outb)
7c673cae 4378{
9f95a23c 4379 auto m = op.get_req<MOSDOp>();
11fdf7f2 4380 const utime_t now = ceph_clock_now();
7c673cae 4381
11fdf7f2
TL
4382 const utime_t latency = now - m->get_recv_stamp();
4383 const utime_t process_latency = now - op.get_dequeued_time();
7c673cae
FG
4384
4385 osd->logger->inc(l_osd_op);
4386
4387 osd->logger->inc(l_osd_op_outb, outb);
4388 osd->logger->inc(l_osd_op_inb, inb);
4389 osd->logger->tinc(l_osd_op_lat, latency);
4390 osd->logger->tinc(l_osd_op_process_lat, process_latency);
4391
11fdf7f2 4392 if (op.may_read() && op.may_write()) {
7c673cae
FG
4393 osd->logger->inc(l_osd_op_rw);
4394 osd->logger->inc(l_osd_op_rw_inb, inb);
4395 osd->logger->inc(l_osd_op_rw_outb, outb);
4396 osd->logger->tinc(l_osd_op_rw_lat, latency);
4397 osd->logger->hinc(l_osd_op_rw_lat_inb_hist, latency.to_nsec(), inb);
4398 osd->logger->hinc(l_osd_op_rw_lat_outb_hist, latency.to_nsec(), outb);
4399 osd->logger->tinc(l_osd_op_rw_process_lat, process_latency);
11fdf7f2 4400 } else if (op.may_read()) {
7c673cae
FG
4401 osd->logger->inc(l_osd_op_r);
4402 osd->logger->inc(l_osd_op_r_outb, outb);
4403 osd->logger->tinc(l_osd_op_r_lat, latency);
4404 osd->logger->hinc(l_osd_op_r_lat_outb_hist, latency.to_nsec(), outb);
4405 osd->logger->tinc(l_osd_op_r_process_lat, process_latency);
11fdf7f2 4406 } else if (op.may_write() || op.may_cache()) {
7c673cae
FG
4407 osd->logger->inc(l_osd_op_w);
4408 osd->logger->inc(l_osd_op_w_inb, inb);
4409 osd->logger->tinc(l_osd_op_w_lat, latency);
4410 osd->logger->hinc(l_osd_op_w_lat_inb_hist, latency.to_nsec(), inb);
4411 osd->logger->tinc(l_osd_op_w_process_lat, process_latency);
11fdf7f2 4412 } else {
7c673cae 4413 ceph_abort();
11fdf7f2 4414 }
7c673cae
FG
4415
4416 dout(15) << "log_op_stats " << *m
4417 << " inb " << inb
4418 << " outb " << outb
4419 << " lat " << latency << dendl;
7c673cae 4420
11fdf7f2
TL
4421 if (m_dynamic_perf_stats.is_enabled()) {
4422 m_dynamic_perf_stats.add(osd, info, op, inb, outb, latency);
7c673cae 4423 }
11fdf7f2 4424}
7c673cae 4425
11fdf7f2
TL
4426void PrimaryLogPG::set_dynamic_perf_stats_queries(
4427 const std::list<OSDPerfMetricQuery> &queries)
4428{
4429 m_dynamic_perf_stats.set_queries(queries);
7c673cae
FG
4430}
4431
11fdf7f2 4432void PrimaryLogPG::get_dynamic_perf_stats(DynamicPerfStats *stats)
7c673cae 4433{
11fdf7f2 4434 std::swap(m_dynamic_perf_stats, *stats);
7c673cae
FG
4435}
4436
4437void PrimaryLogPG::do_scan(
4438 OpRequestRef op,
4439 ThreadPool::TPHandle &handle)
4440{
9f95a23c 4441 auto m = op->get_req<MOSDPGScan>();
11fdf7f2 4442 ceph_assert(m->get_type() == MSG_OSD_PG_SCAN);
7c673cae
FG
4443 dout(10) << "do_scan " << *m << dendl;
4444
4445 op->mark_started();
4446
4447 switch (m->op) {
4448 case MOSDPGScan::OP_SCAN_GET_DIGEST:
4449 {
11fdf7f2
TL
4450 auto dpp = get_dpp();
4451 if (osd->check_backfill_full(dpp)) {
4452 dout(1) << __func__ << ": Canceling backfill: Full." << dendl;
7c673cae 4453 queue_peering_event(
11fdf7f2
TL
4454 PGPeeringEventRef(
4455 std::make_shared<PGPeeringEvent>(
4456 get_osdmap_epoch(),
4457 get_osdmap_epoch(),
9f95a23c 4458 PeeringState::BackfillTooFull())));
7c673cae
FG
4459 return;
4460 }
4461
4462 BackfillInterval bi;
4463 bi.begin = m->begin;
4464 // No need to flush, there won't be any in progress writes occuring
4465 // past m->begin
4466 scan_range(
4467 cct->_conf->osd_backfill_scan_min,
4468 cct->_conf->osd_backfill_scan_max,
4469 &bi,
4470 handle);
4471 MOSDPGScan *reply = new MOSDPGScan(
4472 MOSDPGScan::OP_SCAN_DIGEST,
4473 pg_whoami,
11fdf7f2 4474 get_osdmap_epoch(), m->query_epoch,
7c673cae 4475 spg_t(info.pgid.pgid, get_primary().shard), bi.begin, bi.end);
11fdf7f2 4476 encode(bi.objects, reply->get_data());
7c673cae
FG
4477 osd->send_message_osd_cluster(reply, m->get_connection());
4478 }
4479 break;
4480
4481 case MOSDPGScan::OP_SCAN_DIGEST:
4482 {
4483 pg_shard_t from = m->from;
4484
4485 // Check that from is in backfill_targets vector
9f95a23c 4486 ceph_assert(is_backfill_target(from));
7c673cae
FG
4487
4488 BackfillInterval& bi = peer_backfill_info[from];
4489 bi.begin = m->begin;
4490 bi.end = m->end;
11fdf7f2 4491 auto p = m->get_data().cbegin();
7c673cae
FG
4492
4493 // take care to preserve ordering!
4494 bi.clear_objects();
f67539c2
TL
4495 decode_noclear(bi.objects, p);
4496 dout(10) << __func__ << " bi.begin=" << bi.begin << " bi.end=" << bi.end
4497 << " bi.objects.size()=" << bi.objects.size() << dendl;
7c673cae
FG
4498
4499 if (waiting_on_backfill.erase(from)) {
4500 if (waiting_on_backfill.empty()) {
9f95a23c
TL
4501 ceph_assert(
4502 peer_backfill_info.size() ==
4503 get_backfill_targets().size());
7c673cae
FG
4504 finish_recovery_op(hobject_t::get_max());
4505 }
4506 } else {
4507 // we canceled backfill for a while due to a too full, and this
4508 // is an extra response from a non-too-full peer
11fdf7f2 4509 dout(20) << __func__ << " canceled backfill (too full?)" << dendl;
7c673cae
FG
4510 }
4511 }
4512 break;
4513 }
4514}
4515
4516void PrimaryLogPG::do_backfill(OpRequestRef op)
4517{
9f95a23c 4518 auto m = op->get_req<MOSDPGBackfill>();
11fdf7f2 4519 ceph_assert(m->get_type() == MSG_OSD_PG_BACKFILL);
7c673cae
FG
4520 dout(10) << "do_backfill " << *m << dendl;
4521
4522 op->mark_started();
4523
4524 switch (m->op) {
4525 case MOSDPGBackfill::OP_BACKFILL_FINISH:
4526 {
11fdf7f2 4527 ceph_assert(cct->_conf->osd_kill_backfill_at != 1);
7c673cae
FG
4528
4529 MOSDPGBackfill *reply = new MOSDPGBackfill(
4530 MOSDPGBackfill::OP_BACKFILL_FINISH_ACK,
11fdf7f2 4531 get_osdmap_epoch(),
7c673cae
FG
4532 m->query_epoch,
4533 spg_t(info.pgid.pgid, get_primary().shard));
1e59de90 4534 reply->set_priority(recovery_state.get_recovery_op_priority());
7c673cae
FG
4535 osd->send_message_osd_cluster(reply, m->get_connection());
4536 queue_peering_event(
11fdf7f2
TL
4537 PGPeeringEventRef(
4538 std::make_shared<PGPeeringEvent>(
4539 get_osdmap_epoch(),
4540 get_osdmap_epoch(),
7c673cae
FG
4541 RecoveryDone())));
4542 }
4543 // fall-thru
4544
4545 case MOSDPGBackfill::OP_BACKFILL_PROGRESS:
4546 {
11fdf7f2 4547 ceph_assert(cct->_conf->osd_kill_backfill_at != 2);
7c673cae 4548
7c673cae 4549 ObjectStore::Transaction t;
9f95a23c
TL
4550 recovery_state.update_backfill_progress(
4551 m->last_backfill,
4552 m->stats,
4553 m->op == MOSDPGBackfill::OP_BACKFILL_PROGRESS,
4554 t);
4555
11fdf7f2
TL
4556 int tr = osd->store->queue_transaction(ch, std::move(t), NULL);
4557 ceph_assert(tr == 0);
7c673cae
FG
4558 }
4559 break;
4560
4561 case MOSDPGBackfill::OP_BACKFILL_FINISH_ACK:
4562 {
11fdf7f2
TL
4563 ceph_assert(is_primary());
4564 ceph_assert(cct->_conf->osd_kill_backfill_at != 3);
7c673cae
FG
4565 finish_recovery_op(hobject_t::get_max());
4566 }
4567 break;
4568 }
4569}
4570
4571void PrimaryLogPG::do_backfill_remove(OpRequestRef op)
4572{
4573 const MOSDPGBackfillRemove *m = static_cast<const MOSDPGBackfillRemove*>(
4574 op->get_req());
11fdf7f2 4575 ceph_assert(m->get_type() == MSG_OSD_PG_BACKFILL_REMOVE);
7c673cae
FG
4576 dout(7) << __func__ << " " << m->ls << dendl;
4577
4578 op->mark_started();
4579
4580 ObjectStore::Transaction t;
4581 for (auto& p : m->ls) {
11fdf7f2
TL
4582 if (is_remote_backfilling()) {
4583 struct stat st;
4584 int r = osd->store->stat(ch, ghobject_t(p.first, ghobject_t::NO_GEN,
4585 pg_whoami.shard) , &st);
4586 if (r == 0) {
4587 sub_local_num_bytes(st.st_size);
4588 int64_t usersize;
4589 if (pool.info.is_erasure()) {
4590 bufferlist bv;
4591 int r = osd->store->getattr(
4592 ch,
4593 ghobject_t(p.first, ghobject_t::NO_GEN, pg_whoami.shard),
4594 OI_ATTR,
4595 bv);
4596 if (r >= 0) {
4597 object_info_t oi(bv);
4598 usersize = oi.size * pgbackend->get_ec_data_chunk_count();
4599 } else {
4600 dout(0) << __func__ << " " << ghobject_t(p.first, ghobject_t::NO_GEN, pg_whoami.shard)
4601 << " can't get object info" << dendl;
4602 usersize = 0;
4603 }
4604 } else {
4605 usersize = st.st_size;
4606 }
4607 sub_num_bytes(usersize);
4608 dout(10) << __func__ << " " << ghobject_t(p.first, ghobject_t::NO_GEN, pg_whoami.shard)
4609 << " sub actual data by " << st.st_size
4610 << " sub num_bytes by " << usersize
4611 << dendl;
4612 }
4613 }
7c673cae
FG
4614 remove_snap_mapped_object(t, p.first);
4615 }
11fdf7f2
TL
4616 int r = osd->store->queue_transaction(ch, std::move(t), NULL);
4617 ceph_assert(r == 0);
7c673cae
FG
4618}
4619
224ce89b 4620int PrimaryLogPG::trim_object(
9f95a23c
TL
4621 bool first, const hobject_t &coid, snapid_t snap_to_trim,
4622 PrimaryLogPG::OpContextUPtr *ctxp)
7c673cae 4623{
224ce89b 4624 *ctxp = NULL;
11fdf7f2 4625
7c673cae
FG
4626 // load clone info
4627 bufferlist bl;
4628 ObjectContextRef obc = get_object_context(coid, false, NULL);
224ce89b
WB
4629 if (!obc || !obc->ssc || !obc->ssc->exists) {
4630 osd->clog->error() << __func__ << ": Can not trim " << coid
4631 << " repair needed " << (obc ? "(no obc->ssc or !exists)" : "(no obc)");
4632 return -ENOENT;
7c673cae 4633 }
7c673cae 4634
11fdf7f2
TL
4635 hobject_t head_oid = coid.get_head();
4636 ObjectContextRef head_obc = get_object_context(head_oid, false);
4637 if (!head_obc) {
224ce89b 4638 osd->clog->error() << __func__ << ": Can not trim " << coid
11fdf7f2 4639 << " repair needed, no snapset obc for " << head_oid;
224ce89b
WB
4640 return -ENOENT;
4641 }
7c673cae
FG
4642
4643 SnapSet& snapset = obc->ssc->snapset;
4644
7c673cae 4645 object_info_t &coi = obc->obs.oi;
11fdf7f2
TL
4646 auto citer = snapset.clone_snaps.find(coid.snap);
4647 if (citer == snapset.clone_snaps.end()) {
4648 osd->clog->error() << "No clone_snaps in snapset " << snapset
4649 << " for object " << coid << "\n";
4650 return -ENOENT;
7c673cae 4651 }
11fdf7f2 4652 set<snapid_t> old_snaps(citer->second.begin(), citer->second.end());
7c673cae 4653 if (old_snaps.empty()) {
c07f9fc5 4654 osd->clog->error() << "No object info snaps for object " << coid;
224ce89b 4655 return -ENOENT;
7c673cae
FG
4656 }
4657
4658 dout(10) << coid << " old_snaps " << old_snaps
4659 << " old snapset " << snapset << dendl;
4660 if (snapset.seq == 0) {
c07f9fc5 4661 osd->clog->error() << "No snapset.seq for object " << coid;
224ce89b 4662 return -ENOENT;
7c673cae
FG
4663 }
4664
4665 set<snapid_t> new_snaps;
9f95a23c 4666 const OSDMapRef& osdmap = get_osdmap();
7c673cae
FG
4667 for (set<snapid_t>::iterator i = old_snaps.begin();
4668 i != old_snaps.end();
4669 ++i) {
9f95a23c
TL
4670 if (!osdmap->in_removed_snaps_queue(info.pgid.pgid.pool(), *i) &&
4671 *i != snap_to_trim) {
7c673cae 4672 new_snaps.insert(*i);
9f95a23c 4673 }
7c673cae
FG
4674 }
4675
4676 vector<snapid_t>::iterator p = snapset.clones.end();
4677
4678 if (new_snaps.empty()) {
4679 p = std::find(snapset.clones.begin(), snapset.clones.end(), coid.snap);
4680 if (p == snapset.clones.end()) {
c07f9fc5 4681 osd->clog->error() << "Snap " << coid.snap << " not in clones";
224ce89b 4682 return -ENOENT;
7c673cae
FG
4683 }
4684 }
4685
4686 OpContextUPtr ctx = simple_opc_create(obc);
11fdf7f2 4687 ctx->head_obc = head_obc;
7c673cae
FG
4688
4689 if (!ctx->lock_manager.get_snaptrimmer_write(
4690 coid,
4691 obc,
4692 first)) {
4693 close_op_ctx(ctx.release());
4694 dout(10) << __func__ << ": Unable to get a wlock on " << coid << dendl;
224ce89b 4695 return -ENOLCK;
7c673cae
FG
4696 }
4697
4698 if (!ctx->lock_manager.get_snaptrimmer_write(
11fdf7f2
TL
4699 head_oid,
4700 head_obc,
7c673cae
FG
4701 first)) {
4702 close_op_ctx(ctx.release());
11fdf7f2 4703 dout(10) << __func__ << ": Unable to get a wlock on " << head_oid << dendl;
224ce89b 4704 return -ENOLCK;
7c673cae
FG
4705 }
4706
4707 ctx->at_version = get_next_version();
4708
4709 PGTransaction *t = ctx->op_t.get();
f67539c2 4710
1d09f67e
TL
4711 int64_t num_objects_before_trim = ctx->delta_stats.num_objects;
4712
7c673cae
FG
4713 if (new_snaps.empty()) {
4714 // remove clone
4715 dout(10) << coid << " snaps " << old_snaps << " -> "
4716 << new_snaps << " ... deleting" << dendl;
4717
4718 // ...from snapset
11fdf7f2 4719 ceph_assert(p != snapset.clones.end());
f67539c2 4720
7c673cae
FG
4721 snapid_t last = coid.snap;
4722 ctx->delta_stats.num_bytes -= snapset.get_clone_bytes(last);
4723
4724 if (p != snapset.clones.begin()) {
4725 // not the oldest... merge overlap into next older clone
4726 vector<snapid_t>::iterator n = p - 1;
4727 hobject_t prev_coid = coid;
4728 prev_coid.snap = *n;
4729 bool adjust_prev_bytes = is_present_clone(prev_coid);
4730
4731 if (adjust_prev_bytes)
4732 ctx->delta_stats.num_bytes -= snapset.get_clone_bytes(*n);
4733
4734 snapset.clone_overlap[*n].intersection_of(
4735 snapset.clone_overlap[*p]);
4736
4737 if (adjust_prev_bytes)
4738 ctx->delta_stats.num_bytes += snapset.get_clone_bytes(*n);
4739 }
4740 ctx->delta_stats.num_objects--;
4741 if (coi.is_dirty())
4742 ctx->delta_stats.num_objects_dirty--;
4743 if (coi.is_omap())
4744 ctx->delta_stats.num_objects_omap--;
4745 if (coi.is_whiteout()) {
4746 dout(20) << __func__ << " trimming whiteout on " << coid << dendl;
4747 ctx->delta_stats.num_whiteouts--;
4748 }
4749 ctx->delta_stats.num_object_clones--;
4750 if (coi.is_cache_pinned())
4751 ctx->delta_stats.num_objects_pinned--;
f67539c2
TL
4752 if (coi.has_manifest()) {
4753 dec_all_refcount_manifest(coi, ctx.get());
11fdf7f2 4754 ctx->delta_stats.num_objects_manifest--;
f67539c2 4755 }
7c673cae
FG
4756 obc->obs.exists = false;
4757
4758 snapset.clones.erase(p);
4759 snapset.clone_overlap.erase(last);
4760 snapset.clone_size.erase(last);
4761 snapset.clone_snaps.erase(last);
f67539c2 4762
7c673cae
FG
4763 ctx->log.push_back(
4764 pg_log_entry_t(
4765 pg_log_entry_t::DELETE,
4766 coid,
4767 ctx->at_version,
4768 ctx->obs->oi.version,
4769 0,
4770 osd_reqid_t(),
4771 ctx->mtime,
4772 0)
4773 );
4774 t->remove(coid);
4775 t->update_snaps(
4776 coid,
4777 old_snaps,
4778 new_snaps);
31f18b77
FG
4779
4780 coi = object_info_t(coid);
4781
7c673cae
FG
4782 ctx->at_version.version++;
4783 } else {
4784 // save adjusted snaps for this object
4785 dout(10) << coid << " snaps " << old_snaps << " -> " << new_snaps << dendl;
11fdf7f2
TL
4786 snapset.clone_snaps[coid.snap] =
4787 vector<snapid_t>(new_snaps.rbegin(), new_snaps.rend());
4788 // we still do a 'modify' event on this object just to trigger a
4789 // snapmapper.update ... :(
7c673cae
FG
4790
4791 coi.prior_version = coi.version;
4792 coi.version = ctx->at_version;
4793 bl.clear();
11fdf7f2 4794 encode(coi, bl, get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
7c673cae
FG
4795 t->setattr(coid, OI_ATTR, bl);
4796
4797 ctx->log.push_back(
4798 pg_log_entry_t(
4799 pg_log_entry_t::MODIFY,
4800 coid,
4801 coi.version,
4802 coi.prior_version,
4803 0,
4804 osd_reqid_t(),
4805 ctx->mtime,
4806 0)
4807 );
4808 ctx->at_version.version++;
4809
4810 t->update_snaps(
4811 coid,
4812 old_snaps,
4813 new_snaps);
4814 }
4815
4816 // save head snapset
4817 dout(10) << coid << " new snapset " << snapset << " on "
11fdf7f2 4818 << head_obc->obs.oi << dendl;
7c673cae 4819 if (snapset.clones.empty() &&
11fdf7f2
TL
4820 (head_obc->obs.oi.is_whiteout() &&
4821 !(head_obc->obs.oi.is_dirty() && pool.info.is_tier()) &&
4822 !head_obc->obs.oi.is_cache_pinned())) {
7c673cae
FG
4823 // NOTE: this arguably constitutes minor interference with the
4824 // tiering agent if this is a cache tier since a snap trim event
4825 // is effectively evicting a whiteout we might otherwise want to
4826 // keep around.
11fdf7f2 4827 dout(10) << coid << " removing " << head_oid << dendl;
7c673cae
FG
4828 ctx->log.push_back(
4829 pg_log_entry_t(
4830 pg_log_entry_t::DELETE,
11fdf7f2 4831 head_oid,
7c673cae 4832 ctx->at_version,
11fdf7f2 4833 head_obc->obs.oi.version,
7c673cae
FG
4834 0,
4835 osd_reqid_t(),
4836 ctx->mtime,
4837 0)
4838 );
522d829b 4839 dout(10) << "removing snap head" << dendl;
11fdf7f2
TL
4840 object_info_t& oi = head_obc->obs.oi;
4841 ctx->delta_stats.num_objects--;
4842 if (oi.is_dirty()) {
4843 ctx->delta_stats.num_objects_dirty--;
4844 }
4845 if (oi.is_omap())
4846 ctx->delta_stats.num_objects_omap--;
4847 if (oi.is_whiteout()) {
4848 dout(20) << __func__ << " trimming whiteout on " << oi.soid << dendl;
4849 ctx->delta_stats.num_whiteouts--;
4850 }
4851 if (oi.is_cache_pinned()) {
4852 ctx->delta_stats.num_objects_pinned--;
4853 }
f67539c2 4854 if (oi.has_manifest()) {
11fdf7f2 4855 ctx->delta_stats.num_objects_manifest--;
f67539c2
TL
4856 dec_all_refcount_manifest(oi, ctx.get());
4857 }
11fdf7f2
TL
4858 head_obc->obs.exists = false;
4859 head_obc->obs.oi = object_info_t(head_oid);
4860 t->remove(head_oid);
7c673cae 4861 } else {
9f95a23c
TL
4862 if (get_osdmap()->require_osd_release < ceph_release_t::octopus) {
4863 // filter SnapSet::snaps for the benefit of pre-octopus
4864 // peers. This is perhaps overly conservative in that I'm not
4865 // certain they need this, but let's be conservative here.
4866 dout(10) << coid << " filtering snapset on " << head_oid << dendl;
4867 snapset.filter(pool.info);
4868 } else {
4869 snapset.snaps.clear();
4870 }
11fdf7f2 4871 dout(10) << coid << " writing updated snapset on " << head_oid
7c673cae
FG
4872 << ", snapset is " << snapset << dendl;
4873 ctx->log.push_back(
4874 pg_log_entry_t(
4875 pg_log_entry_t::MODIFY,
11fdf7f2 4876 head_oid,
7c673cae 4877 ctx->at_version,
11fdf7f2 4878 head_obc->obs.oi.version,
7c673cae
FG
4879 0,
4880 osd_reqid_t(),
4881 ctx->mtime,
4882 0)
4883 );
4884
11fdf7f2
TL
4885 head_obc->obs.oi.prior_version = head_obc->obs.oi.version;
4886 head_obc->obs.oi.version = ctx->at_version;
7c673cae 4887
20effc67 4888 map <string, bufferlist, less<>> attrs;
7c673cae 4889 bl.clear();
11fdf7f2 4890 encode(snapset, bl);
f67539c2 4891 attrs[SS_ATTR] = std::move(bl);
7c673cae
FG
4892
4893 bl.clear();
11fdf7f2 4894 encode(head_obc->obs.oi, bl,
7c673cae 4895 get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
f67539c2 4896 attrs[OI_ATTR] = std::move(bl);
11fdf7f2 4897 t->setattrs(head_oid, attrs);
7c673cae
FG
4898 }
4899
1d09f67e
TL
4900 // Stats reporting - Set number of objects trimmed
4901 if (num_objects_before_trim > ctx->delta_stats.num_objects) {
4902 int64_t num_objects_trimmed =
4903 num_objects_before_trim - ctx->delta_stats.num_objects;
4904 add_objects_trimmed_count(num_objects_trimmed);
4905 }
4906
224ce89b
WB
4907 *ctxp = std::move(ctx);
4908 return 0;
7c673cae
FG
4909}
4910
4911void PrimaryLogPG::kick_snap_trim()
4912{
11fdf7f2
TL
4913 ceph_assert(is_active());
4914 ceph_assert(is_primary());
4915 if (is_clean() &&
4916 !state_test(PG_STATE_PREMERGE) &&
4917 !snap_trimq.empty()) {
4918 if (get_osdmap()->test_flag(CEPH_OSDMAP_NOSNAPTRIM)) {
4919 dout(10) << __func__ << ": nosnaptrim set, not kicking" << dendl;
4920 } else {
4921 dout(10) << __func__ << ": clean and snaps to trim, kicking" << dendl;
1d09f67e
TL
4922 reset_objects_trimmed();
4923 set_snaptrim_begin_stamp();
11fdf7f2
TL
4924 snap_trimmer_machine.process_event(KickTrim());
4925 }
7c673cae
FG
4926 }
4927}
4928
4929void PrimaryLogPG::snap_trimmer_scrub_complete()
4930{
1d09f67e
TL
4931 if (is_primary() && is_active() && is_clean() && !snap_trimq.empty()) {
4932 dout(10) << "scrub finished - requeuing snap_trimmer" << dendl;
7c673cae
FG
4933 snap_trimmer_machine.process_event(ScrubComplete());
4934 }
4935}
4936
4937void PrimaryLogPG::snap_trimmer(epoch_t queued)
4938{
9f95a23c 4939 if (recovery_state.is_deleting() || pg_has_reset_since(queued)) {
7c673cae
FG
4940 return;
4941 }
4942
11fdf7f2 4943 ceph_assert(is_primary());
7c673cae
FG
4944
4945 dout(10) << "snap_trimmer posting" << dendl;
4946 snap_trimmer_machine.process_event(DoSnapWork());
4947 dout(10) << "snap_trimmer complete" << dendl;
4948 return;
4949}
4950
20effc67 4951namespace {
7c673cae 4952
20effc67
TL
4953template<typename U, typename V>
4954int do_cmp_xattr(int op, const U& lhs, const V& rhs)
4955{
7c673cae
FG
4956 switch (op) {
4957 case CEPH_OSD_CMPXATTR_OP_EQ:
20effc67 4958 return lhs == rhs;
7c673cae 4959 case CEPH_OSD_CMPXATTR_OP_NE:
20effc67 4960 return lhs != rhs;
7c673cae 4961 case CEPH_OSD_CMPXATTR_OP_GT:
20effc67 4962 return lhs > rhs;
7c673cae 4963 case CEPH_OSD_CMPXATTR_OP_GTE:
20effc67 4964 return lhs >= rhs;
7c673cae 4965 case CEPH_OSD_CMPXATTR_OP_LT:
20effc67 4966 return lhs < rhs;
7c673cae 4967 case CEPH_OSD_CMPXATTR_OP_LTE:
20effc67 4968 return lhs <= rhs;
7c673cae
FG
4969 default:
4970 return -EINVAL;
4971 }
4972}
4973
20effc67 4974} // anonymous namespace
7c673cae 4975
20effc67
TL
4976int PrimaryLogPG::do_xattr_cmp_u64(int op, uint64_t v1, bufferlist& xattr)
4977{
4978 uint64_t v2;
7c673cae 4979
20effc67
TL
4980 if (xattr.length()) {
4981 const char* first = xattr.c_str();
4982 if (auto [p, ec] = std::from_chars(first, first + xattr.length(), v2);
4983 ec != std::errc()) {
4984 return -EINVAL;
4985 }
4986 } else {
4987 v2 = 0;
7c673cae 4988 }
20effc67
TL
4989 dout(20) << "do_xattr_cmp_u64 '" << v1 << "' vs '" << v2 << "' op " << op << dendl;
4990 return do_cmp_xattr(op, v1, v2);
4991}
4992
4993int PrimaryLogPG::do_xattr_cmp_str(int op, string& v1s, bufferlist& xattr)
4994{
4995 string_view v2s(xattr.c_str(), xattr.length());
4996 dout(20) << "do_xattr_cmp_str '" << v1s << "' vs '" << v2s << "' op " << op << dendl;
4997 return do_cmp_xattr(op, v1s, v2s);
7c673cae
FG
4998}
4999
7c673cae
FG
5000int PrimaryLogPG::do_writesame(OpContext *ctx, OSDOp& osd_op)
5001{
5002 ceph_osd_op& op = osd_op.op;
5003 vector<OSDOp> write_ops(1);
5004 OSDOp& write_op = write_ops[0];
5005 uint64_t write_length = op.writesame.length;
5006 int result = 0;
5007
5008 if (!write_length)
5009 return 0;
5010
5011 if (!op.writesame.data_length || write_length % op.writesame.data_length)
5012 return -EINVAL;
5013
5014 if (op.writesame.data_length != osd_op.indata.length()) {
5015 derr << "invalid length ws data length " << op.writesame.data_length << " actual len " << osd_op.indata.length() << dendl;
5016 return -EINVAL;
5017 }
5018
5019 while (write_length) {
5020 write_op.indata.append(osd_op.indata);
5021 write_length -= op.writesame.data_length;
5022 }
5023
5024 write_op.op.op = CEPH_OSD_OP_WRITE;
5025 write_op.op.extent.offset = op.writesame.offset;
5026 write_op.op.extent.length = op.writesame.length;
5027 result = do_osd_ops(ctx, write_ops);
5028 if (result < 0)
5029 derr << "do_writesame do_osd_ops failed " << result << dendl;
5030
5031 return result;
5032}
5033
5034// ========================================================================
5035// low level osd ops
5036
5037int PrimaryLogPG::do_tmap2omap(OpContext *ctx, unsigned flags)
5038{
5039 dout(20) << " convert tmap to omap for " << ctx->new_obs.oi.soid << dendl;
5040 bufferlist header, vals;
5041 int r = _get_tmap(ctx, &header, &vals);
5042 if (r < 0) {
5043 if (r == -ENODATA && (flags & CEPH_OSD_TMAP2OMAP_NULLOK))
5044 r = 0;
5045 return r;
5046 }
5047
5048 vector<OSDOp> ops(3);
5049
5050 ops[0].op.op = CEPH_OSD_OP_TRUNCATE;
5051 ops[0].op.extent.offset = 0;
5052 ops[0].op.extent.length = 0;
5053
5054 ops[1].op.op = CEPH_OSD_OP_OMAPSETHEADER;
f67539c2 5055 ops[1].indata = std::move(header);
7c673cae
FG
5056
5057 ops[2].op.op = CEPH_OSD_OP_OMAPSETVALS;
f67539c2 5058 ops[2].indata = std::move(vals);
7c673cae
FG
5059
5060 return do_osd_ops(ctx, ops);
5061}
5062
11fdf7f2
TL
5063int PrimaryLogPG::do_tmapup_slow(OpContext *ctx, bufferlist::const_iterator& bp,
5064 OSDOp& osd_op, bufferlist& bl)
7c673cae
FG
5065{
5066 // decode
5067 bufferlist header;
5068 map<string, bufferlist> m;
5069 if (bl.length()) {
11fdf7f2
TL
5070 auto p = bl.cbegin();
5071 decode(header, p);
5072 decode(m, p);
5073 ceph_assert(p.end());
7c673cae
FG
5074 }
5075
5076 // do the update(s)
5077 while (!bp.end()) {
5078 __u8 op;
5079 string key;
11fdf7f2 5080 decode(op, bp);
7c673cae
FG
5081
5082 switch (op) {
5083 case CEPH_OSD_TMAP_SET: // insert key
5084 {
11fdf7f2 5085 decode(key, bp);
7c673cae 5086 bufferlist data;
11fdf7f2 5087 decode(data, bp);
7c673cae
FG
5088 m[key] = data;
5089 }
5090 break;
5091 case CEPH_OSD_TMAP_RM: // remove key
11fdf7f2 5092 decode(key, bp);
7c673cae
FG
5093 if (!m.count(key)) {
5094 return -ENOENT;
5095 }
5096 m.erase(key);
5097 break;
5098 case CEPH_OSD_TMAP_RMSLOPPY: // remove key
11fdf7f2 5099 decode(key, bp);
7c673cae
FG
5100 m.erase(key);
5101 break;
5102 case CEPH_OSD_TMAP_HDR: // update header
5103 {
11fdf7f2 5104 decode(header, bp);
7c673cae
FG
5105 }
5106 break;
5107 default:
5108 return -EINVAL;
5109 }
5110 }
5111
5112 // reencode
5113 bufferlist obl;
11fdf7f2
TL
5114 encode(header, obl);
5115 encode(m, obl);
7c673cae
FG
5116
5117 // write it out
5118 vector<OSDOp> nops(1);
5119 OSDOp& newop = nops[0];
5120 newop.op.op = CEPH_OSD_OP_WRITEFULL;
5121 newop.op.extent.offset = 0;
5122 newop.op.extent.length = obl.length();
5123 newop.indata = obl;
5124 do_osd_ops(ctx, nops);
7c673cae
FG
5125 return 0;
5126}
5127
11fdf7f2 5128int PrimaryLogPG::do_tmapup(OpContext *ctx, bufferlist::const_iterator& bp, OSDOp& osd_op)
7c673cae 5129{
11fdf7f2 5130 bufferlist::const_iterator orig_bp = bp;
7c673cae
FG
5131 int result = 0;
5132 if (bp.end()) {
5133 dout(10) << "tmapup is a no-op" << dendl;
5134 } else {
5135 // read the whole object
5136 vector<OSDOp> nops(1);
5137 OSDOp& newop = nops[0];
5138 newop.op.op = CEPH_OSD_OP_READ;
5139 newop.op.extent.offset = 0;
5140 newop.op.extent.length = 0;
5141 result = do_osd_ops(ctx, nops);
5142
5143 dout(10) << "tmapup read " << newop.outdata.length() << dendl;
5144
5145 dout(30) << " starting is \n";
5146 newop.outdata.hexdump(*_dout);
5147 *_dout << dendl;
5148
11fdf7f2 5149 auto ip = newop.outdata.cbegin();
7c673cae
FG
5150 bufferlist obl;
5151
5152 dout(30) << "the update command is: \n";
5153 osd_op.indata.hexdump(*_dout);
5154 *_dout << dendl;
5155
5156 // header
5157 bufferlist header;
5158 __u32 nkeys = 0;
5159 if (newop.outdata.length()) {
11fdf7f2
TL
5160 decode(header, ip);
5161 decode(nkeys, ip);
7c673cae
FG
5162 }
5163 dout(10) << "tmapup header " << header.length() << dendl;
5164
5165 if (!bp.end() && *bp == CEPH_OSD_TMAP_HDR) {
5166 ++bp;
11fdf7f2 5167 decode(header, bp);
7c673cae
FG
5168 dout(10) << "tmapup new header " << header.length() << dendl;
5169 }
5170
11fdf7f2 5171 encode(header, obl);
7c673cae
FG
5172
5173 dout(20) << "tmapup initial nkeys " << nkeys << dendl;
5174
5175 // update keys
5176 bufferlist newkeydata;
5177 string nextkey, last_in_key;
5178 bufferlist nextval;
5179 bool have_next = false;
5180 if (!ip.end()) {
5181 have_next = true;
11fdf7f2
TL
5182 decode(nextkey, ip);
5183 decode(nextval, ip);
7c673cae
FG
5184 }
5185 while (!bp.end() && !result) {
5186 __u8 op;
5187 string key;
5188 try {
11fdf7f2
TL
5189 decode(op, bp);
5190 decode(key, bp);
7c673cae 5191 }
f67539c2 5192 catch (ceph::buffer::error& e) {
7c673cae
FG
5193 return -EINVAL;
5194 }
5195 if (key < last_in_key) {
5196 dout(5) << "tmapup warning: key '" << key << "' < previous key '" << last_in_key
5197 << "', falling back to an inefficient (unsorted) update" << dendl;
5198 bp = orig_bp;
5199 return do_tmapup_slow(ctx, bp, osd_op, newop.outdata);
5200 }
5201 last_in_key = key;
5202
5203 dout(10) << "tmapup op " << (int)op << " key " << key << dendl;
9f95a23c 5204
7c673cae
FG
5205 // skip existing intervening keys
5206 bool key_exists = false;
5207 while (have_next && !key_exists) {
5208 dout(20) << " (have_next=" << have_next << " nextkey=" << nextkey << ")" << dendl;
5209 if (nextkey > key)
5210 break;
5211 if (nextkey < key) {
5212 // copy untouched.
11fdf7f2
TL
5213 encode(nextkey, newkeydata);
5214 encode(nextval, newkeydata);
7c673cae
FG
5215 dout(20) << " keep " << nextkey << " " << nextval.length() << dendl;
5216 } else {
5217 // don't copy; discard old value. and stop.
5218 dout(20) << " drop " << nextkey << " " << nextval.length() << dendl;
5219 key_exists = true;
5220 nkeys--;
5221 }
5222 if (!ip.end()) {
11fdf7f2
TL
5223 decode(nextkey, ip);
5224 decode(nextval, ip);
7c673cae
FG
5225 } else {
5226 have_next = false;
5227 }
5228 }
5229
5230 if (op == CEPH_OSD_TMAP_SET) {
5231 bufferlist val;
5232 try {
11fdf7f2 5233 decode(val, bp);
7c673cae 5234 }
f67539c2 5235 catch (ceph::buffer::error& e) {
7c673cae
FG
5236 return -EINVAL;
5237 }
11fdf7f2
TL
5238 encode(key, newkeydata);
5239 encode(val, newkeydata);
7c673cae
FG
5240 dout(20) << " set " << key << " " << val.length() << dendl;
5241 nkeys++;
5242 } else if (op == CEPH_OSD_TMAP_CREATE) {
5243 if (key_exists) {
5244 return -EEXIST;
5245 }
5246 bufferlist val;
5247 try {
11fdf7f2 5248 decode(val, bp);
7c673cae 5249 }
f67539c2 5250 catch (ceph::buffer::error& e) {
7c673cae
FG
5251 return -EINVAL;
5252 }
11fdf7f2
TL
5253 encode(key, newkeydata);
5254 encode(val, newkeydata);
7c673cae
FG
5255 dout(20) << " create " << key << " " << val.length() << dendl;
5256 nkeys++;
5257 } else if (op == CEPH_OSD_TMAP_RM) {
5258 // do nothing.
5259 if (!key_exists) {
5260 return -ENOENT;
5261 }
5262 } else if (op == CEPH_OSD_TMAP_RMSLOPPY) {
5263 // do nothing
5264 } else {
5265 dout(10) << " invalid tmap op " << (int)op << dendl;
5266 return -EINVAL;
5267 }
5268 }
5269
5270 // copy remaining
5271 if (have_next) {
11fdf7f2
TL
5272 encode(nextkey, newkeydata);
5273 encode(nextval, newkeydata);
7c673cae
FG
5274 dout(20) << " keep " << nextkey << " " << nextval.length() << dendl;
5275 }
5276 if (!ip.end()) {
5277 bufferlist rest;
5278 rest.substr_of(newop.outdata, ip.get_off(), newop.outdata.length() - ip.get_off());
5279 dout(20) << " keep trailing " << rest.length()
5280 << " at " << newkeydata.length() << dendl;
5281 newkeydata.claim_append(rest);
5282 }
5283
5284 // encode final key count + key data
5285 dout(20) << "tmapup final nkeys " << nkeys << dendl;
11fdf7f2 5286 encode(nkeys, obl);
7c673cae
FG
5287 obl.claim_append(newkeydata);
5288
5289 if (0) {
5290 dout(30) << " final is \n";
5291 obl.hexdump(*_dout);
5292 *_dout << dendl;
5293
5294 // sanity check
11fdf7f2 5295 auto tp = obl.cbegin();
7c673cae 5296 bufferlist h;
11fdf7f2 5297 decode(h, tp);
7c673cae 5298 map<string,bufferlist> d;
11fdf7f2
TL
5299 decode(d, tp);
5300 ceph_assert(tp.end());
7c673cae
FG
5301 dout(0) << " **** debug sanity check, looks ok ****" << dendl;
5302 }
5303
5304 // write it out
5305 if (!result) {
5306 dout(20) << "tmapput write " << obl.length() << dendl;
5307 newop.op.op = CEPH_OSD_OP_WRITEFULL;
5308 newop.op.extent.offset = 0;
5309 newop.op.extent.length = obl.length();
5310 newop.indata = obl;
5311 do_osd_ops(ctx, nops);
7c673cae
FG
5312 }
5313 }
5314 return result;
5315}
5316
11fdf7f2
TL
5317static int check_offset_and_length(uint64_t offset, uint64_t length,
5318 uint64_t max, DoutPrefixProvider *dpp)
7c673cae
FG
5319{
5320 if (offset >= max ||
5321 length > max ||
11fdf7f2
TL
5322 offset + length > max) {
5323 ldpp_dout(dpp, 10) << __func__ << " "
5324 << "osd_max_object_size: " << max
5325 << "; Hard limit of object size is 4GB." << dendl;
7c673cae 5326 return -EFBIG;
11fdf7f2 5327 }
7c673cae
FG
5328
5329 return 0;
5330}
5331
5332struct FillInVerifyExtent : public Context {
5333 ceph_le64 *r;
5334 int32_t *rval;
5335 bufferlist *outdatap;
9f95a23c 5336 std::optional<uint32_t> maybe_crc;
7c673cae
FG
5337 uint64_t size;
5338 OSDService *osd;
5339 hobject_t soid;
9f95a23c 5340 uint32_t flags;
7c673cae 5341 FillInVerifyExtent(ceph_le64 *r, int32_t *rv, bufferlist *blp,
9f95a23c
TL
5342 std::optional<uint32_t> mc, uint64_t size,
5343 OSDService *osd, hobject_t soid, uint32_t flags) :
7c673cae
FG
5344 r(r), rval(rv), outdatap(blp), maybe_crc(mc),
5345 size(size), osd(osd), soid(soid), flags(flags) {}
5346 void finish(int len) override {
c07f9fc5
FG
5347 if (len < 0) {
5348 *rval = len;
7c673cae 5349 return;
c07f9fc5 5350 }
20effc67 5351 *r = len;
c07f9fc5
FG
5352 *rval = 0;
5353
7c673cae
FG
5354 // whole object? can we verify the checksum?
5355 if (maybe_crc && *r == size) {
5356 uint32_t crc = outdatap->crc32c(-1);
5357 if (maybe_crc != crc) {
5358 osd->clog->error() << std::hex << " full-object read crc 0x" << crc
5359 << " != expected 0x" << *maybe_crc
5360 << std::dec << " on " << soid;
5361 if (!(flags & CEPH_OSD_OP_FLAG_FAILOK)) {
5362 *rval = -EIO;
5363 *r = 0;
5364 }
5365 }
5366 }
5367 }
5368};
5369
5370struct ToSparseReadResult : public Context {
c07f9fc5
FG
5371 int* result;
5372 bufferlist* data_bl;
7c673cae 5373 uint64_t data_offset;
c07f9fc5
FG
5374 ceph_le64* len;
5375 ToSparseReadResult(int* result, bufferlist* bl, uint64_t offset,
5376 ceph_le64* len)
5377 : result(result), data_bl(bl), data_offset(offset),len(len) {}
7c673cae 5378 void finish(int r) override {
c07f9fc5
FG
5379 if (r < 0) {
5380 *result = r;
5381 return;
5382 }
5383 *result = 0;
5384 *len = r;
7c673cae
FG
5385 bufferlist outdata;
5386 map<uint64_t, uint64_t> extents = {{data_offset, r}};
11fdf7f2 5387 encode(extents, outdata);
f67539c2 5388 encode_destructively(*data_bl, outdata);
c07f9fc5 5389 data_bl->swap(outdata);
7c673cae
FG
5390 }
5391};
5392
5393template<typename V>
5394static string list_keys(const map<string, V>& m) {
5395 string s;
5396 for (typename map<string, V>::const_iterator itr = m.begin(); itr != m.end(); ++itr) {
5397 if (!s.empty()) {
5398 s.push_back(',');
5399 }
5400 s.append(itr->first);
5401 }
5402 return s;
5403}
5404
5405template<typename T>
5406static string list_entries(const T& m) {
5407 string s;
5408 for (typename T::const_iterator itr = m.begin(); itr != m.end(); ++itr) {
5409 if (!s.empty()) {
5410 s.push_back(',');
5411 }
5412 s.append(*itr);
5413 }
5414 return s;
5415}
5416
5417void PrimaryLogPG::maybe_create_new_object(
5418 OpContext *ctx,
5419 bool ignore_transaction)
5420{
5421 ObjectState& obs = ctx->new_obs;
5422 if (!obs.exists) {
5423 ctx->delta_stats.num_objects++;
5424 obs.exists = true;
11fdf7f2 5425 ceph_assert(!obs.oi.is_whiteout());
7c673cae
FG
5426 obs.oi.new_object();
5427 if (!ignore_transaction)
5428 ctx->op_t->create(obs.oi.soid);
5429 } else if (obs.oi.is_whiteout()) {
5430 dout(10) << __func__ << " clearing whiteout on " << obs.oi.soid << dendl;
5431 ctx->new_obs.oi.clear_flag(object_info_t::FLAG_WHITEOUT);
5432 --ctx->delta_stats.num_whiteouts;
5433 }
5434}
5435
c07f9fc5
FG
5436struct ReadFinisher : public PrimaryLogPG::OpFinisher {
5437 OSDOp& osd_op;
5438
11fdf7f2 5439 explicit ReadFinisher(OSDOp& osd_op) : osd_op(osd_op) {
c07f9fc5
FG
5440 }
5441
5442 int execute() override {
5443 return osd_op.rval;
5444 }
5445};
5446
7c673cae
FG
5447struct C_ChecksumRead : public Context {
5448 PrimaryLogPG *primary_log_pg;
5449 OSDOp &osd_op;
5450 Checksummer::CSumType csum_type;
5451 bufferlist init_value_bl;
5452 ceph_le64 read_length;
5453 bufferlist read_bl;
5454 Context *fill_extent_ctx;
5455
5456 C_ChecksumRead(PrimaryLogPG *primary_log_pg, OSDOp &osd_op,
5457 Checksummer::CSumType csum_type, bufferlist &&init_value_bl,
9f95a23c
TL
5458 std::optional<uint32_t> maybe_crc, uint64_t size,
5459 OSDService *osd, hobject_t soid, uint32_t flags)
7c673cae
FG
5460 : primary_log_pg(primary_log_pg), osd_op(osd_op),
5461 csum_type(csum_type), init_value_bl(std::move(init_value_bl)),
5462 fill_extent_ctx(new FillInVerifyExtent(&read_length, &osd_op.rval,
5463 &read_bl, maybe_crc, size,
5464 osd, soid, flags)) {
5465 }
c07f9fc5
FG
5466 ~C_ChecksumRead() override {
5467 delete fill_extent_ctx;
5468 }
7c673cae
FG
5469
5470 void finish(int r) override {
5471 fill_extent_ctx->complete(r);
c07f9fc5 5472 fill_extent_ctx = nullptr;
7c673cae
FG
5473
5474 if (osd_op.rval >= 0) {
11fdf7f2 5475 bufferlist::const_iterator init_value_bl_it = init_value_bl.begin();
7c673cae 5476 osd_op.rval = primary_log_pg->finish_checksum(osd_op, csum_type,
c07f9fc5 5477 &init_value_bl_it, read_bl);
7c673cae
FG
5478 }
5479 }
5480};
5481
5482int PrimaryLogPG::do_checksum(OpContext *ctx, OSDOp& osd_op,
11fdf7f2 5483 bufferlist::const_iterator *bl_it)
7c673cae
FG
5484{
5485 dout(20) << __func__ << dendl;
5486
5487 auto& op = osd_op.op;
5488 if (op.checksum.chunk_size > 0) {
5489 if (op.checksum.length == 0) {
5490 dout(10) << __func__ << ": length required when chunk size provided"
5491 << dendl;
5492 return -EINVAL;
5493 }
5494 if (op.checksum.length % op.checksum.chunk_size != 0) {
5495 dout(10) << __func__ << ": length not aligned to chunk size" << dendl;
5496 return -EINVAL;
5497 }
5498 }
5499
5500 auto& oi = ctx->new_obs.oi;
5501 if (op.checksum.offset == 0 && op.checksum.length == 0) {
5502 // zeroed offset+length implies checksum whole object
5503 op.checksum.length = oi.size;
11fdf7f2
TL
5504 } else if (op.checksum.offset >= oi.size) {
5505 // read size was trimmed to zero, do nothing
5506 // see PrimaryLogPG::do_read
5507 return 0;
5508 } else if (op.extent.offset + op.extent.length > oi.size) {
5509 op.extent.length = oi.size - op.extent.offset;
5510 if (op.checksum.chunk_size > 0 &&
5511 op.checksum.length % op.checksum.chunk_size != 0) {
5512 dout(10) << __func__ << ": length (trimmed to 0x"
5513 << std::hex << op.checksum.length
5514 << ") not aligned to chunk size 0x"
5515 << op.checksum.chunk_size << std::dec
5516 << dendl;
5517 return -EINVAL;
5518 }
7c673cae
FG
5519 }
5520
5521 Checksummer::CSumType csum_type;
5522 switch (op.checksum.type) {
5523 case CEPH_OSD_CHECKSUM_OP_TYPE_XXHASH32:
5524 csum_type = Checksummer::CSUM_XXHASH32;
5525 break;
5526 case CEPH_OSD_CHECKSUM_OP_TYPE_XXHASH64:
5527 csum_type = Checksummer::CSUM_XXHASH64;
5528 break;
5529 case CEPH_OSD_CHECKSUM_OP_TYPE_CRC32C:
5530 csum_type = Checksummer::CSUM_CRC32C;
5531 break;
5532 default:
5533 dout(10) << __func__ << ": unknown crc type ("
5534 << static_cast<uint32_t>(op.checksum.type) << ")" << dendl;
5535 return -EINVAL;
5536 }
5537
5538 size_t csum_init_value_size = Checksummer::get_csum_init_value_size(csum_type);
5539 if (bl_it->get_remaining() < csum_init_value_size) {
5540 dout(10) << __func__ << ": init value not provided" << dendl;
5541 return -EINVAL;
5542 }
5543
5544 bufferlist init_value_bl;
5545 init_value_bl.substr_of(bl_it->get_bl(), bl_it->get_off(),
5546 csum_init_value_size);
9f95a23c 5547 *bl_it += csum_init_value_size;
7c673cae 5548
11fdf7f2 5549 if (pool.info.is_erasure() && op.checksum.length > 0) {
7c673cae
FG
5550 // If there is a data digest and it is possible we are reading
5551 // entire object, pass the digest.
9f95a23c 5552 std::optional<uint32_t> maybe_crc;
11fdf7f2 5553 if (oi.is_data_digest() && op.checksum.offset == 0 &&
7c673cae
FG
5554 op.checksum.length >= oi.size) {
5555 maybe_crc = oi.data_digest;
5556 }
5557
5558 // async read
5559 auto& soid = oi.soid;
5560 auto checksum_ctx = new C_ChecksumRead(this, osd_op, csum_type,
5561 std::move(init_value_bl), maybe_crc,
5562 oi.size, osd, soid, op.flags);
c07f9fc5 5563
7c673cae
FG
5564 ctx->pending_async_reads.push_back({
5565 {op.checksum.offset, op.checksum.length, op.flags},
5566 {&checksum_ctx->read_bl, checksum_ctx}});
5567
5568 dout(10) << __func__ << ": async_read noted for " << soid << dendl;
c07f9fc5
FG
5569 ctx->op_finishers[ctx->current_osd_subop_num].reset(
5570 new ReadFinisher(osd_op));
5571 return -EINPROGRESS;
7c673cae
FG
5572 }
5573
5574 // sync read
7c673cae
FG
5575 std::vector<OSDOp> read_ops(1);
5576 auto& read_op = read_ops[0];
5577 if (op.checksum.length > 0) {
5578 read_op.op.op = CEPH_OSD_OP_READ;
5579 read_op.op.flags = op.flags;
5580 read_op.op.extent.offset = op.checksum.offset;
5581 read_op.op.extent.length = op.checksum.length;
5582 read_op.op.extent.truncate_size = 0;
5583 read_op.op.extent.truncate_seq = 0;
5584
5585 int r = do_osd_ops(ctx, read_ops);
5586 if (r < 0) {
5587 derr << __func__ << ": do_osd_ops failed: " << cpp_strerror(r) << dendl;
5588 return r;
5589 }
5590 }
5591
11fdf7f2 5592 bufferlist::const_iterator init_value_bl_it = init_value_bl.begin();
7c673cae
FG
5593 return finish_checksum(osd_op, csum_type, &init_value_bl_it,
5594 read_op.outdata);
5595}
5596
5597int PrimaryLogPG::finish_checksum(OSDOp& osd_op,
5598 Checksummer::CSumType csum_type,
11fdf7f2 5599 bufferlist::const_iterator *init_value_bl_it,
7c673cae
FG
5600 const bufferlist &read_bl) {
5601 dout(20) << __func__ << dendl;
5602
5603 auto& op = osd_op.op;
5604
5605 if (op.checksum.length > 0 && read_bl.length() != op.checksum.length) {
5606 derr << __func__ << ": bytes read " << read_bl.length() << " != "
5607 << op.checksum.length << dendl;
5608 return -EINVAL;
5609 }
5610
5611 size_t csum_chunk_size = (op.checksum.chunk_size != 0 ?
5612 op.checksum.chunk_size : read_bl.length());
5613 uint32_t csum_count = (csum_chunk_size > 0 ?
5614 read_bl.length() / csum_chunk_size : 0);
5615
5616 bufferlist csum;
5617 bufferptr csum_data;
5618 if (csum_count > 0) {
5619 size_t csum_value_size = Checksummer::get_csum_value_size(csum_type);
f67539c2 5620 csum_data = ceph::buffer::create(csum_value_size * csum_count);
7c673cae
FG
5621 csum_data.zero();
5622 csum.append(csum_data);
5623
5624 switch (csum_type) {
5625 case Checksummer::CSUM_XXHASH32:
5626 {
5627 Checksummer::xxhash32::init_value_t init_value;
11fdf7f2 5628 decode(init_value, *init_value_bl_it);
7c673cae
FG
5629 Checksummer::calculate<Checksummer::xxhash32>(
5630 init_value, csum_chunk_size, 0, read_bl.length(), read_bl,
5631 &csum_data);
5632 }
5633 break;
5634 case Checksummer::CSUM_XXHASH64:
5635 {
5636 Checksummer::xxhash64::init_value_t init_value;
11fdf7f2 5637 decode(init_value, *init_value_bl_it);
7c673cae
FG
5638 Checksummer::calculate<Checksummer::xxhash64>(
5639 init_value, csum_chunk_size, 0, read_bl.length(), read_bl,
5640 &csum_data);
5641 }
5642 break;
5643 case Checksummer::CSUM_CRC32C:
5644 {
5645 Checksummer::crc32c::init_value_t init_value;
11fdf7f2 5646 decode(init_value, *init_value_bl_it);
7c673cae
FG
5647 Checksummer::calculate<Checksummer::crc32c>(
5648 init_value, csum_chunk_size, 0, read_bl.length(), read_bl,
5649 &csum_data);
5650 }
5651 break;
5652 default:
5653 break;
5654 }
5655 }
5656
11fdf7f2 5657 encode(csum_count, osd_op.outdata);
7c673cae
FG
5658 osd_op.outdata.claim_append(csum);
5659 return 0;
5660}
5661
c07f9fc5
FG
5662struct C_ExtentCmpRead : public Context {
5663 PrimaryLogPG *primary_log_pg;
5664 OSDOp &osd_op;
11fdf7f2 5665 ceph_le64 read_length{};
c07f9fc5
FG
5666 bufferlist read_bl;
5667 Context *fill_extent_ctx;
5668
5669 C_ExtentCmpRead(PrimaryLogPG *primary_log_pg, OSDOp &osd_op,
9f95a23c
TL
5670 std::optional<uint32_t> maybe_crc, uint64_t size,
5671 OSDService *osd, hobject_t soid, uint32_t flags)
c07f9fc5
FG
5672 : primary_log_pg(primary_log_pg), osd_op(osd_op),
5673 fill_extent_ctx(new FillInVerifyExtent(&read_length, &osd_op.rval,
5674 &read_bl, maybe_crc, size,
5675 osd, soid, flags)) {
5676 }
5677 ~C_ExtentCmpRead() override {
5678 delete fill_extent_ctx;
5679 }
5680
5681 void finish(int r) override {
5682 if (r == -ENOENT) {
5683 osd_op.rval = 0;
5684 read_bl.clear();
5685 delete fill_extent_ctx;
5686 } else {
5687 fill_extent_ctx->complete(r);
5688 }
5689 fill_extent_ctx = nullptr;
5690
5691 if (osd_op.rval >= 0) {
5692 osd_op.rval = primary_log_pg->finish_extent_cmp(osd_op, read_bl);
5693 }
5694 }
5695};
5696
5697int PrimaryLogPG::do_extent_cmp(OpContext *ctx, OSDOp& osd_op)
5698{
5699 dout(20) << __func__ << dendl;
5700 ceph_osd_op& op = osd_op.op;
5701
3efd9988
FG
5702 auto& oi = ctx->new_obs.oi;
5703 uint64_t size = oi.size;
5704 if ((oi.truncate_seq < op.extent.truncate_seq) &&
5705 (op.extent.offset + op.extent.length > op.extent.truncate_size)) {
5706 size = op.extent.truncate_size;
5707 }
5708
5709 if (op.extent.offset >= size) {
5710 op.extent.length = 0;
5711 } else if (op.extent.offset + op.extent.length > size) {
5712 op.extent.length = size - op.extent.offset;
5713 }
5714
5715 if (op.extent.length == 0) {
5716 dout(20) << __func__ << " zero length extent" << dendl;
5717 return finish_extent_cmp(osd_op, bufferlist{});
5718 } else if (!ctx->obs->exists || ctx->obs->oi.is_whiteout()) {
c07f9fc5
FG
5719 dout(20) << __func__ << " object DNE" << dendl;
5720 return finish_extent_cmp(osd_op, {});
11fdf7f2 5721 } else if (pool.info.is_erasure()) {
c07f9fc5
FG
5722 // If there is a data digest and it is possible we are reading
5723 // entire object, pass the digest.
9f95a23c 5724 std::optional<uint32_t> maybe_crc;
11fdf7f2 5725 if (oi.is_data_digest() && op.checksum.offset == 0 &&
c07f9fc5
FG
5726 op.checksum.length >= oi.size) {
5727 maybe_crc = oi.data_digest;
5728 }
5729
5730 // async read
5731 auto& soid = oi.soid;
5732 auto extent_cmp_ctx = new C_ExtentCmpRead(this, osd_op, maybe_crc, oi.size,
5733 osd, soid, op.flags);
5734 ctx->pending_async_reads.push_back({
5735 {op.extent.offset, op.extent.length, op.flags},
5736 {&extent_cmp_ctx->read_bl, extent_cmp_ctx}});
5737
5738 dout(10) << __func__ << ": async_read noted for " << soid << dendl;
5739
5740 ctx->op_finishers[ctx->current_osd_subop_num].reset(
5741 new ReadFinisher(osd_op));
5742 return -EINPROGRESS;
5743 }
5744
5745 // sync read
5746 vector<OSDOp> read_ops(1);
5747 OSDOp& read_op = read_ops[0];
5748
5749 read_op.op.op = CEPH_OSD_OP_SYNC_READ;
5750 read_op.op.extent.offset = op.extent.offset;
5751 read_op.op.extent.length = op.extent.length;
5752 read_op.op.extent.truncate_seq = op.extent.truncate_seq;
5753 read_op.op.extent.truncate_size = op.extent.truncate_size;
5754
5755 int result = do_osd_ops(ctx, read_ops);
5756 if (result < 0) {
5757 derr << __func__ << " failed " << result << dendl;
5758 return result;
5759 }
5760 return finish_extent_cmp(osd_op, read_op.outdata);
5761}
5762
5763int PrimaryLogPG::finish_extent_cmp(OSDOp& osd_op, const bufferlist &read_bl)
5764{
5765 for (uint64_t idx = 0; idx < osd_op.indata.length(); ++idx) {
5766 char read_byte = (idx < read_bl.length() ? read_bl[idx] : 0);
5767 if (osd_op.indata[idx] != read_byte) {
5768 return (-MAX_ERRNO - idx);
5769 }
5770 }
5771
5772 return 0;
5773}
5774
5775int PrimaryLogPG::do_read(OpContext *ctx, OSDOp& osd_op) {
5776 dout(20) << __func__ << dendl;
5777 auto& op = osd_op.op;
5778 auto& oi = ctx->new_obs.oi;
5779 auto& soid = oi.soid;
5780 __u32 seq = oi.truncate_seq;
5781 uint64_t size = oi.size;
5782 bool trimmed_read = false;
5783
91327a77
AA
5784 dout(30) << __func__ << " oi.size: " << oi.size << dendl;
5785 dout(30) << __func__ << " oi.truncate_seq: " << oi.truncate_seq << dendl;
5786 dout(30) << __func__ << " op.extent.truncate_seq: " << op.extent.truncate_seq << dendl;
5787 dout(30) << __func__ << " op.extent.truncate_size: " << op.extent.truncate_size << dendl;
5788
c07f9fc5
FG
5789 // are we beyond truncate_size?
5790 if ( (seq < op.extent.truncate_seq) &&
91327a77
AA
5791 (op.extent.offset + op.extent.length > op.extent.truncate_size) &&
5792 (size > op.extent.truncate_size) )
c07f9fc5
FG
5793 size = op.extent.truncate_size;
5794
5795 if (op.extent.length == 0) //length is zero mean read the whole object
5796 op.extent.length = size;
5797
5798 if (op.extent.offset >= size) {
5799 op.extent.length = 0;
5800 trimmed_read = true;
5801 } else if (op.extent.offset + op.extent.length > size) {
5802 op.extent.length = size - op.extent.offset;
5803 trimmed_read = true;
5804 }
5805
91327a77
AA
5806 dout(30) << __func__ << "op.extent.length is now " << op.extent.length << dendl;
5807
c07f9fc5
FG
5808 // read into a buffer
5809 int result = 0;
5810 if (trimmed_read && op.extent.length == 0) {
5811 // read size was trimmed to zero and it is expected to do nothing
5812 // a read operation of 0 bytes does *not* do nothing, this is why
5813 // the trimmed_read boolean is needed
11fdf7f2
TL
5814 } else if (pool.info.is_erasure()) {
5815 // The initialisation below is required to silence a false positive
5816 // -Wmaybe-uninitialized warning
9f95a23c 5817 std::optional<uint32_t> maybe_crc;
c07f9fc5
FG
5818 // If there is a data digest and it is possible we are reading
5819 // entire object, pass the digest. FillInVerifyExtent will
5820 // will check the oi.size again.
11fdf7f2 5821 if (oi.is_data_digest() && op.extent.offset == 0 &&
c07f9fc5
FG
5822 op.extent.length >= oi.size)
5823 maybe_crc = oi.data_digest;
5824 ctx->pending_async_reads.push_back(
5825 make_pair(
5826 boost::make_tuple(op.extent.offset, op.extent.length, op.flags),
5827 make_pair(&osd_op.outdata,
5828 new FillInVerifyExtent(&op.extent.length, &osd_op.rval,
5829 &osd_op.outdata, maybe_crc, oi.size,
5830 osd, soid, op.flags))));
5831 dout(10) << " async_read noted for " << soid << dendl;
5832
5833 ctx->op_finishers[ctx->current_osd_subop_num].reset(
5834 new ReadFinisher(osd_op));
5835 } else {
5836 int r = pgbackend->objects_read_sync(
5837 soid, op.extent.offset, op.extent.length, op.flags, &osd_op.outdata);
1adf2230 5838 // whole object? can we verify the checksum?
11fdf7f2 5839 if (r >= 0 && op.extent.offset == 0 &&
1adf2230
AA
5840 (uint64_t)r == oi.size && oi.is_data_digest()) {
5841 uint32_t crc = osd_op.outdata.crc32c(-1);
5842 if (oi.data_digest != crc) {
5843 osd->clog->error() << info.pgid << std::hex
5844 << " full-object read crc 0x" << crc
5845 << " != expected 0x" << oi.data_digest
5846 << std::dec << " on " << soid;
5847 r = -EIO; // try repair later
5848 }
5849 }
c07f9fc5 5850 if (r == -EIO) {
11fdf7f2 5851 r = rep_repair_primary_object(soid, ctx);
c07f9fc5
FG
5852 }
5853 if (r >= 0)
5854 op.extent.length = r;
a8e16298 5855 else if (r == -EAGAIN) {
11fdf7f2 5856 result = -EAGAIN;
a8e16298 5857 } else {
c07f9fc5
FG
5858 result = r;
5859 op.extent.length = 0;
5860 }
5861 dout(10) << " read got " << r << " / " << op.extent.length
5862 << " bytes from obj " << soid << dendl;
c07f9fc5 5863 }
11fdf7f2
TL
5864 if (result >= 0) {
5865 ctx->delta_stats.num_rd_kb += shift_round_up(op.extent.length, 10);
5866 ctx->delta_stats.num_rd++;
5867 }
c07f9fc5
FG
5868 return result;
5869}
5870
5871int PrimaryLogPG::do_sparse_read(OpContext *ctx, OSDOp& osd_op) {
5872 dout(20) << __func__ << dendl;
5873 auto& op = osd_op.op;
5874 auto& oi = ctx->new_obs.oi;
5875 auto& soid = oi.soid;
1d09f67e
TL
5876 uint64_t size = oi.size;
5877 uint64_t offset = op.extent.offset;
5878 uint64_t length = op.extent.length;
c07f9fc5 5879
1d09f67e
TL
5880 // are we beyond truncate_size?
5881 if ((oi.truncate_seq < op.extent.truncate_seq) &&
5882 (op.extent.offset + op.extent.length > op.extent.truncate_size) &&
5883 (size > op.extent.truncate_size)) {
5884 size = op.extent.truncate_size;
5885 }
5886
5887 if (offset > size) {
5888 length = 0;
5889 } else if (offset + length > size) {
5890 length = size - offset;
c07f9fc5
FG
5891 }
5892
5893 ++ctx->num_read;
11fdf7f2 5894 if (pool.info.is_erasure()) {
c07f9fc5 5895 // translate sparse read to a normal one if not supported
c07f9fc5
FG
5896
5897 if (length > 0) {
5898 ctx->pending_async_reads.push_back(
5899 make_pair(
5900 boost::make_tuple(offset, length, op.flags),
5901 make_pair(
5902 &osd_op.outdata,
5903 new ToSparseReadResult(&osd_op.rval, &osd_op.outdata, offset,
5904 &op.extent.length))));
5905 dout(10) << " async_read (was sparse_read) noted for " << soid << dendl;
5906
5907 ctx->op_finishers[ctx->current_osd_subop_num].reset(
5908 new ReadFinisher(osd_op));
5909 } else {
5910 dout(10) << " sparse read ended up empty for " << soid << dendl;
5911 map<uint64_t, uint64_t> extents;
11fdf7f2 5912 encode(extents, osd_op.outdata);
c07f9fc5
FG
5913 }
5914 } else {
5915 // read into a buffer
5916 map<uint64_t, uint64_t> m;
c07f9fc5
FG
5917 int r = osd->store->fiemap(ch, ghobject_t(soid, ghobject_t::NO_GEN,
5918 info.pgid.shard),
1d09f67e 5919 offset, length, m);
c07f9fc5
FG
5920 if (r < 0) {
5921 return r;
5922 }
5923
c07f9fc5 5924 bufferlist data_bl;
9f95a23c
TL
5925 r = pgbackend->objects_readv_sync(soid, std::move(m), op.flags, &data_bl);
5926 if (r == -EIO) {
5927 r = rep_repair_primary_object(soid, ctx);
5928 }
5929 if (r < 0) {
5930 return r;
c07f9fc5
FG
5931 }
5932
5933 // Why SPARSE_READ need checksum? In fact, librbd always use sparse-read.
5934 // Maybe at first, there is no much whole objects. With continued use, more
5935 // and more whole object exist. So from this point, for spare-read add
5936 // checksum make sense.
9f95a23c 5937 if ((uint64_t)r == oi.size && oi.is_data_digest()) {
c07f9fc5
FG
5938 uint32_t crc = data_bl.crc32c(-1);
5939 if (oi.data_digest != crc) {
5940 osd->clog->error() << info.pgid << std::hex
5941 << " full-object read crc 0x" << crc
5942 << " != expected 0x" << oi.data_digest
5943 << std::dec << " on " << soid;
11fdf7f2 5944 r = rep_repair_primary_object(soid, ctx);
1adf2230
AA
5945 if (r < 0) {
5946 return r;
5947 }
c07f9fc5
FG
5948 }
5949 }
5950
1911f103 5951 op.extent.length = r;
c07f9fc5 5952
11fdf7f2 5953 encode(m, osd_op.outdata); // re-encode since it might be modified
c07f9fc5
FG
5954 ::encode_destructively(data_bl, osd_op.outdata);
5955
9f95a23c 5956 dout(10) << " sparse_read got " << r << " bytes from object "
c07f9fc5
FG
5957 << soid << dendl;
5958 }
5959
11fdf7f2 5960 ctx->delta_stats.num_rd_kb += shift_round_up(op.extent.length, 10);
c07f9fc5
FG
5961 ctx->delta_stats.num_rd++;
5962 return 0;
5963}
5964
7c673cae
FG
5965int PrimaryLogPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops)
5966{
5967 int result = 0;
5968 SnapSetContext *ssc = ctx->obc->ssc;
5969 ObjectState& obs = ctx->new_obs;
5970 object_info_t& oi = obs.oi;
5971 const hobject_t& soid = oi.soid;
11fdf7f2
TL
5972 const bool skip_data_digest = osd->store->has_builtin_csum() &&
5973 osd->osd_skip_data_digest;
7c673cae 5974
7c673cae
FG
5975 PGTransaction* t = ctx->op_t.get();
5976
5977 dout(10) << "do_osd_op " << soid << " " << ops << dendl;
5978
c07f9fc5 5979 ctx->current_osd_subop_num = 0;
b32b8144 5980 for (auto p = ops.begin(); p != ops.end(); ++p, ctx->current_osd_subop_num++, ctx->processed_subop_count++) {
7c673cae
FG
5981 OSDOp& osd_op = *p;
5982 ceph_osd_op& op = osd_op.op;
5983
c07f9fc5
FG
5984 OpFinisher* op_finisher = nullptr;
5985 {
5986 auto op_finisher_it = ctx->op_finishers.find(ctx->current_osd_subop_num);
5987 if (op_finisher_it != ctx->op_finishers.end()) {
5988 op_finisher = op_finisher_it->second.get();
5989 }
5990 }
5991
9f95a23c 5992 // TODO: check endianness (ceph_le32 vs uint32_t, etc.)
7c673cae
FG
5993 // The fields in ceph_osd_op are little-endian (according to the definition in rados.h),
5994 // but the code in this function seems to treat them as native-endian. What should the
5995 // tracepoints do?
5996 tracepoint(osd, do_osd_op_pre, soid.oid.name.c_str(), soid.snap.val, op.op, ceph_osd_op_name(op.op), op.flags);
5997
5998 dout(10) << "do_osd_op " << osd_op << dendl;
5999
11fdf7f2 6000 auto bp = osd_op.indata.cbegin();
7c673cae
FG
6001
6002 // user-visible modifcation?
6003 switch (op.op) {
6004 // non user-visible modifications
6005 case CEPH_OSD_OP_WATCH:
6006 case CEPH_OSD_OP_CACHE_EVICT:
6007 case CEPH_OSD_OP_CACHE_FLUSH:
6008 case CEPH_OSD_OP_CACHE_TRY_FLUSH:
6009 case CEPH_OSD_OP_UNDIRTY:
6010 case CEPH_OSD_OP_COPY_FROM: // we handle user_version update explicitly
9f95a23c 6011 case CEPH_OSD_OP_COPY_FROM2:
7c673cae
FG
6012 case CEPH_OSD_OP_CACHE_PIN:
6013 case CEPH_OSD_OP_CACHE_UNPIN:
31f18b77 6014 case CEPH_OSD_OP_SET_REDIRECT:
f67539c2 6015 case CEPH_OSD_OP_SET_CHUNK:
11fdf7f2 6016 case CEPH_OSD_OP_TIER_PROMOTE:
9f95a23c 6017 case CEPH_OSD_OP_TIER_FLUSH:
f67539c2 6018 case CEPH_OSD_OP_TIER_EVICT:
7c673cae
FG
6019 break;
6020 default:
6021 if (op.op & CEPH_OSD_OP_MODE_WR)
6022 ctx->user_modify = true;
6023 }
6024
6025 // munge -1 truncate to 0 truncate
6026 if (ceph_osd_op_uses_extent(op.op) &&
6027 op.extent.truncate_seq == 1 &&
6028 op.extent.truncate_size == (-1ULL)) {
6029 op.extent.truncate_size = 0;
6030 op.extent.truncate_seq = 0;
6031 }
6032
6033 // munge ZERO -> TRUNCATE? (don't munge to DELETE or we risk hosing attributes)
6034 if (op.op == CEPH_OSD_OP_ZERO &&
11fdf7f2
TL
6035 obs.exists &&
6036 op.extent.offset < static_cast<Option::size_t>(osd->osd_max_object_size) &&
6037 op.extent.length >= 1 &&
6038 op.extent.length <= static_cast<Option::size_t>(osd->osd_max_object_size) &&
7c673cae
FG
6039 op.extent.offset + op.extent.length >= oi.size) {
6040 if (op.extent.offset >= oi.size) {
6041 // no-op
6042 goto fail;
6043 }
6044 dout(10) << " munging ZERO " << op.extent.offset << "~" << op.extent.length
6045 << " -> TRUNCATE " << op.extent.offset << " (old size is " << oi.size << ")" << dendl;
6046 op.op = CEPH_OSD_OP_TRUNCATE;
6047 }
6048
6049 switch (op.op) {
f67539c2 6050
7c673cae
FG
6051 // --- READS ---
6052
6053 case CEPH_OSD_OP_CMPEXT:
6054 ++ctx->num_read;
c07f9fc5
FG
6055 tracepoint(osd, do_osd_op_pre_extent_cmp, soid.oid.name.c_str(),
6056 soid.snap.val, oi.size, oi.truncate_seq, op.extent.offset,
6057 op.extent.length, op.extent.truncate_size,
6058 op.extent.truncate_seq);
6059
6060 if (op_finisher == nullptr) {
6061 result = do_extent_cmp(ctx, osd_op);
6062 } else {
6063 result = op_finisher->execute();
6064 }
7c673cae
FG
6065 break;
6066
6067 case CEPH_OSD_OP_SYNC_READ:
11fdf7f2 6068 if (pool.info.is_erasure()) {
7c673cae
FG
6069 result = -EOPNOTSUPP;
6070 break;
6071 }
6072 // fall through
6073 case CEPH_OSD_OP_READ:
6074 ++ctx->num_read;
c07f9fc5
FG
6075 tracepoint(osd, do_osd_op_pre_read, soid.oid.name.c_str(),
6076 soid.snap.val, oi.size, oi.truncate_seq, op.extent.offset,
6077 op.extent.length, op.extent.truncate_size,
6078 op.extent.truncate_seq);
6079 if (op_finisher == nullptr) {
6080 if (!ctx->data_off) {
7c673cae
FG
6081 ctx->data_off = op.extent.offset;
6082 }
c07f9fc5
FG
6083 result = do_read(ctx, osd_op);
6084 } else {
6085 result = op_finisher->execute();
7c673cae
FG
6086 }
6087 break;
6088
6089 case CEPH_OSD_OP_CHECKSUM:
6090 ++ctx->num_read;
6091 {
6092 tracepoint(osd, do_osd_op_pre_checksum, soid.oid.name.c_str(),
6093 soid.snap.val, oi.size, oi.truncate_seq, op.checksum.type,
6094 op.checksum.offset, op.checksum.length,
6095 op.checksum.chunk_size);
6096
c07f9fc5
FG
6097 if (op_finisher == nullptr) {
6098 result = do_checksum(ctx, osd_op, &bp);
6099 } else {
6100 result = op_finisher->execute();
7c673cae
FG
6101 }
6102 }
6103 break;
6104
6105 /* map extents */
6106 case CEPH_OSD_OP_MAPEXT:
6107 tracepoint(osd, do_osd_op_pre_mapext, soid.oid.name.c_str(), soid.snap.val, op.extent.offset, op.extent.length);
11fdf7f2 6108 if (pool.info.is_erasure()) {
7c673cae
FG
6109 result = -EOPNOTSUPP;
6110 break;
6111 }
6112 ++ctx->num_read;
6113 {
6114 // read into a buffer
6115 bufferlist bl;
6116 int r = osd->store->fiemap(ch, ghobject_t(soid, ghobject_t::NO_GEN,
6117 info.pgid.shard),
6118 op.extent.offset, op.extent.length, bl);
f67539c2 6119 osd_op.outdata = std::move(bl);
7c673cae
FG
6120 if (r < 0)
6121 result = r;
6122 else
11fdf7f2 6123 ctx->delta_stats.num_rd_kb += shift_round_up(bl.length(), 10);
7c673cae
FG
6124 ctx->delta_stats.num_rd++;
6125 dout(10) << " map_extents done on object " << soid << dendl;
6126 }
6127 break;
6128
6129 /* map extents */
6130 case CEPH_OSD_OP_SPARSE_READ:
c07f9fc5
FG
6131 tracepoint(osd, do_osd_op_pre_sparse_read, soid.oid.name.c_str(),
6132 soid.snap.val, oi.size, oi.truncate_seq, op.extent.offset,
6133 op.extent.length, op.extent.truncate_size,
6134 op.extent.truncate_seq);
6135 if (op_finisher == nullptr) {
6136 result = do_sparse_read(ctx, osd_op);
7c673cae 6137 } else {
c07f9fc5 6138 result = op_finisher->execute();
7c673cae 6139 }
7c673cae
FG
6140 break;
6141
6142 case CEPH_OSD_OP_CALL:
6143 {
6144 string cname, mname;
6145 bufferlist indata;
6146 try {
6147 bp.copy(op.cls.class_len, cname);
6148 bp.copy(op.cls.method_len, mname);
6149 bp.copy(op.cls.indata_len, indata);
f67539c2 6150 } catch (ceph::buffer::error& e) {
7c673cae
FG
6151 dout(10) << "call unable to decode class + method + indata" << dendl;
6152 dout(30) << "in dump: ";
6153 osd_op.indata.hexdump(*_dout);
6154 *_dout << dendl;
6155 result = -EINVAL;
6156 tracepoint(osd, do_osd_op_pre_call, soid.oid.name.c_str(), soid.snap.val, "???", "???");
6157 break;
6158 }
6159 tracepoint(osd, do_osd_op_pre_call, soid.oid.name.c_str(), soid.snap.val, cname.c_str(), mname.c_str());
6160
6161 ClassHandler::ClassData *cls;
9f95a23c 6162 result = ClassHandler::get_instance().open_class(cname, &cls);
11fdf7f2 6163 ceph_assert(result == 0); // init_op_flags() already verified this works.
7c673cae 6164
9f95a23c 6165 ClassHandler::ClassMethod *method = cls->get_method(mname);
7c673cae
FG
6166 if (!method) {
6167 dout(10) << "call method " << cname << "." << mname << " does not exist" << dendl;
6168 result = -EOPNOTSUPP;
6169 break;
6170 }
6171
6172 int flags = method->get_flags();
6173 if (flags & CLS_METHOD_WR)
6174 ctx->user_modify = true;
6175
6176 bufferlist outdata;
6177 dout(10) << "call method " << cname << "." << mname << dendl;
6178 int prev_rd = ctx->num_read;
6179 int prev_wr = ctx->num_write;
6180 result = method->exec((cls_method_context_t)&ctx, indata, outdata);
6181
6182 if (ctx->num_read > prev_rd && !(flags & CLS_METHOD_RD)) {
6183 derr << "method " << cname << "." << mname << " tried to read object but is not marked RD" << dendl;
6184 result = -EIO;
6185 break;
6186 }
6187 if (ctx->num_write > prev_wr && !(flags & CLS_METHOD_WR)) {
6188 derr << "method " << cname << "." << mname << " tried to update object but is not marked WR" << dendl;
6189 result = -EIO;
6190 break;
6191 }
6192
6193 dout(10) << "method called response length=" << outdata.length() << dendl;
6194 op.extent.length = outdata.length();
6195 osd_op.outdata.claim_append(outdata);
6196 dout(30) << "out dump: ";
6197 osd_op.outdata.hexdump(*_dout);
6198 *_dout << dendl;
6199 }
6200 break;
6201
6202 case CEPH_OSD_OP_STAT:
6203 // note: stat does not require RD
6204 {
6205 tracepoint(osd, do_osd_op_pre_stat, soid.oid.name.c_str(), soid.snap.val);
6206
6207 if (obs.exists && !oi.is_whiteout()) {
11fdf7f2
TL
6208 encode(oi.size, osd_op.outdata);
6209 encode(oi.mtime, osd_op.outdata);
7c673cae
FG
6210 dout(10) << "stat oi has " << oi.size << " " << oi.mtime << dendl;
6211 } else {
6212 result = -ENOENT;
6213 dout(10) << "stat oi object does not exist" << dendl;
6214 }
6215
6216 ctx->delta_stats.num_rd++;
6217 }
6218 break;
6219
6220 case CEPH_OSD_OP_ISDIRTY:
6221 ++ctx->num_read;
6222 {
6223 tracepoint(osd, do_osd_op_pre_isdirty, soid.oid.name.c_str(), soid.snap.val);
6224 bool is_dirty = obs.oi.is_dirty();
11fdf7f2 6225 encode(is_dirty, osd_op.outdata);
7c673cae
FG
6226 ctx->delta_stats.num_rd++;
6227 result = 0;
6228 }
6229 break;
6230
6231 case CEPH_OSD_OP_UNDIRTY:
6232 ++ctx->num_write;
9f95a23c 6233 result = 0;
7c673cae
FG
6234 {
6235 tracepoint(osd, do_osd_op_pre_undirty, soid.oid.name.c_str(), soid.snap.val);
6236 if (oi.is_dirty()) {
6237 ctx->undirty = true; // see make_writeable()
6238 ctx->modify = true;
6239 ctx->delta_stats.num_wr++;
6240 }
7c673cae
FG
6241 }
6242 break;
6243
6244 case CEPH_OSD_OP_CACHE_TRY_FLUSH:
6245 ++ctx->num_write;
9f95a23c 6246 result = 0;
7c673cae
FG
6247 {
6248 tracepoint(osd, do_osd_op_pre_try_flush, soid.oid.name.c_str(), soid.snap.val);
9f95a23c 6249 if (ctx->lock_type != RWState::RWNONE) {
7c673cae
FG
6250 dout(10) << "cache-try-flush without SKIPRWLOCKS flag set" << dendl;
6251 result = -EINVAL;
6252 break;
6253 }
f67539c2 6254 if (pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE || obs.oi.has_manifest()) {
7c673cae
FG
6255 result = -EINVAL;
6256 break;
6257 }
6258 if (!obs.exists) {
6259 result = 0;
6260 break;
6261 }
6262 if (oi.is_cache_pinned()) {
6263 dout(10) << "cache-try-flush on a pinned object, consider unpin this object first" << dendl;
6264 result = -EPERM;
6265 break;
6266 }
6267 if (oi.is_dirty()) {
9f95a23c 6268 result = start_flush(ctx->op, ctx->obc, false, NULL, std::nullopt);
7c673cae
FG
6269 if (result == -EINPROGRESS)
6270 result = -EAGAIN;
6271 } else {
6272 result = 0;
6273 }
6274 }
6275 break;
6276
6277 case CEPH_OSD_OP_CACHE_FLUSH:
6278 ++ctx->num_write;
9f95a23c 6279 result = 0;
7c673cae
FG
6280 {
6281 tracepoint(osd, do_osd_op_pre_cache_flush, soid.oid.name.c_str(), soid.snap.val);
9f95a23c 6282 if (ctx->lock_type == RWState::RWNONE) {
7c673cae
FG
6283 dout(10) << "cache-flush with SKIPRWLOCKS flag set" << dendl;
6284 result = -EINVAL;
6285 break;
6286 }
f67539c2 6287 if (pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE || obs.oi.has_manifest()) {
7c673cae
FG
6288 result = -EINVAL;
6289 break;
6290 }
6291 if (!obs.exists) {
6292 result = 0;
6293 break;
6294 }
6295 if (oi.is_cache_pinned()) {
6296 dout(10) << "cache-flush on a pinned object, consider unpin this object first" << dendl;
6297 result = -EPERM;
6298 break;
6299 }
6300 hobject_t missing;
6301 if (oi.is_dirty()) {
9f95a23c 6302 result = start_flush(ctx->op, ctx->obc, true, &missing, std::nullopt);
7c673cae
FG
6303 if (result == -EINPROGRESS)
6304 result = -EAGAIN;
6305 } else {
6306 result = 0;
6307 }
6308 // Check special return value which has set missing_return
6309 if (result == -ENOENT) {
6310 dout(10) << __func__ << " CEPH_OSD_OP_CACHE_FLUSH got ENOENT" << dendl;
11fdf7f2 6311 ceph_assert(!missing.is_min());
7c673cae
FG
6312 wait_for_unreadable_object(missing, ctx->op);
6313 // Error code which is used elsewhere when wait_for_unreadable_object() is used
6314 result = -EAGAIN;
6315 }
6316 }
6317 break;
6318
6319 case CEPH_OSD_OP_CACHE_EVICT:
6320 ++ctx->num_write;
9f95a23c 6321 result = 0;
7c673cae
FG
6322 {
6323 tracepoint(osd, do_osd_op_pre_cache_evict, soid.oid.name.c_str(), soid.snap.val);
f67539c2 6324 if (pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE || obs.oi.has_manifest()) {
7c673cae
FG
6325 result = -EINVAL;
6326 break;
6327 }
6328 if (!obs.exists) {
6329 result = 0;
6330 break;
6331 }
6332 if (oi.is_cache_pinned()) {
6333 dout(10) << "cache-evict on a pinned object, consider unpin this object first" << dendl;
6334 result = -EPERM;
6335 break;
6336 }
6337 if (oi.is_dirty()) {
6338 result = -EBUSY;
6339 break;
6340 }
6341 if (!oi.watchers.empty()) {
6342 result = -EBUSY;
6343 break;
6344 }
6345 if (soid.snap == CEPH_NOSNAP) {
6346 result = _verify_no_head_clones(soid, ssc->snapset);
6347 if (result < 0)
6348 break;
6349 }
6350 result = _delete_oid(ctx, true, false);
6351 if (result >= 0) {
6352 // mark that this is a cache eviction to avoid triggering normal
11fdf7f2 6353 // make_writeable() clone creation in finish_ctx()
f67539c2 6354 ctx->cache_operation = true;
7c673cae
FG
6355 }
6356 osd->logger->inc(l_osd_tier_evict);
6357 }
6358 break;
6359
6360 case CEPH_OSD_OP_GETXATTR:
6361 ++ctx->num_read;
6362 {
6363 string aname;
6364 bp.copy(op.xattr.name_len, aname);
6365 tracepoint(osd, do_osd_op_pre_getxattr, soid.oid.name.c_str(), soid.snap.val, aname.c_str());
6366 string name = "_" + aname;
6367 int r = getattr_maybe_cache(
6368 ctx->obc,
6369 name,
6370 &(osd_op.outdata));
6371 if (r >= 0) {
6372 op.xattr.value_len = osd_op.outdata.length();
6373 result = 0;
11fdf7f2 6374 ctx->delta_stats.num_rd_kb += shift_round_up(osd_op.outdata.length(), 10);
7c673cae
FG
6375 } else
6376 result = r;
6377
6378 ctx->delta_stats.num_rd++;
6379 }
6380 break;
6381
6382 case CEPH_OSD_OP_GETXATTRS:
6383 ++ctx->num_read;
6384 {
6385 tracepoint(osd, do_osd_op_pre_getxattrs, soid.oid.name.c_str(), soid.snap.val);
20effc67 6386 map<string, bufferlist,less<>> out;
7c673cae
FG
6387 result = getattrs_maybe_cache(
6388 ctx->obc,
b32b8144 6389 &out);
f67539c2 6390
7c673cae 6391 bufferlist bl;
11fdf7f2
TL
6392 encode(out, bl);
6393 ctx->delta_stats.num_rd_kb += shift_round_up(bl.length(), 10);
7c673cae
FG
6394 ctx->delta_stats.num_rd++;
6395 osd_op.outdata.claim_append(bl);
6396 }
6397 break;
f67539c2 6398
7c673cae
FG
6399 case CEPH_OSD_OP_CMPXATTR:
6400 ++ctx->num_read;
6401 {
6402 string aname;
6403 bp.copy(op.xattr.name_len, aname);
6404 tracepoint(osd, do_osd_op_pre_cmpxattr, soid.oid.name.c_str(), soid.snap.val, aname.c_str());
6405 string name = "_" + aname;
6406 name[op.xattr.name_len + 1] = 0;
f67539c2 6407
7c673cae
FG
6408 bufferlist xattr;
6409 result = getattr_maybe_cache(
6410 ctx->obc,
6411 name,
6412 &xattr);
6413 if (result < 0 && result != -EEXIST && result != -ENODATA)
6414 break;
f67539c2 6415
7c673cae 6416 ctx->delta_stats.num_rd++;
11fdf7f2 6417 ctx->delta_stats.num_rd_kb += shift_round_up(xattr.length(), 10);
7c673cae
FG
6418
6419 switch (op.xattr.cmp_mode) {
6420 case CEPH_OSD_CMPXATTR_MODE_STRING:
6421 {
6422 string val;
6423 bp.copy(op.xattr.value_len, val);
6424 val[op.xattr.value_len] = 0;
6425 dout(10) << "CEPH_OSD_OP_CMPXATTR name=" << name << " val=" << val
6426 << " op=" << (int)op.xattr.cmp_op << " mode=" << (int)op.xattr.cmp_mode << dendl;
6427 result = do_xattr_cmp_str(op.xattr.cmp_op, val, xattr);
6428 }
6429 break;
6430
6431 case CEPH_OSD_CMPXATTR_MODE_U64:
6432 {
6433 uint64_t u64val;
6434 try {
11fdf7f2 6435 decode(u64val, bp);
7c673cae 6436 }
f67539c2 6437 catch (ceph::buffer::error& e) {
7c673cae
FG
6438 result = -EINVAL;
6439 goto fail;
6440 }
6441 dout(10) << "CEPH_OSD_OP_CMPXATTR name=" << name << " val=" << u64val
6442 << " op=" << (int)op.xattr.cmp_op << " mode=" << (int)op.xattr.cmp_mode << dendl;
6443 result = do_xattr_cmp_u64(op.xattr.cmp_op, u64val, xattr);
6444 }
6445 break;
6446
6447 default:
6448 dout(10) << "bad cmp mode " << (int)op.xattr.cmp_mode << dendl;
6449 result = -EINVAL;
6450 }
6451
6452 if (!result) {
6453 dout(10) << "comparison returned false" << dendl;
6454 result = -ECANCELED;
6455 break;
6456 }
6457 if (result < 0) {
6458 dout(10) << "comparison returned " << result << " " << cpp_strerror(-result) << dendl;
6459 break;
6460 }
6461
6462 dout(10) << "comparison returned true" << dendl;
6463 }
6464 break;
6465
6466 case CEPH_OSD_OP_ASSERT_VER:
6467 ++ctx->num_read;
6468 {
6469 uint64_t ver = op.assert_ver.ver;
6470 tracepoint(osd, do_osd_op_pre_assert_ver, soid.oid.name.c_str(), soid.snap.val, ver);
1e59de90 6471 if (!ver) {
7c673cae 6472 result = -EINVAL;
1e59de90 6473 } else if (ver < oi.user_version) {
7c673cae 6474 result = -ERANGE;
1e59de90 6475 } else if (ver > oi.user_version) {
7c673cae 6476 result = -EOVERFLOW;
1e59de90 6477 }
7c673cae
FG
6478 }
6479 break;
6480
6481 case CEPH_OSD_OP_LIST_WATCHERS:
6482 ++ctx->num_read;
6483 {
6484 tracepoint(osd, do_osd_op_pre_list_watchers, soid.oid.name.c_str(), soid.snap.val);
6485 obj_list_watch_response_t resp;
6486
6487 map<pair<uint64_t, entity_name_t>, watch_info_t>::const_iterator oi_iter;
6488 for (oi_iter = oi.watchers.begin(); oi_iter != oi.watchers.end();
6489 ++oi_iter) {
6490 dout(20) << "key cookie=" << oi_iter->first.first
6491 << " entity=" << oi_iter->first.second << " "
6492 << oi_iter->second << dendl;
11fdf7f2
TL
6493 ceph_assert(oi_iter->first.first == oi_iter->second.cookie);
6494 ceph_assert(oi_iter->first.second.is_client());
7c673cae
FG
6495
6496 watch_item_t wi(oi_iter->first.second, oi_iter->second.cookie,
6497 oi_iter->second.timeout_seconds, oi_iter->second.addr);
6498 resp.entries.push_back(wi);
6499 }
6500
6501 resp.encode(osd_op.outdata, ctx->get_features());
6502 result = 0;
6503
6504 ctx->delta_stats.num_rd++;
6505 break;
6506 }
6507
6508 case CEPH_OSD_OP_LIST_SNAPS:
6509 ++ctx->num_read;
6510 {
6511 tracepoint(osd, do_osd_op_pre_list_snaps, soid.oid.name.c_str(), soid.snap.val);
6512 obj_list_snap_response_t resp;
6513
6514 if (!ssc) {
6515 ssc = ctx->obc->ssc = get_snapset_context(soid, false);
6516 }
11fdf7f2
TL
6517 ceph_assert(ssc);
6518 dout(20) << " snapset " << ssc->snapset << dendl;
7c673cae
FG
6519
6520 int clonecount = ssc->snapset.clones.size();
11fdf7f2 6521 clonecount++; // for head
7c673cae
FG
6522 resp.clones.reserve(clonecount);
6523 for (auto clone_iter = ssc->snapset.clones.begin();
6524 clone_iter != ssc->snapset.clones.end(); ++clone_iter) {
6525 clone_info ci;
6526 ci.cloneid = *clone_iter;
6527
6528 hobject_t clone_oid = soid;
6529 clone_oid.snap = *clone_iter;
6530
11fdf7f2
TL
6531 auto p = ssc->snapset.clone_snaps.find(*clone_iter);
6532 if (p == ssc->snapset.clone_snaps.end()) {
6533 osd->clog->error() << "osd." << osd->whoami
6534 << ": inconsistent clone_snaps found for oid "
6535 << soid << " clone " << *clone_iter
6536 << " snapset " << ssc->snapset;
6537 result = -EINVAL;
6538 break;
6539 }
6540 for (auto q = p->second.rbegin(); q != p->second.rend(); ++q) {
6541 ci.snaps.push_back(*q);
7c673cae
FG
6542 }
6543
6544 dout(20) << " clone " << *clone_iter << " snaps " << ci.snaps << dendl;
6545
6546 map<snapid_t, interval_set<uint64_t> >::const_iterator coi;
6547 coi = ssc->snapset.clone_overlap.find(ci.cloneid);
6548 if (coi == ssc->snapset.clone_overlap.end()) {
6549 osd->clog->error() << "osd." << osd->whoami
6550 << ": inconsistent clone_overlap found for oid "
6551 << soid << " clone " << *clone_iter;
6552 result = -EINVAL;
6553 break;
6554 }
6555 const interval_set<uint64_t> &o = coi->second;
6556 ci.overlap.reserve(o.num_intervals());
6557 for (interval_set<uint64_t>::const_iterator r = o.begin();
6558 r != o.end(); ++r) {
6559 ci.overlap.push_back(pair<uint64_t,uint64_t>(r.get_start(),
6560 r.get_len()));
6561 }
6562
6563 map<snapid_t, uint64_t>::const_iterator si;
6564 si = ssc->snapset.clone_size.find(ci.cloneid);
6565 if (si == ssc->snapset.clone_size.end()) {
6566 osd->clog->error() << "osd." << osd->whoami
6567 << ": inconsistent clone_size found for oid "
6568 << soid << " clone " << *clone_iter;
6569 result = -EINVAL;
6570 break;
6571 }
6572 ci.size = si->second;
6573
6574 resp.clones.push_back(ci);
6575 }
6576 if (result < 0) {
6577 break;
f67539c2 6578 }
11fdf7f2
TL
6579 if (!ctx->obc->obs.oi.is_whiteout()) {
6580 ceph_assert(obs.exists);
7c673cae
FG
6581 clone_info ci;
6582 ci.cloneid = CEPH_NOSNAP;
6583
6584 //Size for HEAD is oi.size
6585 ci.size = oi.size;
6586
6587 resp.clones.push_back(ci);
6588 }
6589 resp.seq = ssc->snapset.seq;
6590
6591 resp.encode(osd_op.outdata);
6592 result = 0;
6593
6594 ctx->delta_stats.num_rd++;
6595 break;
6596 }
6597
6598 case CEPH_OSD_OP_NOTIFY:
6599 ++ctx->num_read;
6600 {
6601 uint32_t timeout;
6602 bufferlist bl;
6603
6604 try {
6605 uint32_t ver; // obsolete
11fdf7f2
TL
6606 decode(ver, bp);
6607 decode(timeout, bp);
6608 decode(bl, bp);
f67539c2 6609 } catch (const ceph::buffer::error &e) {
7c673cae
FG
6610 timeout = 0;
6611 }
6612 tracepoint(osd, do_osd_op_pre_notify, soid.oid.name.c_str(), soid.snap.val, timeout);
6613 if (!timeout)
6614 timeout = cct->_conf->osd_default_notify_timeout;
6615
6616 notify_info_t n;
6617 n.timeout = timeout;
11fdf7f2 6618 n.notify_id = osd->get_next_id(get_osdmap_epoch());
9f95a23c 6619 n.cookie = op.notify.cookie;
7c673cae
FG
6620 n.bl = bl;
6621 ctx->notifies.push_back(n);
6622
6623 // return our unique notify id to the client
11fdf7f2 6624 encode(n.notify_id, osd_op.outdata);
7c673cae
FG
6625 }
6626 break;
6627
6628 case CEPH_OSD_OP_NOTIFY_ACK:
6629 ++ctx->num_read;
6630 {
6631 try {
6632 uint64_t notify_id = 0;
6633 uint64_t watch_cookie = 0;
11fdf7f2
TL
6634 decode(notify_id, bp);
6635 decode(watch_cookie, bp);
7c673cae
FG
6636 bufferlist reply_bl;
6637 if (!bp.end()) {
11fdf7f2 6638 decode(reply_bl, bp);
7c673cae
FG
6639 }
6640 tracepoint(osd, do_osd_op_pre_notify_ack, soid.oid.name.c_str(), soid.snap.val, notify_id, watch_cookie, "Y");
6641 OpContext::NotifyAck ack(notify_id, watch_cookie, reply_bl);
6642 ctx->notify_acks.push_back(ack);
f67539c2 6643 } catch (const ceph::buffer::error &e) {
7c673cae
FG
6644 tracepoint(osd, do_osd_op_pre_notify_ack, soid.oid.name.c_str(), soid.snap.val, op.watch.cookie, 0, "N");
6645 OpContext::NotifyAck ack(
6646 // op.watch.cookie is actually the notify_id for historical reasons
6647 op.watch.cookie
6648 );
6649 ctx->notify_acks.push_back(ack);
6650 }
6651 }
6652 break;
6653
6654 case CEPH_OSD_OP_SETALLOCHINT:
6655 ++ctx->num_write;
9f95a23c 6656 result = 0;
7c673cae
FG
6657 {
6658 tracepoint(osd, do_osd_op_pre_setallochint, soid.oid.name.c_str(), soid.snap.val, op.alloc_hint.expected_object_size, op.alloc_hint.expected_write_size);
6659 maybe_create_new_object(ctx);
6660 oi.expected_object_size = op.alloc_hint.expected_object_size;
6661 oi.expected_write_size = op.alloc_hint.expected_write_size;
6662 oi.alloc_hint_flags = op.alloc_hint.flags;
6663 t->set_alloc_hint(soid, op.alloc_hint.expected_object_size,
6664 op.alloc_hint.expected_write_size,
6665 op.alloc_hint.flags);
7c673cae
FG
6666 }
6667 break;
6668
6669
6670 // --- WRITES ---
6671
6672 // -- object data --
6673
6674 case CEPH_OSD_OP_WRITE:
6675 ++ctx->num_write;
9f95a23c 6676 result = 0;
7c673cae
FG
6677 { // write
6678 __u32 seq = oi.truncate_seq;
6679 tracepoint(osd, do_osd_op_pre_write, soid.oid.name.c_str(), soid.snap.val, oi.size, seq, op.extent.offset, op.extent.length, op.extent.truncate_size, op.extent.truncate_seq);
6680 if (op.extent.length != osd_op.indata.length()) {
6681 result = -EINVAL;
6682 break;
6683 }
6684
6685 if (pool.info.has_flag(pg_pool_t::FLAG_WRITE_FADVISE_DONTNEED))
6686 op.flags = op.flags | CEPH_OSD_OP_FLAG_FADVISE_DONTNEED;
6687
6688 if (pool.info.requires_aligned_append() &&
6689 (op.extent.offset % pool.info.required_alignment() != 0)) {
6690 result = -EOPNOTSUPP;
6691 break;
6692 }
6693
6694 if (!obs.exists) {
6695 if (pool.info.requires_aligned_append() && op.extent.offset) {
6696 result = -EOPNOTSUPP;
6697 break;
6698 }
6699 } else if (op.extent.offset != oi.size &&
6700 pool.info.requires_aligned_append()) {
6701 result = -EOPNOTSUPP;
6702 break;
6703 }
6704
6705 if (seq && (seq > op.extent.truncate_seq) &&
6706 (op.extent.offset + op.extent.length > oi.size)) {
6707 // old write, arrived after trimtrunc
6708 op.extent.length = (op.extent.offset > oi.size ? 0 : oi.size - op.extent.offset);
6709 dout(10) << " old truncate_seq " << op.extent.truncate_seq << " < current " << seq
6710 << ", adjusting write length to " << op.extent.length << dendl;
6711 bufferlist t;
6712 t.substr_of(osd_op.indata, 0, op.extent.length);
6713 osd_op.indata.swap(t);
6714 }
6715 if (op.extent.truncate_seq > seq) {
6716 // write arrives before trimtrunc
6717 if (obs.exists && !oi.is_whiteout()) {
6718 dout(10) << " truncate_seq " << op.extent.truncate_seq << " > current " << seq
6719 << ", truncating to " << op.extent.truncate_size << dendl;
6720 t->truncate(soid, op.extent.truncate_size);
6721 oi.truncate_seq = op.extent.truncate_seq;
6722 oi.truncate_size = op.extent.truncate_size;
11fdf7f2
TL
6723 if (oi.size > op.extent.truncate_size) {
6724 interval_set<uint64_t> trim;
6725 trim.insert(op.extent.truncate_size,
6726 oi.size - op.extent.truncate_size);
6727 ctx->modified_ranges.union_of(trim);
9f95a23c 6728 ctx->clean_regions.mark_data_region_dirty(op.extent.truncate_size, oi.size - op.extent.truncate_size);
b3b6e05e 6729 oi.clear_data_digest();
11fdf7f2 6730 }
7c673cae 6731 if (op.extent.truncate_size != oi.size) {
11fdf7f2
TL
6732 truncate_update_size_and_usage(ctx->delta_stats,
6733 oi,
6734 op.extent.truncate_size);
7c673cae
FG
6735 }
6736 } else {
6737 dout(10) << " truncate_seq " << op.extent.truncate_seq << " > current " << seq
6738 << ", but object is new" << dendl;
6739 oi.truncate_seq = op.extent.truncate_seq;
6740 oi.truncate_size = op.extent.truncate_size;
6741 }
6742 }
11fdf7f2
TL
6743 result = check_offset_and_length(
6744 op.extent.offset, op.extent.length,
6745 static_cast<Option::size_t>(osd->osd_max_object_size), get_dpp());
7c673cae
FG
6746 if (result < 0)
6747 break;
6748
6749 maybe_create_new_object(ctx);
6750
6751 if (op.extent.length == 0) {
6752 if (op.extent.offset > oi.size) {
6753 t->truncate(
6754 soid, op.extent.offset);
eafe8130
TL
6755 truncate_update_size_and_usage(ctx->delta_stats, oi,
6756 op.extent.offset);
7c673cae
FG
6757 } else {
6758 t->nop(soid);
6759 }
6760 } else {
6761 t->write(
6762 soid, op.extent.offset, op.extent.length, osd_op.indata, op.flags);
6763 }
6764
28e407b8
AA
6765 if (op.extent.offset == 0 && op.extent.length >= oi.size
6766 && !skip_data_digest) {
7c673cae 6767 obs.oi.set_data_digest(osd_op.indata.crc32c(-1));
28e407b8
AA
6768 } else if (op.extent.offset == oi.size && obs.oi.is_data_digest()) {
6769 if (skip_data_digest) {
6770 obs.oi.clear_data_digest();
6771 } else {
6772 obs.oi.set_data_digest(osd_op.indata.crc32c(obs.oi.data_digest));
6773 }
6774 } else {
7c673cae 6775 obs.oi.clear_data_digest();
28e407b8 6776 }
7c673cae
FG
6777 write_update_size_and_usage(ctx->delta_stats, oi, ctx->modified_ranges,
6778 op.extent.offset, op.extent.length);
9f95a23c
TL
6779 ctx->clean_regions.mark_data_region_dirty(op.extent.offset, op.extent.length);
6780 dout(10) << "clean_regions modified" << ctx->clean_regions << dendl;
7c673cae
FG
6781 }
6782 break;
f67539c2 6783
7c673cae
FG
6784 case CEPH_OSD_OP_WRITEFULL:
6785 ++ctx->num_write;
9f95a23c 6786 result = 0;
7c673cae
FG
6787 { // write full object
6788 tracepoint(osd, do_osd_op_pre_writefull, soid.oid.name.c_str(), soid.snap.val, oi.size, 0, op.extent.length);
6789
6790 if (op.extent.length != osd_op.indata.length()) {
6791 result = -EINVAL;
6792 break;
6793 }
11fdf7f2
TL
6794 result = check_offset_and_length(
6795 0, op.extent.length,
6796 static_cast<Option::size_t>(osd->osd_max_object_size), get_dpp());
7c673cae
FG
6797 if (result < 0)
6798 break;
6799
6800 if (pool.info.has_flag(pg_pool_t::FLAG_WRITE_FADVISE_DONTNEED))
6801 op.flags = op.flags | CEPH_OSD_OP_FLAG_FADVISE_DONTNEED;
6802
6803 maybe_create_new_object(ctx);
11fdf7f2 6804 if (pool.info.is_erasure()) {
7c673cae
FG
6805 t->truncate(soid, 0);
6806 } else if (obs.exists && op.extent.length < oi.size) {
6807 t->truncate(soid, op.extent.length);
6808 }
6809 if (op.extent.length) {
6810 t->write(soid, 0, op.extent.length, osd_op.indata, op.flags);
6811 }
28e407b8
AA
6812 if (!skip_data_digest) {
6813 obs.oi.set_data_digest(osd_op.indata.crc32c(-1));
6814 } else {
6815 obs.oi.clear_data_digest();
6816 }
9f95a23c
TL
6817 ctx->clean_regions.mark_data_region_dirty(0,
6818 std::max((uint64_t)op.extent.length, oi.size));
7c673cae
FG
6819 write_update_size_and_usage(ctx->delta_stats, oi, ctx->modified_ranges,
6820 0, op.extent.length, true);
6821 }
6822 break;
6823
6824 case CEPH_OSD_OP_WRITESAME:
6825 ++ctx->num_write;
6826 tracepoint(osd, do_osd_op_pre_writesame, soid.oid.name.c_str(), soid.snap.val, oi.size, op.writesame.offset, op.writesame.length, op.writesame.data_length);
6827 result = do_writesame(ctx, osd_op);
6828 break;
6829
6830 case CEPH_OSD_OP_ROLLBACK :
6831 ++ctx->num_write;
6832 tracepoint(osd, do_osd_op_pre_rollback, soid.oid.name.c_str(), soid.snap.val);
20effc67 6833 result = _rollback_to(ctx, osd_op);
7c673cae
FG
6834 break;
6835
6836 case CEPH_OSD_OP_ZERO:
6837 tracepoint(osd, do_osd_op_pre_zero, soid.oid.name.c_str(), soid.snap.val, op.extent.offset, op.extent.length);
6838 if (pool.info.requires_aligned_append()) {
6839 result = -EOPNOTSUPP;
6840 break;
6841 }
6842 ++ctx->num_write;
6843 { // zero
11fdf7f2
TL
6844 result = check_offset_and_length(
6845 op.extent.offset, op.extent.length,
6846 static_cast<Option::size_t>(osd->osd_max_object_size), get_dpp());
7c673cae
FG
6847 if (result < 0)
6848 break;
f67539c2 6849
20effc67 6850 if (op.extent.length && obs.exists && !oi.is_whiteout()) {
7c673cae
FG
6851 t->zero(soid, op.extent.offset, op.extent.length);
6852 interval_set<uint64_t> ch;
6853 ch.insert(op.extent.offset, op.extent.length);
6854 ctx->modified_ranges.union_of(ch);
9f95a23c 6855 ctx->clean_regions.mark_data_region_dirty(op.extent.offset, op.extent.length);
7c673cae
FG
6856 ctx->delta_stats.num_wr++;
6857 oi.clear_data_digest();
6858 } else {
6859 // no-op
6860 }
6861 }
6862 break;
6863 case CEPH_OSD_OP_CREATE:
6864 ++ctx->num_write;
9f95a23c 6865 result = 0;
7c673cae
FG
6866 {
6867 tracepoint(osd, do_osd_op_pre_create, soid.oid.name.c_str(), soid.snap.val);
7c673cae 6868 if (obs.exists && !oi.is_whiteout() &&
9f95a23c 6869 (op.flags & CEPH_OSD_OP_FLAG_EXCL)) {
7c673cae
FG
6870 result = -EEXIST; /* this is an exclusive create */
6871 } else {
6872 if (osd_op.indata.length()) {
11fdf7f2 6873 auto p = osd_op.indata.cbegin();
7c673cae
FG
6874 string category;
6875 try {
11fdf7f2 6876 decode(category, p);
7c673cae 6877 }
f67539c2 6878 catch (ceph::buffer::error& e) {
7c673cae
FG
6879 result = -EINVAL;
6880 goto fail;
6881 }
6882 // category is no longer implemented.
6883 }
9f95a23c
TL
6884 maybe_create_new_object(ctx);
6885 t->nop(soid);
7c673cae
FG
6886 }
6887 }
6888 break;
6889
6890 case CEPH_OSD_OP_TRIMTRUNC:
6891 op.extent.offset = op.extent.truncate_size;
6892 // falling through
6893
6894 case CEPH_OSD_OP_TRUNCATE:
6895 tracepoint(osd, do_osd_op_pre_truncate, soid.oid.name.c_str(), soid.snap.val, oi.size, oi.truncate_seq, op.extent.offset, op.extent.length, op.extent.truncate_size, op.extent.truncate_seq);
6896 if (pool.info.requires_aligned_append()) {
6897 result = -EOPNOTSUPP;
6898 break;
6899 }
6900 ++ctx->num_write;
9f95a23c 6901 result = 0;
7c673cae
FG
6902 {
6903 // truncate
6904 if (!obs.exists || oi.is_whiteout()) {
6905 dout(10) << " object dne, truncate is a no-op" << dendl;
6906 break;
6907 }
6908
11fdf7f2
TL
6909 result = check_offset_and_length(
6910 op.extent.offset, op.extent.length,
6911 static_cast<Option::size_t>(osd->osd_max_object_size), get_dpp());
6912 if (result < 0)
7c673cae 6913 break;
7c673cae
FG
6914
6915 if (op.extent.truncate_seq) {
11fdf7f2 6916 ceph_assert(op.extent.offset == op.extent.truncate_size);
7c673cae
FG
6917 if (op.extent.truncate_seq <= oi.truncate_seq) {
6918 dout(10) << " truncate seq " << op.extent.truncate_seq << " <= current " << oi.truncate_seq
6919 << ", no-op" << dendl;
6920 break; // old
6921 }
6922 dout(10) << " truncate seq " << op.extent.truncate_seq << " > current " << oi.truncate_seq
6923 << ", truncating" << dendl;
6924 oi.truncate_seq = op.extent.truncate_seq;
6925 oi.truncate_size = op.extent.truncate_size;
6926 }
6927
6928 maybe_create_new_object(ctx);
6929 t->truncate(soid, op.extent.offset);
6930 if (oi.size > op.extent.offset) {
6931 interval_set<uint64_t> trim;
6932 trim.insert(op.extent.offset, oi.size-op.extent.offset);
6933 ctx->modified_ranges.union_of(trim);
9f95a23c
TL
6934 ctx->clean_regions.mark_data_region_dirty(op.extent.offset, oi.size - op.extent.offset);
6935 } else if (oi.size < op.extent.offset) {
6936 ctx->clean_regions.mark_data_region_dirty(oi.size, op.extent.offset - oi.size);
6937 }
7c673cae 6938 if (op.extent.offset != oi.size) {
11fdf7f2
TL
6939 truncate_update_size_and_usage(ctx->delta_stats,
6940 oi,
6941 op.extent.offset);
7c673cae
FG
6942 }
6943 ctx->delta_stats.num_wr++;
6944 // do no set exists, or we will break above DELETE -> TRUNCATE munging.
6945
6946 oi.clear_data_digest();
6947 }
6948 break;
f67539c2 6949
7c673cae
FG
6950 case CEPH_OSD_OP_DELETE:
6951 ++ctx->num_write;
9f95a23c 6952 result = 0;
7c673cae
FG
6953 tracepoint(osd, do_osd_op_pre_delete, soid.oid.name.c_str(), soid.snap.val);
6954 {
6955 result = _delete_oid(ctx, false, ctx->ignore_cache);
6956 }
6957 break;
6958
6959 case CEPH_OSD_OP_WATCH:
6960 ++ctx->num_write;
9f95a23c 6961 result = 0;
7c673cae
FG
6962 {
6963 tracepoint(osd, do_osd_op_pre_watch, soid.oid.name.c_str(), soid.snap.val,
6964 op.watch.cookie, op.watch.op);
6965 if (!obs.exists) {
6966 result = -ENOENT;
6967 break;
6968 }
9f95a23c 6969 result = 0;
7c673cae
FG
6970 uint64_t cookie = op.watch.cookie;
6971 entity_name_t entity = ctx->reqid.name;
6972 ObjectContextRef obc = ctx->obc;
6973
6974 dout(10) << "watch " << ceph_osd_watch_op_name(op.watch.op)
6975 << ": ctx->obc=" << (void *)obc.get() << " cookie=" << cookie
6976 << " oi.version=" << oi.version.version << " ctx->at_version=" << ctx->at_version << dendl;
6977 dout(10) << "watch: oi.user_version=" << oi.user_version<< dendl;
6978 dout(10) << "watch: peer_addr="
6979 << ctx->op->get_req()->get_connection()->get_peer_addr() << dendl;
6980
6981 uint32_t timeout = cct->_conf->osd_client_watch_timeout;
6982 if (op.watch.timeout != 0) {
6983 timeout = op.watch.timeout;
6984 }
6985
6986 watch_info_t w(cookie, timeout,
6987 ctx->op->get_req()->get_connection()->get_peer_addr());
6988 if (op.watch.op == CEPH_OSD_WATCH_OP_WATCH ||
6989 op.watch.op == CEPH_OSD_WATCH_OP_LEGACY_WATCH) {
6990 if (oi.watchers.count(make_pair(cookie, entity))) {
6991 dout(10) << " found existing watch " << w << " by " << entity << dendl;
6992 } else {
6993 dout(10) << " registered new watch " << w << " by " << entity << dendl;
6994 oi.watchers[make_pair(cookie, entity)] = w;
6995 t->nop(soid); // make sure update the object_info on disk!
6996 }
6997 bool will_ping = (op.watch.op == CEPH_OSD_WATCH_OP_WATCH);
6998 ctx->watch_connects.push_back(make_pair(w, will_ping));
6999 } else if (op.watch.op == CEPH_OSD_WATCH_OP_RECONNECT) {
7000 if (!oi.watchers.count(make_pair(cookie, entity))) {
7001 result = -ENOTCONN;
7002 break;
7003 }
7004 dout(10) << " found existing watch " << w << " by " << entity << dendl;
7005 ctx->watch_connects.push_back(make_pair(w, true));
7006 } else if (op.watch.op == CEPH_OSD_WATCH_OP_PING) {
7007 /* Note: WATCH with PING doesn't cause may_write() to return true,
7008 * so if there is nothing else in the transaction, this is going
7009 * to run do_osd_op_effects, but not write out a log entry */
7010 if (!oi.watchers.count(make_pair(cookie, entity))) {
7011 result = -ENOTCONN;
7012 break;
7013 }
7014 map<pair<uint64_t,entity_name_t>,WatchRef>::iterator p =
7015 obc->watchers.find(make_pair(cookie, entity));
7016 if (p == obc->watchers.end() ||
7017 !p->second->is_connected()) {
7018 // client needs to reconnect
7019 result = -ETIMEDOUT;
7020 break;
7021 }
7022 dout(10) << " found existing watch " << w << " by " << entity << dendl;
7023 p->second->got_ping(ceph_clock_now());
7024 result = 0;
7025 } else if (op.watch.op == CEPH_OSD_WATCH_OP_UNWATCH) {
7026 map<pair<uint64_t, entity_name_t>, watch_info_t>::iterator oi_iter =
7027 oi.watchers.find(make_pair(cookie, entity));
7028 if (oi_iter != oi.watchers.end()) {
7029 dout(10) << " removed watch " << oi_iter->second << " by "
7030 << entity << dendl;
7031 oi.watchers.erase(oi_iter);
7032 t->nop(soid); // update oi on disk
7033 ctx->watch_disconnects.push_back(
7034 watch_disconnect_t(cookie, entity, false));
7035 } else {
7036 dout(10) << " can't remove: no watch by " << entity << dendl;
7037 }
7038 }
7039 }
7040 break;
7041
7042 case CEPH_OSD_OP_CACHE_PIN:
7043 tracepoint(osd, do_osd_op_pre_cache_pin, soid.oid.name.c_str(), soid.snap.val);
7044 if ((!pool.info.is_tier() ||
7045 pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE)) {
7046 result = -EINVAL;
7047 dout(10) << " pin object is only allowed on the cache tier " << dendl;
7048 break;
7049 }
7050 ++ctx->num_write;
9f95a23c 7051 result = 0;
7c673cae
FG
7052 {
7053 if (!obs.exists || oi.is_whiteout()) {
7054 result = -ENOENT;
7055 break;
7056 }
7057
7058 if (!oi.is_cache_pinned()) {
7059 oi.set_flag(object_info_t::FLAG_CACHE_PIN);
7060 ctx->modify = true;
7061 ctx->delta_stats.num_objects_pinned++;
7062 ctx->delta_stats.num_wr++;
7063 }
7c673cae
FG
7064 }
7065 break;
7066
7067 case CEPH_OSD_OP_CACHE_UNPIN:
7068 tracepoint(osd, do_osd_op_pre_cache_unpin, soid.oid.name.c_str(), soid.snap.val);
7069 if ((!pool.info.is_tier() ||
7070 pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE)) {
7071 result = -EINVAL;
7072 dout(10) << " pin object is only allowed on the cache tier " << dendl;
7073 break;
7074 }
7075 ++ctx->num_write;
9f95a23c 7076 result = 0;
7c673cae
FG
7077 {
7078 if (!obs.exists || oi.is_whiteout()) {
7079 result = -ENOENT;
7080 break;
7081 }
7082
7083 if (oi.is_cache_pinned()) {
7084 oi.clear_flag(object_info_t::FLAG_CACHE_PIN);
7085 ctx->modify = true;
7086 ctx->delta_stats.num_objects_pinned--;
7087 ctx->delta_stats.num_wr++;
7088 }
7c673cae
FG
7089 }
7090 break;
7091
31f18b77
FG
7092 case CEPH_OSD_OP_SET_REDIRECT:
7093 ++ctx->num_write;
9f95a23c 7094 result = 0;
31f18b77
FG
7095 {
7096 if (pool.info.is_tier()) {
7097 result = -EINVAL;
7098 break;
7099 }
7100 if (!obs.exists) {
7101 result = -ENOENT;
7102 break;
7103 }
9f95a23c 7104 if (get_osdmap()->require_osd_release < ceph_release_t::luminous) {
31f18b77
FG
7105 result = -EOPNOTSUPP;
7106 break;
7107 }
7108
7109 object_t target_name;
7110 object_locator_t target_oloc;
7111 snapid_t target_snapid = (uint64_t)op.copy_from.snapid;
7112 version_t target_version = op.copy_from.src_version;
7113 try {
11fdf7f2
TL
7114 decode(target_name, bp);
7115 decode(target_oloc, bp);
7116 }
f67539c2 7117 catch (ceph::buffer::error& e) {
11fdf7f2
TL
7118 result = -EINVAL;
7119 goto fail;
7120 }
7121 pg_t raw_pg;
2a845540
TL
7122 result = get_osdmap()->object_locator_to_pg(target_name, target_oloc, raw_pg);
7123 if (result < 0) {
7124 dout(5) << " pool information is invalid: " << result << dendl;
7125 break;
7126 }
11fdf7f2
TL
7127 hobject_t target(target_name, target_oloc.key, target_snapid,
7128 raw_pg.ps(), raw_pg.pool(),
7129 target_oloc.nspace);
7130 if (target == soid) {
7131 dout(20) << " set-redirect self is invalid" << dendl;
7132 result = -EINVAL;
7133 break;
7134 }
7135
7136 bool need_reference = (osd_op.op.flags & CEPH_OSD_OP_FLAG_WITH_REFERENCE);
7137 bool has_reference = (oi.flags & object_info_t::FLAG_REDIRECT_HAS_REFERENCE);
7138 if (has_reference) {
7139 result = -EINVAL;
7140 dout(5) << " the object is already a manifest " << dendl;
7141 break;
7142 }
7143 if (op_finisher == nullptr && need_reference) {
7144 // start
7145 ctx->op_finishers[ctx->current_osd_subop_num].reset(
7146 new SetManifestFinisher(osd_op));
1e59de90 7147 ManifestOpRef mop = std::make_shared<ManifestOp>(ctx->obc, new RefCountCallback(ctx, osd_op));
20effc67 7148 auto* fin = new C_SetManifestRefCountDone(this, soid, 0);
f67539c2
TL
7149 ceph_tid_t tid = refcount_manifest(soid, target,
7150 refcount_t::INCREMENT_REF, fin, std::nullopt);
20effc67
TL
7151 fin->tid = tid;
7152 mop->num_chunks++;
7153 mop->tids[0] = tid;
f67539c2
TL
7154 manifest_ops[soid] = mop;
7155 ctx->obc->start_block();
11fdf7f2
TL
7156 result = -EINPROGRESS;
7157 } else {
7158 // finish
7159 if (op_finisher) {
7160 result = op_finisher->execute();
7161 ceph_assert(result == 0);
7162 }
7163
7164 if (!oi.has_manifest() && !oi.manifest.is_redirect())
7165 ctx->delta_stats.num_objects_manifest++;
7166
7167 oi.set_flag(object_info_t::FLAG_MANIFEST);
7168 oi.manifest.redirect_target = target;
7169 oi.manifest.type = object_manifest_t::TYPE_REDIRECT;
7170 t->truncate(soid, 0);
9f95a23c 7171 ctx->clean_regions.mark_data_region_dirty(0, oi.size);
11fdf7f2
TL
7172 if (oi.is_omap() && pool.info.supports_omap()) {
7173 t->omap_clear(soid);
7174 obs.oi.clear_omap_digest();
7175 obs.oi.clear_flag(object_info_t::FLAG_OMAP);
9f95a23c 7176 ctx->clean_regions.mark_omap_dirty();
11fdf7f2 7177 }
9f95a23c
TL
7178 write_update_size_and_usage(ctx->delta_stats, oi, ctx->modified_ranges,
7179 0, oi.size, false);
11fdf7f2
TL
7180 ctx->delta_stats.num_bytes -= oi.size;
7181 oi.size = 0;
7182 oi.new_object();
7183 oi.user_version = target_version;
7184 ctx->user_at_version = target_version;
7185 /* rm_attrs */
20effc67 7186 map<string,bufferlist,less<>> rmattrs;
11fdf7f2
TL
7187 result = getattrs_maybe_cache(ctx->obc, &rmattrs);
7188 if (result < 0) {
eafe8130 7189 dout(10) << __func__ << " error: " << cpp_strerror(result) << dendl;
11fdf7f2
TL
7190 return result;
7191 }
7192 map<string, bufferlist>::iterator iter;
7193 for (iter = rmattrs.begin(); iter != rmattrs.end(); ++iter) {
7194 const string& name = iter->first;
7195 t->rmattr(soid, name);
7196 }
7197 if (!has_reference && need_reference) {
7198 oi.set_flag(object_info_t::FLAG_REDIRECT_HAS_REFERENCE);
7199 }
7200 dout(10) << "set-redirect oid:" << oi.soid << " user_version: " << oi.user_version << dendl;
7201 if (op_finisher) {
7202 ctx->op_finishers.erase(ctx->current_osd_subop_num);
7203 }
7204 }
7205 }
7206
7207 break;
7208
7209 case CEPH_OSD_OP_SET_CHUNK:
7210 ++ctx->num_write;
9f95a23c 7211 result = 0;
11fdf7f2
TL
7212 {
7213 if (pool.info.is_tier()) {
7214 result = -EINVAL;
7215 break;
7216 }
7217 if (!obs.exists) {
7218 result = -ENOENT;
7219 break;
7220 }
9f95a23c 7221 if (get_osdmap()->require_osd_release < ceph_release_t::luminous) {
11fdf7f2
TL
7222 result = -EOPNOTSUPP;
7223 break;
7224 }
f67539c2
TL
7225 if (oi.manifest.is_redirect()) {
7226 result = -EINVAL;
7227 goto fail;
7228 }
11fdf7f2
TL
7229
7230 object_locator_t tgt_oloc;
7231 uint64_t src_offset, src_length, tgt_offset;
7232 object_t tgt_name;
7233 try {
7234 decode(src_offset, bp);
7235 decode(src_length, bp);
7236 decode(tgt_oloc, bp);
7237 decode(tgt_name, bp);
7238 decode(tgt_offset, bp);
31f18b77 7239 }
f67539c2 7240 catch (ceph::buffer::error& e) {
31f18b77 7241 result = -EINVAL;
11fdf7f2
TL
7242 goto fail;
7243 }
f67539c2 7244
11fdf7f2
TL
7245 if (!src_length) {
7246 result = -EINVAL;
7247 goto fail;
7248 }
f67539c2
TL
7249 if (src_offset + src_length > oi.size) {
7250 result = -ERANGE;
7251 goto fail;
7252 }
7253 if (!(osd_op.op.flags & CEPH_OSD_OP_FLAG_WITH_REFERENCE)) {
7254 result = -EOPNOTSUPP;
7255 break;
7256 }
7257 if (pool.info.is_erasure()) {
7258 result = -EOPNOTSUPP;
7259 break;
7260 }
11fdf7f2
TL
7261
7262 for (auto &p : oi.manifest.chunk_map) {
f67539c2
TL
7263 interval_set<uint64_t> chunk;
7264 chunk.insert(p.first, p.second.length);
7265 if (chunk.intersects(src_offset, src_length)) {
11fdf7f2
TL
7266 dout(20) << __func__ << " overlapped !! offset: " << src_offset << " length: " << src_length
7267 << " chunk_info: " << p << dendl;
7268 result = -EOPNOTSUPP;
7269 goto fail;
7270 }
7271 }
7272
11fdf7f2
TL
7273 pg_t raw_pg;
7274 chunk_info_t chunk_info;
2a845540
TL
7275 result = get_osdmap()->object_locator_to_pg(tgt_name, tgt_oloc, raw_pg);
7276 if (result < 0) {
7277 dout(5) << " pool information is invalid: " << result << dendl;
7278 break;
7279 }
11fdf7f2
TL
7280 hobject_t target(tgt_name, tgt_oloc.key, snapid_t(),
7281 raw_pg.ps(), raw_pg.pool(),
7282 tgt_oloc.nspace);
11fdf7f2 7283 bool has_reference = (oi.manifest.chunk_map.find(src_offset) != oi.manifest.chunk_map.end()) &&
f67539c2 7284 (oi.manifest.chunk_map[src_offset].test_flag(chunk_info_t::FLAG_HAS_REFERENCE));
11fdf7f2
TL
7285 if (has_reference) {
7286 result = -EINVAL;
7287 dout(5) << " the object is already a manifest " << dendl;
7288 break;
7289 }
f67539c2
TL
7290 chunk_info.oid = target;
7291 chunk_info.offset = tgt_offset;
7292 chunk_info.length = src_length;
7293 if (op_finisher == nullptr) {
11fdf7f2
TL
7294 // start
7295 ctx->op_finishers[ctx->current_osd_subop_num].reset(
7296 new SetManifestFinisher(osd_op));
f67539c2
TL
7297 object_manifest_t set_chunk;
7298 bool need_inc_ref = false;
7299 set_chunk.chunk_map[src_offset] = chunk_info;
7300 need_inc_ref = inc_refcount_by_set(ctx, set_chunk, osd_op);
7301 if (need_inc_ref) {
7302 result = -EINPROGRESS;
7303 break;
11fdf7f2 7304 }
f67539c2
TL
7305 }
7306 if (op_finisher) {
7307 result = op_finisher->execute();
7308 ceph_assert(result == 0);
7309 }
11fdf7f2 7310
f67539c2
TL
7311 oi.manifest.chunk_map[src_offset] = chunk_info;
7312 if (!oi.has_manifest() && !oi.manifest.is_chunked())
7313 ctx->delta_stats.num_objects_manifest++;
7314 oi.set_flag(object_info_t::FLAG_MANIFEST);
7315 oi.manifest.type = object_manifest_t::TYPE_CHUNKED;
7316 if (!has_reference) {
7317 oi.manifest.chunk_map[src_offset].set_flag(chunk_info_t::FLAG_HAS_REFERENCE);
7318 }
7319 ctx->modify = true;
7320 ctx->cache_operation = true;
11fdf7f2 7321
f67539c2
TL
7322 dout(10) << "set-chunked oid:" << oi.soid << " user_version: " << oi.user_version
7323 << " chunk_info: " << chunk_info << dendl;
7324 if (op_finisher) {
7325 ctx->op_finishers.erase(ctx->current_osd_subop_num);
11fdf7f2
TL
7326 }
7327 }
7328
7329 break;
7330
7331 case CEPH_OSD_OP_TIER_PROMOTE:
7332 ++ctx->num_write;
9f95a23c 7333 result = 0;
11fdf7f2
TL
7334 {
7335 if (pool.info.is_tier()) {
7336 result = -EINVAL;
7337 break;
7338 }
7339 if (!obs.exists) {
7340 result = -ENOENT;
7341 break;
7342 }
9f95a23c 7343 if (get_osdmap()->require_osd_release < ceph_release_t::luminous) {
11fdf7f2
TL
7344 result = -EOPNOTSUPP;
7345 break;
7346 }
7347 if (!obs.oi.has_manifest()) {
7348 result = 0;
7349 break;
7350 }
7351
7352 if (op_finisher == nullptr) {
7353 PromoteManifestCallback *cb;
7354 object_locator_t my_oloc;
7355 hobject_t src_hoid;
7356
7357 if (obs.oi.manifest.is_chunked()) {
7358 src_hoid = obs.oi.soid;
11fdf7f2
TL
7359 } else if (obs.oi.manifest.is_redirect()) {
7360 object_locator_t src_oloc(obs.oi.manifest.redirect_target);
7361 my_oloc = src_oloc;
7362 src_hoid = obs.oi.manifest.redirect_target;
11fdf7f2
TL
7363 } else {
7364 ceph_abort_msg("unrecognized manifest type");
7365 }
f67539c2 7366 cb = new PromoteManifestCallback(ctx->obc, this, ctx);
11fdf7f2
TL
7367 ctx->op_finishers[ctx->current_osd_subop_num].reset(
7368 new PromoteFinisher(cb));
7369 unsigned flags = CEPH_OSD_COPY_FROM_FLAG_IGNORE_OVERLAY |
7370 CEPH_OSD_COPY_FROM_FLAG_IGNORE_CACHE |
7371 CEPH_OSD_COPY_FROM_FLAG_MAP_SNAP_CLONE |
7372 CEPH_OSD_COPY_FROM_FLAG_RWORDERED;
7373 unsigned src_fadvise_flags = LIBRADOS_OP_FLAG_FADVISE_SEQUENTIAL;
7374 start_copy(cb, ctx->obc, src_hoid, my_oloc, 0, flags,
7375 obs.oi.soid.snap == CEPH_NOSNAP,
7376 src_fadvise_flags, 0);
7377
7378 dout(10) << "tier-promote oid:" << oi.soid << " manifest: " << obs.oi.manifest << dendl;
7379 result = -EINPROGRESS;
7380 } else {
7381 result = op_finisher->execute();
7382 ceph_assert(result == 0);
7383 ctx->op_finishers.erase(ctx->current_osd_subop_num);
7384 }
7385 }
7386
7387 break;
7388
9f95a23c
TL
7389 case CEPH_OSD_OP_TIER_FLUSH:
7390 ++ctx->num_write;
7391 result = 0;
7392 {
7393 if (pool.info.is_tier()) {
7394 result = -EINVAL;
7395 break;
7396 }
7397 if (!obs.exists) {
7398 result = -ENOENT;
7399 break;
7400 }
7401 if (get_osdmap()->require_osd_release < ceph_release_t::octopus) {
7402 result = -EOPNOTSUPP;
7403 break;
7404 }
9f95a23c 7405
1e59de90
TL
7406 if (oi.is_dirty() || !obs.oi.has_manifest()) {
7407 result = start_flush(ctx->op, ctx->obc, true, NULL, std::nullopt, true);
9f95a23c
TL
7408 if (result == -EINPROGRESS)
7409 result = -EAGAIN;
7410 } else {
7411 result = 0;
7412 }
7413 }
7414
7415 break;
7416
f67539c2
TL
7417 case CEPH_OSD_OP_TIER_EVICT:
7418 ++ctx->num_write;
7419 result = 0;
7420 {
7421 if (pool.info.is_tier()) {
7422 result = -EINVAL;
7423 break;
7424 }
7425 if (!obs.exists) {
7426 result = -ENOENT;
7427 break;
7428 }
7429 if (get_osdmap()->require_osd_release < ceph_release_t::octopus) {
7430 result = -EOPNOTSUPP;
7431 break;
7432 }
7433 if (!obs.oi.has_manifest()) {
7434 result = -EINVAL;
7435 break;
7436 }
7437
7438 // The chunks already has a reference, so it is just enough to invoke truncate if necessary
20effc67
TL
7439 for (auto &p : obs.oi.manifest.chunk_map) {
7440 p.second.set_flag(chunk_info_t::FLAG_MISSING);
f67539c2 7441 // punch hole
20effc67 7442 t->zero(soid, p.first, p.second.length);
1e59de90
TL
7443 interval_set<uint64_t> ch;
7444 ch.insert(p.first, p.second.length);
7445 ctx->modified_ranges.union_of(ch);
7446 ctx->clean_regions.mark_data_region_dirty(p.first, p.second.length);
f67539c2 7447 }
20effc67
TL
7448 oi.clear_data_digest();
7449 ctx->delta_stats.num_wr++;
7450 ctx->cache_operation = true;
1e59de90 7451 ctx->undirty = true;
f67539c2
TL
7452 osd->logger->inc(l_osd_tier_evict);
7453 }
7454
7455 break;
7456
11fdf7f2
TL
7457 case CEPH_OSD_OP_UNSET_MANIFEST:
7458 ++ctx->num_write;
9f95a23c 7459 result = 0;
11fdf7f2
TL
7460 {
7461 if (pool.info.is_tier()) {
7462 result = -EINVAL;
7463 break;
31f18b77 7464 }
11fdf7f2
TL
7465 if (!obs.exists) {
7466 result = -ENOENT;
31f18b77
FG
7467 break;
7468 }
11fdf7f2
TL
7469 if (!oi.has_manifest()) {
7470 result = -EOPNOTSUPP;
7471 break;
31f18b77 7472 }
9f95a23c 7473 if (get_osdmap()->require_osd_release < ceph_release_t::luminous) {
11fdf7f2
TL
7474 result = -EOPNOTSUPP;
7475 break;
31f18b77 7476 }
11fdf7f2 7477
f67539c2 7478 dec_all_refcount_manifest(oi, ctx);
11fdf7f2
TL
7479
7480 oi.clear_flag(object_info_t::FLAG_MANIFEST);
7481 oi.manifest = object_manifest_t();
7482 ctx->delta_stats.num_objects_manifest--;
7483 ctx->delta_stats.num_wr++;
7484 ctx->modify = true;
31f18b77
FG
7485 }
7486
7487 break;
7c673cae
FG
7488
7489 // -- object attrs --
f67539c2 7490
7c673cae
FG
7491 case CEPH_OSD_OP_SETXATTR:
7492 ++ctx->num_write;
9f95a23c 7493 result = 0;
7c673cae
FG
7494 {
7495 if (cct->_conf->osd_max_attr_size > 0 &&
7496 op.xattr.value_len > cct->_conf->osd_max_attr_size) {
7497 tracepoint(osd, do_osd_op_pre_setxattr, soid.oid.name.c_str(), soid.snap.val, "???");
7498 result = -EFBIG;
7499 break;
7500 }
11fdf7f2
TL
7501 unsigned max_name_len =
7502 std::min<uint64_t>(osd->store->get_max_attr_name_length(),
7503 cct->_conf->osd_max_attr_name_len);
7c673cae
FG
7504 if (op.xattr.name_len > max_name_len) {
7505 result = -ENAMETOOLONG;
7506 break;
7507 }
7508 maybe_create_new_object(ctx);
7509 string aname;
7510 bp.copy(op.xattr.name_len, aname);
7511 tracepoint(osd, do_osd_op_pre_setxattr, soid.oid.name.c_str(), soid.snap.val, aname.c_str());
7512 string name = "_" + aname;
7513 bufferlist bl;
7514 bp.copy(op.xattr.value_len, bl);
7515 t->setattr(soid, name, bl);
7516 ctx->delta_stats.num_wr++;
7517 }
7518 break;
7519
7520 case CEPH_OSD_OP_RMXATTR:
7521 ++ctx->num_write;
9f95a23c 7522 result = 0;
7c673cae
FG
7523 {
7524 string aname;
7525 bp.copy(op.xattr.name_len, aname);
7526 tracepoint(osd, do_osd_op_pre_rmxattr, soid.oid.name.c_str(), soid.snap.val, aname.c_str());
7527 if (!obs.exists || oi.is_whiteout()) {
7528 result = -ENOENT;
7529 break;
7530 }
7531 string name = "_" + aname;
7532 t->rmattr(soid, name);
7533 ctx->delta_stats.num_wr++;
7534 }
7535 break;
f67539c2 7536
7c673cae
FG
7537
7538 // -- fancy writers --
7539 case CEPH_OSD_OP_APPEND:
7540 {
7541 tracepoint(osd, do_osd_op_pre_append, soid.oid.name.c_str(), soid.snap.val, oi.size, oi.truncate_seq, op.extent.offset, op.extent.length, op.extent.truncate_size, op.extent.truncate_seq);
7542 // just do it inline; this works because we are happy to execute
7543 // fancy op on replicas as well.
7544 vector<OSDOp> nops(1);
7545 OSDOp& newop = nops[0];
7546 newop.op.op = CEPH_OSD_OP_WRITE;
7547 newop.op.extent.offset = oi.size;
7548 newop.op.extent.length = op.extent.length;
7549 newop.op.extent.truncate_seq = oi.truncate_seq;
7550 newop.indata = osd_op.indata;
7551 result = do_osd_ops(ctx, nops);
f67539c2 7552 osd_op.outdata = std::move(newop.outdata);
7c673cae
FG
7553 }
7554 break;
7555
7556 case CEPH_OSD_OP_STARTSYNC:
9f95a23c 7557 result = 0;
7c673cae
FG
7558 t->nop(soid);
7559 break;
7560
7c673cae
FG
7561 // -- trivial map --
7562 case CEPH_OSD_OP_TMAPGET:
7563 tracepoint(osd, do_osd_op_pre_tmapget, soid.oid.name.c_str(), soid.snap.val);
11fdf7f2 7564 if (pool.info.is_erasure()) {
7c673cae
FG
7565 result = -EOPNOTSUPP;
7566 break;
7567 }
7568 {
7569 vector<OSDOp> nops(1);
7570 OSDOp& newop = nops[0];
7571 newop.op.op = CEPH_OSD_OP_SYNC_READ;
7572 newop.op.extent.offset = 0;
7573 newop.op.extent.length = 0;
9f95a23c 7574 result = do_osd_ops(ctx, nops);
f67539c2 7575 osd_op.outdata = std::move(newop.outdata);
7c673cae
FG
7576 }
7577 break;
7578
7579 case CEPH_OSD_OP_TMAPPUT:
7580 tracepoint(osd, do_osd_op_pre_tmapput, soid.oid.name.c_str(), soid.snap.val);
11fdf7f2 7581 if (pool.info.is_erasure()) {
7c673cae
FG
7582 result = -EOPNOTSUPP;
7583 break;
7584 }
7585 {
7586 //_dout_lock.Lock();
7587 //osd_op.data.hexdump(*_dout);
7588 //_dout_lock.Unlock();
7589
7590 // verify sort order
7591 bool unsorted = false;
7592 if (true) {
7593 bufferlist header;
11fdf7f2 7594 decode(header, bp);
7c673cae 7595 uint32_t n;
11fdf7f2 7596 decode(n, bp);
7c673cae
FG
7597 string last_key;
7598 while (n--) {
7599 string key;
11fdf7f2 7600 decode(key, bp);
7c673cae
FG
7601 dout(10) << "tmapput key " << key << dendl;
7602 bufferlist val;
11fdf7f2 7603 decode(val, bp);
7c673cae
FG
7604 if (key < last_key) {
7605 dout(10) << "TMAPPUT is unordered; resorting" << dendl;
7606 unsorted = true;
7607 break;
7608 }
7609 last_key = key;
7610 }
7611 }
7612
7613 // write it
7614 vector<OSDOp> nops(1);
7615 OSDOp& newop = nops[0];
7616 newop.op.op = CEPH_OSD_OP_WRITEFULL;
7617 newop.op.extent.offset = 0;
7618 newop.op.extent.length = osd_op.indata.length();
7619 newop.indata = osd_op.indata;
7620
7621 if (unsorted) {
7622 bp = osd_op.indata.begin();
7623 bufferlist header;
7624 map<string, bufferlist> m;
11fdf7f2
TL
7625 decode(header, bp);
7626 decode(m, bp);
7627 ceph_assert(bp.end());
7c673cae 7628 bufferlist newbl;
11fdf7f2
TL
7629 encode(header, newbl);
7630 encode(m, newbl);
7c673cae
FG
7631 newop.indata = newbl;
7632 }
7633 result = do_osd_ops(ctx, nops);
11fdf7f2 7634 ceph_assert(result == 0);
7c673cae
FG
7635 }
7636 break;
7637
7638 case CEPH_OSD_OP_TMAPUP:
7639 tracepoint(osd, do_osd_op_pre_tmapup, soid.oid.name.c_str(), soid.snap.val);
11fdf7f2 7640 if (pool.info.is_erasure()) {
7c673cae
FG
7641 result = -EOPNOTSUPP;
7642 break;
7643 }
7644 ++ctx->num_write;
7645 result = do_tmapup(ctx, bp, osd_op);
7646 break;
7647
7648 case CEPH_OSD_OP_TMAP2OMAP:
7649 ++ctx->num_write;
7650 tracepoint(osd, do_osd_op_pre_tmap2omap, soid.oid.name.c_str(), soid.snap.val);
7651 result = do_tmap2omap(ctx, op.tmap2omap.flags);
7652 break;
7653
7654 // OMAP Read ops
7655 case CEPH_OSD_OP_OMAPGETKEYS:
7656 ++ctx->num_read;
7657 {
7658 string start_after;
7659 uint64_t max_return;
7660 try {
11fdf7f2
TL
7661 decode(start_after, bp);
7662 decode(max_return, bp);
7c673cae 7663 }
f67539c2 7664 catch (ceph::buffer::error& e) {
7c673cae
FG
7665 result = -EINVAL;
7666 tracepoint(osd, do_osd_op_pre_omapgetkeys, soid.oid.name.c_str(), soid.snap.val, "???", 0);
7667 goto fail;
7668 }
7669 if (max_return > cct->_conf->osd_max_omap_entries_per_request) {
7670 max_return = cct->_conf->osd_max_omap_entries_per_request;
7671 }
7672 tracepoint(osd, do_osd_op_pre_omapgetkeys, soid.oid.name.c_str(), soid.snap.val, start_after.c_str(), max_return);
7673
7674 bufferlist bl;
7675 uint32_t num = 0;
7676 bool truncated = false;
7677 if (oi.is_omap()) {
7678 ObjectMap::ObjectMapIterator iter = osd->store->get_omap_iterator(
11fdf7f2 7679 ch, ghobject_t(soid)
7c673cae 7680 );
11fdf7f2 7681 ceph_assert(iter);
7c673cae 7682 iter->upper_bound(start_after);
11fdf7f2 7683 for (num = 0; iter->valid(); ++num, iter->next()) {
7c673cae
FG
7684 if (num >= max_return ||
7685 bl.length() >= cct->_conf->osd_max_omap_bytes_per_request) {
7686 truncated = true;
7687 break;
7688 }
11fdf7f2 7689 encode(iter->key(), bl);
7c673cae
FG
7690 }
7691 } // else return empty out_set
11fdf7f2 7692 encode(num, osd_op.outdata);
7c673cae 7693 osd_op.outdata.claim_append(bl);
11fdf7f2
TL
7694 encode(truncated, osd_op.outdata);
7695 ctx->delta_stats.num_rd_kb += shift_round_up(osd_op.outdata.length(), 10);
7c673cae
FG
7696 ctx->delta_stats.num_rd++;
7697 }
7698 break;
7699
7700 case CEPH_OSD_OP_OMAPGETVALS:
7701 ++ctx->num_read;
7702 {
7703 string start_after;
7704 uint64_t max_return;
7705 string filter_prefix;
7706 try {
11fdf7f2
TL
7707 decode(start_after, bp);
7708 decode(max_return, bp);
7709 decode(filter_prefix, bp);
7c673cae 7710 }
f67539c2 7711 catch (ceph::buffer::error& e) {
7c673cae
FG
7712 result = -EINVAL;
7713 tracepoint(osd, do_osd_op_pre_omapgetvals, soid.oid.name.c_str(), soid.snap.val, "???", 0, "???");
7714 goto fail;
7715 }
7716 if (max_return > cct->_conf->osd_max_omap_entries_per_request) {
7717 max_return = cct->_conf->osd_max_omap_entries_per_request;
7718 }
7719 tracepoint(osd, do_osd_op_pre_omapgetvals, soid.oid.name.c_str(), soid.snap.val, start_after.c_str(), max_return, filter_prefix.c_str());
7720
7721 uint32_t num = 0;
7722 bool truncated = false;
7723 bufferlist bl;
7724 if (oi.is_omap()) {
7725 ObjectMap::ObjectMapIterator iter = osd->store->get_omap_iterator(
11fdf7f2 7726 ch, ghobject_t(soid)
7c673cae
FG
7727 );
7728 if (!iter) {
7729 result = -ENOENT;
7730 goto fail;
7731 }
7732 iter->upper_bound(start_after);
7733 if (filter_prefix > start_after) iter->lower_bound(filter_prefix);
7734 for (num = 0;
7735 iter->valid() &&
7736 iter->key().substr(0, filter_prefix.size()) == filter_prefix;
11fdf7f2 7737 ++num, iter->next()) {
7c673cae
FG
7738 dout(20) << "Found key " << iter->key() << dendl;
7739 if (num >= max_return ||
7740 bl.length() >= cct->_conf->osd_max_omap_bytes_per_request) {
7741 truncated = true;
7742 break;
7743 }
11fdf7f2
TL
7744 encode(iter->key(), bl);
7745 encode(iter->value(), bl);
7c673cae
FG
7746 }
7747 } // else return empty out_set
11fdf7f2 7748 encode(num, osd_op.outdata);
7c673cae 7749 osd_op.outdata.claim_append(bl);
11fdf7f2
TL
7750 encode(truncated, osd_op.outdata);
7751 ctx->delta_stats.num_rd_kb += shift_round_up(osd_op.outdata.length(), 10);
7c673cae
FG
7752 ctx->delta_stats.num_rd++;
7753 }
7754 break;
7755
7756 case CEPH_OSD_OP_OMAPGETHEADER:
7757 tracepoint(osd, do_osd_op_pre_omapgetheader, soid.oid.name.c_str(), soid.snap.val);
7758 if (!oi.is_omap()) {
7759 // return empty header
7760 break;
7761 }
7762 ++ctx->num_read;
7763 {
7764 osd->store->omap_get_header(ch, ghobject_t(soid), &osd_op.outdata);
11fdf7f2 7765 ctx->delta_stats.num_rd_kb += shift_round_up(osd_op.outdata.length(), 10);
7c673cae
FG
7766 ctx->delta_stats.num_rd++;
7767 }
7768 break;
7769
7770 case CEPH_OSD_OP_OMAPGETVALSBYKEYS:
7771 ++ctx->num_read;
7772 {
7773 set<string> keys_to_get;
7774 try {
11fdf7f2 7775 decode(keys_to_get, bp);
7c673cae 7776 }
f67539c2 7777 catch (ceph::buffer::error& e) {
7c673cae
FG
7778 result = -EINVAL;
7779 tracepoint(osd, do_osd_op_pre_omapgetvalsbykeys, soid.oid.name.c_str(), soid.snap.val, "???");
7780 goto fail;
7781 }
7782 tracepoint(osd, do_osd_op_pre_omapgetvalsbykeys, soid.oid.name.c_str(), soid.snap.val, list_entries(keys_to_get).c_str());
7783 map<string, bufferlist> out;
7784 if (oi.is_omap()) {
7785 osd->store->omap_get_values(ch, ghobject_t(soid), keys_to_get, &out);
7786 } // else return empty omap entries
11fdf7f2
TL
7787 encode(out, osd_op.outdata);
7788 ctx->delta_stats.num_rd_kb += shift_round_up(osd_op.outdata.length(), 10);
7c673cae
FG
7789 ctx->delta_stats.num_rd++;
7790 }
7791 break;
7792
7793 case CEPH_OSD_OP_OMAP_CMP:
7794 ++ctx->num_read;
7795 {
7796 if (!obs.exists || oi.is_whiteout()) {
7797 result = -ENOENT;
7798 tracepoint(osd, do_osd_op_pre_omap_cmp, soid.oid.name.c_str(), soid.snap.val, "???");
7799 break;
7800 }
7801 map<string, pair<bufferlist, int> > assertions;
7802 try {
11fdf7f2 7803 decode(assertions, bp);
7c673cae 7804 }
f67539c2 7805 catch (ceph::buffer::error& e) {
7c673cae
FG
7806 result = -EINVAL;
7807 tracepoint(osd, do_osd_op_pre_omap_cmp, soid.oid.name.c_str(), soid.snap.val, "???");
7808 goto fail;
7809 }
7810 tracepoint(osd, do_osd_op_pre_omap_cmp, soid.oid.name.c_str(), soid.snap.val, list_keys(assertions).c_str());
f67539c2 7811
7c673cae
FG
7812 map<string, bufferlist> out;
7813
7814 if (oi.is_omap()) {
7815 set<string> to_get;
7816 for (map<string, pair<bufferlist, int> >::iterator i = assertions.begin();
7817 i != assertions.end();
7818 ++i)
7819 to_get.insert(i->first);
7820 int r = osd->store->omap_get_values(ch, ghobject_t(soid),
7821 to_get, &out);
7822 if (r < 0) {
7823 result = r;
7824 break;
7825 }
7826 } // else leave out empty
7827
7828 //Should set num_rd_kb based on encode length of map
7829 ctx->delta_stats.num_rd++;
7830
7831 int r = 0;
7832 bufferlist empty;
7833 for (map<string, pair<bufferlist, int> >::iterator i = assertions.begin();
7834 i != assertions.end();
7835 ++i) {
7836 auto out_entry = out.find(i->first);
7837 bufferlist &bl = (out_entry != out.end()) ?
7838 out_entry->second : empty;
7839 switch (i->second.second) {
7840 case CEPH_OSD_CMPXATTR_OP_EQ:
7841 if (!(bl == i->second.first)) {
7842 r = -ECANCELED;
7843 }
7844 break;
7845 case CEPH_OSD_CMPXATTR_OP_LT:
7846 if (!(bl < i->second.first)) {
7847 r = -ECANCELED;
7848 }
7849 break;
7850 case CEPH_OSD_CMPXATTR_OP_GT:
7851 if (!(bl > i->second.first)) {
7852 r = -ECANCELED;
7853 }
7854 break;
7855 default:
7856 r = -EINVAL;
7857 break;
7858 }
7859 if (r < 0)
7860 break;
7861 }
7862 if (r < 0) {
7863 result = r;
7864 }
7865 }
7866 break;
7867
7868 // OMAP Write ops
7869 case CEPH_OSD_OP_OMAPSETVALS:
7870 if (!pool.info.supports_omap()) {
7871 result = -EOPNOTSUPP;
7872 tracepoint(osd, do_osd_op_pre_omapsetvals, soid.oid.name.c_str(), soid.snap.val);
7873 break;
7874 }
7875 ++ctx->num_write;
9f95a23c 7876 result = 0;
7c673cae
FG
7877 {
7878 maybe_create_new_object(ctx);
7879 bufferlist to_set_bl;
7880 try {
7881 decode_str_str_map_to_bl(bp, &to_set_bl);
7882 }
f67539c2 7883 catch (ceph::buffer::error& e) {
7c673cae
FG
7884 result = -EINVAL;
7885 tracepoint(osd, do_osd_op_pre_omapsetvals, soid.oid.name.c_str(), soid.snap.val);
7886 goto fail;
7887 }
7888 tracepoint(osd, do_osd_op_pre_omapsetvals, soid.oid.name.c_str(), soid.snap.val);
11fdf7f2 7889 if (cct->_conf->subsys.should_gather<dout_subsys, 20>()) {
7c673cae
FG
7890 dout(20) << "setting vals: " << dendl;
7891 map<string,bufferlist> to_set;
11fdf7f2
TL
7892 bufferlist::const_iterator pt = to_set_bl.begin();
7893 decode(to_set, pt);
7c673cae
FG
7894 for (map<string, bufferlist>::iterator i = to_set.begin();
7895 i != to_set.end();
7896 ++i) {
7897 dout(20) << "\t" << i->first << dendl;
7898 }
7899 }
7900 t->omap_setkeys(soid, to_set_bl);
9f95a23c 7901 ctx->clean_regions.mark_omap_dirty();
7c673cae 7902 ctx->delta_stats.num_wr++;
11fdf7f2 7903 ctx->delta_stats.num_wr_kb += shift_round_up(to_set_bl.length(), 10);
7c673cae
FG
7904 }
7905 obs.oi.set_flag(object_info_t::FLAG_OMAP);
7906 obs.oi.clear_omap_digest();
7907 break;
7908
7909 case CEPH_OSD_OP_OMAPSETHEADER:
7910 tracepoint(osd, do_osd_op_pre_omapsetheader, soid.oid.name.c_str(), soid.snap.val);
7911 if (!pool.info.supports_omap()) {
7912 result = -EOPNOTSUPP;
7913 break;
7914 }
7915 ++ctx->num_write;
9f95a23c 7916 result = 0;
7c673cae
FG
7917 {
7918 maybe_create_new_object(ctx);
7919 t->omap_setheader(soid, osd_op.indata);
9f95a23c 7920 ctx->clean_regions.mark_omap_dirty();
7c673cae
FG
7921 ctx->delta_stats.num_wr++;
7922 }
7923 obs.oi.set_flag(object_info_t::FLAG_OMAP);
7924 obs.oi.clear_omap_digest();
7925 break;
7926
7927 case CEPH_OSD_OP_OMAPCLEAR:
7928 tracepoint(osd, do_osd_op_pre_omapclear, soid.oid.name.c_str(), soid.snap.val);
7929 if (!pool.info.supports_omap()) {
7930 result = -EOPNOTSUPP;
7931 break;
7932 }
7933 ++ctx->num_write;
9f95a23c 7934 result = 0;
7c673cae
FG
7935 {
7936 if (!obs.exists || oi.is_whiteout()) {
7937 result = -ENOENT;
7938 break;
7939 }
7940 if (oi.is_omap()) {
7941 t->omap_clear(soid);
9f95a23c 7942 ctx->clean_regions.mark_omap_dirty();
7c673cae
FG
7943 ctx->delta_stats.num_wr++;
7944 obs.oi.clear_omap_digest();
7945 obs.oi.clear_flag(object_info_t::FLAG_OMAP);
7946 }
7947 }
7948 break;
7949
7950 case CEPH_OSD_OP_OMAPRMKEYS:
7951 if (!pool.info.supports_omap()) {
7952 result = -EOPNOTSUPP;
7953 tracepoint(osd, do_osd_op_pre_omaprmkeys, soid.oid.name.c_str(), soid.snap.val);
7954 break;
7955 }
7956 ++ctx->num_write;
9f95a23c 7957 result = 0;
7c673cae
FG
7958 {
7959 if (!obs.exists || oi.is_whiteout()) {
7960 result = -ENOENT;
7961 tracepoint(osd, do_osd_op_pre_omaprmkeys, soid.oid.name.c_str(), soid.snap.val);
7962 break;
7963 }
7964 bufferlist to_rm_bl;
7965 try {
7966 decode_str_set_to_bl(bp, &to_rm_bl);
7967 }
f67539c2 7968 catch (ceph::buffer::error& e) {
7c673cae
FG
7969 result = -EINVAL;
7970 tracepoint(osd, do_osd_op_pre_omaprmkeys, soid.oid.name.c_str(), soid.snap.val);
7971 goto fail;
7972 }
7973 tracepoint(osd, do_osd_op_pre_omaprmkeys, soid.oid.name.c_str(), soid.snap.val);
7974 t->omap_rmkeys(soid, to_rm_bl);
9f95a23c
TL
7975 ctx->clean_regions.mark_omap_dirty();
7976 ctx->delta_stats.num_wr++;
7977 }
7978 obs.oi.clear_omap_digest();
7979 break;
7980
7981 case CEPH_OSD_OP_OMAPRMKEYRANGE:
7982 tracepoint(osd, do_osd_op_pre_omaprmkeyrange, soid.oid.name.c_str(), soid.snap.val);
7983 if (!pool.info.supports_omap()) {
7984 result = -EOPNOTSUPP;
7985 break;
7986 }
7987 ++ctx->num_write;
7988 result = 0;
7989 {
7990 if (!obs.exists || oi.is_whiteout()) {
7991 result = -ENOENT;
7992 break;
7993 }
7994 std::string key_begin, key_end;
7995 try {
7996 decode(key_begin, bp);
7997 decode(key_end, bp);
f67539c2 7998 } catch (ceph::buffer::error& e) {
9f95a23c
TL
7999 result = -EINVAL;
8000 goto fail;
8001 }
8002 t->omap_rmkeyrange(soid, key_begin, key_end);
1d09f67e 8003 ctx->clean_regions.mark_omap_dirty();
7c673cae
FG
8004 ctx->delta_stats.num_wr++;
8005 }
8006 obs.oi.clear_omap_digest();
8007 break;
8008
8009 case CEPH_OSD_OP_COPY_GET:
8010 ++ctx->num_read;
c07f9fc5
FG
8011 tracepoint(osd, do_osd_op_pre_copy_get, soid.oid.name.c_str(),
8012 soid.snap.val);
8013 if (op_finisher == nullptr) {
8014 result = do_copy_get(ctx, bp, osd_op, ctx->obc);
8015 } else {
8016 result = op_finisher->execute();
8017 }
7c673cae
FG
8018 break;
8019
8020 case CEPH_OSD_OP_COPY_FROM:
9f95a23c 8021 case CEPH_OSD_OP_COPY_FROM2:
7c673cae 8022 ++ctx->num_write;
9f95a23c 8023 result = 0;
7c673cae
FG
8024 {
8025 object_t src_name;
8026 object_locator_t src_oloc;
9f95a23c
TL
8027 uint32_t truncate_seq = 0;
8028 uint64_t truncate_size = 0;
8029 bool have_truncate = false;
7c673cae
FG
8030 snapid_t src_snapid = (uint64_t)op.copy_from.snapid;
8031 version_t src_version = op.copy_from.src_version;
9f95a23c
TL
8032
8033 if ((op.op == CEPH_OSD_OP_COPY_FROM2) &&
8034 (op.copy_from.flags & ~CEPH_OSD_COPY_FROM_FLAGS)) {
8035 dout(20) << "invalid copy-from2 flags 0x"
8036 << std::hex << (int)op.copy_from.flags << std::dec << dendl;
8037 result = -EINVAL;
8038 break;
8039 }
7c673cae 8040 try {
11fdf7f2
TL
8041 decode(src_name, bp);
8042 decode(src_oloc, bp);
9f95a23c
TL
8043 // check if client sent us truncate_seq and truncate_size
8044 if ((op.op == CEPH_OSD_OP_COPY_FROM2) &&
8045 (op.copy_from.flags & CEPH_OSD_COPY_FROM_FLAG_TRUNCATE_SEQ)) {
8046 decode(truncate_seq, bp);
8047 decode(truncate_size, bp);
8048 have_truncate = true;
8049 }
7c673cae 8050 }
f67539c2 8051 catch (ceph::buffer::error& e) {
7c673cae
FG
8052 result = -EINVAL;
8053 tracepoint(osd,
8054 do_osd_op_pre_copy_from,
8055 soid.oid.name.c_str(),
8056 soid.snap.val,
8057 "???",
8058 0,
8059 "???",
8060 "???",
8061 0,
8062 src_snapid,
8063 src_version);
8064 goto fail;
8065 }
8066 tracepoint(osd,
8067 do_osd_op_pre_copy_from,
8068 soid.oid.name.c_str(),
8069 soid.snap.val,
8070 src_name.name.c_str(),
8071 src_oloc.pool,
8072 src_oloc.key.c_str(),
8073 src_oloc.nspace.c_str(),
8074 src_oloc.hash,
8075 src_snapid,
8076 src_version);
c07f9fc5 8077 if (op_finisher == nullptr) {
7c673cae
FG
8078 // start
8079 pg_t raw_pg;
8080 get_osdmap()->object_locator_to_pg(src_name, src_oloc, raw_pg);
8081 hobject_t src(src_name, src_oloc.key, src_snapid,
8082 raw_pg.ps(), raw_pg.pool(),
8083 src_oloc.nspace);
8084 if (src == soid) {
8085 dout(20) << " copy from self is invalid" << dendl;
8086 result = -EINVAL;
8087 break;
8088 }
c07f9fc5 8089 CopyFromCallback *cb = new CopyFromCallback(ctx, osd_op);
9f95a23c
TL
8090 if (have_truncate)
8091 cb->set_truncate(truncate_seq, truncate_size);
c07f9fc5
FG
8092 ctx->op_finishers[ctx->current_osd_subop_num].reset(
8093 new CopyFromFinisher(cb));
7c673cae
FG
8094 start_copy(cb, ctx->obc, src, src_oloc, src_version,
8095 op.copy_from.flags,
8096 false,
8097 op.copy_from.src_fadvise_flags,
8098 op.flags);
8099 result = -EINPROGRESS;
8100 } else {
8101 // finish
c07f9fc5 8102 result = op_finisher->execute();
11fdf7f2 8103 ceph_assert(result == 0);
c07f9fc5
FG
8104
8105 // COPY_FROM cannot be executed multiple times -- it must restart
8106 ctx->op_finishers.erase(ctx->current_osd_subop_num);
7c673cae
FG
8107 }
8108 }
8109 break;
8110
8111 default:
8112 tracepoint(osd, do_osd_op_pre_unknown, soid.oid.name.c_str(), soid.snap.val, op.op, ceph_osd_op_name(op.op));
8113 dout(1) << "unrecognized osd op " << op.op
8114 << " " << ceph_osd_op_name(op.op)
8115 << dendl;
8116 result = -EOPNOTSUPP;
8117 }
8118
8119 fail:
8120 osd_op.rval = result;
8121 tracepoint(osd, do_osd_op_post, soid.oid.name.c_str(), soid.snap.val, op.op, ceph_osd_op_name(op.op), op.flags, result);
11fdf7f2
TL
8122 if (result < 0 && (op.flags & CEPH_OSD_OP_FLAG_FAILOK) &&
8123 result != -EAGAIN && result != -EINPROGRESS)
7c673cae
FG
8124 result = 0;
8125
8126 if (result < 0)
8127 break;
8128 }
eafe8130
TL
8129 if (result < 0) {
8130 dout(10) << __func__ << " error: " << cpp_strerror(result) << dendl;
8131 }
7c673cae
FG
8132 return result;
8133}
8134
8135int PrimaryLogPG::_get_tmap(OpContext *ctx, bufferlist *header, bufferlist *vals)
8136{
8137 if (ctx->new_obs.oi.size == 0) {
8138 dout(20) << "unable to get tmap for zero sized " << ctx->new_obs.oi.soid << dendl;
8139 return -ENODATA;
8140 }
8141 vector<OSDOp> nops(1);
8142 OSDOp &newop = nops[0];
8143 newop.op.op = CEPH_OSD_OP_TMAPGET;
8144 do_osd_ops(ctx, nops);
8145 try {
11fdf7f2
TL
8146 bufferlist::const_iterator i = newop.outdata.begin();
8147 decode(*header, i);
7c673cae
FG
8148 (*vals).substr_of(newop.outdata, i.get_off(), i.get_remaining());
8149 } catch (...) {
8150 dout(20) << "unsuccessful at decoding tmap for " << ctx->new_obs.oi.soid
8151 << dendl;
8152 return -EINVAL;
8153 }
8154 dout(20) << "successful at decoding tmap for " << ctx->new_obs.oi.soid
8155 << dendl;
8156 return 0;
8157}
8158
8159int PrimaryLogPG::_verify_no_head_clones(const hobject_t& soid,
8160 const SnapSet& ss)
8161{
8162 // verify that all clones have been evicted
8163 dout(20) << __func__ << " verifying clones are absent "
8164 << ss << dendl;
8165 for (vector<snapid_t>::const_iterator p = ss.clones.begin();
8166 p != ss.clones.end();
8167 ++p) {
8168 hobject_t clone_oid = soid;
8169 clone_oid.snap = *p;
8170 if (is_missing_object(clone_oid))
8171 return -EBUSY;
8172 ObjectContextRef clone_obc = get_object_context(clone_oid, false);
8173 if (clone_obc && clone_obc->obs.exists) {
8174 dout(10) << __func__ << " cannot evict head before clone "
8175 << clone_oid << dendl;
8176 return -EBUSY;
8177 }
8178 if (copy_ops.count(clone_oid)) {
8179 dout(10) << __func__ << " cannot evict head, pending promote on clone "
8180 << clone_oid << dendl;
8181 return -EBUSY;
8182 }
8183 }
8184 return 0;
8185}
8186
8187inline int PrimaryLogPG::_delete_oid(
8188 OpContext *ctx,
8189 bool no_whiteout, // no whiteouts, no matter what.
8190 bool try_no_whiteout) // try not to whiteout
8191{
8192 SnapSet& snapset = ctx->new_snapset;
8193 ObjectState& obs = ctx->new_obs;
8194 object_info_t& oi = obs.oi;
8195 const hobject_t& soid = oi.soid;
8196 PGTransaction* t = ctx->op_t.get();
8197
8198 // cache: cache: set whiteout on delete?
8199 bool whiteout = false;
8200 if (pool.info.cache_mode != pg_pool_t::CACHEMODE_NONE
8201 && !no_whiteout
8202 && !try_no_whiteout) {
8203 whiteout = true;
8204 }
11fdf7f2
TL
8205
8206 // in luminous or later, we can't delete the head if there are
8207 // clones. we trust the caller passing no_whiteout has already
8208 // verified they don't exist.
8209 if (!snapset.clones.empty() ||
8210 (!ctx->snapc.snaps.empty() && ctx->snapc.snaps[0] > snapset.seq)) {
8211 if (no_whiteout) {
8212 dout(20) << __func__ << " has or will have clones but no_whiteout=1"
8213 << dendl;
8214 } else {
8215 dout(20) << __func__ << " has or will have clones; will whiteout"
8216 << dendl;
8217 whiteout = true;
7c673cae 8218 }
7c673cae
FG
8219 }
8220 dout(20) << __func__ << " " << soid << " whiteout=" << (int)whiteout
8221 << " no_whiteout=" << (int)no_whiteout
8222 << " try_no_whiteout=" << (int)try_no_whiteout
8223 << dendl;
8224 if (!obs.exists || (obs.oi.is_whiteout() && whiteout))
8225 return -ENOENT;
8226
8227 t->remove(soid);
8228
8229 if (oi.size > 0) {
8230 interval_set<uint64_t> ch;
8231 ch.insert(0, oi.size);
8232 ctx->modified_ranges.union_of(ch);
9f95a23c 8233 ctx->clean_regions.mark_data_region_dirty(0, oi.size);
7c673cae
FG
8234 }
8235
9f95a23c 8236 ctx->clean_regions.mark_omap_dirty();
7c673cae
FG
8237 ctx->delta_stats.num_wr++;
8238 if (soid.is_snap()) {
11fdf7f2 8239 ceph_assert(ctx->obc->ssc->snapset.clone_overlap.count(soid.snap));
7c673cae
FG
8240 ctx->delta_stats.num_bytes -= ctx->obc->ssc->snapset.get_clone_bytes(soid.snap);
8241 } else {
8242 ctx->delta_stats.num_bytes -= oi.size;
8243 }
8244 oi.size = 0;
8245 oi.new_object();
8246
8247 // disconnect all watchers
8248 for (map<pair<uint64_t, entity_name_t>, watch_info_t>::iterator p =
8249 oi.watchers.begin();
8250 p != oi.watchers.end();
8251 ++p) {
8252 dout(20) << __func__ << " will disconnect watcher " << p->first << dendl;
8253 ctx->watch_disconnects.push_back(
8254 watch_disconnect_t(p->first.first, p->first.second, true));
8255 }
8256 oi.watchers.clear();
8257
8258 if (whiteout) {
8259 dout(20) << __func__ << " setting whiteout on " << soid << dendl;
8260 oi.set_flag(object_info_t::FLAG_WHITEOUT);
8261 ctx->delta_stats.num_whiteouts++;
8262 t->create(soid);
8263 osd->logger->inc(l_osd_tier_whiteout);
8264 return 0;
8265 }
8266
20effc67
TL
8267 if (oi.has_manifest()) {
8268 ctx->delta_stats.num_objects_manifest--;
8269 dec_all_refcount_manifest(oi, ctx);
8270 }
8271
7c673cae
FG
8272 // delete the head
8273 ctx->delta_stats.num_objects--;
8274 if (soid.is_snap())
8275 ctx->delta_stats.num_object_clones--;
8276 if (oi.is_whiteout()) {
8277 dout(20) << __func__ << " deleting whiteout on " << soid << dendl;
8278 ctx->delta_stats.num_whiteouts--;
8279 oi.clear_flag(object_info_t::FLAG_WHITEOUT);
8280 }
8281 if (oi.is_cache_pinned()) {
8282 ctx->delta_stats.num_objects_pinned--;
8283 }
7c673cae
FG
8284 obs.exists = false;
8285 return 0;
8286}
8287
20effc67 8288int PrimaryLogPG::_rollback_to(OpContext *ctx, OSDOp& op)
7c673cae 8289{
7c673cae
FG
8290 ObjectState& obs = ctx->new_obs;
8291 object_info_t& oi = obs.oi;
8292 const hobject_t& soid = oi.soid;
20effc67 8293 snapid_t snapid = (uint64_t)op.op.snap.snapid;
7c673cae
FG
8294 hobject_t missing_oid;
8295
8296 dout(10) << "_rollback_to " << soid << " snapid " << snapid << dendl;
8297
8298 ObjectContextRef rollback_to;
11fdf7f2 8299
7c673cae
FG
8300 int ret = find_object_context(
8301 hobject_t(soid.oid, soid.get_key(), snapid, soid.get_hash(), info.pgid.pool(),
8302 soid.get_namespace()),
8303 &rollback_to, false, false, &missing_oid);
8304 if (ret == -EAGAIN) {
8305 /* clone must be missing */
11fdf7f2 8306 ceph_assert(is_degraded_or_backfilling_object(missing_oid) || is_degraded_on_async_recovery_target(missing_oid));
c07f9fc5 8307 dout(20) << "_rollback_to attempted to roll back to a missing or backfilling clone "
7c673cae
FG
8308 << missing_oid << " (requested snapid: ) " << snapid << dendl;
8309 block_write_on_degraded_snap(missing_oid, ctx->op);
8310 return ret;
8311 }
8312 {
8313 ObjectContextRef promote_obc;
31f18b77 8314 cache_result_t tier_mode_result;
f67539c2 8315 if (obs.exists && obs.oi.has_manifest()) {
20effc67
TL
8316 /*
8317 * In the case of manifest object, the object_info exists on the base tier at all time,
8318 * so promote_obc should be equal to rollback_to
8319 * */
8320 promote_obc = rollback_to;
f67539c2 8321 tier_mode_result =
31f18b77
FG
8322 maybe_handle_manifest_detail(
8323 ctx->op,
8324 true,
8325 rollback_to);
8326 } else {
f67539c2 8327 tier_mode_result =
31f18b77
FG
8328 maybe_handle_cache_detail(
8329 ctx->op,
8330 true,
8331 rollback_to,
8332 ret,
8333 missing_oid,
8334 true,
8335 false,
8336 &promote_obc);
8337 }
8338 switch (tier_mode_result) {
7c673cae
FG
8339 case cache_result_t::NOOP:
8340 break;
8341 case cache_result_t::BLOCKED_PROMOTE:
11fdf7f2 8342 ceph_assert(promote_obc);
7c673cae
FG
8343 block_write_on_snap_rollback(soid, promote_obc, ctx->op);
8344 return -EAGAIN;
8345 case cache_result_t::BLOCKED_FULL:
8346 block_write_on_full_cache(soid, ctx->op);
8347 return -EAGAIN;
b32b8144 8348 case cache_result_t::REPLIED_WITH_EAGAIN:
11fdf7f2 8349 ceph_abort_msg("this can't happen, no rollback on replica");
7c673cae 8350 default:
11fdf7f2 8351 ceph_abort_msg("must promote was set, other values are not valid");
7c673cae
FG
8352 return -EAGAIN;
8353 }
8354 }
8355
8356 if (ret == -ENOENT || (rollback_to && rollback_to->obs.oi.is_whiteout())) {
8357 // there's no snapshot here, or there's no object.
8358 // if there's no snapshot, we delete the object; otherwise, do nothing.
8359 dout(20) << "_rollback_to deleting head on " << soid.oid
8360 << " because got ENOENT|whiteout on find_object_context" << dendl;
8361 if (ctx->obc->obs.oi.watchers.size()) {
8362 // Cannot delete an object with watchers
8363 ret = -EBUSY;
8364 } else {
8365 _delete_oid(ctx, false, false);
8366 ret = 0;
8367 }
8368 } else if (ret) {
8369 // ummm....huh? It *can't* return anything else at time of writing.
11fdf7f2 8370 ceph_abort_msg("unexpected error code in _rollback_to");
7c673cae
FG
8371 } else { //we got our context, let's use it to do the rollback!
8372 hobject_t& rollback_to_sobject = rollback_to->obs.oi.soid;
11fdf7f2
TL
8373 if (is_degraded_or_backfilling_object(rollback_to_sobject) ||
8374 is_degraded_on_async_recovery_target(rollback_to_sobject)) {
7c673cae
FG
8375 dout(20) << "_rollback_to attempted to roll back to a degraded object "
8376 << rollback_to_sobject << " (requested snapid: ) " << snapid << dendl;
8377 block_write_on_degraded_snap(rollback_to_sobject, ctx->op);
8378 ret = -EAGAIN;
8379 } else if (rollback_to->obs.oi.soid.snap == CEPH_NOSNAP) {
8380 // rolling back to the head; we just need to clone it.
8381 ctx->modify = true;
8382 } else {
20effc67
TL
8383 if (rollback_to->obs.oi.has_manifest() && rollback_to->obs.oi.manifest.is_chunked()) {
8384 /*
8385 * looking at the following case, the foo head needs the reference of chunk4 and chunk5
8386 * in case snap[1] is removed.
8387 *
8388 * Before rollback to snap[1]:
8389 *
8390 * foo snap[1]: [chunk4] [chunk5]
8391 * foo snap[0]: [ chunk2 ]
8392 * foo head : [chunk1] [chunk3]
8393 *
8394 * After:
8395 *
8396 * foo snap[1]: [chunk4] [chunk5]
8397 * foo snap[0]: [ chunk2 ]
8398 * foo head : [chunk4] [chunk5]
8399 *
8400 */
8401 OpFinisher* op_finisher = nullptr;
8402 auto op_finisher_it = ctx->op_finishers.find(ctx->current_osd_subop_num);
8403 if (op_finisher_it != ctx->op_finishers.end()) {
8404 op_finisher = op_finisher_it->second.get();
8405 }
8406 if (!op_finisher) {
8407 bool need_inc_ref = inc_refcount_by_set(ctx, rollback_to->obs.oi.manifest, op);
8408 if (need_inc_ref) {
8409 ceph_assert(op_finisher_it == ctx->op_finishers.end());
8410 ctx->op_finishers[ctx->current_osd_subop_num].reset(
8411 new SetManifestFinisher(op));
8412 return -EINPROGRESS;
8413 }
8414 } else {
8415 op_finisher->execute();
8416 ctx->op_finishers.erase(ctx->current_osd_subop_num);
8417 }
7c673cae 8418 }
20effc67 8419 _do_rollback_to(ctx, rollback_to, op);
7c673cae
FG
8420 }
8421 }
8422 return ret;
8423}
8424
20effc67
TL
8425void PrimaryLogPG::_do_rollback_to(OpContext *ctx, ObjectContextRef rollback_to,
8426 OSDOp& op)
8427{
8428 SnapSet& snapset = ctx->new_snapset;
8429 ObjectState& obs = ctx->new_obs;
8430 object_info_t& oi = obs.oi;
8431 const hobject_t& soid = oi.soid;
8432 PGTransaction* t = ctx->op_t.get();
8433 snapid_t snapid = (uint64_t)op.op.snap.snapid;
8434 hobject_t& rollback_to_sobject = rollback_to->obs.oi.soid;
8435
8436 /* 1) Delete current head
8437 * 2) Clone correct snapshot into head
8438 * 3) Calculate clone_overlaps by following overlaps
8439 * forward from rollback snapshot */
8440 dout(10) << "_do_rollback_to deleting " << soid.oid
8441 << " and rolling back to old snap" << dendl;
8442
8443 if (obs.exists) {
8444 t->remove(soid);
8445 if (obs.oi.has_manifest()) {
8446 dec_all_refcount_manifest(obs.oi, ctx);
8447 oi.manifest.clear();
8448 oi.manifest.type = object_manifest_t::TYPE_NONE;
8449 oi.clear_flag(object_info_t::FLAG_MANIFEST);
8450 ctx->delta_stats.num_objects_manifest--;
8451 ctx->cache_operation = true; // do not trigger to call ref function to calculate refcount
8452 }
8453 }
8454 t->clone(soid, rollback_to_sobject);
8455 t->add_obc(rollback_to);
8456
8457 map<snapid_t, interval_set<uint64_t> >::iterator iter =
8458 snapset.clone_overlap.lower_bound(snapid);
8459 ceph_assert(iter != snapset.clone_overlap.end());
8460 interval_set<uint64_t> overlaps = iter->second;
8461 for ( ;
8462 iter != snapset.clone_overlap.end();
8463 ++iter)
8464 overlaps.intersection_of(iter->second);
8465
8466 if (obs.oi.size > 0) {
8467 interval_set<uint64_t> modified;
8468 modified.insert(0, obs.oi.size);
8469 overlaps.intersection_of(modified);
8470 modified.subtract(overlaps);
8471 ctx->modified_ranges.union_of(modified);
8472 }
8473
8474 // Adjust the cached objectcontext
8475 maybe_create_new_object(ctx, true);
8476 ctx->delta_stats.num_bytes -= obs.oi.size;
8477 ctx->delta_stats.num_bytes += rollback_to->obs.oi.size;
8478 ctx->clean_regions.mark_data_region_dirty(0, std::max(obs.oi.size, rollback_to->obs.oi.size));
8479 ctx->clean_regions.mark_omap_dirty();
8480 obs.oi.size = rollback_to->obs.oi.size;
8481 if (rollback_to->obs.oi.is_data_digest())
8482 obs.oi.set_data_digest(rollback_to->obs.oi.data_digest);
8483 else
8484 obs.oi.clear_data_digest();
8485 if (rollback_to->obs.oi.is_omap_digest())
8486 obs.oi.set_omap_digest(rollback_to->obs.oi.omap_digest);
8487 else
8488 obs.oi.clear_omap_digest();
8489
8490 if (rollback_to->obs.oi.has_manifest() && rollback_to->obs.oi.manifest.is_chunked()) {
8491 obs.oi.set_flag(object_info_t::FLAG_MANIFEST);
8492 obs.oi.manifest.type = rollback_to->obs.oi.manifest.type;
8493 obs.oi.manifest.chunk_map = rollback_to->obs.oi.manifest.chunk_map;
8494 ctx->cache_operation = true;
8495 ctx->delta_stats.num_objects_manifest++;
8496 }
8497
8498 if (rollback_to->obs.oi.is_omap()) {
8499 dout(10) << __func__ << " setting omap flag on " << obs.oi.soid << dendl;
8500 obs.oi.set_flag(object_info_t::FLAG_OMAP);
8501 } else {
8502 dout(10) << __func__ << " clearing omap flag on " << obs.oi.soid << dendl;
8503 obs.oi.clear_flag(object_info_t::FLAG_OMAP);
8504 }
8505}
8506
7c673cae
FG
8507void PrimaryLogPG::_make_clone(
8508 OpContext *ctx,
8509 PGTransaction* t,
1e59de90 8510 ObjectContextRef clone_obc,
7c673cae
FG
8511 const hobject_t& head, const hobject_t& coid,
8512 object_info_t *poi)
8513{
8514 bufferlist bv;
11fdf7f2 8515 encode(*poi, bv, get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
7c673cae
FG
8516
8517 t->clone(coid, head);
1e59de90
TL
8518 setattr_maybe_cache(clone_obc, t, OI_ATTR, bv);
8519 rmattr_maybe_cache(clone_obc, t, SS_ATTR);
7c673cae
FG
8520}
8521
8522void PrimaryLogPG::make_writeable(OpContext *ctx)
8523{
8524 const hobject_t& soid = ctx->obs->oi.soid;
8525 SnapContext& snapc = ctx->snapc;
8526
8527 // clone?
11fdf7f2 8528 ceph_assert(soid.snap == CEPH_NOSNAP);
7c673cae
FG
8529 dout(20) << "make_writeable " << soid << " snapset=" << ctx->new_snapset
8530 << " snapc=" << snapc << dendl;
f67539c2 8531
7c673cae
FG
8532 bool was_dirty = ctx->obc->obs.oi.is_dirty();
8533 if (ctx->new_obs.exists) {
8534 // we will mark the object dirty
8535 if (ctx->undirty && was_dirty) {
8536 dout(20) << " clearing DIRTY flag" << dendl;
11fdf7f2 8537 ceph_assert(ctx->new_obs.oi.is_dirty());
7c673cae
FG
8538 ctx->new_obs.oi.clear_flag(object_info_t::FLAG_DIRTY);
8539 --ctx->delta_stats.num_objects_dirty;
8540 osd->logger->inc(l_osd_tier_clean);
8541 } else if (!was_dirty && !ctx->undirty) {
8542 dout(20) << " setting DIRTY flag" << dendl;
8543 ctx->new_obs.oi.set_flag(object_info_t::FLAG_DIRTY);
8544 ++ctx->delta_stats.num_objects_dirty;
8545 osd->logger->inc(l_osd_tier_dirty);
8546 }
8547 } else {
8548 if (was_dirty) {
8549 dout(20) << " deletion, decrementing num_dirty and clearing flag" << dendl;
8550 ctx->new_obs.oi.clear_flag(object_info_t::FLAG_DIRTY);
8551 --ctx->delta_stats.num_objects_dirty;
8552 }
8553 }
8554
8555 if ((ctx->new_obs.exists &&
8556 ctx->new_obs.oi.is_omap()) &&
8557 (!ctx->obc->obs.exists ||
8558 !ctx->obc->obs.oi.is_omap())) {
8559 ++ctx->delta_stats.num_objects_omap;
8560 }
8561 if ((!ctx->new_obs.exists ||
8562 !ctx->new_obs.oi.is_omap()) &&
8563 (ctx->obc->obs.exists &&
8564 ctx->obc->obs.oi.is_omap())) {
8565 --ctx->delta_stats.num_objects_omap;
8566 }
8567
7c673cae 8568 if (ctx->new_snapset.seq > snapc.seq) {
11fdf7f2 8569 dout(10) << " op snapset is old" << dendl;
7c673cae
FG
8570 }
8571
8572 if ((ctx->obs->exists && !ctx->obs->oi.is_whiteout()) && // head exist(ed)
8573 snapc.snaps.size() && // there are snaps
f67539c2 8574 !ctx->cache_operation &&
7c673cae
FG
8575 snapc.snaps[0] > ctx->new_snapset.seq) { // existing object is old
8576 // clone
8577 hobject_t coid = soid;
8578 coid.snap = snapc.seq;
f67539c2 8579
1e59de90
TL
8580 const auto snaps = [&] {
8581 auto last = find_if_not(
8582 begin(snapc.snaps), end(snapc.snaps),
8583 [&](snapid_t snap_id) { return snap_id > ctx->new_snapset.seq; });
8584 return vector<snapid_t>{begin(snapc.snaps), last};
8585 }();
f67539c2 8586
7c673cae
FG
8587 // prepare clone
8588 object_info_t static_snap_oi(coid);
8589 object_info_t *snap_oi;
8590 if (is_primary()) {
8591 ctx->clone_obc = object_contexts.lookup_or_create(static_snap_oi.soid);
11fdf7f2
TL
8592 ctx->clone_obc->destructor_callback =
8593 new C_PG_ObjectContext(this, ctx->clone_obc.get());
7c673cae
FG
8594 ctx->clone_obc->obs.oi = static_snap_oi;
8595 ctx->clone_obc->obs.exists = true;
8596 ctx->clone_obc->ssc = ctx->obc->ssc;
8597 ctx->clone_obc->ssc->ref++;
11fdf7f2 8598 if (pool.info.is_erasure())
7c673cae
FG
8599 ctx->clone_obc->attr_cache = ctx->obc->attr_cache;
8600 snap_oi = &ctx->clone_obc->obs.oi;
f67539c2
TL
8601 if (ctx->obc->obs.oi.has_manifest()) {
8602 if ((ctx->obc->obs.oi.flags & object_info_t::FLAG_REDIRECT_HAS_REFERENCE) &&
8603 ctx->obc->obs.oi.manifest.is_redirect()) {
8604 snap_oi->set_flag(object_info_t::FLAG_MANIFEST);
8605 snap_oi->manifest.type = object_manifest_t::TYPE_REDIRECT;
8606 snap_oi->manifest.redirect_target = ctx->obc->obs.oi.manifest.redirect_target;
8607 } else if (ctx->obc->obs.oi.manifest.is_chunked()) {
8608 snap_oi->set_flag(object_info_t::FLAG_MANIFEST);
8609 snap_oi->manifest.type = object_manifest_t::TYPE_CHUNKED;
8610 snap_oi->manifest.chunk_map = ctx->obc->obs.oi.manifest.chunk_map;
8611 } else {
8612 ceph_abort_msg("unrecognized manifest type");
8613 }
8614 }
7c673cae
FG
8615 bool got = ctx->lock_manager.get_write_greedy(
8616 coid,
8617 ctx->clone_obc,
8618 ctx->op);
11fdf7f2 8619 ceph_assert(got);
7c673cae
FG
8620 dout(20) << " got greedy write on clone_obc " << *ctx->clone_obc << dendl;
8621 } else {
8622 snap_oi = &static_snap_oi;
8623 }
8624 snap_oi->version = ctx->at_version;
8625 snap_oi->prior_version = ctx->obs->oi.version;
8626 snap_oi->copy_user_bits(ctx->obs->oi);
8627
7c673cae 8628 _make_clone(ctx, ctx->op_t.get(), ctx->clone_obc, soid, coid, snap_oi);
f67539c2 8629
7c673cae
FG
8630 ctx->delta_stats.num_objects++;
8631 if (snap_oi->is_dirty()) {
8632 ctx->delta_stats.num_objects_dirty++;
8633 osd->logger->inc(l_osd_tier_dirty);
8634 }
8635 if (snap_oi->is_omap())
8636 ctx->delta_stats.num_objects_omap++;
8637 if (snap_oi->is_cache_pinned())
8638 ctx->delta_stats.num_objects_pinned++;
11fdf7f2
TL
8639 if (snap_oi->has_manifest())
8640 ctx->delta_stats.num_objects_manifest++;
7c673cae
FG
8641 ctx->delta_stats.num_object_clones++;
8642 ctx->new_snapset.clones.push_back(coid.snap);
8643 ctx->new_snapset.clone_size[coid.snap] = ctx->obs->oi.size;
11fdf7f2 8644 ctx->new_snapset.clone_snaps[coid.snap] = snaps;
7c673cae 8645
f67539c2 8646 // clone_overlap should contain an entry for each clone
7c673cae
FG
8647 // (an empty interval_set if there is no overlap)
8648 ctx->new_snapset.clone_overlap[coid.snap];
1e59de90 8649 if (ctx->obs->oi.size) {
7c673cae 8650 ctx->new_snapset.clone_overlap[coid.snap].insert(0, ctx->obs->oi.size);
1e59de90 8651 }
f67539c2 8652
7c673cae
FG
8653 // log clone
8654 dout(10) << " cloning v " << ctx->obs->oi.version
8655 << " to " << coid << " v " << ctx->at_version
8656 << " snaps=" << snaps
8657 << " snapset=" << ctx->new_snapset << dendl;
11fdf7f2
TL
8658 ctx->log.push_back(pg_log_entry_t(
8659 pg_log_entry_t::CLONE, coid, ctx->at_version,
8660 ctx->obs->oi.version,
8661 ctx->obs->oi.user_version,
8662 osd_reqid_t(), ctx->new_obs.oi.mtime, 0));
8663 encode(snaps, ctx->log.back().snaps);
7c673cae
FG
8664
8665 ctx->at_version.version++;
8666 }
8667
8668 // update most recent clone_overlap and usage stats
8669 if (ctx->new_snapset.clones.size() > 0) {
11fdf7f2
TL
8670 // the clone_overlap is difference of range between head and clones.
8671 // we need to check whether the most recent clone exists, if it's
8672 // been evicted, it's not included in the stats, but the clone_overlap
8673 // is still exist in the snapset, so we should update the
8674 // clone_overlap to make it sense.
7c673cae
FG
8675 hobject_t last_clone_oid = soid;
8676 last_clone_oid.snap = ctx->new_snapset.clone_overlap.rbegin()->first;
11fdf7f2
TL
8677 interval_set<uint64_t> &newest_overlap =
8678 ctx->new_snapset.clone_overlap.rbegin()->second;
8679 ctx->modified_ranges.intersection_of(newest_overlap);
7c673cae 8680 if (is_present_clone(last_clone_oid)) {
7c673cae 8681 // modified_ranges is still in use by the clone
11fdf7f2 8682 ctx->delta_stats.num_bytes += ctx->modified_ranges.size();
7c673cae 8683 }
11fdf7f2 8684 newest_overlap.subtract(ctx->modified_ranges);
7c673cae 8685 }
f67539c2 8686
11fdf7f2
TL
8687 if (snapc.seq > ctx->new_snapset.seq) {
8688 // update snapset with latest snap context
8689 ctx->new_snapset.seq = snapc.seq;
9f95a23c
TL
8690 if (get_osdmap()->require_osd_release < ceph_release_t::octopus) {
8691 ctx->new_snapset.snaps = snapc.snaps;
8692 } else {
8693 ctx->new_snapset.snaps.clear();
8694 }
7c673cae
FG
8695 }
8696 dout(20) << "make_writeable " << soid
8697 << " done, snapset=" << ctx->new_snapset << dendl;
8698}
8699
8700
8701void PrimaryLogPG::write_update_size_and_usage(object_stat_sum_t& delta_stats, object_info_t& oi,
8702 interval_set<uint64_t>& modified, uint64_t offset,
8703 uint64_t length, bool write_full)
8704{
8705 interval_set<uint64_t> ch;
8706 if (write_full) {
8707 if (oi.size)
8708 ch.insert(0, oi.size);
8709 } else if (length)
8710 ch.insert(offset, length);
8711 modified.union_of(ch);
11fdf7f2
TL
8712 if (write_full ||
8713 (offset + length > oi.size && length)) {
7c673cae
FG
8714 uint64_t new_size = offset + length;
8715 delta_stats.num_bytes -= oi.size;
8716 delta_stats.num_bytes += new_size;
8717 oi.size = new_size;
8718 }
f67539c2 8719
7c673cae 8720 delta_stats.num_wr++;
11fdf7f2 8721 delta_stats.num_wr_kb += shift_round_up(length, 10);
7c673cae
FG
8722}
8723
11fdf7f2
TL
8724void PrimaryLogPG::truncate_update_size_and_usage(
8725 object_stat_sum_t& delta_stats,
8726 object_info_t& oi,
8727 uint64_t truncate_size)
7c673cae 8728{
11fdf7f2
TL
8729 if (oi.size != truncate_size) {
8730 delta_stats.num_bytes -= oi.size;
8731 delta_stats.num_bytes += truncate_size;
8732 oi.size = truncate_size;
7c673cae
FG
8733 }
8734}
8735
8736void PrimaryLogPG::complete_disconnect_watches(
8737 ObjectContextRef obc,
8738 const list<watch_disconnect_t> &to_disconnect)
8739{
8740 for (list<watch_disconnect_t>::const_iterator i =
8741 to_disconnect.begin();
8742 i != to_disconnect.end();
8743 ++i) {
8744 pair<uint64_t, entity_name_t> watcher(i->cookie, i->name);
8745 auto watchers_entry = obc->watchers.find(watcher);
8746 if (watchers_entry != obc->watchers.end()) {
8747 WatchRef watch = watchers_entry->second;
8748 dout(10) << "do_osd_op_effects disconnect watcher " << watcher << dendl;
8749 obc->watchers.erase(watcher);
8750 watch->remove(i->send_disconnect);
8751 } else {
8752 dout(10) << "do_osd_op_effects disconnect failed to find watcher "
8753 << watcher << dendl;
8754 }
8755 }
8756}
8757
8758void PrimaryLogPG::do_osd_op_effects(OpContext *ctx, const ConnectionRef& conn)
8759{
8760 entity_name_t entity = ctx->reqid.name;
8761 dout(15) << "do_osd_op_effects " << entity << " con " << conn.get() << dendl;
8762
8763 // disconnects first
8764 complete_disconnect_watches(ctx->obc, ctx->watch_disconnects);
8765
11fdf7f2 8766 ceph_assert(conn);
7c673cae 8767
11fdf7f2
TL
8768 auto session = conn->get_priv();
8769 if (!session)
7c673cae 8770 return;
7c673cae
FG
8771
8772 for (list<pair<watch_info_t,bool> >::iterator i = ctx->watch_connects.begin();
8773 i != ctx->watch_connects.end();
8774 ++i) {
8775 pair<uint64_t, entity_name_t> watcher(i->first.cookie, entity);
8776 dout(15) << "do_osd_op_effects applying watch connect on session "
8777 << session.get() << " watcher " << watcher << dendl;
8778 WatchRef watch;
8779 if (ctx->obc->watchers.count(watcher)) {
8780 dout(15) << "do_osd_op_effects found existing watch watcher " << watcher
8781 << dendl;
8782 watch = ctx->obc->watchers[watcher];
8783 } else {
8784 dout(15) << "do_osd_op_effects new watcher " << watcher
8785 << dendl;
8786 watch = Watch::makeWatchRef(
8787 this, osd, ctx->obc, i->first.timeout_seconds,
8788 i->first.cookie, entity, conn->get_peer_addr());
8789 ctx->obc->watchers.insert(
8790 make_pair(
8791 watcher,
8792 watch));
8793 }
8794 watch->connect(conn, i->second);
8795 }
8796
8797 for (list<notify_info_t>::iterator p = ctx->notifies.begin();
8798 p != ctx->notifies.end();
8799 ++p) {
8800 dout(10) << "do_osd_op_effects, notify " << *p << dendl;
8801 ConnectionRef conn(ctx->op->get_req()->get_connection());
8802 NotifyRef notif(
8803 Notify::makeNotifyRef(
8804 conn,
8805 ctx->reqid.name.num(),
8806 p->bl,
8807 p->timeout,
8808 p->cookie,
8809 p->notify_id,
8810 ctx->obc->obs.oi.user_version,
8811 osd));
8812 for (map<pair<uint64_t, entity_name_t>, WatchRef>::iterator i =
8813 ctx->obc->watchers.begin();
8814 i != ctx->obc->watchers.end();
8815 ++i) {
8816 dout(10) << "starting notify on watch " << i->first << dendl;
8817 i->second->start_notify(notif);
8818 }
8819 notif->init();
8820 }
8821
8822 for (list<OpContext::NotifyAck>::iterator p = ctx->notify_acks.begin();
8823 p != ctx->notify_acks.end();
8824 ++p) {
8825 if (p->watch_cookie)
9f95a23c 8826 dout(10) << "notify_ack " << make_pair(*(p->watch_cookie), p->notify_id) << dendl;
7c673cae
FG
8827 else
8828 dout(10) << "notify_ack " << make_pair("NULL", p->notify_id) << dendl;
8829 for (map<pair<uint64_t, entity_name_t>, WatchRef>::iterator i =
8830 ctx->obc->watchers.begin();
8831 i != ctx->obc->watchers.end();
8832 ++i) {
8833 if (i->first.second != entity) continue;
8834 if (p->watch_cookie &&
9f95a23c 8835 *(p->watch_cookie) != i->first.first) continue;
7c673cae
FG
8836 dout(10) << "acking notify on watch " << i->first << dendl;
8837 i->second->notify_ack(p->notify_id, p->reply_bl);
8838 }
8839 }
8840}
8841
8842hobject_t PrimaryLogPG::generate_temp_object(const hobject_t& target)
8843{
8844 ostringstream ss;
8845 ss << "temp_" << info.pgid << "_" << get_role()
8846 << "_" << osd->monc->get_global_id() << "_" << (++temp_seq);
8847 hobject_t hoid = target.make_temp_hobject(ss.str());
8848 dout(20) << __func__ << " " << hoid << dendl;
8849 return hoid;
8850}
8851
8852hobject_t PrimaryLogPG::get_temp_recovery_object(
8853 const hobject_t& target,
8854 eversion_t version)
8855{
8856 ostringstream ss;
8857 ss << "temp_recovering_" << info.pgid // (note this includes the shardid)
8858 << "_" << version
8859 << "_" << info.history.same_interval_since
8860 << "_" << target.snap;
8861 // pgid + version + interval + snapid is unique, and short
8862 hobject_t hoid = target.make_temp_hobject(ss.str());
8863 dout(20) << __func__ << " " << hoid << dendl;
8864 return hoid;
8865}
8866
8867int PrimaryLogPG::prepare_transaction(OpContext *ctx)
8868{
11fdf7f2 8869 ceph_assert(!ctx->ops->empty());
7c673cae
FG
8870
8871 // valid snap context?
8872 if (!ctx->snapc.is_valid()) {
8873 dout(10) << " invalid snapc " << ctx->snapc << dendl;
8874 return -EINVAL;
8875 }
8876
8877 // prepare the actual mutation
c07f9fc5 8878 int result = do_osd_ops(ctx, *ctx->ops);
7c673cae
FG
8879 if (result < 0) {
8880 if (ctx->op->may_write() &&
9f95a23c 8881 get_osdmap()->require_osd_release >= ceph_release_t::kraken) {
7c673cae
FG
8882 // need to save the error code in the pg log, to detect dup ops,
8883 // but do nothing else
8884 ctx->update_log_only = true;
8885 }
8886 return result;
8887 }
8888
8889 // read-op? write-op noop? done?
8890 if (ctx->op_t->empty() && !ctx->modify) {
11fdf7f2
TL
8891 if (ctx->pending_async_reads.empty())
8892 unstable_stats.add(ctx->delta_stats);
7c673cae 8893 if (ctx->op->may_write() &&
9f95a23c 8894 get_osdmap()->require_osd_release >= ceph_release_t::kraken) {
7c673cae
FG
8895 ctx->update_log_only = true;
8896 }
8897 return result;
8898 }
8899
8900 // check for full
8901 if ((ctx->delta_stats.num_bytes > 0 ||
8902 ctx->delta_stats.num_objects > 0) && // FIXME: keys?
9f95a23c
TL
8903 pool.info.has_flag(pg_pool_t::FLAG_FULL)) {
8904 auto m = ctx->op->get_req<MOSDOp>();
7c673cae
FG
8905 if (ctx->reqid.name.is_mds() || // FIXME: ignore MDS for now
8906 m->has_flag(CEPH_OSD_FLAG_FULL_FORCE)) {
8907 dout(20) << __func__ << " full, but proceeding due to FULL_FORCE or MDS"
8908 << dendl;
8909 } else if (m->has_flag(CEPH_OSD_FLAG_FULL_TRY)) {
8910 // they tried, they failed.
8911 dout(20) << __func__ << " full, replying to FULL_TRY op" << dendl;
11fdf7f2 8912 return pool.info.has_flag(pg_pool_t::FLAG_FULL_QUOTA) ? -EDQUOT : -ENOSPC;
7c673cae
FG
8913 } else {
8914 // drop request
8915 dout(20) << __func__ << " full, dropping request (bad client)" << dendl;
8916 return -EAGAIN;
8917 }
8918 }
8919
11fdf7f2 8920 const hobject_t& soid = ctx->obs->oi.soid;
7c673cae
FG
8921 // clone, if necessary
8922 if (soid.snap == CEPH_NOSNAP)
8923 make_writeable(ctx);
8924
8925 finish_ctx(ctx,
8926 ctx->new_obs.exists ? pg_log_entry_t::MODIFY :
9f95a23c
TL
8927 pg_log_entry_t::DELETE,
8928 result);
7c673cae
FG
8929
8930 return result;
8931}
8932
9f95a23c 8933void PrimaryLogPG::finish_ctx(OpContext *ctx, int log_op_type, int result)
7c673cae
FG
8934{
8935 const hobject_t& soid = ctx->obs->oi.soid;
8936 dout(20) << __func__ << " " << soid << " " << ctx
8937 << " op " << pg_log_entry_t::get_op_name(log_op_type)
8938 << dendl;
8939 utime_t now = ceph_clock_now();
8940
20effc67 8941
f67539c2
TL
8942 // Drop the reference if deduped chunk is modified
8943 if (ctx->new_obs.oi.is_dirty() &&
8944 (ctx->obs->oi.has_manifest() && ctx->obs->oi.manifest.is_chunked()) &&
f67539c2
TL
8945 !ctx->cache_operation &&
8946 log_op_type != pg_log_entry_t::PROMOTE) {
20effc67
TL
8947 update_chunk_map_by_dirty(ctx);
8948 // If a clone is creating, ignore dropping the reference for manifest object
8949 if (!ctx->delta_stats.num_object_clones) {
8950 dec_refcount_by_dirty(ctx);
8951 }
f67539c2
TL
8952 }
8953
7c673cae
FG
8954 // finish and log the op.
8955 if (ctx->user_modify) {
8956 // update the user_version for any modify ops, except for the watch op
11fdf7f2 8957 ctx->user_at_version = std::max(info.last_user_version, ctx->new_obs.oi.user_version) + 1;
7c673cae
FG
8958 /* In order for new clients and old clients to interoperate properly
8959 * when exchanging versions, we need to lower bound the user_version
8960 * (which our new clients pay proper attention to)
8961 * by the at_version (which is all the old clients can ever see). */
8962 if (ctx->at_version.version > ctx->user_at_version)
8963 ctx->user_at_version = ctx->at_version.version;
8964 ctx->new_obs.oi.user_version = ctx->user_at_version;
8965 }
8966 ctx->bytes_written = ctx->op_t->get_bytes_written();
f67539c2 8967
7c673cae 8968 if (ctx->new_obs.exists) {
7c673cae
FG
8969 ctx->new_obs.oi.version = ctx->at_version;
8970 ctx->new_obs.oi.prior_version = ctx->obs->oi.version;
8971 ctx->new_obs.oi.last_reqid = ctx->reqid;
8972 if (ctx->mtime != utime_t()) {
8973 ctx->new_obs.oi.mtime = ctx->mtime;
8974 dout(10) << " set mtime to " << ctx->new_obs.oi.mtime << dendl;
8975 ctx->new_obs.oi.local_mtime = now;
8976 } else {
8977 dout(10) << " mtime unchanged at " << ctx->new_obs.oi.mtime << dendl;
8978 }
8979
11fdf7f2 8980 // object_info_t
20effc67 8981 map <string, bufferlist, less<>> attrs;
7c673cae 8982 bufferlist bv(sizeof(ctx->new_obs.oi));
11fdf7f2 8983 encode(ctx->new_obs.oi, bv,
7c673cae 8984 get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
f67539c2 8985 attrs[OI_ATTR] = std::move(bv);
7c673cae 8986
11fdf7f2 8987 // snapset
7c673cae
FG
8988 if (soid.snap == CEPH_NOSNAP) {
8989 dout(10) << " final snapset " << ctx->new_snapset
8990 << " in " << soid << dendl;
11fdf7f2
TL
8991 bufferlist bss;
8992 encode(ctx->new_snapset, bss);
f67539c2 8993 attrs[SS_ATTR] = std::move(bss);
7c673cae
FG
8994 } else {
8995 dout(10) << " no snapset (this is a clone)" << dendl;
8996 }
8997 ctx->op_t->setattrs(soid, attrs);
8998 } else {
11fdf7f2 8999 // reset cached oi
7c673cae
FG
9000 ctx->new_obs.oi = object_info_t(ctx->obc->obs.oi.soid);
9001 }
9002
7c673cae 9003 // append to log
9f95a23c
TL
9004 ctx->log.push_back(
9005 pg_log_entry_t(log_op_type, soid, ctx->at_version,
9006 ctx->obs->oi.version,
9007 ctx->user_at_version, ctx->reqid,
9008 ctx->mtime,
9009 (ctx->op && ctx->op->allows_returnvec()) ? result : 0));
9010 if (ctx->op && ctx->op->allows_returnvec()) {
9011 // also the per-op values
9012 ctx->log.back().set_op_returns(*ctx->ops);
9013 dout(20) << __func__ << " op_returns " << ctx->log.back().op_returns
9014 << dendl;
9015 }
9016
9017 ctx->log.back().clean_regions = ctx->clean_regions;
9018 dout(20) << __func__ << " object " << soid << " marks clean_regions " << ctx->log.back().clean_regions << dendl;
9019
7c673cae
FG
9020 if (soid.snap < CEPH_NOSNAP) {
9021 switch (log_op_type) {
9022 case pg_log_entry_t::MODIFY:
9023 case pg_log_entry_t::PROMOTE:
9024 case pg_log_entry_t::CLEAN:
11fdf7f2
TL
9025 dout(20) << __func__ << " encoding snaps from " << ctx->new_snapset
9026 << dendl;
9027 encode(ctx->new_snapset.clone_snaps[soid.snap], ctx->log.back().snaps);
7c673cae
FG
9028 break;
9029 default:
9030 break;
9031 }
9032 }
9033
9034 if (!ctx->extra_reqids.empty()) {
11fdf7f2
TL
9035 dout(20) << __func__ << " extra_reqids " << ctx->extra_reqids << " "
9036 << ctx->extra_reqid_return_codes << dendl;
7c673cae 9037 ctx->log.back().extra_reqids.swap(ctx->extra_reqids);
11fdf7f2 9038 ctx->log.back().extra_reqid_return_codes.swap(ctx->extra_reqid_return_codes);
7c673cae
FG
9039 }
9040
9041 // apply new object state.
9042 ctx->obc->obs = ctx->new_obs;
9043
11fdf7f2 9044 if (soid.is_head() && !ctx->obc->obs.exists) {
7c673cae
FG
9045 ctx->obc->ssc->exists = false;
9046 ctx->obc->ssc->snapset = SnapSet();
9047 } else {
9048 ctx->obc->ssc->exists = true;
9049 ctx->obc->ssc->snapset = ctx->new_snapset;
9050 }
9051}
9052
9053void PrimaryLogPG::apply_stats(
9054 const hobject_t &soid,
9055 const object_stat_sum_t &delta_stats) {
9056
9f95a23c
TL
9057 recovery_state.apply_op_stats(soid, delta_stats);
9058 for (set<pg_shard_t>::const_iterator i = get_backfill_targets().begin();
9059 i != get_backfill_targets().end();
7c673cae
FG
9060 ++i) {
9061 pg_shard_t bt = *i;
9f95a23c
TL
9062 const pg_info_t& pinfo = recovery_state.get_peer_info(bt);
9063 if (soid > pinfo.last_backfill && soid <= last_backfill_started) {
7c673cae 9064 pending_backfill_updates[soid].stats.add(delta_stats);
9f95a23c 9065 }
7c673cae
FG
9066 }
9067
f67539c2 9068 m_scrubber->stats_of_handled_objects(delta_stats, soid);
7c673cae
FG
9069}
9070
9071void PrimaryLogPG::complete_read_ctx(int result, OpContext *ctx)
9072{
9f95a23c 9073 auto m = ctx->op->get_req<MOSDOp>();
11fdf7f2 9074 ceph_assert(ctx->async_reads_complete());
7c673cae 9075
f67539c2 9076 for (auto p = ctx->ops->begin();
c07f9fc5 9077 p != ctx->ops->end() && result >= 0; ++p) {
7c673cae
FG
9078 if (p->rval < 0 && !(p->op.flags & CEPH_OSD_OP_FLAG_FAILOK)) {
9079 result = p->rval;
9080 break;
9081 }
9082 ctx->bytes_read += p->outdata.length();
9083 }
c07f9fc5 9084 ctx->reply->get_header().data_off = (ctx->data_off ? *ctx->data_off : 0);
7c673cae
FG
9085
9086 MOSDOpReply *reply = ctx->reply;
9087 ctx->reply = nullptr;
9088
9089 if (result >= 0) {
9090 if (!ctx->ignore_log_op_stats) {
11fdf7f2
TL
9091 log_op_stats(*ctx->op, ctx->bytes_written, ctx->bytes_read);
9092
7c673cae
FG
9093 publish_stats_to_osd();
9094 }
9095
9096 // on read, return the current object version
9097 if (ctx->obs) {
9098 reply->set_reply_versions(eversion_t(), ctx->obs->oi.user_version);
9099 } else {
9100 reply->set_reply_versions(eversion_t(), ctx->user_at_version);
9101 }
9102 } else if (result == -ENOENT) {
9103 // on ENOENT, set a floor for what the next user version will be.
9104 reply->set_enoent_reply_versions(info.last_update, info.last_user_version);
9105 }
9106
9107 reply->set_result(result);
9108 reply->add_flags(CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK);
9109 osd->send_message_osd_client(reply, m->get_connection());
9110 close_op_ctx(ctx);
9111}
9112
9113// ========================================================================
9114// copyfrom
9115
9116struct C_Copyfrom : public Context {
9117 PrimaryLogPGRef pg;
9118 hobject_t oid;
9119 epoch_t last_peering_reset;
9120 ceph_tid_t tid;
11fdf7f2 9121 PrimaryLogPG::CopyOpRef cop; // used for keeping the cop alive
7c673cae
FG
9122 C_Copyfrom(PrimaryLogPG *p, hobject_t o, epoch_t lpr,
9123 const PrimaryLogPG::CopyOpRef& c)
9124 : pg(p), oid(o), last_peering_reset(lpr),
9125 tid(0), cop(c)
9126 {}
9127 void finish(int r) override {
9128 if (r == -ECANCELED)
9129 return;
9f95a23c 9130 std::scoped_lock l{*pg};
7c673cae
FG
9131 if (last_peering_reset == pg->get_last_peering_reset()) {
9132 pg->process_copy_chunk(oid, tid, r);
11fdf7f2 9133 cop.reset();
7c673cae 9134 }
7c673cae
FG
9135 }
9136};
9137
9138struct C_CopyFrom_AsyncReadCb : public Context {
9139 OSDOp *osd_op;
9140 object_copy_data_t reply_obj;
9141 uint64_t features;
9142 size_t len;
9143 C_CopyFrom_AsyncReadCb(OSDOp *osd_op, uint64_t features) :
9144 osd_op(osd_op), features(features), len(0) {}
9145 void finish(int r) override {
c07f9fc5
FG
9146 osd_op->rval = r;
9147 if (r < 0) {
9148 return;
9149 }
9150
11fdf7f2
TL
9151 ceph_assert(len > 0);
9152 ceph_assert(len <= reply_obj.data.length());
7c673cae
FG
9153 bufferlist bl;
9154 bl.substr_of(reply_obj.data, 0, len);
9155 reply_obj.data.swap(bl);
11fdf7f2
TL
9156 encode(reply_obj, osd_op->outdata, features);
9157 }
9158};
9159
9160struct C_CopyChunk : public Context {
9161 PrimaryLogPGRef pg;
9162 hobject_t oid;
9163 epoch_t last_peering_reset;
9164 ceph_tid_t tid;
9165 PrimaryLogPG::CopyOpRef cop; // used for keeping the cop alive
9166 uint64_t offset = 0;
9167 C_CopyChunk(PrimaryLogPG *p, hobject_t o, epoch_t lpr,
9168 const PrimaryLogPG::CopyOpRef& c)
9169 : pg(p), oid(o), last_peering_reset(lpr),
f67539c2 9170 tid(0), cop(c)
11fdf7f2
TL
9171 {}
9172 void finish(int r) override {
9173 if (r == -ECANCELED)
9174 return;
9f95a23c 9175 std::scoped_lock l{*pg};
11fdf7f2
TL
9176 if (last_peering_reset == pg->get_last_peering_reset()) {
9177 pg->process_copy_chunk_manifest(oid, tid, r, offset);
9178 cop.reset();
9179 }
7c673cae
FG
9180 }
9181};
9182
11fdf7f2 9183int PrimaryLogPG::do_copy_get(OpContext *ctx, bufferlist::const_iterator& bp,
c07f9fc5 9184 OSDOp& osd_op, ObjectContextRef &obc)
7c673cae
FG
9185{
9186 object_info_t& oi = obc->obs.oi;
9187 hobject_t& soid = oi.soid;
9188 int result = 0;
9189 object_copy_cursor_t cursor;
9190 uint64_t out_max;
9191 try {
11fdf7f2
TL
9192 decode(cursor, bp);
9193 decode(out_max, bp);
7c673cae 9194 }
f67539c2 9195 catch (ceph::buffer::error& e) {
7c673cae
FG
9196 result = -EINVAL;
9197 return result;
9198 }
9199
9200 const MOSDOp *op = reinterpret_cast<const MOSDOp*>(ctx->op->get_req());
9201 uint64_t features = op->get_features();
9202
9203 bool async_read_started = false;
9204 object_copy_data_t _reply_obj;
11fdf7f2
TL
9205 C_CopyFrom_AsyncReadCb *cb = nullptr;
9206 if (pool.info.is_erasure()) {
7c673cae
FG
9207 cb = new C_CopyFrom_AsyncReadCb(&osd_op, features);
9208 }
9209 object_copy_data_t &reply_obj = cb ? cb->reply_obj : _reply_obj;
9210 // size, mtime
9211 reply_obj.size = oi.size;
9212 reply_obj.mtime = oi.mtime;
11fdf7f2 9213 ceph_assert(obc->ssc);
7c673cae 9214 if (soid.snap < CEPH_NOSNAP) {
11fdf7f2
TL
9215 auto p = obc->ssc->snapset.clone_snaps.find(soid.snap);
9216 ceph_assert(p != obc->ssc->snapset.clone_snaps.end()); // warn?
9217 reply_obj.snaps = p->second;
7c673cae
FG
9218 } else {
9219 reply_obj.snap_seq = obc->ssc->snapset.seq;
9220 }
11fdf7f2 9221 if (oi.is_data_digest()) {
7c673cae
FG
9222 reply_obj.flags |= object_copy_data_t::FLAG_DATA_DIGEST;
9223 reply_obj.data_digest = oi.data_digest;
9224 }
9225 if (oi.is_omap_digest()) {
9226 reply_obj.flags |= object_copy_data_t::FLAG_OMAP_DIGEST;
9227 reply_obj.omap_digest = oi.omap_digest;
9228 }
9229 reply_obj.truncate_seq = oi.truncate_seq;
9230 reply_obj.truncate_size = oi.truncate_size;
9231
9232 // attrs
20effc67 9233 map<string,bufferlist,less<>>& out_attrs = reply_obj.attrs;
7c673cae
FG
9234 if (!cursor.attr_complete) {
9235 result = getattrs_maybe_cache(
9236 ctx->obc,
b32b8144 9237 &out_attrs);
7c673cae
FG
9238 if (result < 0) {
9239 if (cb) {
9240 delete cb;
9241 }
9242 return result;
9243 }
9244 cursor.attr_complete = true;
9245 dout(20) << " got attrs" << dendl;
9246 }
9247
9248 int64_t left = out_max - osd_op.outdata.length();
9249
9250 // data
9251 bufferlist& bl = reply_obj.data;
9252 if (left > 0 && !cursor.data_complete) {
9253 if (cursor.data_offset < oi.size) {
11fdf7f2 9254 uint64_t max_read = std::min(oi.size - cursor.data_offset, (uint64_t)left);
7c673cae
FG
9255 if (cb) {
9256 async_read_started = true;
9257 ctx->pending_async_reads.push_back(
9258 make_pair(
9259 boost::make_tuple(cursor.data_offset, max_read, osd_op.op.flags),
9260 make_pair(&bl, cb)));
c07f9fc5
FG
9261 cb->len = max_read;
9262
9263 ctx->op_finishers[ctx->current_osd_subop_num].reset(
9264 new ReadFinisher(osd_op));
9265 result = -EINPROGRESS;
9266
9267 dout(10) << __func__ << ": async_read noted for " << soid << dendl;
7c673cae
FG
9268 } else {
9269 result = pgbackend->objects_read_sync(
c07f9fc5 9270 oi.soid, cursor.data_offset, max_read, osd_op.op.flags, &bl);
7c673cae
FG
9271 if (result < 0)
9272 return result;
9273 }
c07f9fc5
FG
9274 left -= max_read;
9275 cursor.data_offset += max_read;
7c673cae
FG
9276 }
9277 if (cursor.data_offset == oi.size) {
9278 cursor.data_complete = true;
9279 dout(20) << " got data" << dendl;
9280 }
11fdf7f2 9281 ceph_assert(cursor.data_offset <= oi.size);
7c673cae
FG
9282 }
9283
9284 // omap
9285 uint32_t omap_keys = 0;
9286 if (!pool.info.supports_omap() || !oi.is_omap()) {
9287 cursor.omap_complete = true;
9288 } else {
9289 if (left > 0 && !cursor.omap_complete) {
11fdf7f2 9290 ceph_assert(cursor.data_complete);
7c673cae
FG
9291 if (cursor.omap_offset.empty()) {
9292 osd->store->omap_get_header(ch, ghobject_t(oi.soid),
9293 &reply_obj.omap_header);
9294 }
9295 bufferlist omap_data;
9296 ObjectMap::ObjectMapIterator iter =
11fdf7f2
TL
9297 osd->store->get_omap_iterator(ch, ghobject_t(oi.soid));
9298 ceph_assert(iter);
7c673cae 9299 iter->upper_bound(cursor.omap_offset);
11fdf7f2 9300 for (; iter->valid(); iter->next()) {
7c673cae 9301 ++omap_keys;
11fdf7f2
TL
9302 encode(iter->key(), omap_data);
9303 encode(iter->value(), omap_data);
7c673cae
FG
9304 left -= iter->key().length() + 4 + iter->value().length() + 4;
9305 if (left <= 0)
9306 break;
9307 }
9308 if (omap_keys) {
11fdf7f2 9309 encode(omap_keys, reply_obj.omap_data);
7c673cae
FG
9310 reply_obj.omap_data.claim_append(omap_data);
9311 }
9312 if (iter->valid()) {
9313 cursor.omap_offset = iter->key();
9314 } else {
9315 cursor.omap_complete = true;
9316 dout(20) << " got omap" << dendl;
9317 }
9318 }
9319 }
9320
9321 if (cursor.is_complete()) {
9322 // include reqids only in the final step. this is a bit fragile
9323 // but it works...
9f95a23c 9324 recovery_state.get_pg_log().get_log().get_object_reqids(ctx->obc->obs.oi.soid, 10,
11fdf7f2
TL
9325 &reply_obj.reqids,
9326 &reply_obj.reqid_return_codes);
7c673cae
FG
9327 dout(20) << " got reqids" << dendl;
9328 }
9329
9330 dout(20) << " cursor.is_complete=" << cursor.is_complete()
9331 << " " << out_attrs.size() << " attrs"
9332 << " " << bl.length() << " bytes"
9333 << " " << reply_obj.omap_header.length() << " omap header bytes"
9334 << " " << reply_obj.omap_data.length() << " omap data bytes in "
9335 << omap_keys << " keys"
9336 << " " << reply_obj.reqids.size() << " reqids"
9337 << dendl;
9338 reply_obj.cursor = cursor;
9339 if (!async_read_started) {
11fdf7f2 9340 encode(reply_obj, osd_op.outdata, features);
7c673cae
FG
9341 }
9342 if (cb && !async_read_started) {
9343 delete cb;
9344 }
c07f9fc5
FG
9345
9346 if (result > 0) {
9347 result = 0;
9348 }
7c673cae
FG
9349 return result;
9350}
9351
9352void PrimaryLogPG::fill_in_copy_get_noent(OpRequestRef& op, hobject_t oid,
9353 OSDOp& osd_op)
9354{
9f95a23c 9355 const MOSDOp *m = static_cast<const MOSDOp*>(op->get_req());
7c673cae
FG
9356 uint64_t features = m->get_features();
9357 object_copy_data_t reply_obj;
9358
9f95a23c 9359 recovery_state.get_pg_log().get_log().get_object_reqids(oid, 10, &reply_obj.reqids,
11fdf7f2 9360 &reply_obj.reqid_return_codes);
7c673cae 9361 dout(20) << __func__ << " got reqids " << reply_obj.reqids << dendl;
11fdf7f2 9362 encode(reply_obj, osd_op.outdata, features);
7c673cae 9363 osd_op.rval = -ENOENT;
11fdf7f2 9364 MOSDOpReply *reply = new MOSDOpReply(m, 0, get_osdmap_epoch(), 0, false);
7c673cae
FG
9365 reply->set_result(-ENOENT);
9366 reply->add_flags(CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK);
9367 osd->send_message_osd_client(reply, m->get_connection());
9368}
9369
9370void PrimaryLogPG::start_copy(CopyCallback *cb, ObjectContextRef obc,
9371 hobject_t src, object_locator_t oloc,
9372 version_t version, unsigned flags,
9373 bool mirror_snapset,
9374 unsigned src_obj_fadvise_flags,
9375 unsigned dest_obj_fadvise_flags)
9376{
9377 const hobject_t& dest = obc->obs.oi.soid;
9378 dout(10) << __func__ << " " << dest
9379 << " from " << src << " " << oloc << " v" << version
9380 << " flags " << flags
9381 << (mirror_snapset ? " mirror_snapset" : "")
9382 << dendl;
9383
11fdf7f2 9384 ceph_assert(!mirror_snapset || src.snap == CEPH_NOSNAP);
7c673cae
FG
9385
9386 // cancel a previous in-progress copy?
9387 if (copy_ops.count(dest)) {
9388 // FIXME: if the src etc match, we could avoid restarting from the
9389 // beginning.
9390 CopyOpRef cop = copy_ops[dest];
94b18763
FG
9391 vector<ceph_tid_t> tids;
9392 cancel_copy(cop, false, &tids);
9393 osd->objecter->op_cancel(tids, -ECANCELED);
7c673cae
FG
9394 }
9395
9396 CopyOpRef cop(std::make_shared<CopyOp>(cb, obc, src, oloc, version, flags,
9397 mirror_snapset, src_obj_fadvise_flags,
9398 dest_obj_fadvise_flags));
9399 copy_ops[dest] = cop;
1e59de90 9400 dout(20) << fmt::format("{}: blocking {}", __func__, dest) << dendl;
7c673cae
FG
9401 obc->start_block();
9402
11fdf7f2
TL
9403 if (!obc->obs.oi.has_manifest()) {
9404 _copy_some(obc, cop);
9405 } else {
9406 if (obc->obs.oi.manifest.is_redirect()) {
9407 _copy_some(obc, cop);
9408 } else if (obc->obs.oi.manifest.is_chunked()) {
9409 auto p = obc->obs.oi.manifest.chunk_map.begin();
9410 _copy_some_manifest(obc, cop, p->first);
9411 } else {
9412 ceph_abort_msg("unrecognized manifest type");
9413 }
9414 }
7c673cae
FG
9415}
9416
9417void PrimaryLogPG::_copy_some(ObjectContextRef obc, CopyOpRef cop)
9418{
91327a77 9419 dout(10) << __func__ << " " << *obc << " " << cop << dendl;
7c673cae
FG
9420
9421 unsigned flags = 0;
9422 if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_FLUSH)
9423 flags |= CEPH_OSD_FLAG_FLUSH;
9424 if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_IGNORE_CACHE)
9425 flags |= CEPH_OSD_FLAG_IGNORE_CACHE;
9426 if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_IGNORE_OVERLAY)
9427 flags |= CEPH_OSD_FLAG_IGNORE_OVERLAY;
9428 if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_MAP_SNAP_CLONE)
9429 flags |= CEPH_OSD_FLAG_MAP_SNAP_CLONE;
9430 if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_RWORDERED)
9431 flags |= CEPH_OSD_FLAG_RWORDERED;
9432
9433 C_GatherBuilder gather(cct);
9434
9435 if (cop->cursor.is_initial() && cop->mirror_snapset) {
9436 // list snaps too.
11fdf7f2 9437 ceph_assert(cop->src.snap == CEPH_NOSNAP);
7c673cae
FG
9438 ObjectOperation op;
9439 op.list_snaps(&cop->results.snapset, NULL);
9440 ceph_tid_t tid = osd->objecter->read(cop->src.oid, cop->oloc, op,
9441 CEPH_SNAPDIR, NULL,
9442 flags, gather.new_sub(), NULL);
9443 cop->objecter_tid2 = tid;
9444 }
9445
9446 ObjectOperation op;
9447 if (cop->results.user_version) {
9448 op.assert_version(cop->results.user_version);
9449 } else {
9450 // we should learn the version after the first chunk, if we didn't know
9451 // it already!
11fdf7f2 9452 ceph_assert(cop->cursor.is_initial());
7c673cae
FG
9453 }
9454 op.copy_get(&cop->cursor, get_copy_chunk_size(),
9455 &cop->results.object_size, &cop->results.mtime,
9456 &cop->attrs, &cop->data, &cop->omap_header, &cop->omap_data,
9457 &cop->results.snaps, &cop->results.snap_seq,
9458 &cop->results.flags,
9459 &cop->results.source_data_digest,
9460 &cop->results.source_omap_digest,
9461 &cop->results.reqids,
11fdf7f2 9462 &cop->results.reqid_return_codes,
7c673cae
FG
9463 &cop->results.truncate_seq,
9464 &cop->results.truncate_size,
9465 &cop->rval);
9466 op.set_last_op_flags(cop->src_obj_fadvise_flags);
9467
9468 C_Copyfrom *fin = new C_Copyfrom(this, obc->obs.oi.soid,
9469 get_last_peering_reset(), cop);
9470 gather.set_finisher(new C_OnFinisher(fin,
9f95a23c 9471 osd->get_objecter_finisher(get_pg_shard())));
7c673cae
FG
9472
9473 ceph_tid_t tid = osd->objecter->read(cop->src.oid, cop->oloc, op,
9474 cop->src.snap, NULL,
9475 flags,
9476 gather.new_sub(),
9477 // discover the object version if we don't know it yet
9478 cop->results.user_version ? NULL : &cop->results.user_version);
9479 fin->tid = tid;
9480 cop->objecter_tid = tid;
9481 gather.activate();
9482}
9483
11fdf7f2
TL
9484void PrimaryLogPG::_copy_some_manifest(ObjectContextRef obc, CopyOpRef cop, uint64_t start_offset)
9485{
9486 dout(10) << __func__ << " " << *obc << " " << cop << dendl;
9487
9488 unsigned flags = 0;
9489 if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_FLUSH)
9490 flags |= CEPH_OSD_FLAG_FLUSH;
9491 if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_IGNORE_CACHE)
9492 flags |= CEPH_OSD_FLAG_IGNORE_CACHE;
9493 if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_IGNORE_OVERLAY)
9494 flags |= CEPH_OSD_FLAG_IGNORE_OVERLAY;
9495 if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_MAP_SNAP_CLONE)
9496 flags |= CEPH_OSD_FLAG_MAP_SNAP_CLONE;
9497 if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_RWORDERED)
9498 flags |= CEPH_OSD_FLAG_RWORDERED;
9499
9500 int num_chunks = 0;
9501 uint64_t last_offset = 0, chunks_size = 0;
9502 object_manifest_t *manifest = &obc->obs.oi.manifest;
f67539c2 9503 map<uint64_t, chunk_info_t>::iterator iter = manifest->chunk_map.find(start_offset);
11fdf7f2
TL
9504 for (;iter != manifest->chunk_map.end(); ++iter) {
9505 num_chunks++;
9506 chunks_size += iter->second.length;
9507 last_offset = iter->first;
9508 if (get_copy_chunk_size() < chunks_size) {
9509 break;
9510 }
9511 }
9512
9513 cop->num_chunk = num_chunks;
9514 cop->start_offset = start_offset;
9515 cop->last_offset = last_offset;
9516 dout(20) << __func__ << " oid " << obc->obs.oi.soid << " num_chunks: " << num_chunks
f67539c2 9517 << " start_offset: " << start_offset << " chunks_size: " << chunks_size
11fdf7f2
TL
9518 << " last_offset: " << last_offset << dendl;
9519
9520 iter = manifest->chunk_map.find(start_offset);
9521 for (;iter != manifest->chunk_map.end(); ++iter) {
9522 uint64_t obj_offset = iter->first;
9523 uint64_t length = manifest->chunk_map[iter->first].length;
9524 hobject_t soid = manifest->chunk_map[iter->first].oid;
9525 object_locator_t oloc(soid);
9526 CopyCallback * cb = NULL;
9527 CopyOpRef sub_cop(std::make_shared<CopyOp>(cb, ObjectContextRef(), cop->src, oloc,
9528 cop->results.user_version, cop->flags, cop->mirror_snapset,
9529 cop->src_obj_fadvise_flags, cop->dest_obj_fadvise_flags));
9530 sub_cop->cursor.data_offset = obj_offset;
9531 cop->chunk_cops[obj_offset] = sub_cop;
9532
9533 int s = sub_cop->chunk_ops.size();
9534 sub_cop->chunk_ops.resize(s+1);
9535 sub_cop->chunk_ops[s].op.op = CEPH_OSD_OP_READ;
9536 sub_cop->chunk_ops[s].op.extent.offset = manifest->chunk_map[iter->first].offset;
9537 sub_cop->chunk_ops[s].op.extent.length = length;
9538
9539 ObjectOperation op;
9540 op.dup(sub_cop->chunk_ops);
9541
11fdf7f2
TL
9542 if (cop->results.user_version) {
9543 op.assert_version(cop->results.user_version);
9544 } else {
9545 // we should learn the version after the first chunk, if we didn't know
9546 // it already!
9547 ceph_assert(cop->cursor.is_initial());
9548 }
9549 op.set_last_op_flags(cop->src_obj_fadvise_flags);
9550
9551 C_CopyChunk *fin = new C_CopyChunk(this, obc->obs.oi.soid,
9552 get_last_peering_reset(), cop);
9553 fin->offset = obj_offset;
9f95a23c
TL
9554
9555 ceph_tid_t tid = osd->objecter->read(
9556 soid.oid, oloc, op,
9557 sub_cop->src.snap, NULL,
9558 flags,
9559 new C_OnFinisher(fin, osd->get_objecter_finisher(get_pg_shard())),
9560 // discover the object version if we don't know it yet
9561 sub_cop->results.user_version ? NULL : &sub_cop->results.user_version);
11fdf7f2
TL
9562 fin->tid = tid;
9563 sub_cop->objecter_tid = tid;
f67539c2
TL
9564
9565 dout(20) << __func__ << " tgt_oid: " << soid.oid << " tgt_offset: "
9566 << manifest->chunk_map[iter->first].offset
9567 << " length: " << length << " pool id: " << oloc.pool
9568 << " tid: " << tid << dendl;
9569
20effc67 9570 if (last_offset <= iter->first) {
11fdf7f2
TL
9571 break;
9572 }
9573 }
9574}
9575
7c673cae
FG
9576void PrimaryLogPG::process_copy_chunk(hobject_t oid, ceph_tid_t tid, int r)
9577{
9578 dout(10) << __func__ << " " << oid << " tid " << tid
9579 << " " << cpp_strerror(r) << dendl;
9580 map<hobject_t,CopyOpRef>::iterator p = copy_ops.find(oid);
9581 if (p == copy_ops.end()) {
9582 dout(10) << __func__ << " no copy_op found" << dendl;
9583 return;
9584 }
9585 CopyOpRef cop = p->second;
9586 if (tid != cop->objecter_tid) {
9587 dout(10) << __func__ << " tid " << tid << " != cop " << cop
9588 << " tid " << cop->objecter_tid << dendl;
9589 return;
9590 }
9591
9592 if (cop->omap_data.length() || cop->omap_header.length())
9593 cop->results.has_omap = true;
9594
9595 if (r >= 0 && !pool.info.supports_omap() &&
9596 (cop->omap_data.length() || cop->omap_header.length())) {
9597 r = -EOPNOTSUPP;
9598 }
9599 cop->objecter_tid = 0;
9600 cop->objecter_tid2 = 0; // assume this ordered before us (if it happened)
9601 ObjectContextRef& cobc = cop->obc;
9602
9603 if (r < 0)
9604 goto out;
9605
11fdf7f2 9606 ceph_assert(cop->rval >= 0);
7c673cae
FG
9607
9608 if (oid.snap < CEPH_NOSNAP && !cop->results.snaps.empty()) {
9609 // verify snap hasn't been deleted
9610 vector<snapid_t>::iterator p = cop->results.snaps.begin();
9611 while (p != cop->results.snaps.end()) {
9f95a23c
TL
9612 // make best effort to sanitize snaps/clones.
9613 if (get_osdmap()->in_removed_snaps_queue(info.pgid.pgid.pool(), *p)) {
7c673cae
FG
9614 dout(10) << __func__ << " clone snap " << *p << " has been deleted"
9615 << dendl;
9616 for (vector<snapid_t>::iterator q = p + 1;
9617 q != cop->results.snaps.end();
9618 ++q)
9619 *(q - 1) = *q;
9620 cop->results.snaps.resize(cop->results.snaps.size() - 1);
9621 } else {
9622 ++p;
9623 }
9624 }
9625 if (cop->results.snaps.empty()) {
9626 dout(10) << __func__ << " no more snaps for " << oid << dendl;
9627 r = -ENOENT;
9628 goto out;
9629 }
9630 }
9631
11fdf7f2 9632 ceph_assert(cop->rval >= 0);
7c673cae
FG
9633
9634 if (!cop->temp_cursor.data_complete) {
9635 cop->results.data_digest = cop->data.crc32c(cop->results.data_digest);
9636 }
9637 if (pool.info.supports_omap() && !cop->temp_cursor.omap_complete) {
9638 if (cop->omap_header.length()) {
9639 cop->results.omap_digest =
9640 cop->omap_header.crc32c(cop->results.omap_digest);
9641 }
9642 if (cop->omap_data.length()) {
9643 bufferlist keys;
9644 keys.substr_of(cop->omap_data, 4, cop->omap_data.length() - 4);
9645 cop->results.omap_digest = keys.crc32c(cop->results.omap_digest);
9646 }
9647 }
9648
9649 if (!cop->temp_cursor.attr_complete) {
9650 for (map<string,bufferlist>::iterator p = cop->attrs.begin();
9651 p != cop->attrs.end();
9652 ++p) {
9653 cop->results.attrs[string("_") + p->first] = p->second;
9654 }
9655 cop->attrs.clear();
9656 }
9657
9658 if (!cop->cursor.is_complete()) {
9659 // write out what we have so far
9660 if (cop->temp_cursor.is_initial()) {
11fdf7f2 9661 ceph_assert(!cop->results.started_temp_obj);
7c673cae
FG
9662 cop->results.started_temp_obj = true;
9663 cop->results.temp_oid = generate_temp_object(oid);
9664 dout(20) << __func__ << " using temp " << cop->results.temp_oid << dendl;
9665 }
9666 ObjectContextRef tempobc = get_object_context(cop->results.temp_oid, true);
9667 OpContextUPtr ctx = simple_opc_create(tempobc);
9668 if (cop->temp_cursor.is_initial()) {
9669 ctx->new_temp_oid = cop->results.temp_oid;
9670 }
9671 _write_copy_chunk(cop, ctx->op_t.get());
9672 simple_opc_submit(std::move(ctx));
9673 dout(10) << __func__ << " fetching more" << dendl;
9674 _copy_some(cobc, cop);
9675 return;
9676 }
9677
9678 // verify digests?
9679 if (cop->results.is_data_digest() || cop->results.is_omap_digest()) {
9680 dout(20) << __func__ << std::hex
9681 << " got digest: rx data 0x" << cop->results.data_digest
9682 << " omap 0x" << cop->results.omap_digest
9683 << ", source: data 0x" << cop->results.source_data_digest
9684 << " omap 0x" << cop->results.source_omap_digest
9685 << std::dec
9686 << " flags " << cop->results.flags
9687 << dendl;
9688 }
9689 if (cop->results.is_data_digest() &&
9690 cop->results.data_digest != cop->results.source_data_digest) {
9691 derr << __func__ << std::hex << " data digest 0x" << cop->results.data_digest
9692 << " != source 0x" << cop->results.source_data_digest << std::dec
9693 << dendl;
9694 osd->clog->error() << info.pgid << " copy from " << cop->src
9695 << " to " << cop->obc->obs.oi.soid << std::hex
9696 << " data digest 0x" << cop->results.data_digest
9697 << " != source 0x" << cop->results.source_data_digest
9698 << std::dec;
9699 r = -EIO;
9700 goto out;
9701 }
9702 if (cop->results.is_omap_digest() &&
9703 cop->results.omap_digest != cop->results.source_omap_digest) {
9704 derr << __func__ << std::hex
9705 << " omap digest 0x" << cop->results.omap_digest
9706 << " != source 0x" << cop->results.source_omap_digest
9707 << std::dec << dendl;
9708 osd->clog->error() << info.pgid << " copy from " << cop->src
9709 << " to " << cop->obc->obs.oi.soid << std::hex
9710 << " omap digest 0x" << cop->results.omap_digest
9711 << " != source 0x" << cop->results.source_omap_digest
9712 << std::dec;
9713 r = -EIO;
9714 goto out;
9715 }
9716 if (cct->_conf->osd_debug_inject_copyfrom_error) {
9717 derr << __func__ << " injecting copyfrom failure" << dendl;
9718 r = -EIO;
9719 goto out;
9720 }
9721
9722 cop->results.fill_in_final_tx = std::function<void(PGTransaction*)>(
9723 [this, &cop /* avoid ref cycle */](PGTransaction *t) {
9724 ObjectState& obs = cop->obc->obs;
9725 if (cop->temp_cursor.is_initial()) {
9726 dout(20) << "fill_in_final_tx: writing "
9727 << "directly to final object" << dendl;
9728 // write directly to final object
9729 cop->results.temp_oid = obs.oi.soid;
9730 _write_copy_chunk(cop, t);
9731 } else {
9732 // finish writing to temp object, then move into place
9733 dout(20) << "fill_in_final_tx: writing to temp object" << dendl;
f67539c2
TL
9734 if (obs.oi.has_manifest() && obs.oi.manifest.is_redirect() && obs.exists) {
9735 /* In redirect manifest case, the object exists in the upper tier.
9736 * So, to avoid a conflict when rename() is called, remove existing
9737 * object first
9738 */
9739 t->remove(obs.oi.soid);
9740 }
7c673cae
FG
9741 _write_copy_chunk(cop, t);
9742 t->rename(obs.oi.soid, cop->results.temp_oid);
9743 }
9744 t->setattrs(obs.oi.soid, cop->results.attrs);
9745 });
9746
9747 dout(20) << __func__ << " success; committing" << dendl;
9748
9749 out:
9750 dout(20) << __func__ << " complete r = " << cpp_strerror(r) << dendl;
9751 CopyCallbackResults results(r, &cop->results);
9752 cop->cb->complete(results);
9753
9754 copy_ops.erase(cobc->obs.oi.soid);
9755 cobc->stop_block();
9756
9757 if (r < 0 && cop->results.started_temp_obj) {
9758 dout(10) << __func__ << " deleting partial temp object "
9759 << cop->results.temp_oid << dendl;
9760 ObjectContextRef tempobc = get_object_context(cop->results.temp_oid, true);
9761 OpContextUPtr ctx = simple_opc_create(tempobc);
9762 ctx->op_t->remove(cop->results.temp_oid);
9763 ctx->discard_temp_oid = cop->results.temp_oid;
9764 simple_opc_submit(std::move(ctx));
9765 }
9766
9767 // cancel and requeue proxy ops on this object
9768 if (!r) {
11fdf7f2
TL
9769 cancel_and_requeue_proxy_ops(cobc->obs.oi.soid);
9770 }
9771
9772 kick_object_context_blocked(cobc);
9773}
9774
9775void PrimaryLogPG::process_copy_chunk_manifest(hobject_t oid, ceph_tid_t tid, int r, uint64_t offset)
9776{
9777 dout(10) << __func__ << " " << oid << " tid " << tid
9778 << " " << cpp_strerror(r) << dendl;
9779 map<hobject_t,CopyOpRef>::iterator p = copy_ops.find(oid);
9780 if (p == copy_ops.end()) {
9781 dout(10) << __func__ << " no copy_op found" << dendl;
9782 return;
9783 }
9784 CopyOpRef obj_cop = p->second;
9785 CopyOpRef chunk_cop = obj_cop->chunk_cops[offset];
9786
9787 if (tid != chunk_cop->objecter_tid) {
9788 dout(10) << __func__ << " tid " << tid << " != cop " << chunk_cop
9789 << " tid " << chunk_cop->objecter_tid << dendl;
9790 return;
9791 }
9792
9793 if (chunk_cop->omap_data.length() || chunk_cop->omap_header.length()) {
9794 r = -EOPNOTSUPP;
9795 }
9796
9797 chunk_cop->objecter_tid = 0;
9798 chunk_cop->objecter_tid2 = 0; // assume this ordered before us (if it happened)
9799 ObjectContextRef& cobc = obj_cop->obc;
9800 OSDOp &chunk_data = chunk_cop->chunk_ops[0];
9801
9802 if (r < 0) {
9803 obj_cop->failed = true;
9804 goto out;
f67539c2 9805 }
11fdf7f2
TL
9806
9807 if (obj_cop->failed) {
9808 return;
f67539c2 9809 }
11fdf7f2
TL
9810 if (!chunk_data.outdata.length()) {
9811 r = -EIO;
9812 obj_cop->failed = true;
9813 goto out;
9814 }
9815
9816 obj_cop->num_chunk--;
9817
9818 /* check all of the copyop are completed */
9819 if (obj_cop->num_chunk) {
9820 dout(20) << __func__ << " num_chunk: " << obj_cop->num_chunk << dendl;
9821 return;
9822 }
9823
9824 {
9825 OpContextUPtr ctx = simple_opc_create(obj_cop->obc);
9826 if (!ctx->lock_manager.take_write_lock(
9827 obj_cop->obc->obs.oi.soid,
9828 obj_cop->obc)) {
f67539c2
TL
9829 // recovery op can take read lock.
9830 // so need to wait for recovery completion
11fdf7f2
TL
9831 r = -EAGAIN;
9832 obj_cop->failed = true;
9833 close_op_ctx(ctx.release());
9834 goto out;
7c673cae 9835 }
11fdf7f2
TL
9836 dout(20) << __func__ << " took lock on obc, " << obj_cop->obc->rwstate << dendl;
9837
9838 PGTransaction *t = ctx->op_t.get();
9839 ObjectState& obs = ctx->new_obs;
9840 for (auto p : obj_cop->chunk_cops) {
9841 OSDOp &sub_chunk = p.second->chunk_ops[0];
9842 t->write(cobc->obs.oi.soid,
9843 p.second->cursor.data_offset,
9844 sub_chunk.outdata.length(),
9845 sub_chunk.outdata,
9846 p.second->dest_obj_fadvise_flags);
f67539c2 9847 dout(20) << __func__ << " offset: " << p.second->cursor.data_offset
11fdf7f2
TL
9848 << " length: " << sub_chunk.outdata.length() << dendl;
9849 write_update_size_and_usage(ctx->delta_stats, obs.oi, ctx->modified_ranges,
9850 p.second->cursor.data_offset, sub_chunk.outdata.length());
f67539c2 9851 obs.oi.manifest.chunk_map[p.second->cursor.data_offset].clear_flag(chunk_info_t::FLAG_MISSING);
9f95a23c 9852 ctx->clean_regions.mark_data_region_dirty(p.second->cursor.data_offset, sub_chunk.outdata.length());
11fdf7f2
TL
9853 sub_chunk.outdata.clear();
9854 }
9855 obs.oi.clear_data_digest();
f67539c2 9856 ctx->at_version = get_next_version();
11fdf7f2
TL
9857 finish_ctx(ctx.get(), pg_log_entry_t::PROMOTE);
9858 simple_opc_submit(std::move(ctx));
20effc67 9859 obj_cop->chunk_cops.clear();
11fdf7f2
TL
9860
9861 auto p = cobc->obs.oi.manifest.chunk_map.rbegin();
9862 /* check remaining work */
9863 if (p != cobc->obs.oi.manifest.chunk_map.rend()) {
20effc67 9864 if (obj_cop->last_offset < p->first) {
11fdf7f2
TL
9865 for (auto &en : cobc->obs.oi.manifest.chunk_map) {
9866 if (obj_cop->last_offset < en.first) {
9867 _copy_some_manifest(cobc, obj_cop, en.first);
9868 return;
9869 }
9870 }
7c673cae
FG
9871 }
9872 }
11fdf7f2
TL
9873 }
9874
9875 out:
9876 dout(20) << __func__ << " complete r = " << cpp_strerror(r) << dendl;
9877 CopyCallbackResults results(r, &obj_cop->results);
9878 obj_cop->cb->complete(results);
9879
9880 copy_ops.erase(cobc->obs.oi.soid);
9881 cobc->stop_block();
9882
9883 // cancel and requeue proxy ops on this object
9884 if (!r) {
9885 cancel_and_requeue_proxy_ops(cobc->obs.oi.soid);
7c673cae
FG
9886 }
9887
9888 kick_object_context_blocked(cobc);
9889}
9890
94b18763
FG
9891void PrimaryLogPG::cancel_and_requeue_proxy_ops(hobject_t oid) {
9892 vector<ceph_tid_t> tids;
9893 for (map<ceph_tid_t, ProxyReadOpRef>::iterator it = proxyread_ops.begin();
9894 it != proxyread_ops.end();) {
9895 if (it->second->soid == oid) {
9896 cancel_proxy_read((it++)->second, &tids);
9897 } else {
9898 ++it;
9899 }
9900 }
9901 for (map<ceph_tid_t, ProxyWriteOpRef>::iterator it = proxywrite_ops.begin();
9902 it != proxywrite_ops.end();) {
9903 if (it->second->soid == oid) {
9904 cancel_proxy_write((it++)->second, &tids);
9905 } else {
9906 ++it;
9907 }
9908 }
9909 osd->objecter->op_cancel(tids, -ECANCELED);
9910 kick_proxy_ops_blocked(oid);
9911}
9912
7c673cae
FG
9913void PrimaryLogPG::_write_copy_chunk(CopyOpRef cop, PGTransaction *t)
9914{
9915 dout(20) << __func__ << " " << cop
9916 << " " << cop->attrs.size() << " attrs"
9917 << " " << cop->data.length() << " bytes"
9918 << " " << cop->omap_header.length() << " omap header bytes"
9919 << " " << cop->omap_data.length() << " omap data bytes"
9920 << dendl;
9921 if (!cop->temp_cursor.attr_complete) {
9922 t->create(cop->results.temp_oid);
9923 }
9924 if (!cop->temp_cursor.data_complete) {
11fdf7f2 9925 ceph_assert(cop->data.length() + cop->temp_cursor.data_offset ==
7c673cae 9926 cop->cursor.data_offset);
11fdf7f2 9927 if (pool.info.required_alignment() &&
7c673cae
FG
9928 !cop->cursor.data_complete) {
9929 /**
9930 * Trim off the unaligned bit at the end, we'll adjust cursor.data_offset
9931 * to pick it up on the next pass.
9932 */
11fdf7f2 9933 ceph_assert(cop->temp_cursor.data_offset %
7c673cae
FG
9934 pool.info.required_alignment() == 0);
9935 if (cop->data.length() % pool.info.required_alignment() != 0) {
9936 uint64_t to_trim =
9937 cop->data.length() % pool.info.required_alignment();
9938 bufferlist bl;
9939 bl.substr_of(cop->data, 0, cop->data.length() - to_trim);
9940 cop->data.swap(bl);
9941 cop->cursor.data_offset -= to_trim;
11fdf7f2 9942 ceph_assert(cop->data.length() + cop->temp_cursor.data_offset ==
7c673cae
FG
9943 cop->cursor.data_offset);
9944 }
9945 }
9946 if (cop->data.length()) {
9947 t->write(
9948 cop->results.temp_oid,
9949 cop->temp_cursor.data_offset,
9950 cop->data.length(),
9951 cop->data,
9952 cop->dest_obj_fadvise_flags);
9953 }
9954 cop->data.clear();
9955 }
9956 if (pool.info.supports_omap()) {
9957 if (!cop->temp_cursor.omap_complete) {
9958 if (cop->omap_header.length()) {
9959 t->omap_setheader(
9960 cop->results.temp_oid,
9961 cop->omap_header);
9962 cop->omap_header.clear();
9963 }
9964 if (cop->omap_data.length()) {
9965 map<string,bufferlist> omap;
11fdf7f2
TL
9966 bufferlist::const_iterator p = cop->omap_data.begin();
9967 decode(omap, p);
7c673cae
FG
9968 t->omap_setkeys(cop->results.temp_oid, omap);
9969 cop->omap_data.clear();
9970 }
9971 }
9972 } else {
11fdf7f2
TL
9973 ceph_assert(cop->omap_header.length() == 0);
9974 ceph_assert(cop->omap_data.length() == 0);
7c673cae
FG
9975 }
9976 cop->temp_cursor = cop->cursor;
9977}
9978
c07f9fc5 9979void PrimaryLogPG::finish_copyfrom(CopyFromCallback *cb)
7c673cae 9980{
c07f9fc5 9981 OpContext *ctx = cb->ctx;
7c673cae 9982 dout(20) << "finish_copyfrom on " << ctx->obs->oi.soid << dendl;
7c673cae 9983
c07f9fc5 9984 ObjectState& obs = ctx->new_obs;
7c673cae
FG
9985 if (obs.exists) {
9986 dout(20) << __func__ << ": exists, removing" << dendl;
9987 ctx->op_t->remove(obs.oi.soid);
9988 } else {
9989 ctx->delta_stats.num_objects++;
9990 obs.exists = true;
9991 }
9992 if (cb->is_temp_obj_used()) {
9993 ctx->discard_temp_oid = cb->results->temp_oid;
9994 }
9995 cb->results->fill_in_final_tx(ctx->op_t.get());
9996
9997 // CopyFromCallback fills this in for us
9998 obs.oi.user_version = ctx->user_at_version;
9999
28e407b8
AA
10000 if (cb->results->is_data_digest()) {
10001 obs.oi.set_data_digest(cb->results->data_digest);
10002 } else {
10003 obs.oi.clear_data_digest();
10004 }
10005 if (cb->results->is_omap_digest()) {
10006 obs.oi.set_omap_digest(cb->results->omap_digest);
10007 } else {
10008 obs.oi.clear_omap_digest();
10009 }
7c673cae 10010
9f95a23c
TL
10011 obs.oi.truncate_seq = cb->truncate_seq;
10012 obs.oi.truncate_size = cb->truncate_size;
10013
10014 obs.oi.mtime = ceph::real_clock::to_timespec(cb->results->mtime);
10015 ctx->mtime = utime_t();
7c673cae
FG
10016
10017 ctx->extra_reqids = cb->results->reqids;
11fdf7f2 10018 ctx->extra_reqid_return_codes = cb->results->reqid_return_codes;
7c673cae
FG
10019
10020 // cache: clear whiteout?
10021 if (obs.oi.is_whiteout()) {
10022 dout(10) << __func__ << " clearing whiteout on " << obs.oi.soid << dendl;
10023 obs.oi.clear_flag(object_info_t::FLAG_WHITEOUT);
10024 --ctx->delta_stats.num_whiteouts;
10025 }
10026
10027 if (cb->results->has_omap) {
10028 dout(10) << __func__ << " setting omap flag on " << obs.oi.soid << dendl;
10029 obs.oi.set_flag(object_info_t::FLAG_OMAP);
9f95a23c 10030 ctx->clean_regions.mark_omap_dirty();
7c673cae
FG
10031 } else {
10032 dout(10) << __func__ << " clearing omap flag on " << obs.oi.soid << dendl;
10033 obs.oi.clear_flag(object_info_t::FLAG_OMAP);
10034 }
10035
10036 interval_set<uint64_t> ch;
10037 if (obs.oi.size > 0)
10038 ch.insert(0, obs.oi.size);
10039 ctx->modified_ranges.union_of(ch);
9f95a23c 10040 ctx->clean_regions.mark_data_region_dirty(0, std::max(obs.oi.size, cb->get_data_size()));
7c673cae
FG
10041
10042 if (cb->get_data_size() != obs.oi.size) {
10043 ctx->delta_stats.num_bytes -= obs.oi.size;
10044 obs.oi.size = cb->get_data_size();
10045 ctx->delta_stats.num_bytes += obs.oi.size;
10046 }
10047 ctx->delta_stats.num_wr++;
11fdf7f2 10048 ctx->delta_stats.num_wr_kb += shift_round_up(obs.oi.size, 10);
7c673cae
FG
10049
10050 osd->logger->inc(l_osd_copyfrom);
10051}
10052
10053void PrimaryLogPG::finish_promote(int r, CopyResults *results,
10054 ObjectContextRef obc)
10055{
10056 const hobject_t& soid = obc->obs.oi.soid;
10057 dout(10) << __func__ << " " << soid << " r=" << r
10058 << " uv" << results->user_version << dendl;
10059
10060 if (r == -ECANCELED) {
10061 return;
10062 }
10063
10064 if (r != -ENOENT && soid.is_snap()) {
10065 if (results->snaps.empty()) {
9f95a23c
TL
10066 // we must have read "snap" content from the head object in the
10067 // base pool. use snap_seq to construct what snaps should be
10068 // for this clone (what is was before we evicted the clean clone
10069 // from this pool, and what it will be when we flush and the
10070 // clone eventually happens in the base pool). we want to use
10071 // snaps in (results->snap_seq,soid.snap]
7c673cae 10072 SnapSet& snapset = obc->ssc->snapset;
9f95a23c
TL
10073 for (auto p = snapset.clone_snaps.rbegin();
10074 p != snapset.clone_snaps.rend();
10075 ++p) {
10076 for (auto snap : p->second) {
10077 if (snap > soid.snap) {
10078 continue;
10079 }
10080 if (snap <= results->snap_seq) {
10081 break;
10082 }
10083 results->snaps.push_back(snap);
10084 }
7c673cae
FG
10085 }
10086 }
10087
10088 dout(20) << __func__ << " snaps " << results->snaps << dendl;
10089 filter_snapc(results->snaps);
10090
10091 dout(20) << __func__ << " filtered snaps " << results->snaps << dendl;
10092 if (results->snaps.empty()) {
10093 dout(20) << __func__
10094 << " snaps are empty, clone is invalid,"
10095 << " setting r to ENOENT" << dendl;
10096 r = -ENOENT;
10097 }
10098 }
10099
10100 if (r < 0 && results->started_temp_obj) {
10101 dout(10) << __func__ << " abort; will clean up partial work" << dendl;
10102 ObjectContextRef tempobc = get_object_context(results->temp_oid, false);
11fdf7f2 10103 ceph_assert(tempobc);
7c673cae
FG
10104 OpContextUPtr ctx = simple_opc_create(tempobc);
10105 ctx->op_t->remove(results->temp_oid);
10106 simple_opc_submit(std::move(ctx));
10107 results->started_temp_obj = false;
10108 }
10109
10110 if (r == -ENOENT && soid.is_snap()) {
10111 dout(10) << __func__
10112 << ": enoent while trying to promote clone, " << soid
10113 << " must have been trimmed, removing from snapset"
10114 << dendl;
10115 hobject_t head(soid.get_head());
10116 ObjectContextRef obc = get_object_context(head, false);
11fdf7f2 10117 ceph_assert(obc);
7c673cae
FG
10118
10119 OpContextUPtr tctx = simple_opc_create(obc);
10120 tctx->at_version = get_next_version();
9f95a23c
TL
10121 if (get_osdmap()->require_osd_release < ceph_release_t::octopus) {
10122 filter_snapc(tctx->new_snapset.snaps);
10123 } else {
10124 tctx->new_snapset.snaps.clear();
10125 }
7c673cae
FG
10126 vector<snapid_t> new_clones;
10127 map<snapid_t, vector<snapid_t>> new_clone_snaps;
10128 for (vector<snapid_t>::iterator i = tctx->new_snapset.clones.begin();
10129 i != tctx->new_snapset.clones.end();
10130 ++i) {
10131 if (*i != soid.snap) {
10132 new_clones.push_back(*i);
10133 auto p = tctx->new_snapset.clone_snaps.find(*i);
10134 if (p != tctx->new_snapset.clone_snaps.end()) {
10135 new_clone_snaps[*i] = p->second;
10136 }
10137 }
10138 }
10139 tctx->new_snapset.clones.swap(new_clones);
10140 tctx->new_snapset.clone_overlap.erase(soid.snap);
10141 tctx->new_snapset.clone_size.erase(soid.snap);
10142 tctx->new_snapset.clone_snaps.swap(new_clone_snaps);
10143
10144 // take RWWRITE lock for duration of our local write. ignore starvation.
10145 if (!tctx->lock_manager.take_write_lock(
10146 head,
10147 obc)) {
11fdf7f2 10148 ceph_abort_msg("problem!");
7c673cae
FG
10149 }
10150 dout(20) << __func__ << " took lock on obc, " << obc->rwstate << dendl;
10151
10152 finish_ctx(tctx.get(), pg_log_entry_t::PROMOTE);
10153
10154 simple_opc_submit(std::move(tctx));
10155 return;
10156 }
10157
10158 bool whiteout = false;
10159 if (r == -ENOENT) {
11fdf7f2 10160 ceph_assert(soid.snap == CEPH_NOSNAP); // snap case is above
7c673cae
FG
10161 dout(10) << __func__ << " whiteout " << soid << dendl;
10162 whiteout = true;
10163 }
10164
10165 if (r < 0 && !whiteout) {
10166 derr << __func__ << " unexpected promote error " << cpp_strerror(r) << dendl;
10167 // pass error to everyone blocked on this object
10168 // FIXME: this is pretty sloppy, but at this point we got
10169 // something unexpected and don't have many other options.
10170 map<hobject_t,list<OpRequestRef>>::iterator blocked_iter =
10171 waiting_for_blocked_object.find(soid);
10172 if (blocked_iter != waiting_for_blocked_object.end()) {
10173 while (!blocked_iter->second.empty()) {
10174 osd->reply_op_error(blocked_iter->second.front(), r);
10175 blocked_iter->second.pop_front();
10176 }
10177 waiting_for_blocked_object.erase(blocked_iter);
10178 }
10179 return;
10180 }
10181
10182 osd->promote_finish(results->object_size);
10183
10184 OpContextUPtr tctx = simple_opc_create(obc);
10185 tctx->at_version = get_next_version();
10186
11fdf7f2
TL
10187 if (!obc->obs.oi.has_manifest()) {
10188 ++tctx->delta_stats.num_objects;
10189 }
7c673cae
FG
10190 if (soid.snap < CEPH_NOSNAP)
10191 ++tctx->delta_stats.num_object_clones;
10192 tctx->new_obs.exists = true;
10193
10194 tctx->extra_reqids = results->reqids;
11fdf7f2 10195 tctx->extra_reqid_return_codes = results->reqid_return_codes;
7c673cae 10196
f67539c2
TL
10197 if (obc->obs.oi.has_manifest() && obc->obs.oi.manifest.is_redirect()) {
10198 tctx->new_obs.oi.manifest.type = object_manifest_t::TYPE_NONE;
10199 tctx->new_obs.oi.clear_flag(object_info_t::FLAG_REDIRECT_HAS_REFERENCE);
10200 tctx->new_obs.oi.clear_flag(object_info_t::FLAG_MANIFEST);
10201 tctx->new_obs.oi.manifest.redirect_target = hobject_t();
10202 tctx->delta_stats.num_objects_manifest--;
10203 if (obc->obs.oi.test_flag(object_info_t::FLAG_REDIRECT_HAS_REFERENCE)) {
10204 dec_all_refcount_manifest(obc->obs.oi, tctx.get());
10205 }
10206 }
10207
7c673cae
FG
10208 if (whiteout) {
10209 // create a whiteout
10210 tctx->op_t->create(soid);
10211 tctx->new_obs.oi.set_flag(object_info_t::FLAG_WHITEOUT);
10212 ++tctx->delta_stats.num_whiteouts;
10213 dout(20) << __func__ << " creating whiteout on " << soid << dendl;
10214 osd->logger->inc(l_osd_tier_whiteout);
10215 } else {
10216 if (results->has_omap) {
10217 dout(10) << __func__ << " setting omap flag on " << soid << dendl;
10218 tctx->new_obs.oi.set_flag(object_info_t::FLAG_OMAP);
10219 ++tctx->delta_stats.num_objects_omap;
10220 }
10221
10222 results->fill_in_final_tx(tctx->op_t.get());
10223 if (results->started_temp_obj) {
10224 tctx->discard_temp_oid = results->temp_oid;
10225 }
10226 tctx->new_obs.oi.size = results->object_size;
10227 tctx->new_obs.oi.user_version = results->user_version;
9f95a23c
TL
10228 tctx->new_obs.oi.mtime = ceph::real_clock::to_timespec(results->mtime);
10229 tctx->mtime = utime_t();
28e407b8 10230 if (results->is_data_digest()) {
7c673cae 10231 tctx->new_obs.oi.set_data_digest(results->data_digest);
28e407b8
AA
10232 } else {
10233 tctx->new_obs.oi.clear_data_digest();
10234 }
9f95a23c
TL
10235 if (results->object_size)
10236 tctx->clean_regions.mark_data_region_dirty(0, results->object_size);
28e407b8 10237 if (results->is_omap_digest()) {
7c673cae 10238 tctx->new_obs.oi.set_omap_digest(results->omap_digest);
28e407b8
AA
10239 } else {
10240 tctx->new_obs.oi.clear_omap_digest();
10241 }
9f95a23c
TL
10242 if (results->has_omap)
10243 tctx->clean_regions.mark_omap_dirty();
7c673cae
FG
10244 tctx->new_obs.oi.truncate_seq = results->truncate_seq;
10245 tctx->new_obs.oi.truncate_size = results->truncate_size;
10246
10247 if (soid.snap != CEPH_NOSNAP) {
11fdf7f2
TL
10248 ceph_assert(obc->ssc->snapset.clone_snaps.count(soid.snap));
10249 ceph_assert(obc->ssc->snapset.clone_size.count(soid.snap));
10250 ceph_assert(obc->ssc->snapset.clone_size[soid.snap] ==
7c673cae 10251 results->object_size);
11fdf7f2 10252 ceph_assert(obc->ssc->snapset.clone_overlap.count(soid.snap));
7c673cae
FG
10253
10254 tctx->delta_stats.num_bytes += obc->ssc->snapset.get_clone_bytes(soid.snap);
10255 } else {
10256 tctx->delta_stats.num_bytes += results->object_size;
10257 }
10258 }
10259
10260 if (results->mirror_snapset) {
11fdf7f2 10261 ceph_assert(tctx->new_obs.oi.soid.snap == CEPH_NOSNAP);
7c673cae
FG
10262 tctx->new_snapset.from_snap_set(
10263 results->snapset,
9f95a23c 10264 get_osdmap()->require_osd_release < ceph_release_t::luminous);
7c673cae 10265 }
7c673cae
FG
10266 dout(20) << __func__ << " new_snapset " << tctx->new_snapset << dendl;
10267
10268 // take RWWRITE lock for duration of our local write. ignore starvation.
10269 if (!tctx->lock_manager.take_write_lock(
10270 obc->obs.oi.soid,
10271 obc)) {
11fdf7f2 10272 ceph_abort_msg("problem!");
7c673cae
FG
10273 }
10274 dout(20) << __func__ << " took lock on obc, " << obc->rwstate << dendl;
10275
10276 finish_ctx(tctx.get(), pg_log_entry_t::PROMOTE);
10277
10278 simple_opc_submit(std::move(tctx));
10279
10280 osd->logger->inc(l_osd_tier_promote);
10281
10282 if (agent_state &&
10283 agent_state->is_idle())
10284 agent_choose_mode();
10285}
10286
11fdf7f2
TL
10287void PrimaryLogPG::finish_promote_manifest(int r, CopyResults *results,
10288 ObjectContextRef obc)
10289{
10290 const hobject_t& soid = obc->obs.oi.soid;
10291 dout(10) << __func__ << " " << soid << " r=" << r
10292 << " uv" << results->user_version << dendl;
10293
10294 if (r == -ECANCELED || r == -EAGAIN) {
10295 return;
10296 }
10297
10298 if (r < 0) {
10299 derr << __func__ << " unexpected promote error " << cpp_strerror(r) << dendl;
10300 // pass error to everyone blocked on this object
10301 // FIXME: this is pretty sloppy, but at this point we got
10302 // something unexpected and don't have many other options.
10303 map<hobject_t,list<OpRequestRef>>::iterator blocked_iter =
10304 waiting_for_blocked_object.find(soid);
10305 if (blocked_iter != waiting_for_blocked_object.end()) {
10306 while (!blocked_iter->second.empty()) {
10307 osd->reply_op_error(blocked_iter->second.front(), r);
10308 blocked_iter->second.pop_front();
10309 }
10310 waiting_for_blocked_object.erase(blocked_iter);
10311 }
10312 return;
10313 }
f67539c2 10314
11fdf7f2
TL
10315 osd->promote_finish(results->object_size);
10316 osd->logger->inc(l_osd_tier_promote);
10317
10318 if (agent_state &&
10319 agent_state->is_idle())
10320 agent_choose_mode();
10321}
10322
94b18763
FG
10323void PrimaryLogPG::cancel_copy(CopyOpRef cop, bool requeue,
10324 vector<ceph_tid_t> *tids)
7c673cae
FG
10325{
10326 dout(10) << __func__ << " " << cop->obc->obs.oi.soid
10327 << " from " << cop->src << " " << cop->oloc
10328 << " v" << cop->results.user_version << dendl;
10329
10330 // cancel objecter op, if we can
10331 if (cop->objecter_tid) {
94b18763 10332 tids->push_back(cop->objecter_tid);
7c673cae
FG
10333 cop->objecter_tid = 0;
10334 if (cop->objecter_tid2) {
94b18763 10335 tids->push_back(cop->objecter_tid2);
7c673cae
FG
10336 cop->objecter_tid2 = 0;
10337 }
10338 }
10339
10340 copy_ops.erase(cop->obc->obs.oi.soid);
10341 cop->obc->stop_block();
10342
10343 kick_object_context_blocked(cop->obc);
10344 cop->results.should_requeue = requeue;
10345 CopyCallbackResults result(-ECANCELED, &cop->results);
10346 cop->cb->complete(result);
10347
10348 // There may still be an objecter callback referencing this copy op.
10349 // That callback will not need the obc since it's been canceled, and
10350 // we need the obc reference to go away prior to flush.
10351 cop->obc = ObjectContextRef();
10352}
10353
94b18763 10354void PrimaryLogPG::cancel_copy_ops(bool requeue, vector<ceph_tid_t> *tids)
7c673cae
FG
10355{
10356 dout(10) << __func__ << dendl;
10357 map<hobject_t,CopyOpRef>::iterator p = copy_ops.begin();
10358 while (p != copy_ops.end()) {
10359 // requeue this op? can I queue up all of them?
94b18763 10360 cancel_copy((p++)->second, requeue, tids);
7c673cae
FG
10361 }
10362}
10363
20effc67
TL
10364struct C_gather : public Context {
10365 PrimaryLogPGRef pg;
10366 hobject_t oid;
10367 epoch_t last_peering_reset;
10368 OSDOp *osd_op;
10369 C_gather(PrimaryLogPG *pg_, hobject_t oid_, epoch_t lpr_, OSDOp *osd_op_) :
10370 pg(pg_), oid(oid_), last_peering_reset(lpr_), osd_op(osd_op_) {}
10371 void finish(int r) override {
10372 if (r == -ECANCELED)
10373 return;
10374 std::scoped_lock locker{*pg};
10375 auto p = pg->cls_gather_ops.find(oid);
10376 if (p == pg->cls_gather_ops.end()) {
10377 // op was cancelled
10378 return;
10379 }
10380 if (last_peering_reset != pg->get_last_peering_reset()) {
10381 return;
10382 }
10383 osd_op->rval = r;
10384 PrimaryLogPG::OpContext *ctx = p->second.ctx;
10385 pg->cls_gather_ops.erase(p);
10386 pg->execute_ctx(ctx);
10387 }
10388};
10389
10390int PrimaryLogPG::start_cls_gather(OpContext *ctx, std::map<std::string, bufferlist> *src_obj_buffs, const std::string& pool,
10391 const char *cls, const char *method, bufferlist& inbl)
10392{
10393 OpRequestRef op = ctx->op;
10394 MOSDOp *m = static_cast<MOSDOp*>(op->get_nonconst_req());
10395
10396 auto pool_id = osd->objecter->with_osdmap(std::mem_fn(&OSDMap::lookup_pg_pool_name), pool);
10397 object_locator_t oloc(pool_id);
10398
10399 ObjectState& obs = ctx->new_obs;
10400 object_info_t& oi = obs.oi;
10401 const hobject_t& soid = oi.soid;
10402
10403 ObjectContextRef obc = get_object_context(soid, false);
10404 C_GatherBuilder gather(cct);
10405
10406 auto [iter, inserted] = cls_gather_ops.emplace(soid, CLSGatherOp(ctx, obc, op));
10407 ceph_assert(inserted);
10408 auto &cgop = iter->second;
10409 for (std::map<std::string, bufferlist>::iterator it = src_obj_buffs->begin(); it != src_obj_buffs->end(); it++) {
10410 std::string oid = it->first;
10411 ObjectOperation obj_op;
10412 obj_op.call(cls, method, inbl);
10413 uint32_t flags = 0;
10414 ceph_tid_t tid = osd->objecter->read(
10415 object_t(oid), oloc, obj_op,
10416 m->get_snapid(), &it->second,
10417 flags, gather.new_sub());
10418 cgop.objecter_tids.push_back(tid);
10419 dout(10) << __func__ << " src=" << oid << ", tgt=" << soid << dendl;
10420 }
10421
10422 C_gather *fin = new C_gather(this, soid, get_last_peering_reset(), &(*ctx->ops)[ctx->current_osd_subop_num]);
10423 gather.set_finisher(new C_OnFinisher(fin,
10424 osd->get_objecter_finisher(get_pg_shard())));
10425 gather.activate();
10426
10427 return -EINPROGRESS;
10428}
7c673cae
FG
10429
10430// ========================================================================
10431// flush
10432//
10433// Flush a dirty object in the cache tier by writing it back to the
10434// base tier. The sequence looks like:
10435//
10436// * send a copy-from operation to the base tier to copy the current
10437// version of the object
10438// * base tier will pull the object via (perhaps multiple) copy-get(s)
10439// * on completion, we check if the object has been modified. if so,
10440// just reply with -EAGAIN.
10441// * try to take a write lock so we can clear the dirty flag. if this
10442// fails, wait and retry
10443// * start a repop that clears the bit.
10444//
10445// If we have to wait, we will retry by coming back through the
10446// start_flush method. We check if a flush is already in progress
10447// and, if so, try to finish it by rechecking the version and trying
10448// to clear the dirty bit.
10449//
10450// In order for the cache-flush (a write op) to not block the copy-get
10451// from reading the object, the client *must* set the SKIPRWLOCKS
10452// flag.
10453//
10454// NOTE: normally writes are strictly ordered for the client, but
10455// flushes are special in that they can be reordered with respect to
10456// other writes. In particular, we can't have a flush request block
10457// an update to the cache pool object!
10458
10459struct C_Flush : public Context {
10460 PrimaryLogPGRef pg;
10461 hobject_t oid;
10462 epoch_t last_peering_reset;
10463 ceph_tid_t tid;
10464 utime_t start;
10465 C_Flush(PrimaryLogPG *p, hobject_t o, epoch_t lpr)
10466 : pg(p), oid(o), last_peering_reset(lpr),
10467 tid(0), start(ceph_clock_now())
10468 {}
10469 void finish(int r) override {
10470 if (r == -ECANCELED)
10471 return;
9f95a23c 10472 std::scoped_lock locker{*pg};
7c673cae
FG
10473 if (last_peering_reset == pg->get_last_peering_reset()) {
10474 pg->finish_flush(oid, tid, r);
10475 pg->osd->logger->tinc(l_osd_tier_flush_lat, ceph_clock_now() - start);
10476 }
7c673cae
FG
10477 }
10478};
10479
f67539c2
TL
10480int PrimaryLogPG::start_dedup(OpRequestRef op, ObjectContextRef obc)
10481{
10482 const object_info_t& oi = obc->obs.oi;
10483 const hobject_t& soid = oi.soid;
10484
10485 ceph_assert(obc->is_blocked());
10486 if (oi.size == 0) {
10487 // evicted
10488 return 0;
10489 }
10490 if (pool.info.get_fingerprint_type() == pg_pool_t::TYPE_FINGERPRINT_NONE) {
10491 dout(0) << " fingerprint algorithm is not set " << dendl;
10492 return -EINVAL;
33c7a0ef
TL
10493 }
10494 if (pool.info.get_dedup_tier() <= 0) {
10495 dout(10) << " dedup tier is not set " << dendl;
10496 return -EINVAL;
10497 }
f67539c2
TL
10498
10499 /*
10500 * The operations to make dedup chunks are tracked by a ManifestOp.
10501 * This op will be finished if all the operations are completed.
10502 */
1e59de90 10503 ManifestOpRef mop(std::make_shared<ManifestOp>(obc, nullptr));
f67539c2
TL
10504
10505 // cdc
10506 std::map<uint64_t, bufferlist> chunks;
10507 int r = do_cdc(oi, mop->new_manifest.chunk_map, chunks);
10508 if (r < 0) {
10509 return r;
10510 }
10511 if (!chunks.size()) {
10512 return 0;
10513 }
10514
10515 // chunks issued here are different with chunk_map newly generated
10516 // because the same chunks in previous snap will not be issued
10517 // So, we need two data structures; the first is the issued chunk list to track
10518 // issued operations, and the second is the new chunk_map to update chunk_map after
10519 // all operations are finished
10520 object_ref_delta_t refs;
10521 ObjectContextRef obc_l, obc_g;
10522 get_adjacent_clones(obc, obc_l, obc_g);
10523 // skip if the same content exits in prev snap at same offset
10524 mop->new_manifest.calc_refs_to_inc_on_set(
10525 obc_l ? &(obc_l->obs.oi.manifest) : nullptr,
10526 obc_g ? &(obc_g->obs.oi.manifest) : nullptr,
10527 refs);
10528
10529 for (auto p : chunks) {
10530 hobject_t target = mop->new_manifest.chunk_map[p.first].oid;
10531 if (refs.find(target) == refs.end()) {
10532 continue;
10533 }
10534 C_SetDedupChunks *fin = new C_SetDedupChunks(this, soid, get_last_peering_reset(), p.first);
10535 ceph_tid_t tid = refcount_manifest(soid, target, refcount_t::CREATE_OR_GET_REF,
20effc67 10536 fin, std::move(chunks[p.first]));
f67539c2
TL
10537 mop->chunks[target] = make_pair(p.first, p.second.length());
10538 mop->num_chunks++;
10539 mop->tids[p.first] = tid;
10540 fin->tid = tid;
10541 dout(10) << __func__ << " oid: " << soid << " tid: " << tid
10542 << " target: " << target << " offset: " << p.first
10543 << " length: " << p.second.length() << dendl;
10544 }
10545
10546 if (mop->tids.size()) {
10547 manifest_ops[soid] = mop;
10548 manifest_ops[soid]->op = op;
10549 } else {
10550 // size == 0
10551 return 0;
10552 }
10553
10554 return -EINPROGRESS;
10555}
10556
10557int PrimaryLogPG::do_cdc(const object_info_t& oi,
10558 std::map<uint64_t, chunk_info_t>& chunk_map,
10559 std::map<uint64_t, bufferlist>& chunks)
10560{
10561 string chunk_algo = pool.info.get_dedup_chunk_algorithm_name();
10562 int64_t chunk_size = pool.info.get_dedup_cdc_chunk_size();
10563 uint64_t total_length = 0;
10564
10565 std::unique_ptr<CDC> cdc = CDC::create(chunk_algo, cbits(chunk_size)-1);
10566 if (!cdc) {
10567 dout(0) << __func__ << " unrecognized chunk-algorithm " << dendl;
10568 return -EINVAL;
10569 }
10570
10571 bufferlist bl;
10572 /**
10573 * We disable EC pool as a base tier of distributed dedup.
10574 * The reason why we disallow erasure code pool here is that the EC pool does not support objects_read_sync().
10575 * Therefore, we should change the current implementation totally to make EC pool compatible.
10576 * As s result, we leave this as a future work.
10577 */
10578 int r = pgbackend->objects_read_sync(
10579 oi.soid, 0, oi.size, 0, &bl);
10580 if (r < 0) {
10581 dout(0) << __func__ << " read fail " << oi.soid
10582 << " len: " << oi.size << " r: " << r << dendl;
10583 return r;
10584 }
10585 if (bl.length() != oi.size) {
10586 dout(0) << __func__ << " bl.length: " << bl.length() << " != oi.size: "
10587 << oi.size << " during chunking " << dendl;
10588 return -EIO;
10589 }
10590
10591 dout(10) << __func__ << " oid: " << oi.soid << " len: " << bl.length()
10592 << " oi.size: " << oi.size
10593 << " chunk_size: " << chunk_size << dendl;
10594
10595 vector<pair<uint64_t, uint64_t>> cdc_chunks;
10596 cdc->calc_chunks(bl, &cdc_chunks);
10597
10598 // get fingerprint
10599 for (auto p : cdc_chunks) {
10600 bufferlist chunk;
10601 chunk.substr_of(bl, p.first, p.second);
2a845540
TL
10602 auto [ret, target] = get_fpoid_from_chunk(oi.soid, chunk);
10603 if (ret < 0) {
10604 return ret;
10605 }
20effc67 10606 chunks[p.first] = std::move(chunk);
f67539c2
TL
10607 chunk_map[p.first] = chunk_info_t(0, p.second, target);
10608 total_length += p.second;
10609 }
10610 return total_length;
10611}
10612
2a845540
TL
10613std::pair<int, hobject_t> PrimaryLogPG::get_fpoid_from_chunk(
10614 const hobject_t soid, bufferlist& chunk)
f67539c2
TL
10615{
10616 pg_pool_t::fingerprint_t fp_algo = pool.info.get_fingerprint_type();
10617 if (fp_algo == pg_pool_t::TYPE_FINGERPRINT_NONE) {
2a845540 10618 return make_pair(-EINVAL, hobject_t());
f67539c2
TL
10619 }
10620 object_t fp_oid = [&fp_algo, &chunk]() -> string {
10621 switch (fp_algo) {
10622 case pg_pool_t::TYPE_FINGERPRINT_SHA1:
10623 return ceph::crypto::digest<ceph::crypto::SHA1>(chunk).to_str();
10624 case pg_pool_t::TYPE_FINGERPRINT_SHA256:
10625 return ceph::crypto::digest<ceph::crypto::SHA256>(chunk).to_str();
10626 case pg_pool_t::TYPE_FINGERPRINT_SHA512:
10627 return ceph::crypto::digest<ceph::crypto::SHA512>(chunk).to_str();
10628 default:
10629 assert(0 == "unrecognized fingerprint type");
10630 return {};
10631 }
10632 }();
10633
10634 pg_t raw_pg;
10635 object_locator_t oloc(soid);
10636 oloc.pool = pool.info.get_dedup_tier();
33c7a0ef
TL
10637 // check if dedup_tier isn't set
10638 ceph_assert(oloc.pool > 0);
2a845540
TL
10639 int ret = get_osdmap()->object_locator_to_pg(fp_oid, oloc, raw_pg);
10640 if (ret < 0) {
10641 return make_pair(ret, hobject_t());
10642 }
f67539c2
TL
10643 hobject_t target(fp_oid, oloc.key, snapid_t(),
10644 raw_pg.ps(), raw_pg.pool(),
10645 oloc.nspace);
2a845540 10646 return make_pair(0, target);
f67539c2
TL
10647}
10648
10649int PrimaryLogPG::finish_set_dedup(hobject_t oid, int r, ceph_tid_t tid, uint64_t offset)
10650{
10651 dout(10) << __func__ << " " << oid << " tid " << tid
10652 << " " << cpp_strerror(r) << dendl;
10653 map<hobject_t,ManifestOpRef>::iterator p = manifest_ops.find(oid);
10654 if (p == manifest_ops.end()) {
10655 dout(10) << __func__ << " no manifest_op found" << dendl;
10656 return -EINVAL;
10657 }
10658 ManifestOpRef mop = p->second;
10659 mop->results[offset] = r;
10660 if (r < 0) {
10661 // if any failure occurs, put a mark on the results to recognize the failure
10662 mop->results[0] = r;
10663 }
10664 if (mop->num_chunks != mop->results.size()) {
10665 // there are on-going works
10666 return -EINPROGRESS;
10667 }
1e59de90
TL
10668 ObjectContextRef obc = mop->obc;
10669 ceph_assert(obc);
f67539c2
TL
10670 ceph_assert(obc->is_blocked());
10671 obc->stop_block();
10672 kick_object_context_blocked(obc);
10673 if (mop->results[0] < 0) {
10674 // check if the previous op returns fail
10675 ceph_assert(mop->num_chunks == mop->results.size());
10676 manifest_ops.erase(oid);
10677 osd->reply_op_error(mop->op, mop->results[0]);
10678 return -EIO;
10679 }
10680
10681 if (mop->chunks.size()) {
10682 OpContextUPtr ctx = simple_opc_create(obc);
10683 ceph_assert(ctx);
10684 if (ctx->lock_manager.get_lock_type(
10685 RWState::RWWRITE,
10686 oid,
10687 obc,
10688 mop->op)) {
10689 dout(20) << __func__ << " took write lock" << dendl;
10690 } else if (mop->op) {
10691 dout(10) << __func__ << " waiting on write lock " << mop->op << dendl;
10692 close_op_ctx(ctx.release());
10693 return -EAGAIN;
10694 }
10695
10696 ctx->at_version = get_next_version();
10697 ctx->new_obs = obc->obs;
10698 ctx->new_obs.oi.clear_flag(object_info_t::FLAG_DIRTY);
20effc67 10699 --ctx->delta_stats.num_objects_dirty;
1e59de90
TL
10700 if (!ctx->obs->oi.has_manifest()) {
10701 ctx->delta_stats.num_objects_manifest++;
10702 ctx->new_obs.oi.set_flag(object_info_t::FLAG_MANIFEST);
10703 ctx->new_obs.oi.manifest.type = object_manifest_t::TYPE_CHUNKED;
10704 }
f67539c2
TL
10705
10706 /*
10707 * Let's assume that there is a manifest snapshotted object, and we issue tier_flush() to head.
10708 * head: [0, 2) aaa <-- tier_flush()
10709 * 20: [0, 2) ddd, [6, 2) bbb, [8, 2) ccc
10710 *
10711 * In this case, if the new chunk_map is as follows,
10712 * new_chunk_map : [0, 2) ddd, [6, 2) bbb, [8, 2) ccc
10713 * we should drop aaa from head by using calc_refs_to_drop_on_removal().
10714 * So, the precedure is
10715 * 1. calc_refs_to_drop_on_removal()
10716 * 2. register old references to drop after tier_flush() is committed
10717 * 3. update new chunk_map
10718 */
10719
10720 ObjectCleanRegions c_regions = ctx->clean_regions;
10721 ObjectContextRef cobc = get_prev_clone_obc(obc);
10722 c_regions.mark_fully_dirty();
10723 // CDC was done on entire range of manifest object,
10724 // so the first thing we should do here is to drop the reference to old chunks
10725 ObjectContextRef obc_l, obc_g;
10726 get_adjacent_clones(obc, obc_l, obc_g);
10727 // clear all old references
10728 object_ref_delta_t refs;
10729 ctx->obs->oi.manifest.calc_refs_to_drop_on_removal(
10730 obc_l ? &(obc_l->obs.oi.manifest) : nullptr,
10731 obc_g ? &(obc_g->obs.oi.manifest) : nullptr,
10732 refs);
10733 if (!refs.is_empty()) {
10734 ctx->register_on_commit(
10735 [oid, this, refs](){
10736 dec_refcount(oid, refs);
10737 });
10738 }
10739
10740 // set new references
10741 ctx->new_obs.oi.manifest.chunk_map = mop->new_manifest.chunk_map;
10742
10743 finish_ctx(ctx.get(), pg_log_entry_t::CLEAN);
10744 simple_opc_submit(std::move(ctx));
10745 }
10746 if (mop->op)
10747 osd->reply_op_error(mop->op, r);
10748
10749 manifest_ops.erase(oid);
10750 return 0;
10751}
10752
20effc67
TL
10753int PrimaryLogPG::finish_set_manifest_refcount(hobject_t oid, int r, ceph_tid_t tid, uint64_t offset)
10754{
10755 dout(10) << __func__ << " " << oid << " tid " << tid
10756 << " " << cpp_strerror(r) << dendl;
10757 map<hobject_t,ManifestOpRef>::iterator p = manifest_ops.find(oid);
10758 if (p == manifest_ops.end()) {
10759 dout(10) << __func__ << " no manifest_op found" << dendl;
10760 return -EINVAL;
10761 }
10762 ManifestOpRef mop = p->second;
10763 mop->results[offset] = r;
10764 if (r < 0) {
10765 // if any failure occurs, put a mark on the results to recognize the failure
10766 mop->results[0] = r;
10767 }
10768 if (mop->num_chunks != mop->results.size()) {
10769 // there are on-going works
10770 return -EINPROGRESS;
10771 }
10772
10773 if (mop->cb) {
10774 mop->cb->complete(r);
10775 }
10776
10777 manifest_ops.erase(p);
10778 mop.reset();
10779
10780 return 0;
10781}
10782
7c673cae
FG
10783int PrimaryLogPG::start_flush(
10784 OpRequestRef op, ObjectContextRef obc,
10785 bool blocking, hobject_t *pmissing,
1e59de90
TL
10786 std::optional<std::function<void()>> &&on_flush,
10787 bool force_dedup)
7c673cae
FG
10788{
10789 const object_info_t& oi = obc->obs.oi;
10790 const hobject_t& soid = oi.soid;
10791 dout(10) << __func__ << " " << soid
10792 << " v" << oi.version
10793 << " uv" << oi.user_version
10794 << " " << (blocking ? "blocking" : "non-blocking/best-effort")
10795 << dendl;
10796
9f95a23c
TL
10797 bool preoctopus_compat =
10798 get_osdmap()->require_osd_release < ceph_release_t::octopus;
10799 SnapSet snapset;
10800 if (preoctopus_compat) {
10801 // for pre-octopus compatibility, filter SnapSet::snaps. not
10802 // certain we need this, but let's be conservative.
10803 snapset = obc->ssc->snapset.get_filtered(pool.info);
10804 } else {
10805 // NOTE: change this to a const ref when we remove this compat code
10806 snapset = obc->ssc->snapset;
10807 }
7c673cae 10808
1e59de90
TL
10809 if ((obc->obs.oi.has_manifest() && obc->obs.oi.manifest.is_chunked())
10810 || force_dedup) {
f67539c2
TL
10811 // current dedup tier only supports blocking operation
10812 if (!blocking) {
10813 return -EOPNOTSUPP;
10814 }
10815 }
10816
7c673cae
FG
10817 // verify there are no (older) check for dirty clones
10818 {
10819 dout(20) << " snapset " << snapset << dendl;
10820 vector<snapid_t>::reverse_iterator p = snapset.clones.rbegin();
10821 while (p != snapset.clones.rend() && *p >= soid.snap)
10822 ++p;
10823 if (p != snapset.clones.rend()) {
10824 hobject_t next = soid;
10825 next.snap = *p;
11fdf7f2 10826 ceph_assert(next.snap < soid.snap);
9f95a23c 10827 if (recovery_state.get_pg_log().get_missing().is_missing(next)) {
7c673cae
FG
10828 dout(10) << __func__ << " missing clone is " << next << dendl;
10829 if (pmissing)
10830 *pmissing = next;
10831 return -ENOENT;
10832 }
10833 ObjectContextRef older_obc = get_object_context(next, false);
10834 if (older_obc) {
10835 dout(20) << __func__ << " next oldest clone is " << older_obc->obs.oi
10836 << dendl;
10837 if (older_obc->obs.oi.is_dirty()) {
10838 dout(10) << __func__ << " next oldest clone is dirty: "
10839 << older_obc->obs.oi << dendl;
10840 return -EBUSY;
10841 }
10842 } else {
10843 dout(20) << __func__ << " next oldest clone " << next
10844 << " is not present; implicitly clean" << dendl;
10845 }
10846 } else {
10847 dout(20) << __func__ << " no older clones" << dendl;
10848 }
10849 }
10850
1e59de90
TL
10851 if (blocking) {
10852 dout(20) << fmt::format("{}: blocking {}", __func__, soid) << dendl;
7c673cae 10853 obc->start_block();
1e59de90 10854 }
7c673cae
FG
10855
10856 map<hobject_t,FlushOpRef>::iterator p = flush_ops.find(soid);
10857 if (p != flush_ops.end()) {
10858 FlushOpRef fop = p->second;
10859 if (fop->op == op) {
10860 // we couldn't take the write lock on a cache-try-flush before;
10861 // now we are trying again for the lock.
10862 return try_flush_mark_clean(fop);
10863 }
10864 if (fop->flushed_version == obc->obs.oi.user_version &&
10865 (fop->blocking || !blocking)) {
10866 // nonblocking can join anything
10867 // blocking can only join a blocking flush
10868 dout(20) << __func__ << " piggybacking on existing flush " << dendl;
10869 if (op)
10870 fop->dup_ops.push_back(op);
10871 return -EAGAIN; // clean up this ctx; op will retry later
10872 }
10873
10874 // cancel current flush since it will fail anyway, or because we
10875 // are blocking and the existing flush is nonblocking.
10876 dout(20) << __func__ << " canceling previous flush; it will fail" << dendl;
10877 if (fop->op)
10878 osd->reply_op_error(fop->op, -EBUSY);
10879 while (!fop->dup_ops.empty()) {
10880 osd->reply_op_error(fop->dup_ops.front(), -EBUSY);
10881 fop->dup_ops.pop_front();
10882 }
94b18763
FG
10883 vector<ceph_tid_t> tids;
10884 cancel_flush(fop, false, &tids);
10885 osd->objecter->op_cancel(tids, -ECANCELED);
7c673cae
FG
10886 }
10887
1e59de90
TL
10888 if ((obc->obs.oi.has_manifest() && obc->obs.oi.manifest.is_chunked())
10889 || force_dedup) {
f67539c2 10890 int r = start_dedup(op, obc);
11fdf7f2
TL
10891 if (r != -EINPROGRESS) {
10892 if (blocking)
10893 obc->stop_block();
10894 }
10895 return r;
10896 }
10897
7c673cae
FG
10898 /**
10899 * In general, we need to send a delete and a copyfrom.
10900 * Consider snapc 10:[10, 9, 8, 4, 3, 2]:[10(10, 9), 4(4,3,2)]
10901 * where 4 is marked as clean. To flush 10, we have to:
10902 * 1) delete 4:[4,3,2] -- Logically, the object does not exist after 4
10903 * 2) copyfrom 8:[8,4,3,2] -- flush object after snap 8
10904 *
10905 * There is a complicating case. Supposed there had been a clone 7
10906 * for snaps [7, 6] which has been trimmed since they no longer exist.
10907 * In the base pool, we'd have 5:[4,3,2]:[4(4,3,2)]+head. When we submit
10908 * the delete, the snap will be promoted to 5, and the head will become
11fdf7f2 10909 * a whiteout. When the copy-from goes through, we'll end up with
7c673cae
FG
10910 * 8:[8,4,3,2]:[4(4,3,2)]+head.
10911 *
10912 * Another complication is the case where there is an interval change
10913 * after doing the delete and the flush but before marking the object
10914 * clean. We'll happily delete head and then recreate it at the same
10915 * sequence number, which works out ok.
10916 */
10917
10918 SnapContext snapc, dsnapc;
10919 if (snapset.seq != 0) {
10920 if (soid.snap == CEPH_NOSNAP) {
9f95a23c 10921 snapc = snapset.get_ssc_as_of(snapset.seq);
7c673cae
FG
10922 } else {
10923 snapid_t min_included_snap;
11fdf7f2
TL
10924 auto p = snapset.clone_snaps.find(soid.snap);
10925 ceph_assert(p != snapset.clone_snaps.end());
10926 min_included_snap = p->second.back();
7c673cae
FG
10927 snapc = snapset.get_ssc_as_of(min_included_snap - 1);
10928 }
10929
10930 snapid_t prev_snapc = 0;
10931 for (vector<snapid_t>::reverse_iterator citer = snapset.clones.rbegin();
10932 citer != snapset.clones.rend();
10933 ++citer) {
10934 if (*citer < soid.snap) {
10935 prev_snapc = *citer;
10936 break;
10937 }
10938 }
10939
10940 dsnapc = snapset.get_ssc_as_of(prev_snapc);
10941 }
10942
10943 object_locator_t base_oloc(soid);
10944 base_oloc.pool = pool.info.tier_of;
10945
10946 if (dsnapc.seq < snapc.seq) {
10947 ObjectOperation o;
10948 o.remove();
10949 osd->objecter->mutate(
10950 soid.oid,
10951 base_oloc,
10952 o,
10953 dsnapc,
10954 ceph::real_clock::from_ceph_timespec(oi.mtime),
10955 (CEPH_OSD_FLAG_IGNORE_OVERLAY |
10956 CEPH_OSD_FLAG_ENFORCE_SNAPC),
10957 NULL /* no callback, we'll rely on the ordering w.r.t the next op */);
10958 }
10959
10960 FlushOpRef fop(std::make_shared<FlushOp>());
10961 fop->obc = obc;
10962 fop->flushed_version = oi.user_version;
10963 fop->blocking = blocking;
10964 fop->on_flush = std::move(on_flush);
10965 fop->op = op;
10966
10967 ObjectOperation o;
10968 if (oi.is_whiteout()) {
10969 fop->removal = true;
10970 o.remove();
10971 } else {
10972 object_locator_t oloc(soid);
10973 o.copy_from(soid.oid.name, soid.snap, oloc, oi.user_version,
10974 CEPH_OSD_COPY_FROM_FLAG_FLUSH |
10975 CEPH_OSD_COPY_FROM_FLAG_IGNORE_OVERLAY |
10976 CEPH_OSD_COPY_FROM_FLAG_IGNORE_CACHE |
10977 CEPH_OSD_COPY_FROM_FLAG_MAP_SNAP_CLONE,
10978 LIBRADOS_OP_FLAG_FADVISE_SEQUENTIAL|LIBRADOS_OP_FLAG_FADVISE_NOCACHE);
10979
10980 //mean the base tier don't cache data after this
10981 if (agent_state && agent_state->evict_mode != TierAgentState::EVICT_MODE_FULL)
10982 o.set_last_op_flags(LIBRADOS_OP_FLAG_FADVISE_DONTNEED);
10983 }
10984 C_Flush *fin = new C_Flush(this, soid, get_last_peering_reset());
10985
10986 ceph_tid_t tid = osd->objecter->mutate(
10987 soid.oid, base_oloc, o, snapc,
10988 ceph::real_clock::from_ceph_timespec(oi.mtime),
10989 CEPH_OSD_FLAG_IGNORE_OVERLAY | CEPH_OSD_FLAG_ENFORCE_SNAPC,
10990 new C_OnFinisher(fin,
9f95a23c 10991 osd->get_objecter_finisher(get_pg_shard())));
7c673cae
FG
10992 /* we're under the pg lock and fin->finish() is grabbing that */
10993 fin->tid = tid;
10994 fop->objecter_tid = tid;
10995
10996 flush_ops[soid] = fop;
9f95a23c
TL
10997
10998 recovery_state.update_stats(
10999 [&oi](auto &history, auto &stats) {
11000 stats.stats.sum.num_flush++;
11001 stats.stats.sum.num_flush_kb += shift_round_up(oi.size, 10);
11002 return false;
11003 });
7c673cae
FG
11004 return -EINPROGRESS;
11005}
11006
11007void PrimaryLogPG::finish_flush(hobject_t oid, ceph_tid_t tid, int r)
11008{
11009 dout(10) << __func__ << " " << oid << " tid " << tid
11010 << " " << cpp_strerror(r) << dendl;
11011 map<hobject_t,FlushOpRef>::iterator p = flush_ops.find(oid);
11012 if (p == flush_ops.end()) {
11013 dout(10) << __func__ << " no flush_op found" << dendl;
11014 return;
11015 }
11016 FlushOpRef fop = p->second;
11fdf7f2 11017 if (tid != fop->objecter_tid && !fop->obc->obs.oi.has_manifest()) {
7c673cae
FG
11018 dout(10) << __func__ << " tid " << tid << " != fop " << fop
11019 << " tid " << fop->objecter_tid << dendl;
11020 return;
11021 }
11022 ObjectContextRef obc = fop->obc;
11023 fop->objecter_tid = 0;
11024
11025 if (r < 0 && !(r == -ENOENT && fop->removal)) {
11026 if (fop->op)
11027 osd->reply_op_error(fop->op, -EBUSY);
11028 if (fop->blocking) {
11029 obc->stop_block();
11030 kick_object_context_blocked(obc);
11031 }
11032
11033 if (!fop->dup_ops.empty()) {
11034 dout(20) << __func__ << " requeueing dups" << dendl;
11035 requeue_ops(fop->dup_ops);
11036 }
11037 if (fop->on_flush) {
11038 (*(fop->on_flush))();
9f95a23c 11039 fop->on_flush = std::nullopt;
7c673cae
FG
11040 }
11041 flush_ops.erase(oid);
11042 return;
11043 }
11044
11045 r = try_flush_mark_clean(fop);
11046 if (r == -EBUSY && fop->op) {
11047 osd->reply_op_error(fop->op, r);
11048 }
11049}
11050
11051int PrimaryLogPG::try_flush_mark_clean(FlushOpRef fop)
11052{
11053 ObjectContextRef obc = fop->obc;
11054 const hobject_t& oid = obc->obs.oi.soid;
11055
11056 if (fop->blocking) {
11057 obc->stop_block();
11058 kick_object_context_blocked(obc);
11059 }
11060
11061 if (fop->flushed_version != obc->obs.oi.user_version ||
11062 !obc->obs.exists) {
11063 if (obc->obs.exists)
11064 dout(10) << __func__ << " flushed_version " << fop->flushed_version
11065 << " != current " << obc->obs.oi.user_version
11066 << dendl;
11067 else
11068 dout(10) << __func__ << " object no longer exists" << dendl;
11069
11070 if (!fop->dup_ops.empty()) {
11071 dout(20) << __func__ << " requeueing dups" << dendl;
11072 requeue_ops(fop->dup_ops);
11073 }
11074 if (fop->on_flush) {
11075 (*(fop->on_flush))();
9f95a23c 11076 fop->on_flush = std::nullopt;
7c673cae
FG
11077 }
11078 flush_ops.erase(oid);
11079 if (fop->blocking)
11080 osd->logger->inc(l_osd_tier_flush_fail);
11081 else
11082 osd->logger->inc(l_osd_tier_try_flush_fail);
11083 return -EBUSY;
11084 }
11085
11086 if (!fop->blocking &&
f67539c2 11087 m_scrubber->write_blocked_by_scrub(oid)) {
7c673cae
FG
11088 if (fop->op) {
11089 dout(10) << __func__ << " blocked by scrub" << dendl;
11090 requeue_op(fop->op);
11091 requeue_ops(fop->dup_ops);
11092 return -EAGAIN; // will retry
11093 } else {
11094 osd->logger->inc(l_osd_tier_try_flush_fail);
94b18763
FG
11095 vector<ceph_tid_t> tids;
11096 cancel_flush(fop, false, &tids);
11097 osd->objecter->op_cancel(tids, -ECANCELED);
7c673cae
FG
11098 return -ECANCELED;
11099 }
11100 }
11101
11102 // successfully flushed, can we evict this object?
11fdf7f2
TL
11103 if (!obc->obs.oi.has_manifest() && !fop->op &&
11104 agent_state && agent_state->evict_mode != TierAgentState::EVICT_MODE_IDLE &&
7c673cae
FG
11105 agent_maybe_evict(obc, true)) {
11106 osd->logger->inc(l_osd_tier_clean);
11107 if (fop->on_flush) {
11108 (*(fop->on_flush))();
9f95a23c 11109 fop->on_flush = std::nullopt;
7c673cae
FG
11110 }
11111 flush_ops.erase(oid);
11112 return 0;
11113 }
11114
11115 dout(10) << __func__ << " clearing DIRTY flag for " << oid << dendl;
11116 OpContextUPtr ctx = simple_opc_create(fop->obc);
11117
11118 // successfully flushed; can we clear the dirty bit?
11119 // try to take the lock manually, since we don't
11120 // have a ctx yet.
11121 if (ctx->lock_manager.get_lock_type(
9f95a23c 11122 RWState::RWWRITE,
7c673cae
FG
11123 oid,
11124 obc,
11125 fop->op)) {
11126 dout(20) << __func__ << " took write lock" << dendl;
11127 } else if (fop->op) {
28e407b8
AA
11128 dout(10) << __func__ << " waiting on write lock " << fop->op << " "
11129 << fop->dup_ops << dendl;
28e407b8
AA
11130 // fop->op is now waiting on the lock; get fop->dup_ops to wait too.
11131 for (auto op : fop->dup_ops) {
11132 bool locked = ctx->lock_manager.get_lock_type(
9f95a23c 11133 RWState::RWWRITE,
28e407b8
AA
11134 oid,
11135 obc,
11136 op);
11fdf7f2 11137 ceph_assert(!locked);
28e407b8 11138 }
11fdf7f2 11139 close_op_ctx(ctx.release());
7c673cae
FG
11140 return -EAGAIN; // will retry
11141 } else {
11142 dout(10) << __func__ << " failed write lock, no op; failing" << dendl;
11143 close_op_ctx(ctx.release());
11144 osd->logger->inc(l_osd_tier_try_flush_fail);
94b18763
FG
11145 vector<ceph_tid_t> tids;
11146 cancel_flush(fop, false, &tids);
11147 osd->objecter->op_cancel(tids, -ECANCELED);
7c673cae
FG
11148 return -ECANCELED;
11149 }
11150
11151 if (fop->on_flush) {
11152 ctx->register_on_finish(*(fop->on_flush));
9f95a23c 11153 fop->on_flush = std::nullopt;
7c673cae
FG
11154 }
11155
11156 ctx->at_version = get_next_version();
11157
11158 ctx->new_obs = obc->obs;
11159 ctx->new_obs.oi.clear_flag(object_info_t::FLAG_DIRTY);
11160 --ctx->delta_stats.num_objects_dirty;
11fdf7f2
TL
11161 if (fop->obc->obs.oi.has_manifest()) {
11162 ceph_assert(obc->obs.oi.manifest.is_chunked());
11163 PGTransaction* t = ctx->op_t.get();
11164 uint64_t chunks_size = 0;
11165 for (auto &p : ctx->new_obs.oi.manifest.chunk_map) {
11166 chunks_size += p.second.length;
11167 }
11168 if (ctx->new_obs.oi.is_omap() && pool.info.supports_omap()) {
11169 t->omap_clear(oid);
11170 ctx->new_obs.oi.clear_omap_digest();
11171 ctx->new_obs.oi.clear_flag(object_info_t::FLAG_OMAP);
9f95a23c 11172 ctx->clean_regions.mark_omap_dirty();
11fdf7f2 11173 }
f67539c2 11174 if (obc->obs.oi.size == chunks_size) {
11fdf7f2
TL
11175 t->truncate(oid, 0);
11176 interval_set<uint64_t> trim;
11177 trim.insert(0, ctx->new_obs.oi.size);
11178 ctx->modified_ranges.union_of(trim);
11179 truncate_update_size_and_usage(ctx->delta_stats,
11180 ctx->new_obs.oi,
11181 0);
9f95a23c 11182 ctx->clean_regions.mark_data_region_dirty(0, ctx->new_obs.oi.size);
11fdf7f2
TL
11183 ctx->new_obs.oi.new_object();
11184 for (auto &p : ctx->new_obs.oi.manifest.chunk_map) {
11fdf7f2
TL
11185 p.second.set_flag(chunk_info_t::FLAG_MISSING);
11186 }
11187 } else {
11188 for (auto &p : ctx->new_obs.oi.manifest.chunk_map) {
f67539c2
TL
11189 dout(20) << __func__ << " offset: " << p.second.offset
11190 << " length: " << p.second.length << dendl;
11191 p.second.clear_flag(chunk_info_t::FLAG_MISSING); // CLEAN
11fdf7f2
TL
11192 }
11193 }
11194 }
7c673cae
FG
11195
11196 finish_ctx(ctx.get(), pg_log_entry_t::CLEAN);
11197
11198 osd->logger->inc(l_osd_tier_clean);
11199
11200 if (!fop->dup_ops.empty() || fop->op) {
11201 dout(20) << __func__ << " requeueing for " << ctx->at_version << dendl;
11202 list<OpRequestRef> ls;
11203 if (fop->op)
11204 ls.push_back(fop->op);
11205 ls.splice(ls.end(), fop->dup_ops);
11206 requeue_ops(ls);
11207 }
11208
11209 simple_opc_submit(std::move(ctx));
11210
11211 flush_ops.erase(oid);
11212
11213 if (fop->blocking)
11214 osd->logger->inc(l_osd_tier_flush);
11215 else
11216 osd->logger->inc(l_osd_tier_try_flush);
11217
11218 return -EINPROGRESS;
11219}
11220
94b18763
FG
11221void PrimaryLogPG::cancel_flush(FlushOpRef fop, bool requeue,
11222 vector<ceph_tid_t> *tids)
7c673cae
FG
11223{
11224 dout(10) << __func__ << " " << fop->obc->obs.oi.soid << " tid "
11225 << fop->objecter_tid << dendl;
11226 if (fop->objecter_tid) {
94b18763 11227 tids->push_back(fop->objecter_tid);
7c673cae
FG
11228 fop->objecter_tid = 0;
11229 }
94b18763
FG
11230 if (fop->io_tids.size()) {
11231 for (auto &p : fop->io_tids) {
11232 tids->push_back(p.second);
11233 p.second = 0;
f67539c2 11234 }
94b18763
FG
11235 }
11236 if (fop->blocking && fop->obc->is_blocked()) {
7c673cae
FG
11237 fop->obc->stop_block();
11238 kick_object_context_blocked(fop->obc);
11239 }
11240 if (requeue) {
11241 if (fop->op)
11242 requeue_op(fop->op);
11243 requeue_ops(fop->dup_ops);
11244 }
11245 if (fop->on_flush) {
11246 (*(fop->on_flush))();
9f95a23c 11247 fop->on_flush = std::nullopt;
7c673cae
FG
11248 }
11249 flush_ops.erase(fop->obc->obs.oi.soid);
11250}
11251
94b18763 11252void PrimaryLogPG::cancel_flush_ops(bool requeue, vector<ceph_tid_t> *tids)
7c673cae
FG
11253{
11254 dout(10) << __func__ << dendl;
11255 map<hobject_t,FlushOpRef>::iterator p = flush_ops.begin();
11256 while (p != flush_ops.end()) {
94b18763 11257 cancel_flush((p++)->second, requeue, tids);
7c673cae
FG
11258 }
11259}
11260
11261bool PrimaryLogPG::is_present_clone(hobject_t coid)
11262{
11263 if (!pool.info.allow_incomplete_clones())
11264 return true;
11265 if (is_missing_object(coid))
11266 return true;
11267 ObjectContextRef obc = get_object_context(coid, false);
11268 return obc && obc->obs.exists;
11269}
11270
20effc67
TL
11271// ========================================================================
11272// cls gather
11273//
11274
11275void PrimaryLogPG::cancel_cls_gather(map<hobject_t,CLSGatherOp>::iterator iter, bool requeue,
11276 vector<ceph_tid_t> *tids)
11277{
11278 auto &cgop = iter->second;
11279 for (std::vector<ceph_tid_t>::iterator p = cgop.objecter_tids.begin(); p != cgop.objecter_tids.end(); p++) {
11280 tids->push_back(*p);
11281 dout(10) << __func__ << " " << cgop.obc->obs.oi.soid << " tid " << *p << dendl;
11282 }
11283 cgop.objecter_tids.clear();
11284 close_op_ctx(cgop.ctx);
11285 cgop.ctx = NULL;
11286 if (requeue) {
11287 if (cgop.op)
11288 requeue_op(cgop.op);
11289 }
11290 cls_gather_ops.erase(iter);
11291}
11292
11293void PrimaryLogPG::cancel_cls_gather_ops(bool requeue, vector<ceph_tid_t> *tids)
11294{
11295 dout(10) << __func__ << dendl;
11296 map<hobject_t,CLSGatherOp>::iterator p = cls_gather_ops.begin();
11297 while (p != cls_gather_ops.end()) {
11298 cancel_cls_gather(p++, requeue, tids);
11299 }
11300}
11301
7c673cae 11302// ========================================================================
11fdf7f2 11303// rep op gather
7c673cae
FG
11304
11305class C_OSD_RepopCommit : public Context {
11306 PrimaryLogPGRef pg;
11307 boost::intrusive_ptr<PrimaryLogPG::RepGather> repop;
11308public:
11309 C_OSD_RepopCommit(PrimaryLogPG *pg, PrimaryLogPG::RepGather *repop)
11310 : pg(pg), repop(repop) {}
11311 void finish(int) override {
11312 pg->repop_all_committed(repop.get());
11313 }
11314};
11315
11316void PrimaryLogPG::repop_all_committed(RepGather *repop)
11317{
11318 dout(10) << __func__ << ": repop tid " << repop->rep_tid << " all committed "
11319 << dendl;
11320 repop->all_committed = true;
7c673cae
FG
11321 if (!repop->rep_aborted) {
11322 if (repop->v != eversion_t()) {
9f95a23c 11323 recovery_state.complete_write(repop->v, repop->pg_local_last_complete);
7c673cae
FG
11324 }
11325 eval_repop(repop);
11326 }
11327}
11328
11329void PrimaryLogPG::op_applied(const eversion_t &applied_version)
11330{
11331 dout(10) << "op_applied version " << applied_version << dendl;
11fdf7f2
TL
11332 ceph_assert(applied_version != eversion_t());
11333 ceph_assert(applied_version <= info.last_update);
9f95a23c 11334 recovery_state.local_write_applied(applied_version);
f67539c2 11335
20effc67
TL
11336 if (is_primary() && m_scrubber) {
11337 // if there's a scrub operation waiting for the selected chunk to be fully updated -
11338 // allow it to continue
11339 m_scrubber->on_applied_when_primary(recovery_state.get_last_update_applied());
7c673cae
FG
11340 }
11341}
11342
11343void PrimaryLogPG::eval_repop(RepGather *repop)
11344{
9f95a23c
TL
11345 dout(10) << "eval_repop " << *repop
11346 << (repop->op && repop->op->get_req<MOSDOp>() ? "" : " (no op)") << dendl;
7c673cae
FG
11347
11348 // ondisk?
11349 if (repop->all_committed) {
11350 dout(10) << " commit: " << *repop << dendl;
11351 for (auto p = repop->on_committed.begin();
11352 p != repop->on_committed.end();
11353 repop->on_committed.erase(p++)) {
11354 (*p)();
11355 }
11356 // send dup commits, in order
11fdf7f2
TL
11357 auto it = waiting_for_ondisk.find(repop->v);
11358 if (it != waiting_for_ondisk.end()) {
11359 ceph_assert(waiting_for_ondisk.begin()->first == repop->v);
11360 for (auto& i : it->second) {
11361 int return_code = repop->r;
11362 if (return_code >= 0) {
11363 return_code = std::get<2>(i);
11364 }
11365 osd->reply_op_error(std::get<0>(i), return_code, repop->v,
9f95a23c 11366 std::get<1>(i), std::get<3>(i));
7c673cae 11367 }
11fdf7f2 11368 waiting_for_ondisk.erase(it);
7c673cae 11369 }
7c673cae
FG
11370
11371 publish_stats_to_osd();
7c673cae
FG
11372
11373 dout(10) << " removing " << *repop << dendl;
11fdf7f2 11374 ceph_assert(!repop_queue.empty());
f67539c2 11375 dout(20) << " q front is " << *repop_queue.front() << dendl;
11fdf7f2 11376 if (repop_queue.front() == repop) {
7c673cae
FG
11377 RepGather *to_remove = nullptr;
11378 while (!repop_queue.empty() &&
11fdf7f2 11379 (to_remove = repop_queue.front())->all_committed) {
7c673cae
FG
11380 repop_queue.pop_front();
11381 for (auto p = to_remove->on_success.begin();
11382 p != to_remove->on_success.end();
11383 to_remove->on_success.erase(p++)) {
11384 (*p)();
11385 }
11386 remove_repop(to_remove);
11387 }
11388 }
11389 }
11390}
11391
11392void PrimaryLogPG::issue_repop(RepGather *repop, OpContext *ctx)
11393{
11fdf7f2 11394 FUNCTRACE(cct);
7c673cae
FG
11395 const hobject_t& soid = ctx->obs->oi.soid;
11396 dout(7) << "issue_repop rep_tid " << repop->rep_tid
11397 << " o " << soid
11398 << dendl;
20effc67 11399
7c673cae
FG
11400
11401 repop->v = ctx->at_version;
7c673cae 11402
7c673cae
FG
11403 ctx->op_t->add_obc(ctx->obc);
11404 if (ctx->clone_obc) {
7c673cae
FG
11405 ctx->op_t->add_obc(ctx->clone_obc);
11406 }
11fdf7f2
TL
11407 if (ctx->head_obc) {
11408 ctx->op_t->add_obc(ctx->head_obc);
7c673cae
FG
11409 }
11410
11411 Context *on_all_commit = new C_OSD_RepopCommit(this, repop);
7c673cae 11412 if (!(ctx->log.empty())) {
11fdf7f2 11413 ceph_assert(ctx->at_version >= projected_last_update);
7c673cae
FG
11414 projected_last_update = ctx->at_version;
11415 }
11416 for (auto &&entry: ctx->log) {
11417 projected_log.add(entry);
11418 }
11fdf7f2 11419
9f95a23c
TL
11420 recovery_state.pre_submit_op(
11421 soid,
11422 ctx->log,
11423 ctx->at_version);
7c673cae
FG
11424 pgbackend->submit_transaction(
11425 soid,
11426 ctx->delta_stats,
11427 ctx->at_version,
11428 std::move(ctx->op_t),
9f95a23c
TL
11429 recovery_state.get_pg_trim_to(),
11430 recovery_state.get_min_last_complete_ondisk(),
f67539c2 11431 std::move(ctx->log),
7c673cae 11432 ctx->updated_hset_history,
7c673cae
FG
11433 on_all_commit,
11434 repop->rep_tid,
11435 ctx->reqid,
11436 ctx->op);
11437}
11438
11439PrimaryLogPG::RepGather *PrimaryLogPG::new_repop(
20effc67 11440 OpContext *ctx,
7c673cae
FG
11441 ceph_tid_t rep_tid)
11442{
11443 if (ctx->op)
11444 dout(10) << "new_repop rep_tid " << rep_tid << " on " << *ctx->op->get_req() << dendl;
11445 else
11446 dout(10) << "new_repop rep_tid " << rep_tid << " (no op)" << dendl;
11447
11448 RepGather *repop = new RepGather(
11fdf7f2 11449 ctx, rep_tid, info.last_complete);
7c673cae
FG
11450
11451 repop->start = ceph_clock_now();
11452
11453 repop_queue.push_back(&repop->queue_item);
11454 repop->get();
11455
11456 osd->logger->inc(l_osd_op_wip);
11457
11458 dout(10) << __func__ << ": " << *repop << dendl;
11459 return repop;
11460}
11461
11462boost::intrusive_ptr<PrimaryLogPG::RepGather> PrimaryLogPG::new_repop(
11463 eversion_t version,
11464 int r,
11465 ObcLockManager &&manager,
11466 OpRequestRef &&op,
9f95a23c 11467 std::optional<std::function<void(void)> > &&on_complete)
7c673cae
FG
11468{
11469 RepGather *repop = new RepGather(
11470 std::move(manager),
11471 std::move(op),
11472 std::move(on_complete),
11473 osd->get_tid(),
11474 info.last_complete,
7c673cae
FG
11475 r);
11476 repop->v = version;
11477
11478 repop->start = ceph_clock_now();
11479
11480 repop_queue.push_back(&repop->queue_item);
11481
11482 osd->logger->inc(l_osd_op_wip);
11483
11484 dout(10) << __func__ << ": " << *repop << dendl;
11485 return boost::intrusive_ptr<RepGather>(repop);
11486}
f67539c2 11487
7c673cae
FG
11488void PrimaryLogPG::remove_repop(RepGather *repop)
11489{
11490 dout(20) << __func__ << " " << *repop << dendl;
11491
11492 for (auto p = repop->on_finish.begin();
11493 p != repop->on_finish.end();
11494 repop->on_finish.erase(p++)) {
11495 (*p)();
11496 }
11497
11498 release_object_locks(
11499 repop->lock_manager);
11500 repop->put();
11501
11502 osd->logger->dec(l_osd_op_wip);
11503}
11504
11505PrimaryLogPG::OpContextUPtr PrimaryLogPG::simple_opc_create(ObjectContextRef obc)
11506{
11507 dout(20) << __func__ << " " << obc->obs.oi.soid << dendl;
7c673cae
FG
11508 ceph_tid_t rep_tid = osd->get_tid();
11509 osd_reqid_t reqid(osd->get_cluster_msgr_name(), 0, rep_tid);
c07f9fc5 11510 OpContextUPtr ctx(new OpContext(OpRequestRef(), reqid, nullptr, obc, this));
7c673cae
FG
11511 ctx->op_t.reset(new PGTransaction());
11512 ctx->mtime = ceph_clock_now();
11513 return ctx;
11514}
11515
11516void PrimaryLogPG::simple_opc_submit(OpContextUPtr ctx)
11517{
20effc67 11518 RepGather *repop = new_repop(ctx.get(), ctx->reqid.tid);
7c673cae
FG
11519 dout(20) << __func__ << " " << repop << dendl;
11520 issue_repop(repop, ctx.get());
11521 eval_repop(repop);
9f95a23c 11522 recovery_state.update_trim_to();
7c673cae
FG
11523 repop->put();
11524}
11525
11526
11527void PrimaryLogPG::submit_log_entries(
31f18b77 11528 const mempool::osd_pglog::list<pg_log_entry_t> &entries,
7c673cae 11529 ObcLockManager &&manager,
9f95a23c 11530 std::optional<std::function<void(void)> > &&_on_complete,
7c673cae
FG
11531 OpRequestRef op,
11532 int r)
11533{
11534 dout(10) << __func__ << " " << entries << dendl;
11fdf7f2 11535 ceph_assert(is_primary());
7c673cae
FG
11536
11537 eversion_t version;
11538 if (!entries.empty()) {
11fdf7f2 11539 ceph_assert(entries.rbegin()->version >= projected_last_update);
7c673cae
FG
11540 version = projected_last_update = entries.rbegin()->version;
11541 }
11542
11543 boost::intrusive_ptr<RepGather> repop;
9f95a23c
TL
11544 std::optional<std::function<void(void)> > on_complete;
11545 if (get_osdmap()->require_osd_release >= ceph_release_t::jewel) {
7c673cae
FG
11546 repop = new_repop(
11547 version,
11548 r,
11549 std::move(manager),
11550 std::move(op),
11551 std::move(_on_complete));
11552 } else {
11553 on_complete = std::move(_on_complete);
11554 }
11555
11556 pgbackend->call_write_ordered(
11557 [this, entries, repop, on_complete]() {
11558 ObjectStore::Transaction t;
11559 eversion_t old_last_update = info.last_update;
9f95a23c
TL
11560 recovery_state.merge_new_log_entries(
11561 entries, t, recovery_state.get_pg_trim_to(),
11562 recovery_state.get_min_last_complete_ondisk());
7c673cae
FG
11563
11564 set<pg_shard_t> waiting_on;
9f95a23c
TL
11565 for (set<pg_shard_t>::const_iterator i = get_acting_recovery_backfill().begin();
11566 i != get_acting_recovery_backfill().end();
7c673cae
FG
11567 ++i) {
11568 pg_shard_t peer(*i);
11569 if (peer == pg_whoami) continue;
9f95a23c
TL
11570 ceph_assert(recovery_state.get_peer_missing().count(peer));
11571 ceph_assert(recovery_state.has_peer_info(peer));
11572 if (get_osdmap()->require_osd_release >= ceph_release_t::jewel) {
11fdf7f2 11573 ceph_assert(repop);
7c673cae
FG
11574 MOSDPGUpdateLogMissing *m = new MOSDPGUpdateLogMissing(
11575 entries,
11576 spg_t(info.pgid.pgid, i->shard),
11577 pg_whoami.shard,
11fdf7f2 11578 get_osdmap_epoch(),
9f95a23c 11579 get_last_peering_reset(),
94b18763 11580 repop->rep_tid,
9f95a23c
TL
11581 recovery_state.get_pg_trim_to(),
11582 recovery_state.get_min_last_complete_ondisk());
7c673cae 11583 osd->send_message_osd_cluster(
11fdf7f2 11584 peer.osd, m, get_osdmap_epoch());
7c673cae
FG
11585 waiting_on.insert(peer);
11586 } else {
11587 MOSDPGLog *m = new MOSDPGLog(
11588 peer.shard, pg_whoami.shard,
11589 info.last_update.epoch,
9f95a23c 11590 info, get_last_peering_reset());
7c673cae
FG
11591 m->log.log = entries;
11592 m->log.tail = old_last_update;
11593 m->log.head = info.last_update;
11594 osd->send_message_osd_cluster(
11fdf7f2 11595 peer.osd, m, get_osdmap_epoch());
7c673cae
FG
11596 }
11597 }
11fdf7f2
TL
11598 ceph_tid_t rep_tid = repop->rep_tid;
11599 waiting_on.insert(pg_whoami);
11600 log_entry_update_waiting_on.insert(
11601 make_pair(
11602 rep_tid,
11603 LogUpdateCtx{std::move(repop), std::move(waiting_on)}
11604 ));
11605 struct OnComplete : public Context {
11606 PrimaryLogPGRef pg;
11607 ceph_tid_t rep_tid;
11608 epoch_t epoch;
11609 OnComplete(
11610 PrimaryLogPGRef pg,
11611 ceph_tid_t rep_tid,
11612 epoch_t epoch)
11613 : pg(pg), rep_tid(rep_tid), epoch(epoch) {}
11614 void finish(int) override {
9f95a23c 11615 std::scoped_lock l{*pg};
11fdf7f2
TL
11616 if (!pg->pg_has_reset_since(epoch)) {
11617 auto it = pg->log_entry_update_waiting_on.find(rep_tid);
11618 ceph_assert(it != pg->log_entry_update_waiting_on.end());
11619 auto it2 = it->second.waiting_on.find(pg->pg_whoami);
11620 ceph_assert(it2 != it->second.waiting_on.end());
11621 it->second.waiting_on.erase(it2);
11622 if (it->second.waiting_on.empty()) {
11623 pg->repop_all_committed(it->second.repop.get());
11624 pg->log_entry_update_waiting_on.erase(it);
7c673cae 11625 }
7c673cae 11626 }
11fdf7f2
TL
11627 }
11628 };
11629 t.register_on_commit(
11630 new OnComplete{this, rep_tid, get_osdmap_epoch()});
11631 int r = osd->store->queue_transaction(ch, std::move(t), NULL);
11632 ceph_assert(r == 0);
11633 op_applied(info.last_update);
7c673cae 11634 });
94b18763 11635
9f95a23c 11636 recovery_state.update_trim_to();
7c673cae
FG
11637}
11638
11639void PrimaryLogPG::cancel_log_updates()
11640{
11641 // get rid of all the LogUpdateCtx so their references to repops are
11642 // dropped
11643 log_entry_update_waiting_on.clear();
11644}
11645
11646// -------------------------------------------------------
11647
11fdf7f2 11648void PrimaryLogPG::get_watchers(list<obj_watch_item_t> *ls)
7c673cae 11649{
9f95a23c 11650 std::scoped_lock l{*this};
7c673cae
FG
11651 pair<hobject_t, ObjectContextRef> i;
11652 while (object_contexts.get_next(i.first, &i)) {
11653 ObjectContextRef obc(i.second);
11fdf7f2 11654 get_obc_watchers(obc, *ls);
7c673cae
FG
11655 }
11656}
11657
11658void PrimaryLogPG::get_obc_watchers(ObjectContextRef obc, list<obj_watch_item_t> &pg_watchers)
11659{
11660 for (map<pair<uint64_t, entity_name_t>, WatchRef>::iterator j =
11661 obc->watchers.begin();
11662 j != obc->watchers.end();
11663 ++j) {
11664 obj_watch_item_t owi;
11665
11666 owi.obj = obc->obs.oi.soid;
11667 owi.wi.addr = j->second->get_peer_addr();
11668 owi.wi.name = j->second->get_entity();
11669 owi.wi.cookie = j->second->get_cookie();
11670 owi.wi.timeout_seconds = j->second->get_timeout();
11671
11672 dout(30) << "watch: Found oid=" << owi.obj << " addr=" << owi.wi.addr
11673 << " name=" << owi.wi.name << " cookie=" << owi.wi.cookie << dendl;
11674
11675 pg_watchers.push_back(owi);
11676 }
11677}
11678
f67539c2 11679void PrimaryLogPG::check_blocklisted_watchers()
7c673cae 11680{
f67539c2 11681 dout(20) << "PrimaryLogPG::check_blocklisted_watchers for pg " << get_pgid() << dendl;
7c673cae
FG
11682 pair<hobject_t, ObjectContextRef> i;
11683 while (object_contexts.get_next(i.first, &i))
f67539c2 11684 check_blocklisted_obc_watchers(i.second);
7c673cae
FG
11685}
11686
f67539c2 11687void PrimaryLogPG::check_blocklisted_obc_watchers(ObjectContextRef obc)
7c673cae 11688{
f67539c2 11689 dout(20) << "PrimaryLogPG::check_blocklisted_obc_watchers for obc " << obc->obs.oi.soid << dendl;
7c673cae
FG
11690 for (map<pair<uint64_t, entity_name_t>, WatchRef>::iterator k =
11691 obc->watchers.begin();
11692 k != obc->watchers.end();
11693 ) {
11694 //Advance iterator now so handle_watch_timeout() can erase element
11695 map<pair<uint64_t, entity_name_t>, WatchRef>::iterator j = k++;
11696 dout(30) << "watch: Found " << j->second->get_entity() << " cookie " << j->second->get_cookie() << dendl;
11697 entity_addr_t ea = j->second->get_peer_addr();
11698 dout(30) << "watch: Check entity_addr_t " << ea << dendl;
f67539c2
TL
11699 if (get_osdmap()->is_blocklisted(ea)) {
11700 dout(10) << "watch: Found blocklisted watcher for " << ea << dendl;
11fdf7f2 11701 ceph_assert(j->second->get_pg() == this);
7c673cae
FG
11702 j->second->unregister_cb();
11703 handle_watch_timeout(j->second);
11704 }
11705 }
11706}
11707
11708void PrimaryLogPG::populate_obc_watchers(ObjectContextRef obc)
11709{
f6b5b4d7 11710 ceph_assert(is_primary() && is_active());
9f95a23c 11711 auto it_objects = recovery_state.get_pg_log().get_log().objects.find(obc->obs.oi.soid);
11fdf7f2 11712 ceph_assert((recovering.count(obc->obs.oi.soid) ||
7c673cae 11713 !is_missing_object(obc->obs.oi.soid)) ||
9f95a23c 11714 (it_objects != recovery_state.get_pg_log().get_log().objects.end() && // or this is a revert... see recover_primary()
11fdf7f2 11715 it_objects->second->op ==
7c673cae 11716 pg_log_entry_t::LOST_REVERT &&
11fdf7f2 11717 it_objects->second->reverting_to ==
7c673cae
FG
11718 obc->obs.oi.version));
11719
11720 dout(10) << "populate_obc_watchers " << obc->obs.oi.soid << dendl;
11fdf7f2 11721 ceph_assert(obc->watchers.empty());
7c673cae
FG
11722 // populate unconnected_watchers
11723 for (map<pair<uint64_t, entity_name_t>, watch_info_t>::iterator p =
11724 obc->obs.oi.watchers.begin();
11725 p != obc->obs.oi.watchers.end();
11726 ++p) {
11727 utime_t expire = info.stats.last_became_active;
11728 expire += p->second.timeout_seconds;
11729 dout(10) << " unconnected watcher " << p->first << " will expire " << expire << dendl;
11730 WatchRef watch(
11731 Watch::makeWatchRef(
11732 this, osd, obc, p->second.timeout_seconds, p->first.first,
11733 p->first.second, p->second.addr));
11734 watch->disconnect();
11735 obc->watchers.insert(
11736 make_pair(
11737 make_pair(p->first.first, p->first.second),
11738 watch));
11739 }
f67539c2
TL
11740 // Look for watchers from blocklisted clients and drop
11741 check_blocklisted_obc_watchers(obc);
7c673cae
FG
11742}
11743
11744void PrimaryLogPG::handle_watch_timeout(WatchRef watch)
11745{
11746 ObjectContextRef obc = watch->get_obc(); // handle_watch_timeout owns this ref
1e59de90 11747 dout(10) << "handle_watch_timeout obc " << *obc << dendl;
7c673cae
FG
11748
11749 if (!is_active()) {
11750 dout(10) << "handle_watch_timeout not active, no-op" << dendl;
11751 return;
11752 }
a8e16298
TL
11753 if (!obc->obs.exists) {
11754 dout(10) << __func__ << " object " << obc->obs.oi.soid << " dne" << dendl;
11755 return;
11756 }
7c673cae
FG
11757 if (is_degraded_or_backfilling_object(obc->obs.oi.soid)) {
11758 callbacks_for_degraded_object[obc->obs.oi.soid].push_back(
11759 watch->get_delayed_cb()
11760 );
11761 dout(10) << "handle_watch_timeout waiting for degraded on obj "
11762 << obc->obs.oi.soid
11763 << dendl;
11764 return;
11765 }
11766
f67539c2 11767 if (m_scrubber->write_blocked_by_scrub(obc->obs.oi.soid)) {
7c673cae
FG
11768 dout(10) << "handle_watch_timeout waiting for scrub on obj "
11769 << obc->obs.oi.soid
11770 << dendl;
f67539c2 11771 m_scrubber->add_callback(
7c673cae
FG
11772 watch->get_delayed_cb() // This callback!
11773 );
11774 return;
11775 }
11776
11777 OpContextUPtr ctx = simple_opc_create(obc);
11778 ctx->at_version = get_next_version();
11779
11780 object_info_t& oi = ctx->new_obs.oi;
11781 oi.watchers.erase(make_pair(watch->get_cookie(),
11782 watch->get_entity()));
11783
11784 list<watch_disconnect_t> watch_disconnects = {
11785 watch_disconnect_t(watch->get_cookie(), watch->get_entity(), true)
11786 };
11787 ctx->register_on_success(
11788 [this, obc, watch_disconnects]() {
11789 complete_disconnect_watches(obc, watch_disconnects);
11790 });
11791
11792
11793 PGTransaction *t = ctx->op_t.get();
11794 ctx->log.push_back(pg_log_entry_t(pg_log_entry_t::MODIFY, obc->obs.oi.soid,
11795 ctx->at_version,
11796 oi.version,
11797 0,
11798 osd_reqid_t(), ctx->mtime, 0));
11799
11800 oi.prior_version = obc->obs.oi.version;
11801 oi.version = ctx->at_version;
11802 bufferlist bl;
11fdf7f2 11803 encode(oi, bl, get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
7c673cae
FG
11804 t->setattr(obc->obs.oi.soid, OI_ATTR, bl);
11805
11806 // apply new object state.
11807 ctx->obc->obs = ctx->new_obs;
11808
11809 // no ctx->delta_stats
11810 simple_opc_submit(std::move(ctx));
11811}
11812
11813ObjectContextRef PrimaryLogPG::create_object_context(const object_info_t& oi,
11814 SnapSetContext *ssc)
11815{
11816 ObjectContextRef obc(object_contexts.lookup_or_create(oi.soid));
11fdf7f2 11817 ceph_assert(obc->destructor_callback == NULL);
f67539c2 11818 obc->destructor_callback = new C_PG_ObjectContext(this, obc.get());
7c673cae
FG
11819 obc->obs.oi = oi;
11820 obc->obs.exists = false;
11821 obc->ssc = ssc;
11822 if (ssc)
11823 register_snapset_context(ssc);
11824 dout(10) << "create_object_context " << (void*)obc.get() << " " << oi.soid << " " << dendl;
11825 if (is_active())
11826 populate_obc_watchers(obc);
11827 return obc;
11828}
11829
11830ObjectContextRef PrimaryLogPG::get_object_context(
11831 const hobject_t& soid,
11832 bool can_create,
20effc67 11833 const map<string, bufferlist, less<>> *attrs)
7c673cae 11834{
9f95a23c 11835 auto it_objects = recovery_state.get_pg_log().get_log().objects.find(soid);
11fdf7f2 11836 ceph_assert(
9f95a23c 11837 attrs || !recovery_state.get_pg_log().get_missing().is_missing(soid) ||
7c673cae 11838 // or this is a revert... see recover_primary()
9f95a23c 11839 (it_objects != recovery_state.get_pg_log().get_log().objects.end() &&
11fdf7f2 11840 it_objects->second->op ==
7c673cae
FG
11841 pg_log_entry_t::LOST_REVERT));
11842 ObjectContextRef obc = object_contexts.lookup(soid);
11843 osd->logger->inc(l_osd_object_ctx_cache_total);
11844 if (obc) {
11845 osd->logger->inc(l_osd_object_ctx_cache_hit);
1e59de90 11846 dout(10) << __func__ << ": found obc in cache: " << *obc
7c673cae
FG
11847 << dendl;
11848 } else {
11849 dout(10) << __func__ << ": obc NOT found in cache: " << soid << dendl;
11850 // check disk
11851 bufferlist bv;
11852 if (attrs) {
11fdf7f2
TL
11853 auto it_oi = attrs->find(OI_ATTR);
11854 ceph_assert(it_oi != attrs->end());
11855 bv = it_oi->second;
7c673cae
FG
11856 } else {
11857 int r = pgbackend->objects_get_attr(soid, OI_ATTR, &bv);
11858 if (r < 0) {
11859 if (!can_create) {
11860 dout(10) << __func__ << ": no obc for soid "
11861 << soid << " and !can_create"
11862 << dendl;
11863 return ObjectContextRef(); // -ENOENT!
11864 }
11865
11866 dout(10) << __func__ << ": no obc for soid "
11867 << soid << " but can_create"
11868 << dendl;
11869 // new object.
11870 object_info_t oi(soid);
11871 SnapSetContext *ssc = get_snapset_context(
11872 soid, true, 0, false);
11fdf7f2 11873 ceph_assert(ssc);
7c673cae 11874 obc = create_object_context(oi, ssc);
1e59de90 11875 dout(10) << __func__ << ": " << *obc
7c673cae 11876 << " oi: " << obc->obs.oi
1e59de90 11877 << " " << *obc->ssc << dendl;
7c673cae
FG
11878 return obc;
11879 }
11880 }
11881
11882 object_info_t oi;
11883 try {
11fdf7f2
TL
11884 bufferlist::const_iterator bliter = bv.begin();
11885 decode(oi, bliter);
7c673cae
FG
11886 } catch (...) {
11887 dout(0) << __func__ << ": obc corrupt: " << soid << dendl;
11888 return ObjectContextRef(); // -ENOENT!
11889 }
11890
11fdf7f2 11891 ceph_assert(oi.soid.pool == (int64_t)info.pgid.pool());
7c673cae
FG
11892
11893 obc = object_contexts.lookup_or_create(oi.soid);
11894 obc->destructor_callback = new C_PG_ObjectContext(this, obc.get());
11895 obc->obs.oi = oi;
11896 obc->obs.exists = true;
11897
11898 obc->ssc = get_snapset_context(
11899 soid, true,
11900 soid.has_snapset() ? attrs : 0);
11901
f6b5b4d7 11902 if (is_primary() && is_active())
7c673cae
FG
11903 populate_obc_watchers(obc);
11904
11fdf7f2 11905 if (pool.info.is_erasure()) {
7c673cae
FG
11906 if (attrs) {
11907 obc->attr_cache = *attrs;
11908 } else {
11909 int r = pgbackend->objects_get_attrs(
11910 soid,
11911 &obc->attr_cache);
11fdf7f2 11912 ceph_assert(r == 0);
7c673cae
FG
11913 }
11914 }
11915
1e59de90 11916 dout(10) << __func__ << ": creating obc from disk: " << *obc
7c673cae
FG
11917 << dendl;
11918 }
224ce89b
WB
11919
11920 // XXX: Caller doesn't expect this
11921 if (obc->ssc == NULL) {
11922 derr << __func__ << ": obc->ssc not available, not returning context" << dendl;
11923 return ObjectContextRef(); // -ENOENT!
11924 }
11925
1e59de90 11926 dout(10) << __func__ << ": " << *obc
7c673cae
FG
11927 << " oi: " << obc->obs.oi
11928 << " exists: " << (int)obc->obs.exists
1e59de90 11929 << " " << *obc->ssc << dendl;
7c673cae
FG
11930 return obc;
11931}
11932
11933void PrimaryLogPG::context_registry_on_change()
11934{
11935 pair<hobject_t, ObjectContextRef> i;
11936 while (object_contexts.get_next(i.first, &i)) {
11937 ObjectContextRef obc(i.second);
11938 if (obc) {
11939 for (map<pair<uint64_t, entity_name_t>, WatchRef>::iterator j =
11940 obc->watchers.begin();
11941 j != obc->watchers.end();
11942 obc->watchers.erase(j++)) {
11943 j->second->discard();
11944 }
11945 }
11946 }
11947}
11948
11949
11950/*
11951 * If we return an error, and set *pmissing, then promoting that
11952 * object may help.
11953 *
11954 * If we return -EAGAIN, we will always set *pmissing to the missing
11955 * object to wait for.
11956 *
11957 * If we return an error but do not set *pmissing, then we know the
11958 * object does not exist.
11959 */
11960int PrimaryLogPG::find_object_context(const hobject_t& oid,
11961 ObjectContextRef *pobc,
11962 bool can_create,
11963 bool map_snapid_to_clone,
11964 hobject_t *pmissing)
11965{
11fdf7f2
TL
11966 FUNCTRACE(cct);
11967 ceph_assert(oid.pool == static_cast<int64_t>(info.pgid.pool()));
7c673cae
FG
11968 // want the head?
11969 if (oid.snap == CEPH_NOSNAP) {
11970 ObjectContextRef obc = get_object_context(oid, can_create);
11971 if (!obc) {
11972 if (pmissing)
11973 *pmissing = oid;
11974 return -ENOENT;
11975 }
11fdf7f2 11976 dout(10) << __func__ << " " << oid
7c673cae
FG
11977 << " @" << oid.snap
11978 << " oi=" << obc->obs.oi
11979 << dendl;
11980 *pobc = obc;
11981
11982 return 0;
11983 }
11984
7c673cae 11985 // we want a snap
7c673cae 11986
9f95a23c 11987 hobject_t head = oid.get_head();
7c673cae
FG
11988 SnapSetContext *ssc = get_snapset_context(oid, can_create);
11989 if (!ssc || !(ssc->exists || can_create)) {
11990 dout(20) << __func__ << " " << oid << " no snapset" << dendl;
11991 if (pmissing)
11992 *pmissing = head; // start by getting the head
11993 if (ssc)
11994 put_snapset_context(ssc);
11995 return -ENOENT;
11996 }
11997
11998 if (map_snapid_to_clone) {
11fdf7f2 11999 dout(10) << __func__ << " " << oid << " @" << oid.snap
7c673cae
FG
12000 << " snapset " << ssc->snapset
12001 << " map_snapid_to_clone=true" << dendl;
12002 if (oid.snap > ssc->snapset.seq) {
12003 // already must be readable
12004 ObjectContextRef obc = get_object_context(head, false);
11fdf7f2 12005 dout(10) << __func__ << " " << oid << " @" << oid.snap
7c673cae
FG
12006 << " snapset " << ssc->snapset
12007 << " maps to head" << dendl;
12008 *pobc = obc;
12009 put_snapset_context(ssc);
12010 return (obc && obc->obs.exists) ? 0 : -ENOENT;
12011 } else {
12012 vector<snapid_t>::const_iterator citer = std::find(
12013 ssc->snapset.clones.begin(),
12014 ssc->snapset.clones.end(),
12015 oid.snap);
12016 if (citer == ssc->snapset.clones.end()) {
11fdf7f2 12017 dout(10) << __func__ << " " << oid << " @" << oid.snap
7c673cae
FG
12018 << " snapset " << ssc->snapset
12019 << " maps to nothing" << dendl;
12020 put_snapset_context(ssc);
12021 return -ENOENT;
12022 }
12023
11fdf7f2 12024 dout(10) << __func__ << " " << oid << " @" << oid.snap
7c673cae
FG
12025 << " snapset " << ssc->snapset
12026 << " maps to " << oid << dendl;
12027
9f95a23c 12028 if (recovery_state.get_pg_log().get_missing().is_missing(oid)) {
11fdf7f2 12029 dout(10) << __func__ << " " << oid << " @" << oid.snap
7c673cae
FG
12030 << " snapset " << ssc->snapset
12031 << " " << oid << " is missing" << dendl;
12032 if (pmissing)
12033 *pmissing = oid;
12034 put_snapset_context(ssc);
12035 return -EAGAIN;
12036 }
12037
12038 ObjectContextRef obc = get_object_context(oid, false);
12039 if (!obc || !obc->obs.exists) {
11fdf7f2 12040 dout(10) << __func__ << " " << oid << " @" << oid.snap
7c673cae
FG
12041 << " snapset " << ssc->snapset
12042 << " " << oid << " is not present" << dendl;
12043 if (pmissing)
12044 *pmissing = oid;
12045 put_snapset_context(ssc);
12046 return -ENOENT;
12047 }
11fdf7f2 12048 dout(10) << __func__ << " " << oid << " @" << oid.snap
7c673cae
FG
12049 << " snapset " << ssc->snapset
12050 << " " << oid << " HIT" << dendl;
12051 *pobc = obc;
12052 put_snapset_context(ssc);
12053 return 0;
12054 }
12055 ceph_abort(); //unreachable
12056 }
12057
11fdf7f2 12058 dout(10) << __func__ << " " << oid << " @" << oid.snap
7c673cae 12059 << " snapset " << ssc->snapset << dendl;
f67539c2 12060
7c673cae
FG
12061 // head?
12062 if (oid.snap > ssc->snapset.seq) {
11fdf7f2
TL
12063 ObjectContextRef obc = get_object_context(head, false);
12064 dout(10) << __func__ << " " << head
7c673cae 12065 << " want " << oid.snap << " > snapset seq " << ssc->snapset.seq
11fdf7f2 12066 << " -- HIT " << obc->obs
7c673cae 12067 << dendl;
11fdf7f2
TL
12068 if (!obc->ssc)
12069 obc->ssc = ssc;
12070 else {
12071 ceph_assert(ssc == obc->ssc);
12072 put_snapset_context(ssc);
12073 }
12074 *pobc = obc;
12075 return 0;
7c673cae
FG
12076 }
12077
12078 // which clone would it be?
12079 unsigned k = 0;
12080 while (k < ssc->snapset.clones.size() &&
12081 ssc->snapset.clones[k] < oid.snap)
12082 k++;
12083 if (k == ssc->snapset.clones.size()) {
11fdf7f2 12084 dout(10) << __func__ << " no clones with last >= oid.snap "
7c673cae
FG
12085 << oid.snap << " -- DNE" << dendl;
12086 put_snapset_context(ssc);
12087 return -ENOENT;
12088 }
12089 hobject_t soid(oid.oid, oid.get_key(), ssc->snapset.clones[k], oid.get_hash(),
12090 info.pgid.pool(), oid.get_namespace());
12091
9f95a23c 12092 if (recovery_state.get_pg_log().get_missing().is_missing(soid)) {
11fdf7f2 12093 dout(20) << __func__ << " " << soid << " missing, try again later"
7c673cae
FG
12094 << dendl;
12095 if (pmissing)
12096 *pmissing = soid;
12097 put_snapset_context(ssc);
12098 return -EAGAIN;
12099 }
12100
12101 ObjectContextRef obc = get_object_context(soid, false);
12102 if (!obc || !obc->obs.exists) {
7c673cae
FG
12103 if (pmissing)
12104 *pmissing = soid;
12105 put_snapset_context(ssc);
9f95a23c
TL
12106 if (is_primary()) {
12107 if (is_degraded_or_backfilling_object(soid)) {
12108 dout(20) << __func__ << " clone is degraded or backfilling " << soid << dendl;
12109 return -EAGAIN;
12110 } else if (is_degraded_on_async_recovery_target(soid)) {
12111 dout(20) << __func__ << " clone is recovering " << soid << dendl;
12112 return -EAGAIN;
12113 } else {
12114 dout(20) << __func__ << " missing clone " << soid << dendl;
12115 return -ENOENT;
12116 }
c07f9fc5 12117 } else {
9f95a23c 12118 dout(20) << __func__ << " replica missing clone" << soid << dendl;
c07f9fc5
FG
12119 return -ENOENT;
12120 }
7c673cae
FG
12121 }
12122
12123 if (!obc->ssc) {
12124 obc->ssc = ssc;
12125 } else {
11fdf7f2 12126 ceph_assert(obc->ssc == ssc);
7c673cae
FG
12127 put_snapset_context(ssc);
12128 }
12129 ssc = 0;
12130
12131 // clone
11fdf7f2 12132 dout(20) << __func__ << " " << soid
7c673cae 12133 << " snapset " << obc->ssc->snapset
7c673cae
FG
12134 << dendl;
12135 snapid_t first, last;
11fdf7f2
TL
12136 auto p = obc->ssc->snapset.clone_snaps.find(soid.snap);
12137 ceph_assert(p != obc->ssc->snapset.clone_snaps.end());
12138 if (p->second.empty()) {
12139 dout(1) << __func__ << " " << soid << " empty snapset -- DNE" << dendl;
12140 ceph_assert(!cct->_conf->osd_debug_verify_snaps);
12141 return -ENOENT;
7c673cae 12142 }
9f95a23c
TL
12143 if (std::find(p->second.begin(), p->second.end(), oid.snap) ==
12144 p->second.end()) {
12145 dout(20) << __func__ << " " << soid << " clone_snaps " << p->second
12146 << " does not contain " << oid.snap << " -- DNE" << dendl;
12147 return -ENOENT;
12148 }
12149 if (get_osdmap()->in_removed_snaps_queue(info.pgid.pgid.pool(), oid.snap)) {
12150 dout(20) << __func__ << " " << soid << " snap " << oid.snap
12151 << " in removed_snaps_queue" << " -- DNE" << dendl;
7c673cae
FG
12152 return -ENOENT;
12153 }
9f95a23c
TL
12154 dout(20) << __func__ << " " << soid << " clone_snaps " << p->second
12155 << " contains " << oid.snap << " -- HIT " << obc->obs << dendl;
12156 *pobc = obc;
12157 return 0;
7c673cae
FG
12158}
12159
12160void PrimaryLogPG::object_context_destructor_callback(ObjectContext *obc)
12161{
12162 if (obc->ssc)
12163 put_snapset_context(obc->ssc);
12164}
12165
12166void PrimaryLogPG::add_object_context_to_pg_stat(ObjectContextRef obc, pg_stat_t *pgstat)
12167{
12168 object_info_t& oi = obc->obs.oi;
12169
11fdf7f2
TL
12170 dout(10) << __func__ << " " << oi.soid << dendl;
12171 ceph_assert(!oi.soid.is_snapdir());
7c673cae 12172
11fdf7f2
TL
12173 object_stat_sum_t stat;
12174 stat.num_objects++;
7c673cae
FG
12175 if (oi.is_dirty())
12176 stat.num_objects_dirty++;
12177 if (oi.is_whiteout())
12178 stat.num_whiteouts++;
12179 if (oi.is_omap())
12180 stat.num_objects_omap++;
12181 if (oi.is_cache_pinned())
12182 stat.num_objects_pinned++;
11fdf7f2
TL
12183 if (oi.has_manifest())
12184 stat.num_objects_manifest++;
7c673cae 12185
11fdf7f2 12186 if (oi.soid.is_snap()) {
7c673cae
FG
12187 stat.num_object_clones++;
12188
12189 if (!obc->ssc)
12190 obc->ssc = get_snapset_context(oi.soid, false);
11fdf7f2
TL
12191 ceph_assert(obc->ssc);
12192 stat.num_bytes += obc->ssc->snapset.get_clone_bytes(oi.soid.snap);
12193 } else {
12194 stat.num_bytes += oi.size;
7c673cae
FG
12195 }
12196
12197 // add it in
12198 pgstat->stats.sum.add(stat);
12199}
12200
20effc67
TL
12201void PrimaryLogPG::requeue_op_blocked_by_object(const hobject_t &soid) {
12202 map<hobject_t, list<OpRequestRef>>::iterator p = waiting_for_blocked_object.find(soid);
12203 if (p != waiting_for_blocked_object.end()) {
12204 list<OpRequestRef>& ls = p->second;
12205 dout(10) << __func__ << " " << soid << " requeuing " << ls.size() << " requests" << dendl;
12206 requeue_ops(ls);
12207 waiting_for_blocked_object.erase(p);
12208 }
12209}
12210
7c673cae
FG
12211void PrimaryLogPG::kick_object_context_blocked(ObjectContextRef obc)
12212{
12213 const hobject_t& soid = obc->obs.oi.soid;
12214 if (obc->is_blocked()) {
12215 dout(10) << __func__ << " " << soid << " still blocked" << dendl;
12216 return;
12217 }
12218
20effc67 12219 requeue_op_blocked_by_object(soid);
7c673cae
FG
12220
12221 map<hobject_t, ObjectContextRef>::iterator i =
12222 objects_blocked_on_snap_promotion.find(obc->obs.oi.soid.get_head());
12223 if (i != objects_blocked_on_snap_promotion.end()) {
11fdf7f2 12224 ceph_assert(i->second == obc);
20effc67
TL
12225 ObjectContextRef head_obc = get_object_context(i->first, false);
12226 head_obc->stop_block();
12227 // kick blocked ops (head)
12228 requeue_op_blocked_by_object(i->first);
7c673cae
FG
12229 objects_blocked_on_snap_promotion.erase(i);
12230 }
12231
12232 if (obc->requeue_scrub_on_unblock) {
f67539c2 12233
7c673cae 12234 obc->requeue_scrub_on_unblock = false;
f67539c2
TL
12235
12236 dout(20) << __func__ << " requeuing if still active: " << (is_active() ? "yes" : "no") << dendl;
12237
494da23a
TL
12238 // only requeue if we are still active: we may be unblocking
12239 // because we are resetting for a new peering interval
12240 if (is_active()) {
f67539c2 12241 osd->queue_scrub_unblocking(this, is_scrub_blocking_ops());
494da23a 12242 }
7c673cae
FG
12243 }
12244}
12245
12246SnapSetContext *PrimaryLogPG::get_snapset_context(
12247 const hobject_t& oid,
12248 bool can_create,
20effc67 12249 const map<string, bufferlist, less<>> *attrs,
7c673cae
FG
12250 bool oid_existed)
12251{
11fdf7f2 12252 std::lock_guard l(snapset_contexts_lock);
7c673cae
FG
12253 SnapSetContext *ssc;
12254 map<hobject_t, SnapSetContext*>::iterator p = snapset_contexts.find(
12255 oid.get_snapdir());
12256 if (p != snapset_contexts.end()) {
12257 if (can_create || p->second->exists) {
12258 ssc = p->second;
12259 } else {
12260 return NULL;
12261 }
12262 } else {
12263 bufferlist bv;
12264 if (!attrs) {
12265 int r = -ENOENT;
11fdf7f2 12266 if (!(oid.is_head() && !oid_existed)) {
7c673cae 12267 r = pgbackend->objects_get_attr(oid.get_head(), SS_ATTR, &bv);
7c673cae 12268 }
11fdf7f2
TL
12269 if (r < 0 && !can_create)
12270 return NULL;
7c673cae 12271 } else {
11fdf7f2
TL
12272 auto it_ss = attrs->find(SS_ATTR);
12273 ceph_assert(it_ss != attrs->end());
12274 bv = it_ss->second;
7c673cae
FG
12275 }
12276 ssc = new SnapSetContext(oid.get_snapdir());
12277 _register_snapset_context(ssc);
12278 if (bv.length()) {
11fdf7f2 12279 bufferlist::const_iterator bvp = bv.begin();
224ce89b
WB
12280 try {
12281 ssc->snapset.decode(bvp);
f67539c2
TL
12282 } catch (const ceph::buffer::error& e) {
12283 dout(0) << __func__ << " Can't decode snapset: " << e.what() << dendl;
224ce89b
WB
12284 return NULL;
12285 }
7c673cae
FG
12286 ssc->exists = true;
12287 } else {
12288 ssc->exists = false;
12289 }
12290 }
11fdf7f2 12291 ceph_assert(ssc);
7c673cae
FG
12292 ssc->ref++;
12293 return ssc;
12294}
12295
12296void PrimaryLogPG::put_snapset_context(SnapSetContext *ssc)
12297{
11fdf7f2 12298 std::lock_guard l(snapset_contexts_lock);
7c673cae
FG
12299 --ssc->ref;
12300 if (ssc->ref == 0) {
12301 if (ssc->registered)
12302 snapset_contexts.erase(ssc->oid);
12303 delete ssc;
12304 }
12305}
12306
7c673cae
FG
12307/*
12308 * Return values:
12309 * NONE - didn't pull anything
12310 * YES - pulled what the caller wanted
11fdf7f2 12311 * HEAD - needed to pull head first
7c673cae 12312 */
11fdf7f2 12313enum { PULL_NONE, PULL_HEAD, PULL_YES };
7c673cae
FG
12314
12315int PrimaryLogPG::recover_missing(
12316 const hobject_t &soid, eversion_t v,
12317 int priority,
12318 PGBackend::RecoveryHandle *h)
12319{
1e59de90
TL
12320 dout(10) << __func__ << " sar: " << scrub_after_recovery << dendl;
12321
9f95a23c 12322 if (recovery_state.get_missing_loc().is_unfound(soid)) {
11fdf7f2 12323 dout(7) << __func__ << " " << soid
f67539c2 12324 << " v " << v
7c673cae
FG
12325 << " but it is unfound" << dendl;
12326 return PULL_NONE;
12327 }
12328
9f95a23c 12329 if (recovery_state.get_missing_loc().is_deleted(soid)) {
c07f9fc5 12330 start_recovery_op(soid);
11fdf7f2 12331 ceph_assert(!recovering.count(soid));
c07f9fc5 12332 recovering.insert(make_pair(soid, ObjectContextRef()));
11fdf7f2 12333 epoch_t cur_epoch = get_osdmap_epoch();
9f95a23c 12334 remove_missing_object(soid, v, new LambdaContext(
1e59de90 12335 [=, this](int) {
9f95a23c 12336 std::scoped_lock locker{*this};
c07f9fc5
FG
12337 if (!pg_has_reset_since(cur_epoch)) {
12338 bool object_missing = false;
9f95a23c 12339 for (const auto& shard : get_acting_recovery_backfill()) {
c07f9fc5
FG
12340 if (shard == pg_whoami)
12341 continue;
9f95a23c 12342 if (recovery_state.get_peer_missing(shard).is_missing(soid)) {
c07f9fc5
FG
12343 dout(20) << __func__ << ": soid " << soid << " needs to be deleted from replica " << shard << dendl;
12344 object_missing = true;
12345 break;
12346 }
12347 }
12348 if (!object_missing) {
12349 object_stat_sum_t stat_diff;
12350 stat_diff.num_objects_recovered = 1;
11fdf7f2
TL
12351 if (scrub_after_recovery)
12352 stat_diff.num_objects_repaired = 1;
c07f9fc5
FG
12353 on_global_recover(soid, stat_diff, true);
12354 } else {
12355 auto recovery_handle = pgbackend->open_recovery_op();
12356 pgbackend->recover_delete_object(soid, v, recovery_handle);
12357 pgbackend->run_recovery_op(recovery_handle, priority);
12358 }
12359 }
c07f9fc5
FG
12360 }));
12361 return PULL_YES;
12362 }
12363
7c673cae
FG
12364 // is this a snapped object? if so, consult the snapset.. we may not need the entire object!
12365 ObjectContextRef obc;
12366 ObjectContextRef head_obc;
12367 if (soid.snap && soid.snap < CEPH_NOSNAP) {
11fdf7f2 12368 // do we have the head?
7c673cae 12369 hobject_t head = soid.get_head();
9f95a23c 12370 if (recovery_state.get_pg_log().get_missing().is_missing(head)) {
7c673cae
FG
12371 if (recovering.count(head)) {
12372 dout(10) << " missing but already recovering head " << head << dendl;
12373 return PULL_NONE;
12374 } else {
12375 int r = recover_missing(
9f95a23c 12376 head, recovery_state.get_pg_log().get_missing().get_items().find(head)->second.need, priority,
7c673cae
FG
12377 h);
12378 if (r != PULL_NONE)
11fdf7f2 12379 return PULL_HEAD;
7c673cae
FG
12380 return PULL_NONE;
12381 }
12382 }
7c673cae 12383 head_obc = get_object_context(
11fdf7f2 12384 head,
7c673cae
FG
12385 false,
12386 0);
11fdf7f2 12387 ceph_assert(head_obc);
7c673cae
FG
12388 }
12389 start_recovery_op(soid);
11fdf7f2 12390 ceph_assert(!recovering.count(soid));
7c673cae 12391 recovering.insert(make_pair(soid, obc));
224ce89b 12392 int r = pgbackend->recover_object(
7c673cae
FG
12393 soid,
12394 v,
12395 head_obc,
12396 obc,
12397 h);
224ce89b 12398 // This is only a pull which shouldn't return an error
11fdf7f2 12399 ceph_assert(r >= 0);
7c673cae
FG
12400 return PULL_YES;
12401}
12402
c07f9fc5
FG
12403void PrimaryLogPG::remove_missing_object(const hobject_t &soid,
12404 eversion_t v, Context *on_complete)
12405{
12406 dout(20) << __func__ << " " << soid << " " << v << dendl;
11fdf7f2 12407 ceph_assert(on_complete != nullptr);
c07f9fc5
FG
12408 // delete locally
12409 ObjectStore::Transaction t;
12410 remove_snap_mapped_object(t, soid);
12411
12412 ObjectRecoveryInfo recovery_info;
12413 recovery_info.soid = soid;
12414 recovery_info.version = v;
12415
11fdf7f2 12416 epoch_t cur_epoch = get_osdmap_epoch();
9f95a23c 12417 t.register_on_complete(new LambdaContext(
1e59de90 12418 [=, this](int) {
9f95a23c 12419 std::unique_lock locker{*this};
c07f9fc5
FG
12420 if (!pg_has_reset_since(cur_epoch)) {
12421 ObjectStore::Transaction t2;
12422 on_local_recover(soid, recovery_info, ObjectContextRef(), true, &t2);
12423 t2.register_on_complete(on_complete);
11fdf7f2
TL
12424 int r = osd->store->queue_transaction(ch, std::move(t2), nullptr);
12425 ceph_assert(r == 0);
9f95a23c 12426 locker.unlock();
c07f9fc5 12427 } else {
9f95a23c 12428 locker.unlock();
c07f9fc5
FG
12429 on_complete->complete(-EAGAIN);
12430 }
12431 }));
11fdf7f2
TL
12432 int r = osd->store->queue_transaction(ch, std::move(t), nullptr);
12433 ceph_assert(r == 0);
c07f9fc5 12434}
7c673cae 12435
eafe8130 12436void PrimaryLogPG::finish_degraded_object(const hobject_t oid)
7c673cae 12437{
11fdf7f2 12438 dout(10) << __func__ << " " << oid << dendl;
7c673cae
FG
12439 if (callbacks_for_degraded_object.count(oid)) {
12440 list<Context*> contexts;
12441 contexts.swap(callbacks_for_degraded_object[oid]);
12442 callbacks_for_degraded_object.erase(oid);
12443 for (list<Context*>::iterator i = contexts.begin();
12444 i != contexts.end();
12445 ++i) {
12446 (*i)->complete(0);
12447 }
12448 }
12449 map<hobject_t, snapid_t>::iterator i = objects_blocked_on_degraded_snap.find(
12450 oid.get_head());
12451 if (i != objects_blocked_on_degraded_snap.end() &&
12452 i->second == oid.snap)
12453 objects_blocked_on_degraded_snap.erase(i);
12454}
12455
12456void PrimaryLogPG::_committed_pushed_object(
12457 epoch_t epoch, eversion_t last_complete)
12458{
9f95a23c 12459 std::scoped_lock locker{*this};
7c673cae 12460 if (!pg_has_reset_since(epoch)) {
9f95a23c 12461 recovery_state.recovery_committed_to(last_complete);
7c673cae 12462 } else {
9f95a23c
TL
12463 dout(10) << __func__
12464 << " pg has changed, not touching last_complete_ondisk" << dendl;
7c673cae 12465 }
7c673cae
FG
12466}
12467
12468void PrimaryLogPG::_applied_recovered_object(ObjectContextRef obc)
12469{
c07f9fc5
FG
12470 dout(20) << __func__ << dendl;
12471 if (obc) {
12472 dout(20) << "obc = " << *obc << dendl;
12473 }
11fdf7f2 12474 ceph_assert(active_pushes >= 1);
7c673cae
FG
12475 --active_pushes;
12476
12477 // requeue an active chunky scrub waiting on recovery ops
f67539c2 12478 if (!recovery_state.is_deleting() && active_pushes == 0 &&
20effc67 12479 is_scrub_active()) {
f67539c2
TL
12480
12481 osd->queue_scrub_pushes_update(this, is_scrub_blocking_ops());
7c673cae 12482 }
7c673cae
FG
12483}
12484
12485void PrimaryLogPG::_applied_recovered_object_replica()
12486{
c07f9fc5 12487 dout(20) << __func__ << dendl;
11fdf7f2 12488 ceph_assert(active_pushes >= 1);
7c673cae
FG
12489 --active_pushes;
12490
f67539c2 12491 // requeue an active scrub waiting on recovery ops
9f95a23c 12492 if (!recovery_state.is_deleting() && active_pushes == 0 &&
20effc67 12493 is_scrub_active()) {
f67539c2
TL
12494
12495 osd->queue_scrub_replica_pushes(this, m_scrubber->replica_op_priority());
7c673cae 12496 }
7c673cae
FG
12497}
12498
9f95a23c
TL
12499void PrimaryLogPG::on_failed_pull(
12500 const set<pg_shard_t> &from,
12501 const hobject_t &soid,
12502 const eversion_t &v)
7c673cae
FG
12503{
12504 dout(20) << __func__ << ": " << soid << dendl;
11fdf7f2 12505 ceph_assert(recovering.count(soid));
7c673cae
FG
12506 auto obc = recovering[soid];
12507 if (obc) {
12508 list<OpRequestRef> blocked_ops;
12509 obc->drop_recovery_read(&blocked_ops);
12510 requeue_ops(blocked_ops);
12511 }
12512 recovering.erase(soid);
81eedcae 12513 for (auto&& i : from) {
9f95a23c
TL
12514 if (i != pg_whoami) { // we'll get it below in primary_error
12515 recovery_state.force_object_missing(i, soid, v);
81eedcae
TL
12516 }
12517 }
9f95a23c 12518
7c673cae 12519 dout(0) << __func__ << " " << soid << " from shard " << from
9f95a23c
TL
12520 << ", reps on " << recovery_state.get_missing_loc().get_locations(soid)
12521 << " unfound? " << recovery_state.get_missing_loc().is_unfound(soid)
12522 << dendl;
7c673cae 12523 finish_recovery_op(soid); // close out this attempt,
9f95a23c
TL
12524 finish_degraded_object(soid);
12525
12526 if (from.count(pg_whoami)) {
12527 dout(0) << " primary missing oid " << soid << " version " << v << dendl;
12528 primary_error(soid, v);
12529 backfills_in_flight.erase(soid);
12530 }
7c673cae
FG
12531}
12532
7c673cae
FG
12533eversion_t PrimaryLogPG::pick_newest_available(const hobject_t& oid)
12534{
12535 eversion_t v;
12536 pg_missing_item pmi;
9f95a23c 12537 bool is_missing = recovery_state.get_pg_log().get_missing().is_missing(oid, &pmi);
11fdf7f2 12538 ceph_assert(is_missing);
7c673cae
FG
12539 v = pmi.have;
12540 dout(10) << "pick_newest_available " << oid << " " << v << " on osd." << osd->whoami << " (local)" << dendl;
12541
9f95a23c
TL
12542 ceph_assert(!get_acting_recovery_backfill().empty());
12543 for (set<pg_shard_t>::iterator i = get_acting_recovery_backfill().begin();
12544 i != get_acting_recovery_backfill().end();
7c673cae
FG
12545 ++i) {
12546 if (*i == get_primary()) continue;
12547 pg_shard_t peer = *i;
9f95a23c 12548 if (!recovery_state.get_peer_missing(peer).is_missing(oid)) {
7c673cae
FG
12549 continue;
12550 }
9f95a23c 12551 eversion_t h = recovery_state.get_peer_missing(peer).get_items().at(oid).have;
7c673cae
FG
12552 dout(10) << "pick_newest_available " << oid << " " << h << " on osd." << peer << dendl;
12553 if (h > v)
12554 v = h;
12555 }
12556
12557 dout(10) << "pick_newest_available " << oid << " " << v << " (newest)" << dendl;
12558 return v;
12559}
12560
12561void PrimaryLogPG::do_update_log_missing(OpRequestRef &op)
12562{
12563 const MOSDPGUpdateLogMissing *m = static_cast<const MOSDPGUpdateLogMissing*>(
12564 op->get_req());
11fdf7f2 12565 ceph_assert(m->get_type() == MSG_OSD_PG_UPDATE_LOG_MISSING);
7c673cae 12566 ObjectStore::Transaction t;
9f95a23c 12567 std::optional<eversion_t> op_trim_to, op_roll_forward_to;
94b18763
FG
12568 if (m->pg_trim_to != eversion_t())
12569 op_trim_to = m->pg_trim_to;
12570 if (m->pg_roll_forward_to != eversion_t())
12571 op_roll_forward_to = m->pg_roll_forward_to;
12572
9f95a23c
TL
12573 dout(20) << __func__
12574 << " op_trim_to = " << op_trim_to << " op_roll_forward_to = " << op_roll_forward_to << dendl;
94b18763 12575
9f95a23c
TL
12576 recovery_state.append_log_entries_update_missing(
12577 m->entries, t, op_trim_to, op_roll_forward_to);
94b18763 12578 eversion_t new_lcod = info.last_complete;
7c673cae 12579
9f95a23c 12580 Context *complete = new LambdaContext(
1e59de90 12581 [=, this](int) {
7c673cae
FG
12582 const MOSDPGUpdateLogMissing *msg = static_cast<const MOSDPGUpdateLogMissing*>(
12583 op->get_req());
9f95a23c 12584 std::scoped_lock locker{*this};
7c673cae 12585 if (!pg_has_reset_since(msg->get_epoch())) {
94b18763 12586 update_last_complete_ondisk(new_lcod);
7c673cae
FG
12587 MOSDPGUpdateLogMissingReply *reply =
12588 new MOSDPGUpdateLogMissingReply(
12589 spg_t(info.pgid.pgid, primary_shard().shard),
12590 pg_whoami.shard,
12591 msg->get_epoch(),
12592 msg->min_epoch,
94b18763
FG
12593 msg->get_tid(),
12594 new_lcod);
7c673cae
FG
12595 reply->set_priority(CEPH_MSG_PRIO_HIGH);
12596 msg->get_connection()->send_message(reply);
12597 }
7c673cae
FG
12598 });
12599
9f95a23c 12600 if (get_osdmap()->require_osd_release >= ceph_release_t::kraken) {
7c673cae
FG
12601 t.register_on_commit(complete);
12602 } else {
12603 /* Hack to work around the fact that ReplicatedBackend sends
12604 * ack+commit if commit happens first
12605 *
12606 * This behavior is no longer necessary, but we preserve it so old
12607 * primaries can keep their repops in order */
11fdf7f2 12608 if (pool.info.is_erasure()) {
7c673cae
FG
12609 t.register_on_complete(complete);
12610 } else {
12611 t.register_on_commit(complete);
12612 }
12613 }
7c673cae 12614 int tr = osd->store->queue_transaction(
11fdf7f2 12615 ch,
7c673cae
FG
12616 std::move(t),
12617 nullptr);
11fdf7f2
TL
12618 ceph_assert(tr == 0);
12619 op_applied(info.last_update);
7c673cae
FG
12620}
12621
12622void PrimaryLogPG::do_update_log_missing_reply(OpRequestRef &op)
12623{
12624 const MOSDPGUpdateLogMissingReply *m =
12625 static_cast<const MOSDPGUpdateLogMissingReply*>(
12626 op->get_req());
12627 dout(20) << __func__ << " got reply from "
12628 << m->get_from() << dendl;
12629
12630 auto it = log_entry_update_waiting_on.find(m->get_tid());
12631 if (it != log_entry_update_waiting_on.end()) {
12632 if (it->second.waiting_on.count(m->get_from())) {
12633 it->second.waiting_on.erase(m->get_from());
94b18763
FG
12634 if (m->last_complete_ondisk != eversion_t()) {
12635 update_peer_last_complete_ondisk(m->get_from(), m->last_complete_ondisk);
12636 }
7c673cae
FG
12637 } else {
12638 osd->clog->error()
12639 << info.pgid << " got reply "
12640 << *m << " from shard we are not waiting for "
12641 << m->get_from();
12642 }
12643
12644 if (it->second.waiting_on.empty()) {
12645 repop_all_committed(it->second.repop.get());
12646 log_entry_update_waiting_on.erase(it);
12647 }
12648 } else {
12649 osd->clog->error()
12650 << info.pgid << " got reply "
12651 << *m << " on unknown tid " << m->get_tid();
12652 }
12653}
12654
12655/* Mark all unfound objects as lost.
12656 */
12657void PrimaryLogPG::mark_all_unfound_lost(
12658 int what,
9f95a23c 12659 std::function<void(int,const std::string&,bufferlist&)> on_finish)
7c673cae
FG
12660{
12661 dout(3) << __func__ << " " << pg_log_entry_t::get_op_name(what) << dendl;
224ce89b 12662 list<hobject_t> oids;
7c673cae
FG
12663
12664 dout(30) << __func__ << ": log before:\n";
9f95a23c 12665 recovery_state.get_pg_log().get_log().print(*_dout);
7c673cae
FG
12666 *_dout << dendl;
12667
31f18b77 12668 mempool::osd_pglog::list<pg_log_entry_t> log_entries;
7c673cae
FG
12669
12670 utime_t mtime = ceph_clock_now();
12671 map<hobject_t, pg_missing_item>::const_iterator m =
9f95a23c 12672 recovery_state.get_missing_loc().get_needs_recovery().begin();
7c673cae 12673 map<hobject_t, pg_missing_item>::const_iterator mend =
9f95a23c 12674 recovery_state.get_missing_loc().get_needs_recovery().end();
7c673cae
FG
12675
12676 ObcLockManager manager;
12677 eversion_t v = get_next_version();
11fdf7f2 12678 v.epoch = get_osdmap_epoch();
9f95a23c 12679 uint64_t num_unfound = recovery_state.get_missing_loc().num_unfound();
7c673cae
FG
12680 while (m != mend) {
12681 const hobject_t &oid(m->first);
9f95a23c 12682 if (!recovery_state.get_missing_loc().is_unfound(oid)) {
7c673cae
FG
12683 // We only care about unfound objects
12684 ++m;
12685 continue;
12686 }
12687
12688 ObjectContextRef obc;
12689 eversion_t prev;
12690
12691 switch (what) {
12692 case pg_log_entry_t::LOST_MARK:
11fdf7f2 12693 ceph_abort_msg("actually, not implemented yet!");
7c673cae
FG
12694 break;
12695
12696 case pg_log_entry_t::LOST_REVERT:
12697 prev = pick_newest_available(oid);
12698 if (prev > eversion_t()) {
12699 // log it
12700 pg_log_entry_t e(
12701 pg_log_entry_t::LOST_REVERT, oid, v,
12702 m->second.need, 0, osd_reqid_t(), mtime, 0);
12703 e.reverting_to = prev;
12704 e.mark_unrollbackable();
12705 log_entries.push_back(e);
12706 dout(10) << e << dendl;
12707
12708 // we are now missing the new version; recovery code will sort it out.
12709 ++v.version;
12710 ++m;
12711 break;
12712 }
12713
12714 case pg_log_entry_t::LOST_DELETE:
12715 {
12716 pg_log_entry_t e(pg_log_entry_t::LOST_DELETE, oid, v, m->second.need,
12717 0, osd_reqid_t(), mtime, 0);
9f95a23c 12718 if (get_osdmap()->require_osd_release >= ceph_release_t::jewel) {
7c673cae
FG
12719 if (pool.info.require_rollback()) {
12720 e.mod_desc.try_rmobject(v.version);
12721 } else {
12722 e.mark_unrollbackable();
12723 }
12724 } // otherwise, just do what we used to do
12725 dout(10) << e << dendl;
12726 log_entries.push_back(e);
224ce89b 12727 oids.push_back(oid);
7c673cae 12728
b32b8144
FG
12729 // If context found mark object as deleted in case
12730 // of racing with new creation. This can happen if
12731 // object lost and EIO at primary.
12732 obc = object_contexts.lookup(oid);
12733 if (obc)
12734 obc->obs.exists = false;
12735
7c673cae
FG
12736 ++v.version;
12737 ++m;
12738 }
12739 break;
12740
12741 default:
12742 ceph_abort();
12743 }
12744 }
12745
9f95a23c
TL
12746 recovery_state.update_stats(
12747 [](auto &history, auto &stats) {
12748 stats.stats_invalid = true;
12749 return false;
12750 });
7c673cae
FG
12751
12752 submit_log_entries(
12753 log_entries,
12754 std::move(manager),
9f95a23c
TL
12755 std::optional<std::function<void(void)> >(
12756 [this, oids, num_unfound, on_finish]() {
12757 if (recovery_state.perform_deletes_during_peering()) {
c07f9fc5
FG
12758 for (auto oid : oids) {
12759 // clear old locations - merge_new_log_entries will have
12760 // handled rebuilding missing_loc for each of these
12761 // objects if we have the RECOVERY_DELETES flag
9f95a23c 12762 recovery_state.object_recovered(oid, object_stat_sum_t());
c07f9fc5
FG
12763 }
12764 }
12765
b32b8144
FG
12766 if (is_recovery_unfound()) {
12767 queue_peering_event(
11fdf7f2
TL
12768 PGPeeringEventRef(
12769 std::make_shared<PGPeeringEvent>(
12770 get_osdmap_epoch(),
12771 get_osdmap_epoch(),
9f95a23c 12772 PeeringState::DoRecovery())));
b32b8144
FG
12773 } else if (is_backfill_unfound()) {
12774 queue_peering_event(
11fdf7f2
TL
12775 PGPeeringEventRef(
12776 std::make_shared<PGPeeringEvent>(
12777 get_osdmap_epoch(),
12778 get_osdmap_epoch(),
9f95a23c 12779 PeeringState::RequestBackfill())));
b32b8144
FG
12780 } else {
12781 queue_recovery();
7c673cae 12782 }
7c673cae
FG
12783
12784 stringstream ss;
12785 ss << "pg has " << num_unfound
12786 << " objects unfound and apparently lost marking";
12787 string rs = ss.str();
12788 dout(0) << "do_command r=" << 0 << " " << rs << dendl;
12789 osd->clog->info() << rs;
9f95a23c
TL
12790 bufferlist empty;
12791 on_finish(0, rs, empty);
7c673cae
FG
12792 }),
12793 OpRequestRef());
12794}
12795
12796void PrimaryLogPG::_split_into(pg_t child_pgid, PG *child, unsigned split_bits)
12797{
11fdf7f2 12798 ceph_assert(repop_queue.empty());
7c673cae
FG
12799}
12800
12801/*
12802 * pg status change notification
12803 */
12804
12805void PrimaryLogPG::apply_and_flush_repops(bool requeue)
12806{
12807 list<OpRequestRef> rq;
12808
12809 // apply all repops
12810 while (!repop_queue.empty()) {
12811 RepGather *repop = repop_queue.front();
12812 repop_queue.pop_front();
12813 dout(10) << " canceling repop tid " << repop->rep_tid << dendl;
12814 repop->rep_aborted = true;
7c673cae
FG
12815 repop->on_committed.clear();
12816 repop->on_success.clear();
12817
12818 if (requeue) {
12819 if (repop->op) {
12820 dout(10) << " requeuing " << *repop->op->get_req() << dendl;
12821 rq.push_back(repop->op);
12822 repop->op = OpRequestRef();
12823 }
12824
12825 // also requeue any dups, interleaved into position
11fdf7f2 12826 auto p = waiting_for_ondisk.find(repop->v);
7c673cae
FG
12827 if (p != waiting_for_ondisk.end()) {
12828 dout(10) << " also requeuing ondisk waiters " << p->second << dendl;
11fdf7f2
TL
12829 for (auto& i : p->second) {
12830 rq.push_back(std::get<0>(i));
7c673cae
FG
12831 }
12832 waiting_for_ondisk.erase(p);
12833 }
12834 }
12835
12836 remove_repop(repop);
12837 }
12838
11fdf7f2 12839 ceph_assert(repop_queue.empty());
7c673cae
FG
12840
12841 if (requeue) {
12842 requeue_ops(rq);
12843 if (!waiting_for_ondisk.empty()) {
11fdf7f2
TL
12844 for (auto& i : waiting_for_ondisk) {
12845 for (auto& j : i.second) {
12846 derr << __func__ << ": op " << *(std::get<0>(j)->get_req())
12847 << " waiting on " << i.first << dendl;
12848 }
7c673cae 12849 }
11fdf7f2 12850 ceph_assert(waiting_for_ondisk.empty());
7c673cae
FG
12851 }
12852 }
12853
12854 waiting_for_ondisk.clear();
12855}
12856
12857void PrimaryLogPG::on_flushed()
12858{
9f95a23c 12859 requeue_ops(waiting_for_flush);
7c673cae
FG
12860 if (!is_peered() || !is_primary()) {
12861 pair<hobject_t, ObjectContextRef> i;
12862 while (object_contexts.get_next(i.first, &i)) {
11fdf7f2 12863 derr << __func__ << ": object " << i.first << " obc still alive" << dendl;
7c673cae 12864 }
11fdf7f2 12865 ceph_assert(object_contexts.empty());
7c673cae 12866 }
7c673cae
FG
12867}
12868
9f95a23c 12869void PrimaryLogPG::on_removal(ObjectStore::Transaction &t)
7c673cae 12870{
11fdf7f2 12871 dout(10) << __func__ << dendl;
7c673cae 12872
11fdf7f2 12873 on_shutdown();
9f95a23c
TL
12874
12875 t.register_on_commit(new C_DeleteMore(this, get_osdmap_epoch()));
7c673cae
FG
12876}
12877
c07f9fc5
FG
12878void PrimaryLogPG::clear_async_reads()
12879{
12880 dout(10) << __func__ << dendl;
12881 for(auto& i : in_progress_async_reads) {
12882 dout(10) << "clear ctx: "
12883 << "OpRequestRef " << i.first
12884 << " OpContext " << i.second
12885 << dendl;
12886 close_op_ctx(i.second);
12887 }
12888}
12889
11fdf7f2 12890void PrimaryLogPG::clear_cache()
7c673cae 12891{
11fdf7f2
TL
12892 object_contexts.clear();
12893}
7c673cae 12894
11fdf7f2
TL
12895void PrimaryLogPG::on_shutdown()
12896{
12897 dout(10) << __func__ << dendl;
7c673cae 12898
224ce89b
WB
12899 if (recovery_queued) {
12900 recovery_queued = false;
12901 osd->clear_queued_recovery(this);
12902 }
12903
f67539c2 12904 m_scrubber->scrub_clear_state();
20effc67 12905 m_scrubber->rm_from_osd_scrubbing();
94b18763
FG
12906
12907 vector<ceph_tid_t> tids;
12908 cancel_copy_ops(false, &tids);
12909 cancel_flush_ops(false, &tids);
12910 cancel_proxy_ops(false, &tids);
9f95a23c 12911 cancel_manifest_ops(false, &tids);
20effc67 12912 cancel_cls_gather_ops(false, &tids);
94b18763
FG
12913 osd->objecter->op_cancel(tids, -ECANCELED);
12914
7c673cae
FG
12915 apply_and_flush_repops(false);
12916 cancel_log_updates();
31f18b77 12917 // we must remove PGRefs, so do this this prior to release_backoffs() callers
f67539c2 12918 clear_backoffs();
31f18b77
FG
12919 // clean up snap trim references
12920 snap_trimmer_machine.process_event(Reset());
7c673cae
FG
12921
12922 pgbackend->on_change();
12923
12924 context_registry_on_change();
12925 object_contexts.clear();
12926
c07f9fc5
FG
12927 clear_async_reads();
12928
7c673cae
FG
12929 osd->remote_reserver.cancel_reservation(info.pgid);
12930 osd->local_reserver.cancel_reservation(info.pgid);
12931
12932 clear_primary_state();
12933 cancel_recovery();
11fdf7f2
TL
12934
12935 if (is_primary()) {
12936 osd->clear_ready_to_merge(this);
12937 }
7c673cae
FG
12938}
12939
9f95a23c 12940void PrimaryLogPG::on_activate_complete()
7c673cae 12941{
9f95a23c
TL
12942 check_local();
12943 // waiters
12944 if (!recovery_state.needs_flush()) {
12945 requeue_ops(waiting_for_peered);
12946 } else if (!waiting_for_peered.empty()) {
12947 dout(10) << __func__ << " flushes in progress, moving "
12948 << waiting_for_peered.size()
12949 << " items to waiting_for_flush"
12950 << dendl;
12951 ceph_assert(waiting_for_flush.empty());
12952 waiting_for_flush.swap(waiting_for_peered);
12953 }
12954
12955
7c673cae
FG
12956 // all clean?
12957 if (needs_recovery()) {
12958 dout(10) << "activate not all replicas are up-to-date, queueing recovery" << dendl;
12959 queue_peering_event(
11fdf7f2
TL
12960 PGPeeringEventRef(
12961 std::make_shared<PGPeeringEvent>(
12962 get_osdmap_epoch(),
12963 get_osdmap_epoch(),
9f95a23c 12964 PeeringState::DoRecovery())));
7c673cae
FG
12965 } else if (needs_backfill()) {
12966 dout(10) << "activate queueing backfill" << dendl;
12967 queue_peering_event(
11fdf7f2
TL
12968 PGPeeringEventRef(
12969 std::make_shared<PGPeeringEvent>(
12970 get_osdmap_epoch(),
12971 get_osdmap_epoch(),
9f95a23c 12972 PeeringState::RequestBackfill())));
7c673cae
FG
12973 } else {
12974 dout(10) << "activate all replicas clean, no recovery" << dendl;
12975 queue_peering_event(
11fdf7f2
TL
12976 PGPeeringEventRef(
12977 std::make_shared<PGPeeringEvent>(
12978 get_osdmap_epoch(),
12979 get_osdmap_epoch(),
9f95a23c 12980 PeeringState::AllReplicasRecovered())));
7c673cae
FG
12981 }
12982
12983 publish_stats_to_osd();
12984
9f95a23c 12985 if (get_backfill_targets().size()) {
f67539c2 12986 last_backfill_started = recovery_state.earliest_backfill();
7c673cae 12987 new_backfill = true;
11fdf7f2 12988 ceph_assert(!last_backfill_started.is_max());
9f95a23c 12989 dout(5) << __func__ << ": bft=" << get_backfill_targets()
7c673cae 12990 << " from " << last_backfill_started << dendl;
9f95a23c
TL
12991 for (set<pg_shard_t>::const_iterator i = get_backfill_targets().begin();
12992 i != get_backfill_targets().end();
7c673cae
FG
12993 ++i) {
12994 dout(5) << "target shard " << *i
9f95a23c 12995 << " from " << recovery_state.get_peer_info(*i).last_backfill
7c673cae
FG
12996 << dendl;
12997 }
12998 }
12999
13000 hit_set_setup();
13001 agent_setup();
13002}
13003
9f95a23c 13004void PrimaryLogPG::on_change(ObjectStore::Transaction &t)
7c673cae 13005{
11fdf7f2 13006 dout(10) << __func__ << dendl;
7c673cae
FG
13007
13008 if (hit_set && hit_set->insert_count() == 0) {
13009 dout(20) << " discarding empty hit_set" << dendl;
13010 hit_set_clear();
13011 }
13012
13013 if (recovery_queued) {
13014 recovery_queued = false;
13015 osd->clear_queued_recovery(this);
13016 }
13017
13018 // requeue everything in the reverse order they should be
13019 // reexamined.
13020 requeue_ops(waiting_for_peered);
b32b8144 13021 requeue_ops(waiting_for_flush);
7c673cae 13022 requeue_ops(waiting_for_active);
9f95a23c 13023 requeue_ops(waiting_for_readable);
7c673cae 13024
94b18763
FG
13025 vector<ceph_tid_t> tids;
13026 cancel_copy_ops(is_primary(), &tids);
13027 cancel_flush_ops(is_primary(), &tids);
13028 cancel_proxy_ops(is_primary(), &tids);
9f95a23c 13029 cancel_manifest_ops(is_primary(), &tids);
20effc67 13030 cancel_cls_gather_ops(is_primary(), &tids);
94b18763 13031 osd->objecter->op_cancel(tids, -ECANCELED);
7c673cae
FG
13032
13033 // requeue object waiters
13034 for (auto& p : waiting_for_unreadable_object) {
13035 release_backoffs(p.first);
13036 }
13037 if (is_primary()) {
13038 requeue_object_waiters(waiting_for_unreadable_object);
13039 } else {
13040 waiting_for_unreadable_object.clear();
13041 }
13042 for (map<hobject_t,list<OpRequestRef>>::iterator p = waiting_for_degraded_object.begin();
13043 p != waiting_for_degraded_object.end();
13044 waiting_for_degraded_object.erase(p++)) {
13045 release_backoffs(p->first);
13046 if (is_primary())
13047 requeue_ops(p->second);
13048 else
13049 p->second.clear();
13050 finish_degraded_object(p->first);
13051 }
13052
13053 // requeues waiting_for_scrub
f67539c2 13054 m_scrubber->scrub_clear_state();
7c673cae
FG
13055
13056 for (auto p = waiting_for_blocked_object.begin();
13057 p != waiting_for_blocked_object.end();
13058 waiting_for_blocked_object.erase(p++)) {
13059 if (is_primary())
13060 requeue_ops(p->second);
13061 else
13062 p->second.clear();
13063 }
13064 for (auto i = callbacks_for_degraded_object.begin();
13065 i != callbacks_for_degraded_object.end();
13066 ) {
13067 finish_degraded_object((i++)->first);
13068 }
11fdf7f2 13069 ceph_assert(callbacks_for_degraded_object.empty());
7c673cae
FG
13070
13071 if (is_primary()) {
13072 requeue_ops(waiting_for_cache_not_full);
7c673cae
FG
13073 } else {
13074 waiting_for_cache_not_full.clear();
7c673cae
FG
13075 }
13076 objects_blocked_on_cache_full.clear();
13077
13078 for (list<pair<OpRequestRef, OpContext*> >::iterator i =
13079 in_progress_async_reads.begin();
13080 i != in_progress_async_reads.end();
13081 in_progress_async_reads.erase(i++)) {
13082 close_op_ctx(i->second);
13083 if (is_primary())
13084 requeue_op(i->first);
13085 }
13086
13087 // this will requeue ops we were working on but didn't finish, and
13088 // any dups
13089 apply_and_flush_repops(is_primary());
13090 cancel_log_updates();
13091
13092 // do this *after* apply_and_flush_repops so that we catch any newly
13093 // registered watches.
13094 context_registry_on_change();
13095
9f95a23c 13096 pgbackend->on_change_cleanup(&t);
f67539c2 13097 m_scrubber->cleanup_store(&t);
7c673cae
FG
13098 pgbackend->on_change();
13099
13100 // clear snap_trimmer state
13101 snap_trimmer_machine.process_event(Reset());
13102
13103 debug_op_order.clear();
13104 unstable_stats.clear();
13105
13106 // we don't want to cache object_contexts through the interval change
13107 // NOTE: we actually assert that all currently live references are dead
13108 // by the time the flush for the next interval completes.
13109 object_contexts.clear();
13110
13111 // should have been cleared above by finishing all of the degraded objects
11fdf7f2 13112 ceph_assert(objects_blocked_on_degraded_snap.empty());
7c673cae
FG
13113}
13114
9f95a23c 13115void PrimaryLogPG::plpg_on_role_change()
7c673cae 13116{
11fdf7f2 13117 dout(10) << __func__ << dendl;
7c673cae
FG
13118 if (get_role() != 0 && hit_set) {
13119 dout(10) << " clearing hit set" << dendl;
13120 hit_set_clear();
13121 }
13122}
13123
9f95a23c 13124void PrimaryLogPG::plpg_on_pool_change()
7c673cae
FG
13125{
13126 dout(10) << __func__ << dendl;
13127 // requeue cache full waiters just in case the cache_mode is
13128 // changing away from writeback mode. note that if we are not
13129 // active the normal requeuing machinery is sufficient (and properly
13130 // ordered).
13131 if (is_active() &&
13132 pool.info.cache_mode != pg_pool_t::CACHEMODE_WRITEBACK &&
13133 !waiting_for_cache_not_full.empty()) {
13134 dout(10) << __func__ << " requeuing full waiters (not in writeback) "
13135 << dendl;
13136 requeue_ops(waiting_for_cache_not_full);
13137 objects_blocked_on_cache_full.clear();
13138 }
13139 hit_set_setup();
13140 agent_setup();
13141}
13142
13143// clear state. called on recovery completion AND cancellation.
13144void PrimaryLogPG::_clear_recovery_state()
13145{
7c673cae
FG
13146#ifdef DEBUG_RECOVERY_OIDS
13147 recovering_oids.clear();
13148#endif
f67539c2
TL
13149 dout(15) << __func__ << " flags: " << m_planned_scrub << dendl;
13150
7c673cae
FG
13151 last_backfill_started = hobject_t();
13152 set<hobject_t>::iterator i = backfills_in_flight.begin();
13153 while (i != backfills_in_flight.end()) {
7c673cae
FG
13154 backfills_in_flight.erase(i++);
13155 }
13156
13157 list<OpRequestRef> blocked_ops;
13158 for (map<hobject_t, ObjectContextRef>::iterator i = recovering.begin();
13159 i != recovering.end();
13160 recovering.erase(i++)) {
13161 if (i->second) {
13162 i->second->drop_recovery_read(&blocked_ops);
13163 requeue_ops(blocked_ops);
13164 }
13165 }
11fdf7f2 13166 ceph_assert(backfills_in_flight.empty());
7c673cae 13167 pending_backfill_updates.clear();
11fdf7f2 13168 ceph_assert(recovering.empty());
7c673cae
FG
13169 pgbackend->clear_recovery_state();
13170}
13171
13172void PrimaryLogPG::cancel_pull(const hobject_t &soid)
13173{
13174 dout(20) << __func__ << ": " << soid << dendl;
11fdf7f2 13175 ceph_assert(recovering.count(soid));
7c673cae
FG
13176 ObjectContextRef obc = recovering[soid];
13177 if (obc) {
13178 list<OpRequestRef> blocked_ops;
13179 obc->drop_recovery_read(&blocked_ops);
13180 requeue_ops(blocked_ops);
13181 }
13182 recovering.erase(soid);
13183 finish_recovery_op(soid);
13184 release_backoffs(soid);
13185 if (waiting_for_degraded_object.count(soid)) {
13186 dout(20) << " kicking degraded waiters on " << soid << dendl;
13187 requeue_ops(waiting_for_degraded_object[soid]);
13188 waiting_for_degraded_object.erase(soid);
13189 }
13190 if (waiting_for_unreadable_object.count(soid)) {
13191 dout(20) << " kicking unreadable waiters on " << soid << dendl;
13192 requeue_ops(waiting_for_unreadable_object[soid]);
13193 waiting_for_unreadable_object.erase(soid);
13194 }
13195 if (is_missing_object(soid))
9f95a23c 13196 recovery_state.set_last_requested(0);
7c673cae
FG
13197 finish_degraded_object(soid);
13198}
13199
13200void PrimaryLogPG::check_recovery_sources(const OSDMapRef& osdmap)
13201{
7c673cae 13202 pgbackend->check_recovery_sources(osdmap);
7c673cae
FG
13203}
13204
7c673cae
FG
13205bool PrimaryLogPG::start_recovery_ops(
13206 uint64_t max,
13207 ThreadPool::TPHandle &handle,
13208 uint64_t *ops_started)
13209{
13210 uint64_t& started = *ops_started;
13211 started = 0;
13212 bool work_in_progress = false;
11fdf7f2
TL
13213 bool recovery_started = false;
13214 ceph_assert(is_primary());
13215 ceph_assert(is_peered());
9f95a23c 13216 ceph_assert(!recovery_state.is_deleting());
11fdf7f2
TL
13217
13218 ceph_assert(recovery_queued);
13219 recovery_queued = false;
7c673cae
FG
13220
13221 if (!state_test(PG_STATE_RECOVERING) &&
3efd9988 13222 !state_test(PG_STATE_BACKFILLING)) {
7c673cae
FG
13223 /* TODO: I think this case is broken and will make do_recovery()
13224 * unhappy since we're returning false */
13225 dout(10) << "recovery raced and were queued twice, ignoring!" << dendl;
11fdf7f2 13226 return have_unfound();
7c673cae
FG
13227 }
13228
9f95a23c 13229 const auto &missing = recovery_state.get_pg_log().get_missing();
7c673cae 13230
7c673cae
FG
13231 uint64_t num_unfound = get_num_unfound();
13232
9f95a23c
TL
13233 if (!recovery_state.have_missing()) {
13234 recovery_state.local_recovery_complete();
7c673cae
FG
13235 }
13236
81eedcae 13237 if (!missing.have_missing() || // Primary does not have missing
9f95a23c
TL
13238 // or all of the missing objects are unfound.
13239 recovery_state.all_missing_unfound()) {
7c673cae 13240 // Recover the replicas.
11fdf7f2 13241 started = recover_replicas(max, handle, &recovery_started);
7c673cae
FG
13242 }
13243 if (!started) {
13244 // We still have missing objects that we should grab from replicas.
13245 started += recover_primary(max, handle);
13246 }
13247 if (!started && num_unfound != get_num_unfound()) {
13248 // second chance to recovery replicas
11fdf7f2 13249 started = recover_replicas(max, handle, &recovery_started);
7c673cae
FG
13250 }
13251
11fdf7f2 13252 if (started || recovery_started)
7c673cae
FG
13253 work_in_progress = true;
13254
13255 bool deferred_backfill = false;
13256 if (recovering.empty() &&
3efd9988 13257 state_test(PG_STATE_BACKFILLING) &&
9f95a23c 13258 !get_backfill_targets().empty() && started < max &&
7c673cae
FG
13259 missing.num_missing() == 0 &&
13260 waiting_on_backfill.empty()) {
13261 if (get_osdmap()->test_flag(CEPH_OSDMAP_NOBACKFILL)) {
13262 dout(10) << "deferring backfill due to NOBACKFILL" << dendl;
13263 deferred_backfill = true;
13264 } else if (get_osdmap()->test_flag(CEPH_OSDMAP_NOREBALANCE) &&
13265 !is_degraded()) {
13266 dout(10) << "deferring backfill due to NOREBALANCE" << dendl;
13267 deferred_backfill = true;
9f95a23c 13268 } else if (!recovery_state.is_backfill_reserved()) {
f67539c2 13269 /* DNMNOTE I think this branch is dead */
7c673cae
FG
13270 dout(10) << "deferring backfill due to !backfill_reserved" << dendl;
13271 if (!backfill_reserving) {
13272 dout(10) << "queueing RequestBackfill" << dendl;
13273 backfill_reserving = true;
13274 queue_peering_event(
11fdf7f2
TL
13275 PGPeeringEventRef(
13276 std::make_shared<PGPeeringEvent>(
13277 get_osdmap_epoch(),
13278 get_osdmap_epoch(),
9f95a23c 13279 PeeringState::RequestBackfill())));
7c673cae
FG
13280 }
13281 deferred_backfill = true;
13282 } else {
13283 started += recover_backfill(max - started, handle, &work_in_progress);
13284 }
13285 }
13286
13287 dout(10) << " started " << started << dendl;
13288 osd->logger->inc(l_osd_rop, started);
13289
13290 if (!recovering.empty() ||
13291 work_in_progress || recovery_ops_active > 0 || deferred_backfill)
11fdf7f2 13292 return !work_in_progress && have_unfound();
7c673cae 13293
11fdf7f2
TL
13294 ceph_assert(recovering.empty());
13295 ceph_assert(recovery_ops_active == 0);
7c673cae
FG
13296
13297 dout(10) << __func__ << " needs_recovery: "
9f95a23c 13298 << recovery_state.get_missing_loc().get_needs_recovery()
7c673cae
FG
13299 << dendl;
13300 dout(10) << __func__ << " missing_loc: "
9f95a23c 13301 << recovery_state.get_missing_loc().get_missing_locs()
7c673cae
FG
13302 << dendl;
13303 int unfound = get_num_unfound();
13304 if (unfound) {
13305 dout(10) << " still have " << unfound << " unfound" << dendl;
11fdf7f2 13306 return true;
7c673cae
FG
13307 }
13308
13309 if (missing.num_missing() > 0) {
13310 // this shouldn't happen!
c07f9fc5
FG
13311 osd->clog->error() << info.pgid << " Unexpected Error: recovery ending with "
13312 << missing.num_missing() << ": " << missing.get_items();
11fdf7f2 13313 return false;
7c673cae
FG
13314 }
13315
13316 if (needs_recovery()) {
13317 // this shouldn't happen!
13318 // We already checked num_missing() so we must have missing replicas
f67539c2 13319 osd->clog->error() << info.pgid
c07f9fc5 13320 << " Unexpected Error: recovery ending with missing replicas";
11fdf7f2 13321 return false;
7c673cae
FG
13322 }
13323
13324 if (state_test(PG_STATE_RECOVERING)) {
13325 state_clear(PG_STATE_RECOVERING);
c07f9fc5 13326 state_clear(PG_STATE_FORCED_RECOVERY);
7c673cae
FG
13327 if (needs_backfill()) {
13328 dout(10) << "recovery done, queuing backfill" << dendl;
13329 queue_peering_event(
11fdf7f2
TL
13330 PGPeeringEventRef(
13331 std::make_shared<PGPeeringEvent>(
13332 get_osdmap_epoch(),
13333 get_osdmap_epoch(),
9f95a23c 13334 PeeringState::RequestBackfill())));
7c673cae
FG
13335 } else {
13336 dout(10) << "recovery done, no backfill" << dendl;
c07f9fc5 13337 state_clear(PG_STATE_FORCED_BACKFILL);
7c673cae 13338 queue_peering_event(
11fdf7f2
TL
13339 PGPeeringEventRef(
13340 std::make_shared<PGPeeringEvent>(
13341 get_osdmap_epoch(),
13342 get_osdmap_epoch(),
9f95a23c 13343 PeeringState::AllReplicasRecovered())));
7c673cae
FG
13344 }
13345 } else { // backfilling
3efd9988 13346 state_clear(PG_STATE_BACKFILLING);
c07f9fc5
FG
13347 state_clear(PG_STATE_FORCED_BACKFILL);
13348 state_clear(PG_STATE_FORCED_RECOVERY);
7c673cae
FG
13349 dout(10) << "recovery done, backfill done" << dendl;
13350 queue_peering_event(
11fdf7f2
TL
13351 PGPeeringEventRef(
13352 std::make_shared<PGPeeringEvent>(
13353 get_osdmap_epoch(),
13354 get_osdmap_epoch(),
9f95a23c 13355 PeeringState::Backfilled())));
7c673cae
FG
13356 }
13357
13358 return false;
13359}
13360
13361/**
13362 * do one recovery op.
13363 * return true if done, false if nothing left to do.
13364 */
13365uint64_t PrimaryLogPG::recover_primary(uint64_t max, ThreadPool::TPHandle &handle)
13366{
11fdf7f2 13367 ceph_assert(is_primary());
7c673cae 13368
9f95a23c 13369 const auto &missing = recovery_state.get_pg_log().get_missing();
7c673cae 13370
11fdf7f2
TL
13371 dout(10) << __func__ << " recovering " << recovering.size()
13372 << " in pg,"
13373 << " missing " << missing << dendl;
13374
13375 dout(25) << __func__ << " " << missing.get_items() << dendl;
7c673cae
FG
13376
13377 // look at log!
13378 pg_log_entry_t *latest = 0;
13379 unsigned started = 0;
13380 int skipped = 0;
13381
13382 PGBackend::RecoveryHandle *h = pgbackend->open_recovery_op();
13383 map<version_t, hobject_t>::const_iterator p =
9f95a23c 13384 missing.get_rmissing().lower_bound(recovery_state.get_pg_log().get_log().last_requested);
7c673cae
FG
13385 while (p != missing.get_rmissing().end()) {
13386 handle.reset_tp_timeout();
13387 hobject_t soid;
13388 version_t v = p->first;
13389
9f95a23c
TL
13390 auto it_objects = recovery_state.get_pg_log().get_log().objects.find(p->second);
13391 if (it_objects != recovery_state.get_pg_log().get_log().objects.end()) {
11fdf7f2
TL
13392 latest = it_objects->second;
13393 ceph_assert(latest->is_update() || latest->is_delete());
7c673cae
FG
13394 soid = latest->soid;
13395 } else {
13396 latest = 0;
13397 soid = p->second;
13398 }
13399 const pg_missing_item& item = missing.get_items().find(p->second)->second;
13400 ++p;
13401
224ce89b 13402 hobject_t head = soid.get_head();
7c673cae
FG
13403
13404 eversion_t need = item.need;
13405
11fdf7f2 13406 dout(10) << __func__ << " "
7c673cae
FG
13407 << soid << " " << item.need
13408 << (missing.is_missing(soid) ? " (missing)":"")
13409 << (missing.is_missing(head) ? " (missing head)":"")
13410 << (recovering.count(soid) ? " (recovering)":"")
13411 << (recovering.count(head) ? " (recovering head)":"")
13412 << dendl;
13413
13414 if (latest) {
13415 switch (latest->op) {
13416 case pg_log_entry_t::CLONE:
13417 /*
13418 * Handling for this special case removed for now, until we
13419 * can correctly construct an accurate SnapSet from the old
13420 * one.
13421 */
13422 break;
13423
13424 case pg_log_entry_t::LOST_REVERT:
13425 {
13426 if (item.have == latest->reverting_to) {
13427 ObjectContextRef obc = get_object_context(soid, true);
f67539c2 13428
7c673cae
FG
13429 if (obc->obs.oi.version == latest->version) {
13430 // I'm already reverting
13431 dout(10) << " already reverting " << soid << dendl;
13432 } else {
13433 dout(10) << " reverting " << soid << " to " << latest->prior_version << dendl;
7c673cae
FG
13434 obc->obs.oi.version = latest->version;
13435
13436 ObjectStore::Transaction t;
13437 bufferlist b2;
13438 obc->obs.oi.encode(
13439 b2,
13440 get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
11fdf7f2 13441 ceph_assert(!pool.info.require_rollback());
7c673cae
FG
13442 t.setattr(coll, ghobject_t(soid), OI_ATTR, b2);
13443
9f95a23c
TL
13444 recovery_state.recover_got(
13445 soid,
13446 latest->version,
13447 false,
13448 t);
7c673cae
FG
13449
13450 ++active_pushes;
13451
11fdf7f2
TL
13452 t.register_on_applied(new C_OSD_AppliedRecoveredObject(this, obc));
13453 t.register_on_commit(new C_OSD_CommittedPushedObject(
13454 this,
13455 get_osdmap_epoch(),
13456 info.last_complete));
13457 osd->store->queue_transaction(ch, std::move(t));
7c673cae
FG
13458 continue;
13459 }
13460 } else {
13461 /*
13462 * Pull the old version of the object. Update missing_loc here to have the location
13463 * of the version we want.
13464 *
13465 * This doesn't use the usual missing_loc paths, but that's okay:
13466 * - if we have it locally, we hit the case above, and go from there.
13467 * - if we don't, we always pass through this case during recovery and set up the location
13468 * properly.
13469 * - this way we don't need to mangle the missing code to be general about needing an old
13470 * version...
13471 */
13472 eversion_t alternate_need = latest->reverting_to;
13473 dout(10) << " need to pull prior_version " << alternate_need << " for revert " << item << dendl;
13474
9f95a23c
TL
13475 set<pg_shard_t> good_peers;
13476 for (auto p = recovery_state.get_peer_missing().begin();
13477 p != recovery_state.get_peer_missing().end();
13478 ++p) {
7c673cae
FG
13479 if (p->second.is_missing(soid, need) &&
13480 p->second.get_items().at(soid).have == alternate_need) {
9f95a23c 13481 good_peers.insert(p->first);
7c673cae 13482 }
9f95a23c
TL
13483 }
13484 recovery_state.set_revert_with_targets(
13485 soid,
13486 good_peers);
7c673cae 13487 dout(10) << " will pull " << alternate_need << " or " << need
9f95a23c
TL
13488 << " from one of "
13489 << recovery_state.get_missing_loc().get_locations(soid)
7c673cae
FG
13490 << dendl;
13491 }
13492 }
13493 break;
13494 }
13495 }
f67539c2 13496
7c673cae
FG
13497 if (!recovering.count(soid)) {
13498 if (recovering.count(head)) {
13499 ++skipped;
13500 } else {
13501 int r = recover_missing(
1e59de90 13502 soid, need, recovery_state.get_recovery_op_priority(), h);
7c673cae
FG
13503 switch (r) {
13504 case PULL_YES:
13505 ++started;
13506 break;
11fdf7f2 13507 case PULL_HEAD:
7c673cae
FG
13508 ++started;
13509 case PULL_NONE:
13510 ++skipped;
13511 break;
13512 default:
13513 ceph_abort();
13514 }
13515 if (started >= max)
13516 break;
13517 }
13518 }
f67539c2 13519
7c673cae
FG
13520 // only advance last_requested if we haven't skipped anything
13521 if (!skipped)
9f95a23c 13522 recovery_state.set_last_requested(v);
7c673cae 13523 }
f67539c2 13524
1e59de90 13525 pgbackend->run_recovery_op(h, recovery_state.get_recovery_op_priority());
7c673cae
FG
13526 return started;
13527}
13528
224ce89b
WB
13529bool PrimaryLogPG::primary_error(
13530 const hobject_t& soid, eversion_t v)
13531{
9f95a23c
TL
13532 recovery_state.force_object_missing(pg_whoami, soid, v);
13533 bool uhoh = recovery_state.get_missing_loc().is_unfound(soid);
224ce89b 13534 if (uhoh)
9f95a23c
TL
13535 osd->clog->error() << info.pgid << " missing primary copy of "
13536 << soid << ", unfound";
224ce89b 13537 else
9f95a23c
TL
13538 osd->clog->error() << info.pgid << " missing primary copy of "
13539 << soid
13540 << ", will try copies on "
13541 << recovery_state.get_missing_loc().get_locations(soid);
224ce89b
WB
13542 return uhoh;
13543}
13544
c07f9fc5
FG
13545int PrimaryLogPG::prep_object_replica_deletes(
13546 const hobject_t& soid, eversion_t v,
11fdf7f2
TL
13547 PGBackend::RecoveryHandle *h,
13548 bool *work_started)
c07f9fc5 13549{
11fdf7f2 13550 ceph_assert(is_primary());
c07f9fc5
FG
13551 dout(10) << __func__ << ": on " << soid << dendl;
13552
11fdf7f2
TL
13553 ObjectContextRef obc = get_object_context(soid, false);
13554 if (obc) {
13555 if (!obc->get_recovery_read()) {
13556 dout(20) << "replica delete delayed on " << soid
13557 << "; could not get rw_manager lock" << dendl;
13558 *work_started = true;
13559 return 0;
13560 } else {
13561 dout(20) << "replica delete got recovery read lock on " << soid
13562 << dendl;
13563 }
13564 }
13565
c07f9fc5 13566 start_recovery_op(soid);
11fdf7f2
TL
13567 ceph_assert(!recovering.count(soid));
13568 if (!obc)
13569 recovering.insert(make_pair(soid, ObjectContextRef()));
13570 else
13571 recovering.insert(make_pair(soid, obc));
c07f9fc5
FG
13572
13573 pgbackend->recover_delete_object(soid, v, h);
13574 return 1;
13575}
13576
7c673cae
FG
13577int PrimaryLogPG::prep_object_replica_pushes(
13578 const hobject_t& soid, eversion_t v,
11fdf7f2
TL
13579 PGBackend::RecoveryHandle *h,
13580 bool *work_started)
7c673cae 13581{
11fdf7f2 13582 ceph_assert(is_primary());
7c673cae
FG
13583 dout(10) << __func__ << ": on " << soid << dendl;
13584
9f95a23c
TL
13585 if (soid.snap && soid.snap < CEPH_NOSNAP) {
13586 // do we have the head and/or snapdir?
13587 hobject_t head = soid.get_head();
13588 if (recovery_state.get_pg_log().get_missing().is_missing(head)) {
13589 if (recovering.count(head)) {
13590 dout(10) << " missing but already recovering head " << head << dendl;
13591 return 0;
13592 } else {
13593 int r = recover_missing(
13594 head, recovery_state.get_pg_log().get_missing().get_items().find(head)->second.need,
1e59de90 13595 recovery_state.get_recovery_op_priority(), h);
9f95a23c
TL
13596 if (r != PULL_NONE)
13597 return 1;
13598 return 0;
13599 }
13600 }
13601 }
13602
7c673cae
FG
13603 // NOTE: we know we will get a valid oloc off of disk here.
13604 ObjectContextRef obc = get_object_context(soid, false);
13605 if (!obc) {
224ce89b 13606 primary_error(soid, v);
7c673cae
FG
13607 return 0;
13608 }
13609
13610 if (!obc->get_recovery_read()) {
13611 dout(20) << "recovery delayed on " << soid
13612 << "; could not get rw_manager lock" << dendl;
11fdf7f2 13613 *work_started = true;
7c673cae
FG
13614 return 0;
13615 } else {
13616 dout(20) << "recovery got recovery read lock on " << soid
13617 << dendl;
13618 }
13619
13620 start_recovery_op(soid);
11fdf7f2 13621 ceph_assert(!recovering.count(soid));
7c673cae
FG
13622 recovering.insert(make_pair(soid, obc));
13623
224ce89b 13624 int r = pgbackend->recover_object(
7c673cae
FG
13625 soid,
13626 v,
13627 ObjectContextRef(),
13628 obc, // has snapset context
13629 h);
224ce89b
WB
13630 if (r < 0) {
13631 dout(0) << __func__ << " Error " << r << " on oid " << soid << dendl;
9f95a23c 13632 on_failed_pull({ pg_whoami }, soid, v);
224ce89b
WB
13633 return 0;
13634 }
7c673cae
FG
13635 return 1;
13636}
13637
11fdf7f2
TL
13638uint64_t PrimaryLogPG::recover_replicas(uint64_t max, ThreadPool::TPHandle &handle,
13639 bool *work_started)
7c673cae
FG
13640{
13641 dout(10) << __func__ << "(" << max << ")" << dendl;
13642 uint64_t started = 0;
13643
13644 PGBackend::RecoveryHandle *h = pgbackend->open_recovery_op();
13645
13646 // this is FAR from an optimal recovery order. pretty lame, really.
9f95a23c 13647 ceph_assert(!get_acting_recovery_backfill().empty());
11fdf7f2
TL
13648 // choose replicas to recover, replica has the shortest missing list first
13649 // so we can bring it back to normal ASAP
13650 std::vector<std::pair<unsigned int, pg_shard_t>> replicas_by_num_missing,
13651 async_by_num_missing;
9f95a23c
TL
13652 replicas_by_num_missing.reserve(get_acting_recovery_backfill().size() - 1);
13653 for (auto &p: get_acting_recovery_backfill()) {
11fdf7f2
TL
13654 if (p == get_primary()) {
13655 continue;
13656 }
9f95a23c
TL
13657 auto pm = recovery_state.get_peer_missing().find(p);
13658 ceph_assert(pm != recovery_state.get_peer_missing().end());
11fdf7f2
TL
13659 auto nm = pm->second.num_missing();
13660 if (nm != 0) {
9f95a23c 13661 if (is_async_recovery_target(p)) {
11fdf7f2
TL
13662 async_by_num_missing.push_back(make_pair(nm, p));
13663 } else {
13664 replicas_by_num_missing.push_back(make_pair(nm, p));
13665 }
13666 }
13667 }
13668 // sort by number of missing objects, in ascending order.
13669 auto func = [](const std::pair<unsigned int, pg_shard_t> &lhs,
13670 const std::pair<unsigned int, pg_shard_t> &rhs) {
13671 return lhs.first < rhs.first;
13672 };
13673 // acting goes first
13674 std::sort(replicas_by_num_missing.begin(), replicas_by_num_missing.end(), func);
13675 // then async_recovery_targets
13676 std::sort(async_by_num_missing.begin(), async_by_num_missing.end(), func);
13677 replicas_by_num_missing.insert(replicas_by_num_missing.end(),
13678 async_by_num_missing.begin(), async_by_num_missing.end());
13679 for (auto &replica: replicas_by_num_missing) {
13680 pg_shard_t &peer = replica.second;
13681 ceph_assert(peer != get_primary());
9f95a23c
TL
13682 auto pm = recovery_state.get_peer_missing().find(peer);
13683 ceph_assert(pm != recovery_state.get_peer_missing().end());
7c673cae
FG
13684 size_t m_sz = pm->second.num_missing();
13685
13686 dout(10) << " peer osd." << peer << " missing " << m_sz << " objects." << dendl;
13687 dout(20) << " peer osd." << peer << " missing " << pm->second.get_items() << dendl;
13688
13689 // oldest first!
13690 const pg_missing_t &m(pm->second);
13691 for (map<version_t, hobject_t>::const_iterator p = m.get_rmissing().begin();
13692 p != m.get_rmissing().end() && started < max;
13693 ++p) {
13694 handle.reset_tp_timeout();
13695 const hobject_t soid(p->second);
13696
9f95a23c 13697 if (recovery_state.get_missing_loc().is_unfound(soid)) {
224ce89b
WB
13698 dout(10) << __func__ << ": " << soid << " still unfound" << dendl;
13699 continue;
13700 }
13701
9f95a23c
TL
13702 const pg_info_t &pi = recovery_state.get_peer_info(peer);
13703 if (soid > pi.last_backfill) {
7c673cae 13704 if (!recovering.count(soid)) {
9f95a23c
TL
13705 derr << __func__ << ": object " << soid << " last_backfill "
13706 << pi.last_backfill << dendl;
7c673cae
FG
13707 derr << __func__ << ": object added to missing set for backfill, but "
13708 << "is not in recovering, error!" << dendl;
13709 ceph_abort();
13710 }
13711 continue;
13712 }
13713
13714 if (recovering.count(soid)) {
13715 dout(10) << __func__ << ": already recovering " << soid << dendl;
13716 continue;
13717 }
13718
9f95a23c 13719 if (recovery_state.get_missing_loc().is_deleted(soid)) {
c07f9fc5
FG
13720 dout(10) << __func__ << ": " << soid << " is a delete, removing" << dendl;
13721 map<hobject_t,pg_missing_item>::const_iterator r = m.get_items().find(soid);
11fdf7f2 13722 started += prep_object_replica_deletes(soid, r->second.need, h, work_started);
c07f9fc5
FG
13723 continue;
13724 }
13725
9f95a23c
TL
13726 if (soid.is_snap() &&
13727 recovery_state.get_pg_log().get_missing().is_missing(
13728 soid.get_head())) {
7c673cae
FG
13729 dout(10) << __func__ << ": " << soid.get_head()
13730 << " still missing on primary" << dendl;
13731 continue;
13732 }
13733
9f95a23c 13734 if (recovery_state.get_pg_log().get_missing().is_missing(soid)) {
7c673cae
FG
13735 dout(10) << __func__ << ": " << soid << " still missing on primary" << dendl;
13736 continue;
13737 }
13738
13739 dout(10) << __func__ << ": recover_object_replicas(" << soid << ")" << dendl;
13740 map<hobject_t,pg_missing_item>::const_iterator r = m.get_items().find(soid);
11fdf7f2 13741 started += prep_object_replica_pushes(soid, r->second.need, h, work_started);
7c673cae
FG
13742 }
13743 }
13744
1e59de90 13745 pgbackend->run_recovery_op(h, recovery_state.get_recovery_op_priority());
7c673cae
FG
13746 return started;
13747}
13748
13749hobject_t PrimaryLogPG::earliest_peer_backfill() const
13750{
13751 hobject_t e = hobject_t::get_max();
9f95a23c
TL
13752 for (const pg_shard_t& peer : get_backfill_targets()) {
13753 const auto iter = peer_backfill_info.find(peer);
11fdf7f2 13754 ceph_assert(iter != peer_backfill_info.end());
9f95a23c 13755 e = std::min(e, iter->second.begin);
7c673cae
FG
13756 }
13757 return e;
13758}
13759
13760bool PrimaryLogPG::all_peer_done() const
13761{
13762 // Primary hasn't got any more objects
11fdf7f2 13763 ceph_assert(backfill_info.empty());
7c673cae 13764
9f95a23c
TL
13765 for (const pg_shard_t& bt : get_backfill_targets()) {
13766 const auto piter = peer_backfill_info.find(bt);
11fdf7f2 13767 ceph_assert(piter != peer_backfill_info.end());
7c673cae
FG
13768 const BackfillInterval& pbi = piter->second;
13769 // See if peer has more to process
13770 if (!pbi.extends_to_end() || !pbi.empty())
13771 return false;
13772 }
13773 return true;
13774}
13775
13776/**
13777 * recover_backfill
13778 *
13779 * Invariants:
13780 *
13781 * backfilled: fully pushed to replica or present in replica's missing set (both
13782 * our copy and theirs).
13783 *
13784 * All objects on a backfill_target in
13785 * [MIN,peer_backfill_info[backfill_target].begin) are valid; logically-removed
13786 * objects have been actually deleted and all logically-valid objects are replicated.
13787 * There may be PG objects in this interval yet to be backfilled.
13788 *
13789 * All objects in PG in [MIN,backfill_info.begin) have been backfilled to all
13790 * backfill_targets. There may be objects on backfill_target(s) yet to be deleted.
13791 *
11fdf7f2 13792 * For a backfill target, all objects < std::min(peer_backfill_info[target].begin,
7c673cae
FG
13793 * backfill_info.begin) in PG are backfilled. No deleted objects in this
13794 * interval remain on the backfill target.
13795 *
13796 * For a backfill target, all objects <= peer_info[target].last_backfill
13797 * have been backfilled to target
13798 *
13799 * There *MAY* be missing/outdated objects between last_backfill_started and
11fdf7f2 13800 * std::min(peer_backfill_info[*].begin, backfill_info.begin) in the event that client
7c673cae
FG
13801 * io created objects since the last scan. For this reason, we call
13802 * update_range() again before continuing backfill.
13803 */
13804uint64_t PrimaryLogPG::recover_backfill(
13805 uint64_t max,
13806 ThreadPool::TPHandle &handle, bool *work_started)
13807{
11fdf7f2 13808 dout(10) << __func__ << " (" << max << ")"
9f95a23c 13809 << " bft=" << get_backfill_targets()
7c673cae
FG
13810 << " last_backfill_started " << last_backfill_started
13811 << (new_backfill ? " new_backfill":"")
13812 << dendl;
9f95a23c 13813 ceph_assert(!get_backfill_targets().empty());
7c673cae
FG
13814
13815 // Initialize from prior backfill state
13816 if (new_backfill) {
13817 // on_activate() was called prior to getting here
f67539c2 13818 ceph_assert(last_backfill_started == recovery_state.earliest_backfill());
7c673cae
FG
13819 new_backfill = false;
13820
13821 // initialize BackfillIntervals
9f95a23c
TL
13822 for (set<pg_shard_t>::const_iterator i = get_backfill_targets().begin();
13823 i != get_backfill_targets().end();
7c673cae 13824 ++i) {
9f95a23c
TL
13825 peer_backfill_info[*i].reset(
13826 recovery_state.get_peer_info(*i).last_backfill);
7c673cae
FG
13827 }
13828 backfill_info.reset(last_backfill_started);
13829
13830 backfills_in_flight.clear();
13831 pending_backfill_updates.clear();
13832 }
13833
9f95a23c
TL
13834 for (set<pg_shard_t>::const_iterator i = get_backfill_targets().begin();
13835 i != get_backfill_targets().end();
7c673cae
FG
13836 ++i) {
13837 dout(10) << "peer osd." << *i
9f95a23c 13838 << " info " << recovery_state.get_peer_info(*i)
7c673cae
FG
13839 << " interval " << peer_backfill_info[*i].begin
13840 << "-" << peer_backfill_info[*i].end
13841 << " " << peer_backfill_info[*i].objects.size() << " objects"
13842 << dendl;
13843 }
13844
13845 // update our local interval to cope with recent changes
13846 backfill_info.begin = last_backfill_started;
13847 update_range(&backfill_info, handle);
13848
13849 unsigned ops = 0;
7c673cae
FG
13850 vector<boost::tuple<hobject_t, eversion_t, pg_shard_t> > to_remove;
13851 set<hobject_t> add_to_stat;
13852
9f95a23c
TL
13853 for (set<pg_shard_t>::const_iterator i = get_backfill_targets().begin();
13854 i != get_backfill_targets().end();
7c673cae
FG
13855 ++i) {
13856 peer_backfill_info[*i].trim_to(
9f95a23c
TL
13857 std::max(
13858 recovery_state.get_peer_info(*i).last_backfill,
13859 last_backfill_started));
7c673cae
FG
13860 }
13861 backfill_info.trim_to(last_backfill_started);
13862
224ce89b 13863 PGBackend::RecoveryHandle *h = pgbackend->open_recovery_op();
7c673cae
FG
13864 while (ops < max) {
13865 if (backfill_info.begin <= earliest_peer_backfill() &&
13866 !backfill_info.extends_to_end() && backfill_info.empty()) {
13867 hobject_t next = backfill_info.end;
13868 backfill_info.reset(next);
13869 backfill_info.end = hobject_t::get_max();
13870 update_range(&backfill_info, handle);
13871 backfill_info.trim();
13872 }
13873
13874 dout(20) << " my backfill interval " << backfill_info << dendl;
13875
13876 bool sent_scan = false;
9f95a23c
TL
13877 for (set<pg_shard_t>::const_iterator i = get_backfill_targets().begin();
13878 i != get_backfill_targets().end();
7c673cae
FG
13879 ++i) {
13880 pg_shard_t bt = *i;
13881 BackfillInterval& pbi = peer_backfill_info[bt];
13882
13883 dout(20) << " peer shard " << bt << " backfill " << pbi << dendl;
13884 if (pbi.begin <= backfill_info.begin &&
13885 !pbi.extends_to_end() && pbi.empty()) {
13886 dout(10) << " scanning peer osd." << bt << " from " << pbi.end << dendl;
11fdf7f2 13887 epoch_t e = get_osdmap_epoch();
7c673cae 13888 MOSDPGScan *m = new MOSDPGScan(
9f95a23c 13889 MOSDPGScan::OP_SCAN_GET_DIGEST, pg_whoami, e, get_last_peering_reset(),
7c673cae
FG
13890 spg_t(info.pgid.pgid, bt.shard),
13891 pbi.end, hobject_t());
1e59de90
TL
13892
13893 if (cct->_conf->osd_op_queue == "mclock_scheduler") {
13894 /* This guard preserves legacy WeightedPriorityQueue behavior for
13895 * now, but should be removed after Reef */
13896 m->set_priority(recovery_state.get_recovery_op_priority());
13897 }
11fdf7f2
TL
13898 osd->send_message_osd_cluster(bt.osd, m, get_osdmap_epoch());
13899 ceph_assert(waiting_on_backfill.find(bt) == waiting_on_backfill.end());
7c673cae
FG
13900 waiting_on_backfill.insert(bt);
13901 sent_scan = true;
13902 }
13903 }
13904
13905 // Count simultaneous scans as a single op and let those complete
13906 if (sent_scan) {
13907 ops++;
13908 start_recovery_op(hobject_t::get_max()); // XXX: was pbi.end
13909 break;
13910 }
13911
13912 if (backfill_info.empty() && all_peer_done()) {
13913 dout(10) << " reached end for both local and all peers" << dendl;
13914 break;
13915 }
13916
13917 // Get object within set of peers to operate on and
13918 // the set of targets for which that object applies.
13919 hobject_t check = earliest_peer_backfill();
13920
13921 if (check < backfill_info.begin) {
13922
13923 set<pg_shard_t> check_targets;
9f95a23c
TL
13924 for (set<pg_shard_t>::const_iterator i = get_backfill_targets().begin();
13925 i != get_backfill_targets().end();
7c673cae
FG
13926 ++i) {
13927 pg_shard_t bt = *i;
13928 BackfillInterval& pbi = peer_backfill_info[bt];
13929 if (pbi.begin == check)
13930 check_targets.insert(bt);
13931 }
11fdf7f2 13932 ceph_assert(!check_targets.empty());
7c673cae
FG
13933
13934 dout(20) << " BACKFILL removing " << check
13935 << " from peers " << check_targets << dendl;
13936 for (set<pg_shard_t>::iterator i = check_targets.begin();
13937 i != check_targets.end();
13938 ++i) {
13939 pg_shard_t bt = *i;
13940 BackfillInterval& pbi = peer_backfill_info[bt];
11fdf7f2 13941 ceph_assert(pbi.begin == check);
7c673cae
FG
13942
13943 to_remove.push_back(boost::make_tuple(check, pbi.objects.begin()->second, bt));
13944 pbi.pop_front();
13945 }
13946
11fdf7f2 13947 last_backfill_started = check;
7c673cae
FG
13948
13949 // Don't increment ops here because deletions
13950 // are cheap and not replied to unlike real recovery_ops,
13951 // and we can't increment ops without requeueing ourself
13952 // for recovery.
13953 } else {
13954 eversion_t& obj_v = backfill_info.objects.begin()->second;
13955
13956 vector<pg_shard_t> need_ver_targs, missing_targs, keep_ver_targs, skip_targs;
9f95a23c
TL
13957 for (set<pg_shard_t>::const_iterator i = get_backfill_targets().begin();
13958 i != get_backfill_targets().end();
7c673cae
FG
13959 ++i) {
13960 pg_shard_t bt = *i;
13961 BackfillInterval& pbi = peer_backfill_info[bt];
13962 // Find all check peers that have the wrong version
13963 if (check == backfill_info.begin && check == pbi.begin) {
13964 if (pbi.objects.begin()->second != obj_v) {
13965 need_ver_targs.push_back(bt);
13966 } else {
13967 keep_ver_targs.push_back(bt);
13968 }
13969 } else {
9f95a23c 13970 const pg_info_t& pinfo = recovery_state.get_peer_info(bt);
7c673cae
FG
13971
13972 // Only include peers that we've caught up to their backfill line
13973 // otherwise, they only appear to be missing this object
13974 // because their pbi.begin > backfill_info.begin.
13975 if (backfill_info.begin > pinfo.last_backfill)
13976 missing_targs.push_back(bt);
13977 else
13978 skip_targs.push_back(bt);
13979 }
13980 }
13981
13982 if (!keep_ver_targs.empty()) {
13983 // These peers have version obj_v
13984 dout(20) << " BACKFILL keeping " << check
13985 << " with ver " << obj_v
13986 << " on peers " << keep_ver_targs << dendl;
13987 //assert(!waiting_for_degraded_object.count(check));
13988 }
13989 if (!need_ver_targs.empty() || !missing_targs.empty()) {
13990 ObjectContextRef obc = get_object_context(backfill_info.begin, false);
11fdf7f2 13991 ceph_assert(obc);
7c673cae
FG
13992 if (obc->get_recovery_read()) {
13993 if (!need_ver_targs.empty()) {
13994 dout(20) << " BACKFILL replacing " << check
13995 << " with ver " << obj_v
13996 << " to peers " << need_ver_targs << dendl;
13997 }
13998 if (!missing_targs.empty()) {
13999 dout(20) << " BACKFILL pushing " << backfill_info.begin
14000 << " with ver " << obj_v
14001 << " to peers " << missing_targs << dendl;
14002 }
14003 vector<pg_shard_t> all_push = need_ver_targs;
14004 all_push.insert(all_push.end(), missing_targs.begin(), missing_targs.end());
14005
224ce89b
WB
14006 handle.reset_tp_timeout();
14007 int r = prep_backfill_object_push(backfill_info.begin, obj_v, obc, all_push, h);
14008 if (r < 0) {
14009 *work_started = true;
14010 dout(0) << __func__ << " Error " << r << " trying to backfill " << backfill_info.begin << dendl;
14011 break;
14012 }
7c673cae
FG
14013 ops++;
14014 } else {
14015 *work_started = true;
14016 dout(20) << "backfill blocking on " << backfill_info.begin
14017 << "; could not get rw_manager lock" << dendl;
14018 break;
14019 }
14020 }
14021 dout(20) << "need_ver_targs=" << need_ver_targs
14022 << " keep_ver_targs=" << keep_ver_targs << dendl;
9f95a23c 14023 dout(20) << "backfill_targets=" << get_backfill_targets()
7c673cae
FG
14024 << " missing_targs=" << missing_targs
14025 << " skip_targs=" << skip_targs << dendl;
14026
14027 last_backfill_started = backfill_info.begin;
14028 add_to_stat.insert(backfill_info.begin); // XXX: Only one for all pushes?
14029 backfill_info.pop_front();
14030 vector<pg_shard_t> check_targets = need_ver_targs;
14031 check_targets.insert(check_targets.end(), keep_ver_targs.begin(), keep_ver_targs.end());
14032 for (vector<pg_shard_t>::iterator i = check_targets.begin();
14033 i != check_targets.end();
14034 ++i) {
14035 pg_shard_t bt = *i;
14036 BackfillInterval& pbi = peer_backfill_info[bt];
14037 pbi.pop_front();
14038 }
14039 }
14040 }
14041
7c673cae
FG
14042 for (set<hobject_t>::iterator i = add_to_stat.begin();
14043 i != add_to_stat.end();
14044 ++i) {
14045 ObjectContextRef obc = get_object_context(*i, false);
11fdf7f2 14046 ceph_assert(obc);
7c673cae
FG
14047 pg_stat_t stat;
14048 add_object_context_to_pg_stat(obc, &stat);
14049 pending_backfill_updates[*i] = stat;
14050 }
11fdf7f2
TL
14051 map<pg_shard_t,MOSDPGBackfillRemove*> reqs;
14052 for (unsigned i = 0; i < to_remove.size(); ++i) {
14053 handle.reset_tp_timeout();
14054 const hobject_t& oid = to_remove[i].get<0>();
14055 eversion_t v = to_remove[i].get<1>();
14056 pg_shard_t peer = to_remove[i].get<2>();
14057 MOSDPGBackfillRemove *m;
14058 auto it = reqs.find(peer);
14059 if (it != reqs.end()) {
14060 m = it->second;
14061 } else {
14062 m = reqs[peer] = new MOSDPGBackfillRemove(
14063 spg_t(info.pgid.pgid, peer.shard),
14064 get_osdmap_epoch());
1e59de90
TL
14065 if (cct->_conf->osd_op_queue == "mclock_scheduler") {
14066 /* This guard preserves legacy WeightedPriorityQueue behavior for
14067 * now, but should be removed after Reef */
14068 m->set_priority(recovery_state.get_recovery_op_priority());
14069 }
7c673cae 14070 }
11fdf7f2 14071 m->ls.push_back(make_pair(oid, v));
7c673cae 14072
11fdf7f2
TL
14073 if (oid <= last_backfill_started)
14074 pending_backfill_updates[oid]; // add empty stat!
14075 }
14076 for (auto p : reqs) {
14077 osd->send_message_osd_cluster(p.first.osd, p.second,
14078 get_osdmap_epoch());
7c673cae
FG
14079 }
14080
1e59de90 14081 pgbackend->run_recovery_op(h, recovery_state.get_recovery_op_priority());
7c673cae 14082
f67539c2
TL
14083 hobject_t backfill_pos =
14084 std::min(backfill_info.begin, earliest_peer_backfill());
7c673cae
FG
14085 dout(5) << "backfill_pos is " << backfill_pos << dendl;
14086 for (set<hobject_t>::iterator i = backfills_in_flight.begin();
14087 i != backfills_in_flight.end();
14088 ++i) {
14089 dout(20) << *i << " is still in flight" << dendl;
14090 }
14091
14092 hobject_t next_backfill_to_complete = backfills_in_flight.empty() ?
14093 backfill_pos : *(backfills_in_flight.begin());
f67539c2 14094 hobject_t new_last_backfill = recovery_state.earliest_backfill();
7c673cae
FG
14095 dout(10) << "starting new_last_backfill at " << new_last_backfill << dendl;
14096 for (map<hobject_t, pg_stat_t>::iterator i =
14097 pending_backfill_updates.begin();
14098 i != pending_backfill_updates.end() &&
14099 i->first < next_backfill_to_complete;
14100 pending_backfill_updates.erase(i++)) {
14101 dout(20) << " pending_backfill_update " << i->first << dendl;
11fdf7f2 14102 ceph_assert(i->first > new_last_backfill);
f67539c2
TL
14103 // carried from a previous round – if we are here, then we had to
14104 // be requeued (by e.g. on_global_recover()) and those operations
14105 // are done.
9f95a23c
TL
14106 recovery_state.update_complete_backfill_object_stats(
14107 i->first,
14108 i->second);
7c673cae
FG
14109 new_last_backfill = i->first;
14110 }
14111 dout(10) << "possible new_last_backfill at " << new_last_backfill << dendl;
14112
11fdf7f2 14113 ceph_assert(!pending_backfill_updates.empty() ||
7c673cae
FG
14114 new_last_backfill == last_backfill_started);
14115 if (pending_backfill_updates.empty() &&
14116 backfill_pos.is_max()) {
11fdf7f2 14117 ceph_assert(backfills_in_flight.empty());
7c673cae
FG
14118 new_last_backfill = backfill_pos;
14119 last_backfill_started = backfill_pos;
14120 }
14121 dout(10) << "final new_last_backfill at " << new_last_backfill << dendl;
14122
14123 // If new_last_backfill == MAX, then we will send OP_BACKFILL_FINISH to
14124 // all the backfill targets. Otherwise, we will move last_backfill up on
14125 // those targets need it and send OP_BACKFILL_PROGRESS to them.
9f95a23c
TL
14126 for (set<pg_shard_t>::const_iterator i = get_backfill_targets().begin();
14127 i != get_backfill_targets().end();
7c673cae
FG
14128 ++i) {
14129 pg_shard_t bt = *i;
9f95a23c 14130 const pg_info_t& pinfo = recovery_state.get_peer_info(bt);
7c673cae
FG
14131
14132 if (new_last_backfill > pinfo.last_backfill) {
9f95a23c 14133 recovery_state.update_peer_last_backfill(bt, new_last_backfill);
11fdf7f2 14134 epoch_t e = get_osdmap_epoch();
7c673cae
FG
14135 MOSDPGBackfill *m = NULL;
14136 if (pinfo.last_backfill.is_max()) {
14137 m = new MOSDPGBackfill(
14138 MOSDPGBackfill::OP_BACKFILL_FINISH,
14139 e,
9f95a23c 14140 get_last_peering_reset(),
7c673cae
FG
14141 spg_t(info.pgid.pgid, bt.shard));
14142 // Use default priority here, must match sub_op priority
7c673cae
FG
14143 start_recovery_op(hobject_t::get_max());
14144 } else {
14145 m = new MOSDPGBackfill(
14146 MOSDPGBackfill::OP_BACKFILL_PROGRESS,
14147 e,
9f95a23c 14148 get_last_peering_reset(),
7c673cae
FG
14149 spg_t(info.pgid.pgid, bt.shard));
14150 // Use default priority here, must match sub_op priority
14151 }
14152 m->last_backfill = pinfo.last_backfill;
14153 m->stats = pinfo.stats;
1e59de90
TL
14154
14155 if (cct->_conf->osd_op_queue == "mclock_scheduler") {
14156 /* This guard preserves legacy WeightedPriorityQueue behavior for
14157 * now, but should be removed after Reef */
14158 m->set_priority(recovery_state.get_recovery_op_priority());
14159 }
14160
11fdf7f2 14161 osd->send_message_osd_cluster(bt.osd, m, get_osdmap_epoch());
7c673cae
FG
14162 dout(10) << " peer " << bt
14163 << " num_objects now " << pinfo.stats.stats.sum.num_objects
14164 << " / " << info.stats.stats.sum.num_objects << dendl;
14165 }
14166 }
14167
14168 if (ops)
14169 *work_started = true;
14170 return ops;
14171}
14172
224ce89b 14173int PrimaryLogPG::prep_backfill_object_push(
7c673cae
FG
14174 hobject_t oid, eversion_t v,
14175 ObjectContextRef obc,
14176 vector<pg_shard_t> peers,
14177 PGBackend::RecoveryHandle *h)
14178{
224ce89b 14179 dout(10) << __func__ << " " << oid << " v " << v << " to peers " << peers << dendl;
11fdf7f2 14180 ceph_assert(!peers.empty());
7c673cae
FG
14181
14182 backfills_in_flight.insert(oid);
9f95a23c 14183 recovery_state.prepare_backfill_for_missing(oid, v, peers);
7c673cae 14184
11fdf7f2 14185 ceph_assert(!recovering.count(oid));
7c673cae
FG
14186
14187 start_recovery_op(oid);
14188 recovering.insert(make_pair(oid, obc));
14189
224ce89b 14190 int r = pgbackend->recover_object(
7c673cae
FG
14191 oid,
14192 v,
14193 ObjectContextRef(),
14194 obc,
14195 h);
224ce89b
WB
14196 if (r < 0) {
14197 dout(0) << __func__ << " Error " << r << " on oid " << oid << dendl;
9f95a23c 14198 on_failed_pull({ pg_whoami }, oid, v);
224ce89b
WB
14199 }
14200 return r;
7c673cae
FG
14201}
14202
14203void PrimaryLogPG::update_range(
14204 BackfillInterval *bi,
14205 ThreadPool::TPHandle &handle)
14206{
14207 int local_min = cct->_conf->osd_backfill_scan_min;
14208 int local_max = cct->_conf->osd_backfill_scan_max;
14209
14210 if (bi->version < info.log_tail) {
14211 dout(10) << __func__<< ": bi is old, rescanning local backfill_info"
14212 << dendl;
11fdf7f2 14213 bi->version = info.last_update;
7c673cae
FG
14214 scan_range(local_min, local_max, bi, handle);
14215 }
14216
14217 if (bi->version >= projected_last_update) {
14218 dout(10) << __func__<< ": bi is current " << dendl;
11fdf7f2 14219 ceph_assert(bi->version == projected_last_update);
7c673cae 14220 } else if (bi->version >= info.log_tail) {
9f95a23c 14221 if (recovery_state.get_pg_log().get_log().empty() && projected_log.empty()) {
7c673cae
FG
14222 /* Because we don't move log_tail on split, the log might be
14223 * empty even if log_tail != last_update. However, the only
14224 * way to get here with an empty log is if log_tail is actually
14225 * eversion_t(), because otherwise the entry which changed
14226 * last_update since the last scan would have to be present.
14227 */
11fdf7f2 14228 ceph_assert(bi->version == eversion_t());
7c673cae
FG
14229 return;
14230 }
14231
14232 dout(10) << __func__<< ": bi is old, (" << bi->version
14233 << ") can be updated with log to projected_last_update "
14234 << projected_last_update << dendl;
14235
14236 auto func = [&](const pg_log_entry_t &e) {
14237 dout(10) << __func__ << ": updating from version " << e.version
14238 << dendl;
14239 const hobject_t &soid = e.soid;
14240 if (soid >= bi->begin &&
14241 soid < bi->end) {
14242 if (e.is_update()) {
14243 dout(10) << __func__ << ": " << e.soid << " updated to version "
14244 << e.version << dendl;
14245 bi->objects.erase(e.soid);
14246 bi->objects.insert(
14247 make_pair(
14248 e.soid,
14249 e.version));
14250 } else if (e.is_delete()) {
14251 dout(10) << __func__ << ": " << e.soid << " removed" << dendl;
14252 bi->objects.erase(e.soid);
14253 }
14254 }
14255 };
14256 dout(10) << "scanning pg log first" << dendl;
9f95a23c 14257 recovery_state.get_pg_log().get_log().scan_log_after(bi->version, func);
7c673cae
FG
14258 dout(10) << "scanning projected log" << dendl;
14259 projected_log.scan_log_after(bi->version, func);
14260 bi->version = projected_last_update;
14261 } else {
11fdf7f2 14262 ceph_abort_msg("scan_range should have raised bi->version past log_tail");
7c673cae
FG
14263 }
14264}
14265
14266void PrimaryLogPG::scan_range(
14267 int min, int max, BackfillInterval *bi,
14268 ThreadPool::TPHandle &handle)
14269{
11fdf7f2 14270 ceph_assert(is_locked());
7c673cae
FG
14271 dout(10) << "scan_range from " << bi->begin << dendl;
14272 bi->clear_objects();
14273
14274 vector<hobject_t> ls;
14275 ls.reserve(max);
14276 int r = pgbackend->objects_list_partial(bi->begin, min, max, &ls, &bi->end);
11fdf7f2 14277 ceph_assert(r >= 0);
7c673cae
FG
14278 dout(10) << " got " << ls.size() << " items, next " << bi->end << dendl;
14279 dout(20) << ls << dendl;
14280
14281 for (vector<hobject_t>::iterator p = ls.begin(); p != ls.end(); ++p) {
14282 handle.reset_tp_timeout();
14283 ObjectContextRef obc;
14284 if (is_primary())
14285 obc = object_contexts.lookup(*p);
14286 if (obc) {
92f5a8d4
TL
14287 if (!obc->obs.exists) {
14288 /* If the object does not exist here, it must have been removed
14289 * between the collection_list_partial and here. This can happen
14290 * for the first item in the range, which is usually last_backfill.
14291 */
14292 continue;
14293 }
7c673cae
FG
14294 bi->objects[*p] = obc->obs.oi.version;
14295 dout(20) << " " << *p << " " << obc->obs.oi.version << dendl;
14296 } else {
14297 bufferlist bl;
14298 int r = pgbackend->objects_get_attr(*p, OI_ATTR, &bl);
7c673cae 14299 /* If the object does not exist here, it must have been removed
92f5a8d4
TL
14300 * between the collection_list_partial and here. This can happen
14301 * for the first item in the range, which is usually last_backfill.
14302 */
7c673cae
FG
14303 if (r == -ENOENT)
14304 continue;
14305
11fdf7f2 14306 ceph_assert(r >= 0);
7c673cae
FG
14307 object_info_t oi(bl);
14308 bi->objects[*p] = oi.version;
14309 dout(20) << " " << *p << " " << oi.version << dendl;
14310 }
14311 }
14312}
14313
14314
14315/** check_local
f67539c2 14316 *
7c673cae
FG
14317 * verifies that stray objects have been deleted
14318 */
14319void PrimaryLogPG::check_local()
14320{
14321 dout(10) << __func__ << dendl;
14322
9f95a23c
TL
14323 ceph_assert(
14324 info.last_update >=
14325 recovery_state.get_pg_log().get_tail()); // otherwise we need some help!
7c673cae
FG
14326
14327 if (!cct->_conf->osd_debug_verify_stray_on_activate)
14328 return;
14329
14330 // just scan the log.
14331 set<hobject_t> did;
9f95a23c
TL
14332 for (list<pg_log_entry_t>::const_reverse_iterator p = recovery_state.get_pg_log().get_log().log.rbegin();
14333 p != recovery_state.get_pg_log().get_log().log.rend();
7c673cae
FG
14334 ++p) {
14335 if (did.count(p->soid))
14336 continue;
14337 did.insert(p->soid);
14338
c07f9fc5 14339 if (p->is_delete() && !is_missing_object(p->soid)) {
7c673cae
FG
14340 dout(10) << " checking " << p->soid
14341 << " at " << p->version << dendl;
14342 struct stat st;
14343 int r = osd->store->stat(
14344 ch,
14345 ghobject_t(p->soid, ghobject_t::NO_GEN, pg_whoami.shard),
14346 &st);
14347 if (r != -ENOENT) {
14348 derr << __func__ << " " << p->soid << " exists, but should have been "
14349 << "deleted" << dendl;
11fdf7f2 14350 ceph_abort_msg("erroneously present object");
7c673cae
FG
14351 }
14352 } else {
14353 // ignore old(+missing) objects
14354 }
14355 }
14356}
14357
14358
14359
14360// ===========================
14361// hit sets
14362
14363hobject_t PrimaryLogPG::get_hit_set_current_object(utime_t stamp)
14364{
14365 ostringstream ss;
14366 ss << "hit_set_" << info.pgid.pgid << "_current_" << stamp;
14367 hobject_t hoid(sobject_t(ss.str(), CEPH_NOSNAP), "",
14368 info.pgid.ps(), info.pgid.pool(),
14369 cct->_conf->osd_hit_set_namespace);
14370 dout(20) << __func__ << " " << hoid << dendl;
14371 return hoid;
14372}
14373
14374hobject_t PrimaryLogPG::get_hit_set_archive_object(utime_t start,
14375 utime_t end,
14376 bool using_gmt)
14377{
14378 ostringstream ss;
14379 ss << "hit_set_" << info.pgid.pgid << "_archive_";
14380 if (using_gmt) {
9f95a23c
TL
14381 start.gmtime(ss, true /* legacy pre-octopus form */) << "_";
14382 end.gmtime(ss, true /* legacy pre-octopus form */);
7c673cae 14383 } else {
9f95a23c
TL
14384 start.localtime(ss, true /* legacy pre-octopus form */) << "_";
14385 end.localtime(ss, true /* legacy pre-octopus form */);
7c673cae
FG
14386 }
14387 hobject_t hoid(sobject_t(ss.str(), CEPH_NOSNAP), "",
14388 info.pgid.ps(), info.pgid.pool(),
14389 cct->_conf->osd_hit_set_namespace);
14390 dout(20) << __func__ << " " << hoid << dendl;
14391 return hoid;
14392}
14393
14394void PrimaryLogPG::hit_set_clear()
14395{
14396 dout(20) << __func__ << dendl;
14397 hit_set.reset();
14398 hit_set_start_stamp = utime_t();
14399}
14400
14401void PrimaryLogPG::hit_set_setup()
14402{
14403 if (!is_active() ||
14404 !is_primary()) {
14405 hit_set_clear();
14406 return;
14407 }
14408
14409 if (is_active() && is_primary() &&
14410 (!pool.info.hit_set_count ||
14411 !pool.info.hit_set_period ||
14412 pool.info.hit_set_params.get_type() == HitSet::TYPE_NONE)) {
14413 hit_set_clear();
14414
14415 // only primary is allowed to remove all the hit set objects
14416 hit_set_remove_all();
14417 return;
14418 }
14419
14420 // FIXME: discard any previous data for now
14421 hit_set_create();
14422
14423 // include any writes we know about from the pg log. this doesn't
14424 // capture reads, but it is better than nothing!
14425 hit_set_apply_log();
14426}
14427
14428void PrimaryLogPG::hit_set_remove_all()
14429{
14430 // If any archives are degraded we skip this
9f95a23c 14431 for (auto p = info.hit_set.history.begin();
7c673cae
FG
14432 p != info.hit_set.history.end();
14433 ++p) {
14434 hobject_t aoid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt);
14435
14436 // Once we hit a degraded object just skip
14437 if (is_degraded_or_backfilling_object(aoid))
14438 return;
f67539c2 14439 if (m_scrubber->write_blocked_by_scrub(aoid))
7c673cae
FG
14440 return;
14441 }
14442
14443 if (!info.hit_set.history.empty()) {
9f95a23c 14444 auto p = info.hit_set.history.rbegin();
11fdf7f2 14445 ceph_assert(p != info.hit_set.history.rend());
7c673cae 14446 hobject_t oid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt);
11fdf7f2 14447 ceph_assert(!is_degraded_or_backfilling_object(oid));
7c673cae 14448 ObjectContextRef obc = get_object_context(oid, false);
11fdf7f2 14449 ceph_assert(obc);
7c673cae
FG
14450
14451 OpContextUPtr ctx = simple_opc_create(obc);
14452 ctx->at_version = get_next_version();
14453 ctx->updated_hset_history = info.hit_set;
14454 utime_t now = ceph_clock_now();
14455 ctx->mtime = now;
14456 hit_set_trim(ctx, 0);
14457 simple_opc_submit(std::move(ctx));
14458 }
14459
9f95a23c 14460 recovery_state.update_hset(pg_hit_set_history_t());
7c673cae
FG
14461 if (agent_state) {
14462 agent_state->discard_hit_sets();
14463 }
14464}
14465
14466void PrimaryLogPG::hit_set_create()
14467{
14468 utime_t now = ceph_clock_now();
14469 // make a copy of the params to modify
14470 HitSet::Params params(pool.info.hit_set_params);
14471
14472 dout(20) << __func__ << " " << params << dendl;
14473 if (pool.info.hit_set_params.get_type() == HitSet::TYPE_BLOOM) {
14474 BloomHitSet::Params *p =
14475 static_cast<BloomHitSet::Params*>(params.impl.get());
14476
14477 // convert false positive rate so it holds up across the full period
14478 p->set_fpp(p->get_fpp() / pool.info.hit_set_count);
14479 if (p->get_fpp() <= 0.0)
14480 p->set_fpp(.01); // fpp cannot be zero!
14481
14482 // if we don't have specified size, estimate target size based on the
14483 // previous bin!
14484 if (p->target_size == 0 && hit_set) {
14485 utime_t dur = now - hit_set_start_stamp;
14486 unsigned unique = hit_set->approx_unique_insert_count();
14487 dout(20) << __func__ << " previous set had approx " << unique
14488 << " unique items over " << dur << " seconds" << dendl;
14489 p->target_size = (double)unique * (double)pool.info.hit_set_period
14490 / (double)dur;
14491 }
14492 if (p->target_size <
14493 static_cast<uint64_t>(cct->_conf->osd_hit_set_min_size))
14494 p->target_size = cct->_conf->osd_hit_set_min_size;
14495
14496 if (p->target_size
14497 > static_cast<uint64_t>(cct->_conf->osd_hit_set_max_size))
14498 p->target_size = cct->_conf->osd_hit_set_max_size;
14499
14500 p->seed = now.sec();
14501
14502 dout(10) << __func__ << " target_size " << p->target_size
14503 << " fpp " << p->get_fpp() << dendl;
14504 }
14505 hit_set.reset(new HitSet(params));
14506 hit_set_start_stamp = now;
14507}
14508
14509/**
14510 * apply log entries to set
14511 *
14512 * this would only happen after peering, to at least capture writes
14513 * during an interval that was potentially lost.
14514 */
14515bool PrimaryLogPG::hit_set_apply_log()
14516{
14517 if (!hit_set)
14518 return false;
14519
14520 eversion_t to = info.last_update;
14521 eversion_t from = info.hit_set.current_last_update;
14522 if (to <= from) {
14523 dout(20) << __func__ << " no update" << dendl;
14524 return false;
14525 }
14526
14527 dout(20) << __func__ << " " << to << " .. " << info.last_update << dendl;
9f95a23c
TL
14528 list<pg_log_entry_t>::const_reverse_iterator p =
14529 recovery_state.get_pg_log().get_log().log.rbegin();
14530 while (p != recovery_state.get_pg_log().get_log().log.rend() && p->version > to)
7c673cae 14531 ++p;
9f95a23c 14532 while (p != recovery_state.get_pg_log().get_log().log.rend() && p->version > from) {
7c673cae
FG
14533 hit_set->insert(p->soid);
14534 ++p;
14535 }
14536
14537 return true;
14538}
14539
14540void PrimaryLogPG::hit_set_persist()
14541{
14542 dout(10) << __func__ << dendl;
14543 bufferlist bl;
14544 unsigned max = pool.info.hit_set_count;
14545
14546 utime_t now = ceph_clock_now();
14547 hobject_t oid;
14548
14549 // If any archives are degraded we skip this persist request
14550 // account for the additional entry being added below
9f95a23c 14551 for (auto p = info.hit_set.history.begin();
7c673cae
FG
14552 p != info.hit_set.history.end();
14553 ++p) {
14554 hobject_t aoid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt);
14555
14556 // Once we hit a degraded object just skip further trim
14557 if (is_degraded_or_backfilling_object(aoid))
14558 return;
f67539c2 14559 if (m_scrubber->write_blocked_by_scrub(aoid))
7c673cae
FG
14560 return;
14561 }
14562
14563 // If backfill is in progress and we could possibly overlap with the
14564 // hit_set_* objects, back off. Since these all have
14565 // hobject_t::hash set to pgid.ps(), and those sort first, we can
14566 // look just at that. This is necessary because our transactions
14567 // may include a modify of the new hit_set *and* a delete of the
14568 // old one, and this may span the backfill boundary.
9f95a23c
TL
14569 for (set<pg_shard_t>::const_iterator p = get_backfill_targets().begin();
14570 p != get_backfill_targets().end();
7c673cae 14571 ++p) {
9f95a23c 14572 const pg_info_t& pi = recovery_state.get_peer_info(*p);
7c673cae
FG
14573 if (pi.last_backfill == hobject_t() ||
14574 pi.last_backfill.get_hash() == info.pgid.ps()) {
14575 dout(10) << __func__ << " backfill target osd." << *p
14576 << " last_backfill has not progressed past pgid ps"
14577 << dendl;
14578 return;
14579 }
14580 }
14581
14582
14583 pg_hit_set_info_t new_hset = pg_hit_set_info_t(pool.info.use_gmt_hitset);
14584 new_hset.begin = hit_set_start_stamp;
14585 new_hset.end = now;
14586 oid = get_hit_set_archive_object(
14587 new_hset.begin,
14588 new_hset.end,
14589 new_hset.using_gmt);
14590
14591 // If the current object is degraded we skip this persist request
f67539c2 14592 if (m_scrubber->write_blocked_by_scrub(oid))
7c673cae
FG
14593 return;
14594
14595 hit_set->seal();
11fdf7f2 14596 encode(*hit_set, bl);
7c673cae
FG
14597 dout(20) << __func__ << " archive " << oid << dendl;
14598
14599 if (agent_state) {
14600 agent_state->add_hit_set(new_hset.begin, hit_set);
14601 uint32_t size = agent_state->hit_set_map.size();
14602 if (size >= pool.info.hit_set_count) {
14603 size = pool.info.hit_set_count > 0 ? pool.info.hit_set_count - 1: 0;
14604 }
14605 hit_set_in_memory_trim(size);
14606 }
14607
14608 ObjectContextRef obc = get_object_context(oid, true);
14609 OpContextUPtr ctx = simple_opc_create(obc);
14610
14611 ctx->at_version = get_next_version();
14612 ctx->updated_hset_history = info.hit_set;
14613 pg_hit_set_history_t &updated_hit_set_hist = *(ctx->updated_hset_history);
14614
14615 updated_hit_set_hist.current_last_update = info.last_update;
14616 new_hset.version = ctx->at_version;
14617
14618 updated_hit_set_hist.history.push_back(new_hset);
14619 hit_set_create();
14620
14621 // fabricate an object_info_t and SnapSet
14622 obc->obs.oi.version = ctx->at_version;
14623 obc->obs.oi.mtime = now;
14624 obc->obs.oi.size = bl.length();
14625 obc->obs.exists = true;
14626 obc->obs.oi.set_data_digest(bl.crc32c(-1));
14627
14628 ctx->new_obs = obc->obs;
14629
7c673cae
FG
14630 ctx->new_snapset = obc->ssc->snapset;
14631
14632 ctx->delta_stats.num_objects++;
14633 ctx->delta_stats.num_objects_hit_set_archive++;
11fdf7f2 14634
7c673cae
FG
14635 ctx->delta_stats.num_bytes += bl.length();
14636 ctx->delta_stats.num_bytes_hit_set_archive += bl.length();
14637
14638 bufferlist bss;
11fdf7f2 14639 encode(ctx->new_snapset, bss);
7c673cae 14640 bufferlist boi(sizeof(ctx->new_obs.oi));
11fdf7f2 14641 encode(ctx->new_obs.oi, boi,
7c673cae
FG
14642 get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
14643
14644 ctx->op_t->create(oid);
14645 if (bl.length()) {
14646 ctx->op_t->write(oid, 0, bl.length(), bl, 0);
9f95a23c
TL
14647 write_update_size_and_usage(ctx->delta_stats, obc->obs.oi, ctx->modified_ranges,
14648 0, bl.length());
14649 ctx->clean_regions.mark_data_region_dirty(0, bl.length());
7c673cae 14650 }
20effc67
TL
14651 map<string, bufferlist, std::less<>> attrs = {
14652 {OI_ATTR, std::move(boi)},
14653 {SS_ATTR, std::move(bss)}
14654 };
11fdf7f2 14655 setattrs_maybe_cache(ctx->obc, ctx->op_t.get(), attrs);
7c673cae
FG
14656 ctx->log.push_back(
14657 pg_log_entry_t(
14658 pg_log_entry_t::MODIFY,
14659 oid,
14660 ctx->at_version,
14661 eversion_t(),
14662 0,
14663 osd_reqid_t(),
14664 ctx->mtime,
14665 0)
14666 );
9f95a23c 14667 ctx->log.back().clean_regions = ctx->clean_regions;
7c673cae
FG
14668
14669 hit_set_trim(ctx, max);
14670
14671 simple_opc_submit(std::move(ctx));
14672}
14673
14674void PrimaryLogPG::hit_set_trim(OpContextUPtr &ctx, unsigned max)
14675{
11fdf7f2 14676 ceph_assert(ctx->updated_hset_history);
7c673cae
FG
14677 pg_hit_set_history_t &updated_hit_set_hist =
14678 *(ctx->updated_hset_history);
14679 for (unsigned num = updated_hit_set_hist.history.size(); num > max; --num) {
14680 list<pg_hit_set_info_t>::iterator p = updated_hit_set_hist.history.begin();
11fdf7f2 14681 ceph_assert(p != updated_hit_set_hist.history.end());
7c673cae
FG
14682 hobject_t oid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt);
14683
11fdf7f2 14684 ceph_assert(!is_degraded_or_backfilling_object(oid));
7c673cae
FG
14685
14686 dout(20) << __func__ << " removing " << oid << dendl;
14687 ++ctx->at_version.version;
14688 ctx->log.push_back(
14689 pg_log_entry_t(pg_log_entry_t::DELETE,
14690 oid,
14691 ctx->at_version,
14692 p->version,
14693 0,
14694 osd_reqid_t(),
14695 ctx->mtime,
14696 0));
14697
14698 ctx->op_t->remove(oid);
14699 updated_hit_set_hist.history.pop_front();
14700
14701 ObjectContextRef obc = get_object_context(oid, false);
11fdf7f2 14702 ceph_assert(obc);
7c673cae
FG
14703 --ctx->delta_stats.num_objects;
14704 --ctx->delta_stats.num_objects_hit_set_archive;
14705 ctx->delta_stats.num_bytes -= obc->obs.oi.size;
14706 ctx->delta_stats.num_bytes_hit_set_archive -= obc->obs.oi.size;
14707 }
14708}
14709
14710void PrimaryLogPG::hit_set_in_memory_trim(uint32_t max_in_memory)
14711{
14712 while (agent_state->hit_set_map.size() > max_in_memory) {
14713 agent_state->remove_oldest_hit_set();
14714 }
14715}
14716
14717
14718// =======================================
14719// cache agent
14720
14721void PrimaryLogPG::agent_setup()
14722{
11fdf7f2 14723 ceph_assert(is_locked());
7c673cae
FG
14724 if (!is_active() ||
14725 !is_primary() ||
11fdf7f2 14726 state_test(PG_STATE_PREMERGE) ||
7c673cae
FG
14727 pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE ||
14728 pool.info.tier_of < 0 ||
14729 !get_osdmap()->have_pg_pool(pool.info.tier_of)) {
14730 agent_clear();
14731 return;
14732 }
14733 if (!agent_state) {
14734 agent_state.reset(new TierAgentState);
14735
14736 // choose random starting position
14737 agent_state->position = hobject_t();
14738 agent_state->position.pool = info.pgid.pool();
14739 agent_state->position.set_hash(pool.info.get_random_pg_position(
14740 info.pgid.pgid,
14741 rand()));
14742 agent_state->start = agent_state->position;
14743
14744 dout(10) << __func__ << " allocated new state, position "
14745 << agent_state->position << dendl;
14746 } else {
14747 dout(10) << __func__ << " keeping existing state" << dendl;
14748 }
14749
14750 if (info.stats.stats_invalid) {
14751 osd->clog->warn() << "pg " << info.pgid << " has invalid (post-split) stats; must scrub before tier agent can activate";
14752 }
14753
14754 agent_choose_mode();
14755}
14756
14757void PrimaryLogPG::agent_clear()
14758{
14759 agent_stop();
14760 agent_state.reset(NULL);
14761}
14762
14763// Return false if no objects operated on since start of object hash space
14764bool PrimaryLogPG::agent_work(int start_max, int agent_flush_quota)
14765{
9f95a23c 14766 std::scoped_lock locker{*this};
7c673cae
FG
14767 if (!agent_state) {
14768 dout(10) << __func__ << " no agent state, stopping" << dendl;
7c673cae
FG
14769 return true;
14770 }
14771
9f95a23c 14772 ceph_assert(!recovery_state.is_deleting());
7c673cae
FG
14773
14774 if (agent_state->is_idle()) {
14775 dout(10) << __func__ << " idle, stopping" << dendl;
7c673cae
FG
14776 return true;
14777 }
14778
14779 osd->logger->inc(l_osd_agent_wake);
14780
14781 dout(10) << __func__
14782 << " max " << start_max
14783 << ", flush " << agent_state->get_flush_mode_name()
14784 << ", evict " << agent_state->get_evict_mode_name()
14785 << ", pos " << agent_state->position
14786 << dendl;
11fdf7f2
TL
14787 ceph_assert(is_primary());
14788 ceph_assert(is_active());
7c673cae
FG
14789
14790 agent_load_hit_sets();
14791
14792 const pg_pool_t *base_pool = get_osdmap()->get_pg_pool(pool.info.tier_of);
11fdf7f2 14793 ceph_assert(base_pool);
7c673cae
FG
14794
14795 int ls_min = 1;
14796 int ls_max = cct->_conf->osd_pool_default_cache_max_evict_check_size;
14797
14798 // list some objects. this conveniently lists clones (oldest to
14799 // newest) before heads... the same order we want to flush in.
14800 //
14801 // NOTE: do not flush the Sequencer. we will assume that the
14802 // listing we get back is imprecise.
14803 vector<hobject_t> ls;
14804 hobject_t next;
14805 int r = pgbackend->objects_list_partial(agent_state->position, ls_min, ls_max,
14806 &ls, &next);
11fdf7f2 14807 ceph_assert(r >= 0);
7c673cae
FG
14808 dout(20) << __func__ << " got " << ls.size() << " objects" << dendl;
14809 int started = 0;
14810 for (vector<hobject_t>::iterator p = ls.begin();
14811 p != ls.end();
14812 ++p) {
14813 if (p->nspace == cct->_conf->osd_hit_set_namespace) {
14814 dout(20) << __func__ << " skip (hit set) " << *p << dendl;
14815 osd->logger->inc(l_osd_agent_skip);
14816 continue;
14817 }
14818 if (is_degraded_or_backfilling_object(*p)) {
14819 dout(20) << __func__ << " skip (degraded) " << *p << dendl;
14820 osd->logger->inc(l_osd_agent_skip);
14821 continue;
14822 }
14823 if (is_missing_object(p->get_head())) {
14824 dout(20) << __func__ << " skip (missing head) " << *p << dendl;
14825 osd->logger->inc(l_osd_agent_skip);
14826 continue;
14827 }
14828 ObjectContextRef obc = get_object_context(*p, false, NULL);
14829 if (!obc) {
14830 // we didn't flush; we may miss something here.
14831 dout(20) << __func__ << " skip (no obc) " << *p << dendl;
14832 osd->logger->inc(l_osd_agent_skip);
14833 continue;
14834 }
14835 if (!obc->obs.exists) {
14836 dout(20) << __func__ << " skip (dne) " << obc->obs.oi.soid << dendl;
14837 osd->logger->inc(l_osd_agent_skip);
14838 continue;
14839 }
f67539c2 14840 if (m_scrubber->range_intersects_scrub(obc->obs.oi.soid,
28e407b8 14841 obc->obs.oi.soid.get_head())) {
7c673cae
FG
14842 dout(20) << __func__ << " skip (scrubbing) " << obc->obs.oi << dendl;
14843 osd->logger->inc(l_osd_agent_skip);
14844 continue;
14845 }
14846 if (obc->is_blocked()) {
14847 dout(20) << __func__ << " skip (blocked) " << obc->obs.oi << dendl;
14848 osd->logger->inc(l_osd_agent_skip);
14849 continue;
14850 }
14851 if (obc->is_request_pending()) {
14852 dout(20) << __func__ << " skip (request pending) " << obc->obs.oi << dendl;
14853 osd->logger->inc(l_osd_agent_skip);
14854 continue;
14855 }
14856
14857 // be careful flushing omap to an EC pool.
14858 if (!base_pool->supports_omap() &&
14859 obc->obs.oi.is_omap()) {
14860 dout(20) << __func__ << " skip (omap to EC) " << obc->obs.oi << dendl;
14861 osd->logger->inc(l_osd_agent_skip);
14862 continue;
14863 }
14864
14865 if (agent_state->evict_mode != TierAgentState::EVICT_MODE_IDLE &&
14866 agent_maybe_evict(obc, false))
14867 ++started;
14868 else if (agent_state->flush_mode != TierAgentState::FLUSH_MODE_IDLE &&
14869 agent_flush_quota > 0 && agent_maybe_flush(obc)) {
14870 ++started;
14871 --agent_flush_quota;
14872 }
14873 if (started >= start_max) {
14874 // If finishing early, set "next" to the next object
14875 if (++p != ls.end())
14876 next = *p;
14877 break;
14878 }
14879 }
14880
14881 if (++agent_state->hist_age > cct->_conf->osd_agent_hist_halflife) {
14882 dout(20) << __func__ << " resetting atime and temp histograms" << dendl;
14883 agent_state->hist_age = 0;
14884 agent_state->temp_hist.decay();
14885 }
14886
14887 // Total objects operated on so far
14888 int total_started = agent_state->started + started;
14889 bool need_delay = false;
14890
14891 dout(20) << __func__ << " start pos " << agent_state->position
14892 << " next start pos " << next
14893 << " started " << total_started << dendl;
14894
14895 // See if we've made a full pass over the object hash space
14896 // This might check at most ls_max objects a second time to notice that
14897 // we've checked every objects at least once.
14898 if (agent_state->position < agent_state->start &&
14899 next >= agent_state->start) {
14900 dout(20) << __func__ << " wrap around " << agent_state->start << dendl;
14901 if (total_started == 0)
14902 need_delay = true;
14903 else
14904 total_started = 0;
14905 agent_state->start = next;
14906 }
14907 agent_state->started = total_started;
14908
14909 // See if we are starting from beginning
14910 if (next.is_max())
14911 agent_state->position = hobject_t();
14912 else
14913 agent_state->position = next;
14914
14915 // Discard old in memory HitSets
14916 hit_set_in_memory_trim(pool.info.hit_set_count);
14917
14918 if (need_delay) {
11fdf7f2 14919 ceph_assert(agent_state->delaying == false);
7c673cae 14920 agent_delay();
7c673cae
FG
14921 return false;
14922 }
14923 agent_choose_mode();
7c673cae
FG
14924 return true;
14925}
14926
14927void PrimaryLogPG::agent_load_hit_sets()
14928{
14929 if (agent_state->evict_mode == TierAgentState::EVICT_MODE_IDLE) {
14930 return;
14931 }
14932
14933 if (agent_state->hit_set_map.size() < info.hit_set.history.size()) {
14934 dout(10) << __func__ << dendl;
9f95a23c 14935 for (auto p = info.hit_set.history.begin();
7c673cae
FG
14936 p != info.hit_set.history.end(); ++p) {
14937 if (agent_state->hit_set_map.count(p->begin.sec()) == 0) {
14938 dout(10) << __func__ << " loading " << p->begin << "-"
14939 << p->end << dendl;
14940 if (!pool.info.is_replicated()) {
14941 // FIXME: EC not supported here yet
14942 derr << __func__ << " on non-replicated pool" << dendl;
14943 break;
14944 }
14945
14946 hobject_t oid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt);
14947 if (is_unreadable_object(oid)) {
14948 dout(10) << __func__ << " unreadable " << oid << ", waiting" << dendl;
14949 break;
14950 }
14951
14952 ObjectContextRef obc = get_object_context(oid, false);
14953 if (!obc) {
14954 derr << __func__ << ": could not load hitset " << oid << dendl;
14955 break;
14956 }
14957
14958 bufferlist bl;
14959 {
7c673cae 14960 int r = osd->store->read(ch, ghobject_t(oid), 0, 0, bl);
11fdf7f2 14961 ceph_assert(r >= 0);
7c673cae
FG
14962 }
14963 HitSetRef hs(new HitSet);
11fdf7f2
TL
14964 bufferlist::const_iterator pbl = bl.begin();
14965 decode(*hs, pbl);
7c673cae
FG
14966 agent_state->add_hit_set(p->begin.sec(), hs);
14967 }
14968 }
14969 }
14970}
14971
14972bool PrimaryLogPG::agent_maybe_flush(ObjectContextRef& obc)
14973{
14974 if (!obc->obs.oi.is_dirty()) {
14975 dout(20) << __func__ << " skip (clean) " << obc->obs.oi << dendl;
14976 osd->logger->inc(l_osd_agent_skip);
14977 return false;
14978 }
14979 if (obc->obs.oi.is_cache_pinned()) {
14980 dout(20) << __func__ << " skip (cache_pinned) " << obc->obs.oi << dendl;
14981 osd->logger->inc(l_osd_agent_skip);
14982 return false;
14983 }
14984
14985 utime_t now = ceph_clock_now();
14986 utime_t ob_local_mtime;
14987 if (obc->obs.oi.local_mtime != utime_t()) {
14988 ob_local_mtime = obc->obs.oi.local_mtime;
14989 } else {
14990 ob_local_mtime = obc->obs.oi.mtime;
14991 }
14992 bool evict_mode_full =
14993 (agent_state->evict_mode == TierAgentState::EVICT_MODE_FULL);
14994 if (!evict_mode_full &&
14995 obc->obs.oi.soid.snap == CEPH_NOSNAP && // snaps immutable; don't delay
14996 (ob_local_mtime + utime_t(pool.info.cache_min_flush_age, 0) > now)) {
14997 dout(20) << __func__ << " skip (too young) " << obc->obs.oi << dendl;
14998 osd->logger->inc(l_osd_agent_skip);
14999 return false;
15000 }
15001
15002 if (osd->agent_is_active_oid(obc->obs.oi.soid)) {
15003 dout(20) << __func__ << " skip (flushing) " << obc->obs.oi << dendl;
15004 osd->logger->inc(l_osd_agent_skip);
15005 return false;
15006 }
15007
15008 dout(10) << __func__ << " flushing " << obc->obs.oi << dendl;
15009
15010 // FIXME: flush anything dirty, regardless of what distribution of
15011 // ages we expect.
15012
15013 hobject_t oid = obc->obs.oi.soid;
15014 osd->agent_start_op(oid);
15015 // no need to capture a pg ref, can't outlive fop or ctx
15016 std::function<void()> on_flush = [this, oid]() {
15017 osd->agent_finish_op(oid);
15018 };
15019
15020 int result = start_flush(
15021 OpRequestRef(), obc, false, NULL,
15022 on_flush);
15023 if (result != -EINPROGRESS) {
15024 on_flush();
15025 dout(10) << __func__ << " start_flush() failed " << obc->obs.oi
15026 << " with " << result << dendl;
15027 osd->logger->inc(l_osd_agent_skip);
15028 return false;
15029 }
15030
15031 osd->logger->inc(l_osd_agent_flush);
15032 return true;
15033}
15034
15035bool PrimaryLogPG::agent_maybe_evict(ObjectContextRef& obc, bool after_flush)
15036{
15037 const hobject_t& soid = obc->obs.oi.soid;
15038 if (!after_flush && obc->obs.oi.is_dirty()) {
15039 dout(20) << __func__ << " skip (dirty) " << obc->obs.oi << dendl;
15040 return false;
15041 }
81eedcae 15042 // This is already checked by agent_work() which passes after_flush = false
f67539c2 15043 if (after_flush && m_scrubber->range_intersects_scrub(soid, soid.get_head())) {
81eedcae
TL
15044 dout(20) << __func__ << " skip (scrubbing) " << obc->obs.oi << dendl;
15045 return false;
15046 }
7c673cae
FG
15047 if (!obc->obs.oi.watchers.empty()) {
15048 dout(20) << __func__ << " skip (watchers) " << obc->obs.oi << dendl;
15049 return false;
15050 }
15051 if (obc->is_blocked()) {
15052 dout(20) << __func__ << " skip (blocked) " << obc->obs.oi << dendl;
15053 return false;
15054 }
15055 if (obc->obs.oi.is_cache_pinned()) {
15056 dout(20) << __func__ << " skip (cache_pinned) " << obc->obs.oi << dendl;
15057 return false;
15058 }
15059
15060 if (soid.snap == CEPH_NOSNAP) {
15061 int result = _verify_no_head_clones(soid, obc->ssc->snapset);
15062 if (result < 0) {
15063 dout(20) << __func__ << " skip (clones) " << obc->obs.oi << dendl;
15064 return false;
15065 }
15066 }
15067
15068 if (agent_state->evict_mode != TierAgentState::EVICT_MODE_FULL) {
15069 // is this object old than cache_min_evict_age?
15070 utime_t now = ceph_clock_now();
15071 utime_t ob_local_mtime;
15072 if (obc->obs.oi.local_mtime != utime_t()) {
15073 ob_local_mtime = obc->obs.oi.local_mtime;
15074 } else {
15075 ob_local_mtime = obc->obs.oi.mtime;
15076 }
15077 if (ob_local_mtime + utime_t(pool.info.cache_min_evict_age, 0) > now) {
15078 dout(20) << __func__ << " skip (too young) " << obc->obs.oi << dendl;
15079 osd->logger->inc(l_osd_agent_skip);
15080 return false;
15081 }
15082 // is this object old and/or cold enough?
15083 int temp = 0;
15084 uint64_t temp_upper = 0, temp_lower = 0;
15085 if (hit_set)
15086 agent_estimate_temp(soid, &temp);
15087 agent_state->temp_hist.add(temp);
15088 agent_state->temp_hist.get_position_micro(temp, &temp_lower, &temp_upper);
15089
15090 dout(20) << __func__
15091 << " temp " << temp
15092 << " pos " << temp_lower << "-" << temp_upper
15093 << ", evict_effort " << agent_state->evict_effort
15094 << dendl;
15095 dout(30) << "agent_state:\n";
1e59de90 15096 auto f = Formatter::create_unique("");
7c673cae 15097 f->open_object_section("agent_state");
1e59de90 15098 agent_state->dump(f.get());
7c673cae
FG
15099 f->close_section();
15100 f->flush(*_dout);
7c673cae
FG
15101 *_dout << dendl;
15102
15103 if (1000000 - temp_upper >= agent_state->evict_effort)
15104 return false;
15105 }
15106
15107 dout(10) << __func__ << " evicting " << obc->obs.oi << dendl;
15108 OpContextUPtr ctx = simple_opc_create(obc);
15109
11fdf7f2 15110 auto null_op_req = OpRequestRef();
7c673cae 15111 if (!ctx->lock_manager.get_lock_type(
9f95a23c 15112 RWState::RWWRITE,
7c673cae
FG
15113 obc->obs.oi.soid,
15114 obc,
11fdf7f2 15115 null_op_req)) {
7c673cae
FG
15116 close_op_ctx(ctx.release());
15117 dout(20) << __func__ << " skip (cannot get lock) " << obc->obs.oi << dendl;
15118 return false;
15119 }
15120
15121 osd->agent_start_evict_op();
15122 ctx->register_on_finish(
15123 [this]() {
15124 osd->agent_finish_evict_op();
15125 });
15126
15127 ctx->at_version = get_next_version();
11fdf7f2 15128 ceph_assert(ctx->new_obs.exists);
7c673cae
FG
15129 int r = _delete_oid(ctx.get(), true, false);
15130 if (obc->obs.oi.is_omap())
15131 ctx->delta_stats.num_objects_omap--;
15132 ctx->delta_stats.num_evict++;
11fdf7f2 15133 ctx->delta_stats.num_evict_kb += shift_round_up(obc->obs.oi.size, 10);
7c673cae
FG
15134 if (obc->obs.oi.is_dirty())
15135 --ctx->delta_stats.num_objects_dirty;
11fdf7f2
TL
15136 ceph_assert(r == 0);
15137 finish_ctx(ctx.get(), pg_log_entry_t::DELETE);
7c673cae
FG
15138 simple_opc_submit(std::move(ctx));
15139 osd->logger->inc(l_osd_tier_evict);
15140 osd->logger->inc(l_osd_agent_evict);
15141 return true;
15142}
15143
15144void PrimaryLogPG::agent_stop()
15145{
15146 dout(20) << __func__ << dendl;
15147 if (agent_state && !agent_state->is_idle()) {
15148 agent_state->evict_mode = TierAgentState::EVICT_MODE_IDLE;
15149 agent_state->flush_mode = TierAgentState::FLUSH_MODE_IDLE;
15150 osd->agent_disable_pg(this, agent_state->evict_effort);
15151 }
15152}
15153
15154void PrimaryLogPG::agent_delay()
15155{
15156 dout(20) << __func__ << dendl;
15157 if (agent_state && !agent_state->is_idle()) {
11fdf7f2 15158 ceph_assert(agent_state->delaying == false);
7c673cae
FG
15159 agent_state->delaying = true;
15160 osd->agent_disable_pg(this, agent_state->evict_effort);
15161 }
15162}
15163
15164void PrimaryLogPG::agent_choose_mode_restart()
15165{
15166 dout(20) << __func__ << dendl;
9f95a23c 15167 std::scoped_lock locker{*this};
7c673cae
FG
15168 if (agent_state && agent_state->delaying) {
15169 agent_state->delaying = false;
15170 agent_choose_mode(true);
15171 }
7c673cae
FG
15172}
15173
15174bool PrimaryLogPG::agent_choose_mode(bool restart, OpRequestRef op)
15175{
15176 bool requeued = false;
15177 // Let delay play out
15178 if (agent_state->delaying) {
11fdf7f2 15179 dout(20) << __func__ << " " << this << " delaying, ignored" << dendl;
7c673cae
FG
15180 return requeued;
15181 }
15182
15183 TierAgentState::flush_mode_t flush_mode = TierAgentState::FLUSH_MODE_IDLE;
15184 TierAgentState::evict_mode_t evict_mode = TierAgentState::EVICT_MODE_IDLE;
15185 unsigned evict_effort = 0;
15186
15187 if (info.stats.stats_invalid) {
15188 // idle; stats can't be trusted until we scrub.
15189 dout(20) << __func__ << " stats invalid (post-split), idle" << dendl;
15190 goto skip_calc;
15191 }
15192
15193 {
15194 uint64_t divisor = pool.info.get_pg_num_divisor(info.pgid.pgid);
11fdf7f2 15195 ceph_assert(divisor > 0);
7c673cae
FG
15196
15197 // adjust (effective) user objects down based on the number
15198 // of HitSet objects, which should not count toward our total since
15199 // they cannot be flushed.
15200 uint64_t unflushable = info.stats.stats.sum.num_objects_hit_set_archive;
15201
15202 // also exclude omap objects if ec backing pool
15203 const pg_pool_t *base_pool = get_osdmap()->get_pg_pool(pool.info.tier_of);
11fdf7f2 15204 ceph_assert(base_pool);
7c673cae
FG
15205 if (!base_pool->supports_omap())
15206 unflushable += info.stats.stats.sum.num_objects_omap;
15207
15208 uint64_t num_user_objects = info.stats.stats.sum.num_objects;
15209 if (num_user_objects > unflushable)
15210 num_user_objects -= unflushable;
15211 else
15212 num_user_objects = 0;
15213
15214 uint64_t num_user_bytes = info.stats.stats.sum.num_bytes;
15215 uint64_t unflushable_bytes = info.stats.stats.sum.num_bytes_hit_set_archive;
15216 num_user_bytes -= unflushable_bytes;
15217 uint64_t num_overhead_bytes = osd->store->estimate_objects_overhead(num_user_objects);
15218 num_user_bytes += num_overhead_bytes;
15219
15220 // also reduce the num_dirty by num_objects_omap
15221 int64_t num_dirty = info.stats.stats.sum.num_objects_dirty;
15222 if (!base_pool->supports_omap()) {
15223 if (num_dirty > info.stats.stats.sum.num_objects_omap)
15224 num_dirty -= info.stats.stats.sum.num_objects_omap;
15225 else
15226 num_dirty = 0;
15227 }
15228
15229 dout(10) << __func__
15230 << " flush_mode: "
15231 << TierAgentState::get_flush_mode_name(agent_state->flush_mode)
15232 << " evict_mode: "
15233 << TierAgentState::get_evict_mode_name(agent_state->evict_mode)
15234 << " num_objects: " << info.stats.stats.sum.num_objects
15235 << " num_bytes: " << info.stats.stats.sum.num_bytes
15236 << " num_objects_dirty: " << info.stats.stats.sum.num_objects_dirty
15237 << " num_objects_omap: " << info.stats.stats.sum.num_objects_omap
15238 << " num_dirty: " << num_dirty
15239 << " num_user_objects: " << num_user_objects
15240 << " num_user_bytes: " << num_user_bytes
15241 << " num_overhead_bytes: " << num_overhead_bytes
15242 << " pool.info.target_max_bytes: " << pool.info.target_max_bytes
15243 << " pool.info.target_max_objects: " << pool.info.target_max_objects
15244 << dendl;
15245
15246 // get dirty, full ratios
15247 uint64_t dirty_micro = 0;
15248 uint64_t full_micro = 0;
15249 if (pool.info.target_max_bytes && num_user_objects > 0) {
15250 uint64_t avg_size = num_user_bytes / num_user_objects;
15251 dirty_micro =
15252 num_dirty * avg_size * 1000000 /
11fdf7f2 15253 std::max<uint64_t>(pool.info.target_max_bytes / divisor, 1);
7c673cae
FG
15254 full_micro =
15255 num_user_objects * avg_size * 1000000 /
11fdf7f2 15256 std::max<uint64_t>(pool.info.target_max_bytes / divisor, 1);
7c673cae
FG
15257 }
15258 if (pool.info.target_max_objects > 0) {
15259 uint64_t dirty_objects_micro =
15260 num_dirty * 1000000 /
11fdf7f2 15261 std::max<uint64_t>(pool.info.target_max_objects / divisor, 1);
7c673cae
FG
15262 if (dirty_objects_micro > dirty_micro)
15263 dirty_micro = dirty_objects_micro;
15264 uint64_t full_objects_micro =
15265 num_user_objects * 1000000 /
11fdf7f2 15266 std::max<uint64_t>(pool.info.target_max_objects / divisor, 1);
7c673cae
FG
15267 if (full_objects_micro > full_micro)
15268 full_micro = full_objects_micro;
15269 }
15270 dout(20) << __func__ << " dirty " << ((float)dirty_micro / 1000000.0)
15271 << " full " << ((float)full_micro / 1000000.0)
15272 << dendl;
15273
15274 // flush mode
15275 uint64_t flush_target = pool.info.cache_target_dirty_ratio_micro;
15276 uint64_t flush_high_target = pool.info.cache_target_dirty_high_ratio_micro;
15277 uint64_t flush_slop = (float)flush_target * cct->_conf->osd_agent_slop;
15278 if (restart || agent_state->flush_mode == TierAgentState::FLUSH_MODE_IDLE) {
15279 flush_target += flush_slop;
15280 flush_high_target += flush_slop;
15281 } else {
11fdf7f2
TL
15282 flush_target -= std::min(flush_target, flush_slop);
15283 flush_high_target -= std::min(flush_high_target, flush_slop);
7c673cae
FG
15284 }
15285
15286 if (dirty_micro > flush_high_target) {
15287 flush_mode = TierAgentState::FLUSH_MODE_HIGH;
11fdf7f2 15288 } else if (dirty_micro > flush_target || (!flush_target && num_dirty > 0)) {
7c673cae
FG
15289 flush_mode = TierAgentState::FLUSH_MODE_LOW;
15290 }
15291
15292 // evict mode
15293 uint64_t evict_target = pool.info.cache_target_full_ratio_micro;
15294 uint64_t evict_slop = (float)evict_target * cct->_conf->osd_agent_slop;
15295 if (restart || agent_state->evict_mode == TierAgentState::EVICT_MODE_IDLE)
15296 evict_target += evict_slop;
15297 else
11fdf7f2 15298 evict_target -= std::min(evict_target, evict_slop);
7c673cae
FG
15299
15300 if (full_micro > 1000000) {
15301 // evict anything clean
15302 evict_mode = TierAgentState::EVICT_MODE_FULL;
15303 evict_effort = 1000000;
15304 } else if (full_micro > evict_target) {
15305 // set effort in [0..1] range based on where we are between
15306 evict_mode = TierAgentState::EVICT_MODE_SOME;
15307 uint64_t over = full_micro - evict_target;
15308 uint64_t span = 1000000 - evict_target;
11fdf7f2
TL
15309 evict_effort = std::max(over * 1000000 / span,
15310 uint64_t(1000000.0 *
15311 cct->_conf->osd_agent_min_evict_effort));
7c673cae
FG
15312
15313 // quantize effort to avoid too much reordering in the agent_queue.
15314 uint64_t inc = cct->_conf->osd_agent_quantize_effort * 1000000;
11fdf7f2 15315 ceph_assert(inc > 0);
7c673cae
FG
15316 uint64_t was = evict_effort;
15317 evict_effort -= evict_effort % inc;
15318 if (evict_effort < inc)
15319 evict_effort = inc;
11fdf7f2 15320 ceph_assert(evict_effort >= inc && evict_effort <= 1000000);
7c673cae
FG
15321 dout(30) << __func__ << " evict_effort " << was << " quantized by " << inc << " to " << evict_effort << dendl;
15322 }
15323 }
15324
15325 skip_calc:
15326 bool old_idle = agent_state->is_idle();
15327 if (flush_mode != agent_state->flush_mode) {
15328 dout(5) << __func__ << " flush_mode "
15329 << TierAgentState::get_flush_mode_name(agent_state->flush_mode)
15330 << " -> "
15331 << TierAgentState::get_flush_mode_name(flush_mode)
15332 << dendl;
9f95a23c 15333 recovery_state.update_stats(
1e59de90 15334 [=, this](auto &history, auto &stats) {
9f95a23c
TL
15335 if (flush_mode == TierAgentState::FLUSH_MODE_HIGH) {
15336 osd->agent_inc_high_count();
15337 stats.stats.sum.num_flush_mode_high = 1;
15338 } else if (flush_mode == TierAgentState::FLUSH_MODE_LOW) {
15339 stats.stats.sum.num_flush_mode_low = 1;
15340 }
15341 if (agent_state->flush_mode == TierAgentState::FLUSH_MODE_HIGH) {
15342 osd->agent_dec_high_count();
15343 stats.stats.sum.num_flush_mode_high = 0;
15344 } else if (agent_state->flush_mode == TierAgentState::FLUSH_MODE_LOW) {
15345 stats.stats.sum.num_flush_mode_low = 0;
15346 }
15347 return false;
15348 });
7c673cae
FG
15349 agent_state->flush_mode = flush_mode;
15350 }
15351 if (evict_mode != agent_state->evict_mode) {
15352 dout(5) << __func__ << " evict_mode "
15353 << TierAgentState::get_evict_mode_name(agent_state->evict_mode)
15354 << " -> "
15355 << TierAgentState::get_evict_mode_name(evict_mode)
15356 << dendl;
15357 if (agent_state->evict_mode == TierAgentState::EVICT_MODE_FULL &&
15358 is_active()) {
15359 if (op)
15360 requeue_op(op);
b32b8144 15361 requeue_ops(waiting_for_flush);
7c673cae 15362 requeue_ops(waiting_for_active);
9f95a23c 15363 requeue_ops(waiting_for_readable);
7c673cae
FG
15364 requeue_ops(waiting_for_scrub);
15365 requeue_ops(waiting_for_cache_not_full);
15366 objects_blocked_on_cache_full.clear();
15367 requeued = true;
15368 }
9f95a23c 15369 recovery_state.update_stats(
1e59de90 15370 [=, this](auto &history, auto &stats) {
9f95a23c
TL
15371 if (evict_mode == TierAgentState::EVICT_MODE_SOME) {
15372 stats.stats.sum.num_evict_mode_some = 1;
15373 } else if (evict_mode == TierAgentState::EVICT_MODE_FULL) {
15374 stats.stats.sum.num_evict_mode_full = 1;
15375 }
15376 if (agent_state->evict_mode == TierAgentState::EVICT_MODE_SOME) {
15377 stats.stats.sum.num_evict_mode_some = 0;
15378 } else if (agent_state->evict_mode == TierAgentState::EVICT_MODE_FULL) {
15379 stats.stats.sum.num_evict_mode_full = 0;
15380 }
15381 return false;
15382 });
7c673cae
FG
15383 agent_state->evict_mode = evict_mode;
15384 }
15385 uint64_t old_effort = agent_state->evict_effort;
15386 if (evict_effort != agent_state->evict_effort) {
15387 dout(5) << __func__ << " evict_effort "
15388 << ((float)agent_state->evict_effort / 1000000.0)
15389 << " -> "
15390 << ((float)evict_effort / 1000000.0)
15391 << dendl;
15392 agent_state->evict_effort = evict_effort;
15393 }
15394
15395 // NOTE: we are using evict_effort as a proxy for *all* agent effort
15396 // (including flush). This is probably fine (they should be
15397 // correlated) but it is not precisely correct.
15398 if (agent_state->is_idle()) {
15399 if (!restart && !old_idle) {
15400 osd->agent_disable_pg(this, old_effort);
15401 }
15402 } else {
15403 if (restart || old_idle) {
15404 osd->agent_enable_pg(this, agent_state->evict_effort);
15405 } else if (old_effort != agent_state->evict_effort) {
15406 osd->agent_adjust_pg(this, old_effort, agent_state->evict_effort);
15407 }
15408 }
15409 return requeued;
15410}
15411
15412void PrimaryLogPG::agent_estimate_temp(const hobject_t& oid, int *temp)
15413{
11fdf7f2
TL
15414 ceph_assert(hit_set);
15415 ceph_assert(temp);
7c673cae
FG
15416 *temp = 0;
15417 if (hit_set->contains(oid))
15418 *temp = 1000000;
15419 unsigned i = 0;
15420 int last_n = pool.info.hit_set_search_last_n;
15421 for (map<time_t,HitSetRef>::reverse_iterator p =
15422 agent_state->hit_set_map.rbegin(); last_n > 0 &&
15423 p != agent_state->hit_set_map.rend(); ++p, ++i) {
15424 if (p->second->contains(oid)) {
15425 *temp += pool.info.get_grade(i);
15426 --last_n;
15427 }
15428 }
15429}
15430
15431// Dup op detection
15432
15433bool PrimaryLogPG::already_complete(eversion_t v)
15434{
15435 dout(20) << __func__ << ": " << v << dendl;
15436 for (xlist<RepGather*>::iterator i = repop_queue.begin();
15437 !i.end();
15438 ++i) {
15439 dout(20) << __func__ << ": " << **i << dendl;
15440 // skip copy from temp object ops
15441 if ((*i)->v == eversion_t()) {
15442 dout(20) << __func__ << ": " << **i
15443 << " version is empty" << dendl;
15444 continue;
15445 }
15446 if ((*i)->v > v) {
15447 dout(20) << __func__ << ": " << **i
15448 << " (*i)->v past v" << dendl;
15449 break;
15450 }
15451 if (!(*i)->all_committed) {
15452 dout(20) << __func__ << ": " << **i
15453 << " not committed, returning false"
15454 << dendl;
15455 return false;
15456 }
15457 }
15458 dout(20) << __func__ << ": returning true" << dendl;
15459 return true;
15460}
15461
7c673cae
FG
15462
15463// ==========================================================================================
15464// SCRUB
15465
f67539c2
TL
15466void PrimaryLogPG::do_replica_scrub_map(OpRequestRef op)
15467{
20effc67 15468 dout(15) << __func__ << " is scrub active? " << is_scrub_active() << dendl;
f67539c2
TL
15469 op->mark_started();
15470
20effc67 15471 if (!is_scrub_active()) {
f67539c2
TL
15472 dout(10) << __func__ << " scrub isn't active" << dendl;
15473 return;
15474 }
15475 m_scrubber->map_from_replica(op);
15476}
7c673cae 15477
f67539c2
TL
15478bool PrimaryLogPG::_range_available_for_scrub(const hobject_t& begin,
15479 const hobject_t& end)
7c673cae
FG
15480{
15481 pair<hobject_t, ObjectContextRef> next;
15482 next.second = object_contexts.lookup(begin);
15483 next.first = begin;
15484 bool more = true;
15485 while (more && next.first < end) {
15486 if (next.second && next.second->is_blocked()) {
15487 next.second->requeue_scrub_on_unblock = true;
15488 dout(10) << __func__ << ": scrub delayed, "
15489 << next.first << " is blocked"
15490 << dendl;
15491 return false;
15492 }
15493 more = object_contexts.get_next(next.first, &next);
15494 }
15495 return true;
15496}
15497
7c673cae 15498
11fdf7f2 15499int PrimaryLogPG::rep_repair_primary_object(const hobject_t& soid, OpContext *ctx)
224ce89b 15500{
11fdf7f2 15501 OpRequestRef op = ctx->op;
224ce89b 15502 // Only supports replicated pools
11fdf7f2
TL
15503 ceph_assert(!pool.info.is_erasure());
15504 ceph_assert(is_primary());
224ce89b
WB
15505
15506 dout(10) << __func__ << " " << soid
9f95a23c 15507 << " peers osd.{" << get_acting_recovery_backfill() << "}" << dendl;
224ce89b
WB
15508
15509 if (!is_clean()) {
15510 block_for_clean(soid, op);
15511 return -EAGAIN;
15512 }
15513
9f95a23c 15514 ceph_assert(!recovery_state.get_pg_log().get_missing().is_missing(soid));
11fdf7f2
TL
15515 auto& oi = ctx->new_obs.oi;
15516 eversion_t v = oi.version;
224ce89b 15517
224ce89b
WB
15518 if (primary_error(soid, v)) {
15519 dout(0) << __func__ << " No other replicas available for " << soid << dendl;
15520 // XXX: If we knew that there is no down osd which could include this
15521 // object, it would be nice if we could return EIO here.
15522 // If a "never fail" flag was available, that could be used
15523 // for rbd to NOT return EIO until object marked lost.
15524
15525 // Drop through to save this op in case an osd comes up with the object.
15526 }
15527
15528 // Restart the op after object becomes readable again
15529 waiting_for_unreadable_object[soid].push_back(op);
15530 op->mark_delayed("waiting for missing object");
15531
f67539c2
TL
15532 ceph_assert(is_clean());
15533 state_set(PG_STATE_REPAIR);
15534 state_clear(PG_STATE_CLEAN);
15535 queue_peering_event(
15536 PGPeeringEventRef(
15537 std::make_shared<PGPeeringEvent>(
15538 get_osdmap_epoch(),
15539 get_osdmap_epoch(),
15540 PeeringState::DoRecovery())));
224ce89b
WB
15541
15542 return -EAGAIN;
15543}
15544
7c673cae
FG
15545/*---SnapTrimmer Logging---*/
15546#undef dout_prefix
11fdf7f2 15547#define dout_prefix pg->gen_prefix(*_dout)
7c673cae
FG
15548
15549void PrimaryLogPG::SnapTrimmer::log_enter(const char *state_name)
15550{
15551 ldout(pg->cct, 20) << "enter " << state_name << dendl;
15552}
15553
15554void PrimaryLogPG::SnapTrimmer::log_exit(const char *state_name, utime_t enter_time)
15555{
15556 ldout(pg->cct, 20) << "exit " << state_name << dendl;
15557}
15558
f67539c2
TL
15559bool PrimaryLogPG::SnapTrimmer::permit_trim() {
15560 return
15561 pg->is_clean() &&
20effc67 15562 !pg->is_scrub_queued_or_active() &&
f67539c2
TL
15563 !pg->snap_trimq.empty();
15564}
15565
7c673cae
FG
15566/*---SnapTrimmer states---*/
15567#undef dout_prefix
11fdf7f2 15568#define dout_prefix (context< SnapTrimmer >().pg->gen_prefix(*_dout) \
7c673cae
FG
15569 << "SnapTrimmer state<" << get_state_name() << ">: ")
15570
15571/* NotTrimming */
15572PrimaryLogPG::NotTrimming::NotTrimming(my_context ctx)
f67539c2 15573 : my_base(ctx),
9f95a23c 15574 NamedState(nullptr, "NotTrimming")
7c673cae
FG
15575{
15576 context< SnapTrimmer >().log_enter(state_name);
15577}
15578
15579void PrimaryLogPG::NotTrimming::exit()
15580{
15581 context< SnapTrimmer >().log_exit(state_name, enter_time);
15582}
15583
15584boost::statechart::result PrimaryLogPG::NotTrimming::react(const KickTrim&)
15585{
15586 PrimaryLogPG *pg = context< SnapTrimmer >().pg;
15587 ldout(pg->cct, 10) << "NotTrimming react KickTrim" << dendl;
15588
15589 if (!(pg->is_primary() && pg->is_active())) {
15590 ldout(pg->cct, 10) << "NotTrimming not primary or active" << dendl;
15591 return discard_event();
15592 }
15593 if (!pg->is_clean() ||
15594 pg->snap_trimq.empty()) {
15595 ldout(pg->cct, 10) << "NotTrimming not clean or nothing to trim" << dendl;
15596 return discard_event();
15597 }
20effc67 15598 if (pg->is_scrub_queued_or_active()) {
7c673cae 15599 ldout(pg->cct, 10) << " scrubbing, will requeue snap_trimmer after" << dendl;
7c673cae
FG
15600 return transit< WaitScrub >();
15601 } else {
15602 return transit< Trimming >();
15603 }
15604}
15605
15606boost::statechart::result PrimaryLogPG::WaitReservation::react(const SnapTrimReserved&)
15607{
15608 PrimaryLogPG *pg = context< SnapTrimmer >().pg;
15609 ldout(pg->cct, 10) << "WaitReservation react SnapTrimReserved" << dendl;
15610
15611 pending = nullptr;
15612 if (!context< SnapTrimmer >().can_trim()) {
15613 post_event(KickTrim());
15614 return transit< NotTrimming >();
15615 }
15616
15617 context<Trimming>().snap_to_trim = pg->snap_trimq.range_start();
15618 ldout(pg->cct, 10) << "NotTrimming: trimming "
15619 << pg->snap_trimq.range_start()
15620 << dendl;
15621 return transit< AwaitAsyncWork >();
15622}
15623
15624/* AwaitAsyncWork */
15625PrimaryLogPG::AwaitAsyncWork::AwaitAsyncWork(my_context ctx)
15626 : my_base(ctx),
9f95a23c 15627 NamedState(nullptr, "Trimming/AwaitAsyncWork")
7c673cae
FG
15628{
15629 auto *pg = context< SnapTrimmer >().pg;
15630 context< SnapTrimmer >().log_enter(state_name);
15631 context< SnapTrimmer >().pg->osd->queue_for_snap_trim(pg);
15632 pg->state_set(PG_STATE_SNAPTRIM);
224ce89b 15633 pg->state_clear(PG_STATE_SNAPTRIM_ERROR);
7c673cae
FG
15634 pg->publish_stats_to_osd();
15635}
15636
15637boost::statechart::result PrimaryLogPG::AwaitAsyncWork::react(const DoSnapWork&)
15638{
15639 PrimaryLogPGRef pg = context< SnapTrimmer >().pg;
15640 snapid_t snap_to_trim = context<Trimming>().snap_to_trim;
15641 auto &in_flight = context<Trimming>().in_flight;
11fdf7f2 15642 ceph_assert(in_flight.empty());
7c673cae 15643
11fdf7f2 15644 ceph_assert(pg->is_primary() && pg->is_active());
7c673cae
FG
15645 if (!context< SnapTrimmer >().can_trim()) {
15646 ldout(pg->cct, 10) << "something changed, reverting to NotTrimming" << dendl;
15647 post_event(KickTrim());
15648 return transit< NotTrimming >();
15649 }
15650
15651 ldout(pg->cct, 10) << "AwaitAsyncWork: trimming snap " << snap_to_trim << dendl;
15652
15653 vector<hobject_t> to_trim;
15654 unsigned max = pg->cct->_conf->osd_pg_max_concurrent_snap_trims;
1d09f67e
TL
15655 // we need to look for at least 1 snaptrim, otherwise we'll misinterpret
15656 // the ENOENT below and erase snap_to_trim.
15657 ceph_assert(max > 0);
7c673cae
FG
15658 to_trim.reserve(max);
15659 int r = pg->snap_mapper.get_next_objects_to_trim(
15660 snap_to_trim,
15661 max,
15662 &to_trim);
15663 if (r != 0 && r != -ENOENT) {
15664 lderr(pg->cct) << "get_next_objects_to_trim returned "
15665 << cpp_strerror(r) << dendl;
11fdf7f2 15666 ceph_abort_msg("get_next_objects_to_trim returned an invalid code");
7c673cae
FG
15667 } else if (r == -ENOENT) {
15668 // Done!
15669 ldout(pg->cct, 10) << "got ENOENT" << dendl;
15670
7c673cae 15671 pg->snap_trimq.erase(snap_to_trim);
7c673cae 15672
9f95a23c
TL
15673 if (pg->snap_trimq_repeat.count(snap_to_trim)) {
15674 ldout(pg->cct, 10) << " removing from snap_trimq_repeat" << dendl;
15675 pg->snap_trimq_repeat.erase(snap_to_trim);
15676 } else {
15677 ldout(pg->cct, 10) << "adding snap " << snap_to_trim
15678 << " to purged_snaps"
15679 << dendl;
15680 ObjectStore::Transaction t;
15681 pg->recovery_state.adjust_purged_snaps(
15682 [snap_to_trim](auto &purged_snaps) {
15683 purged_snaps.insert(snap_to_trim);
15684 });
15685 pg->write_if_dirty(t);
15686
15687 ldout(pg->cct, 10) << "purged_snaps now "
15688 << pg->info.purged_snaps << ", snap_trimq now "
15689 << pg->snap_trimq << dendl;
15690
15691 int tr = pg->osd->store->queue_transaction(pg->ch, std::move(t), NULL);
15692 ceph_assert(tr == 0);
7c673cae 15693
9f95a23c
TL
15694 pg->recovery_state.share_pg_info();
15695 }
7c673cae 15696 post_event(KickTrim());
1d09f67e 15697 pg->set_snaptrim_duration();
7c673cae
FG
15698 return transit< NotTrimming >();
15699 }
11fdf7f2 15700 ceph_assert(!to_trim.empty());
7c673cae
FG
15701
15702 for (auto &&object: to_trim) {
15703 // Get next
15704 ldout(pg->cct, 10) << "AwaitAsyncWork react trimming " << object << dendl;
224ce89b 15705 OpContextUPtr ctx;
9f95a23c 15706 int error = pg->trim_object(in_flight.empty(), object, snap_to_trim, &ctx);
224ce89b
WB
15707 if (error) {
15708 if (error == -ENOLCK) {
15709 ldout(pg->cct, 10) << "could not get write lock on obj "
15710 << object << dendl;
15711 } else {
15712 pg->state_set(PG_STATE_SNAPTRIM_ERROR);
15713 ldout(pg->cct, 10) << "Snaptrim error=" << error << dendl;
15714 }
15715 if (!in_flight.empty()) {
15716 ldout(pg->cct, 10) << "letting the ones we already started finish" << dendl;
15717 return transit< WaitRepops >();
15718 }
15719 if (error == -ENOLCK) {
7c673cae
FG
15720 ldout(pg->cct, 10) << "waiting for it to clear"
15721 << dendl;
15722 return transit< WaitRWLock >();
7c673cae 15723 } else {
224ce89b 15724 return transit< NotTrimming >();
7c673cae
FG
15725 }
15726 }
15727
15728 in_flight.insert(object);
15729 ctx->register_on_success(
15730 [pg, object, &in_flight]() {
11fdf7f2 15731 ceph_assert(in_flight.find(object) != in_flight.end());
7c673cae 15732 in_flight.erase(object);
224ce89b
WB
15733 if (in_flight.empty()) {
15734 if (pg->state_test(PG_STATE_SNAPTRIM_ERROR)) {
15735 pg->snap_trimmer_machine.process_event(Reset());
15736 } else {
15737 pg->snap_trimmer_machine.process_event(RepopsComplete());
15738 }
15739 }
7c673cae
FG
15740 });
15741
15742 pg->simple_opc_submit(std::move(ctx));
15743 }
15744
15745 return transit< WaitRepops >();
15746}
15747
15748void PrimaryLogPG::setattr_maybe_cache(
15749 ObjectContextRef obc,
7c673cae
FG
15750 PGTransaction *t,
15751 const string &key,
15752 bufferlist &val)
15753{
15754 t->setattr(obc->obs.oi.soid, key, val);
15755}
15756
15757void PrimaryLogPG::setattrs_maybe_cache(
15758 ObjectContextRef obc,
7c673cae 15759 PGTransaction *t,
20effc67 15760 map<string, bufferlist, less<>> &attrs)
7c673cae
FG
15761{
15762 t->setattrs(obc->obs.oi.soid, attrs);
15763}
15764
15765void PrimaryLogPG::rmattr_maybe_cache(
15766 ObjectContextRef obc,
7c673cae
FG
15767 PGTransaction *t,
15768 const string &key)
15769{
15770 t->rmattr(obc->obs.oi.soid, key);
15771}
15772
15773int PrimaryLogPG::getattr_maybe_cache(
15774 ObjectContextRef obc,
15775 const string &key,
15776 bufferlist *val)
15777{
11fdf7f2 15778 if (pool.info.is_erasure()) {
7c673cae
FG
15779 map<string, bufferlist>::iterator i = obc->attr_cache.find(key);
15780 if (i != obc->attr_cache.end()) {
15781 if (val)
15782 *val = i->second;
15783 return 0;
15784 } else {
1e59de90
TL
15785 if (obc->obs.exists) {
15786 return -ENODATA;
15787 } else {
15788 return -ENOENT;
15789 }
7c673cae
FG
15790 }
15791 }
15792 return pgbackend->objects_get_attr(obc->obs.oi.soid, key, val);
15793}
15794
15795int PrimaryLogPG::getattrs_maybe_cache(
15796 ObjectContextRef obc,
20effc67 15797 map<string, bufferlist, less<>> *out)
7c673cae
FG
15798{
15799 int r = 0;
11fdf7f2
TL
15800 ceph_assert(out);
15801 if (pool.info.is_erasure()) {
b32b8144 15802 *out = obc->attr_cache;
7c673cae
FG
15803 } else {
15804 r = pgbackend->objects_get_attrs(obc->obs.oi.soid, out);
15805 }
20effc67
TL
15806 map<string, bufferlist, less<>> tmp;
15807 for (auto& [key, val]: *out) {
15808 if (key.size() > 1 && key[0] == '_') {
15809 tmp[key.substr(1, key.size())] = std::move(val);
15810 }
7c673cae 15811 }
b32b8144 15812 tmp.swap(*out);
7c673cae
FG
15813 return r;
15814}
15815
11fdf7f2
TL
15816bool PrimaryLogPG::check_failsafe_full() {
15817 return osd->check_failsafe_full(get_dpp());
7c673cae
FG
15818}
15819
f67539c2
TL
15820bool PrimaryLogPG::maybe_preempt_replica_scrub(const hobject_t& oid)
15821{
15822 return m_scrubber->write_blocked_by_scrub(oid);
15823}
15824
7c673cae
FG
15825void intrusive_ptr_add_ref(PrimaryLogPG *pg) { pg->get("intptr"); }
15826void intrusive_ptr_release(PrimaryLogPG *pg) { pg->put("intptr"); }
15827
15828#ifdef PG_DEBUG_REFS
15829uint64_t get_with_id(PrimaryLogPG *pg) { return pg->get_with_id(); }
15830void put_with_id(PrimaryLogPG *pg, uint64_t id) { return pg->put_with_id(id); }
15831#endif
15832
15833void intrusive_ptr_add_ref(PrimaryLogPG::RepGather *repop) { repop->get(); }
15834void intrusive_ptr_release(PrimaryLogPG::RepGather *repop) { repop->put(); }