]> git.proxmox.com Git - ceph.git/blame - ceph/src/osd/PrimaryLogPG.cc
import 15.2.5
[ceph.git] / ceph / src / osd / PrimaryLogPG.cc
CommitLineData
7c673cae
FG
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3/*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 * Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com>
8 *
9 * Author: Loic Dachary <loic@dachary.org>
10 *
11 * This is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public
13 * License version 2.1, as published by the Free Software
14 * Foundation. See file COPYING.
15 *
16 */
17
18#include "boost/tuple/tuple.hpp"
19#include "boost/intrusive_ptr.hpp"
20#include "PG.h"
21#include "PrimaryLogPG.h"
22#include "OSD.h"
23#include "OpRequest.h"
24#include "ScrubStore.h"
25#include "Session.h"
26#include "objclass/objclass.h"
27
9f95a23c 28#include "common/ceph_crypto.h"
7c673cae
FG
29#include "common/errno.h"
30#include "common/scrub_types.h"
31#include "common/perf_counters.h"
32
33#include "messages/MOSDOp.h"
34#include "messages/MOSDBackoff.h"
7c673cae
FG
35#include "messages/MOSDPGTrim.h"
36#include "messages/MOSDPGScan.h"
37#include "messages/MOSDRepScrub.h"
38#include "messages/MOSDPGBackfill.h"
39#include "messages/MOSDPGBackfillRemove.h"
40#include "messages/MOSDPGUpdateLogMissing.h"
41#include "messages/MOSDPGUpdateLogMissingReply.h"
42#include "messages/MCommandReply.h"
43#include "messages/MOSDScrubReserve.h"
7c673cae
FG
44#include "common/EventTrace.h"
45
46#include "common/config.h"
47#include "include/compat.h"
48#include "mon/MonClient.h"
49#include "osdc/Objecter.h"
50#include "json_spirit/json_spirit_value.h"
51#include "json_spirit/json_spirit_reader.h"
11fdf7f2 52#include "include/ceph_assert.h" // json_spirit clobbers it
7c673cae
FG
53#include "include/rados/rados_types.hpp"
54
55#ifdef WITH_LTTNG
56#include "tracing/osd.h"
57#else
58#define tracepoint(...)
59#endif
60
61#define dout_context cct
62#define dout_subsys ceph_subsys_osd
63#define DOUT_PREFIX_ARGS this, osd->whoami, get_osdmap()
64#undef dout_prefix
65#define dout_prefix _prefix(_dout, this)
9f95a23c
TL
66using TOPNSPC::common::cmd_getval;
67
7c673cae
FG
68template <typename T>
69static ostream& _prefix(std::ostream *_dout, T *pg) {
11fdf7f2 70 return pg->gen_prefix(*_dout);
7c673cae
FG
71}
72
73
74#include <sstream>
75#include <utility>
76
77#include <errno.h>
78
79MEMPOOL_DEFINE_OBJECT_FACTORY(PrimaryLogPG, replicatedpg, osd);
80
9f95a23c 81using namespace ceph::osd::scheduler;
7c673cae 82
7c673cae
FG
83/**
84 * The CopyCallback class defines an interface for completions to the
85 * copy_start code. Users of the copy infrastructure must implement
86 * one and give an instance of the class to start_copy.
87 *
88 * The implementer is responsible for making sure that the CopyCallback
89 * can associate itself with the correct copy operation.
90 */
91class PrimaryLogPG::CopyCallback : public GenContext<CopyCallbackResults> {
92protected:
93 CopyCallback() {}
94 /**
95 * results.get<0>() is the return code: 0 for success; -ECANCELED if
96 * the operation was cancelled by the local OSD; -errno for other issues.
97 * results.get<1>() is a pointer to a CopyResults object, which you are
98 * responsible for deleting.
99 */
100 void finish(CopyCallbackResults results_) override = 0;
101
102public:
103 /// Provide the final size of the copied object to the CopyCallback
104 ~CopyCallback() override {}
105};
106
107template <typename T>
108class PrimaryLogPG::BlessedGenContext : public GenContext<T> {
109 PrimaryLogPGRef pg;
110 unique_ptr<GenContext<T>> c;
111 epoch_t e;
112public:
113 BlessedGenContext(PrimaryLogPG *pg, GenContext<T> *c, epoch_t e)
114 : pg(pg), c(c), e(e) {}
115 void finish(T t) override {
9f95a23c 116 std::scoped_lock locker{*pg};
7c673cae
FG
117 if (pg->pg_has_reset_since(e))
118 c.reset();
119 else
120 c.release()->complete(t);
7c673cae 121 }
11fdf7f2
TL
122 bool sync_finish(T t) {
123 // we assume here all blessed/wrapped Contexts can complete synchronously.
124 c.release()->complete(t);
125 return true;
126 }
7c673cae
FG
127};
128
129GenContext<ThreadPool::TPHandle&> *PrimaryLogPG::bless_gencontext(
130 GenContext<ThreadPool::TPHandle&> *c) {
131 return new BlessedGenContext<ThreadPool::TPHandle&>(
11fdf7f2
TL
132 this, c, get_osdmap_epoch());
133}
134
135template <typename T>
136class PrimaryLogPG::UnlockedBlessedGenContext : public GenContext<T> {
137 PrimaryLogPGRef pg;
138 unique_ptr<GenContext<T>> c;
139 epoch_t e;
140public:
141 UnlockedBlessedGenContext(PrimaryLogPG *pg, GenContext<T> *c, epoch_t e)
142 : pg(pg), c(c), e(e) {}
143 void finish(T t) override {
144 if (pg->pg_has_reset_since(e))
145 c.reset();
146 else
147 c.release()->complete(t);
148 }
149 bool sync_finish(T t) {
150 // we assume here all blessed/wrapped Contexts can complete synchronously.
151 c.release()->complete(t);
152 return true;
153 }
154};
155
156GenContext<ThreadPool::TPHandle&> *PrimaryLogPG::bless_unlocked_gencontext(
157 GenContext<ThreadPool::TPHandle&> *c) {
158 return new UnlockedBlessedGenContext<ThreadPool::TPHandle&>(
159 this, c, get_osdmap_epoch());
7c673cae
FG
160}
161
162class PrimaryLogPG::BlessedContext : public Context {
163 PrimaryLogPGRef pg;
164 unique_ptr<Context> c;
165 epoch_t e;
166public:
167 BlessedContext(PrimaryLogPG *pg, Context *c, epoch_t e)
168 : pg(pg), c(c), e(e) {}
169 void finish(int r) override {
9f95a23c 170 std::scoped_lock locker{*pg};
7c673cae
FG
171 if (pg->pg_has_reset_since(e))
172 c.reset();
173 else
174 c.release()->complete(r);
7c673cae 175 }
11fdf7f2
TL
176 bool sync_finish(int r) {
177 // we assume here all blessed/wrapped Contexts can complete synchronously.
178 c.release()->complete(r);
179 return true;
180 }
7c673cae
FG
181};
182
7c673cae 183Context *PrimaryLogPG::bless_context(Context *c) {
11fdf7f2 184 return new BlessedContext(this, c, get_osdmap_epoch());
7c673cae
FG
185}
186
187class PrimaryLogPG::C_PG_ObjectContext : public Context {
188 PrimaryLogPGRef pg;
189 ObjectContext *obc;
190 public:
191 C_PG_ObjectContext(PrimaryLogPG *p, ObjectContext *o) :
192 pg(p), obc(o) {}
193 void finish(int r) override {
194 pg->object_context_destructor_callback(obc);
195 }
196};
197
7c673cae
FG
198struct OnReadComplete : public Context {
199 PrimaryLogPG *pg;
200 PrimaryLogPG::OpContext *opcontext;
201 OnReadComplete(
202 PrimaryLogPG *pg,
203 PrimaryLogPG::OpContext *ctx) : pg(pg), opcontext(ctx) {}
204 void finish(int r) override {
7c673cae
FG
205 opcontext->finish_read(pg);
206 }
207 ~OnReadComplete() override {}
208};
209
210class PrimaryLogPG::C_OSD_AppliedRecoveredObject : public Context {
211 PrimaryLogPGRef pg;
212 ObjectContextRef obc;
213 public:
214 C_OSD_AppliedRecoveredObject(PrimaryLogPG *p, ObjectContextRef o) :
215 pg(p), obc(o) {}
11fdf7f2
TL
216 bool sync_finish(int r) override {
217 pg->_applied_recovered_object(obc);
218 return true;
219 }
7c673cae 220 void finish(int r) override {
9f95a23c 221 std::scoped_lock locker{*pg};
7c673cae
FG
222 pg->_applied_recovered_object(obc);
223 }
224};
225
226class PrimaryLogPG::C_OSD_CommittedPushedObject : public Context {
227 PrimaryLogPGRef pg;
228 epoch_t epoch;
229 eversion_t last_complete;
230 public:
231 C_OSD_CommittedPushedObject(
232 PrimaryLogPG *p, epoch_t epoch, eversion_t lc) :
233 pg(p), epoch(epoch), last_complete(lc) {
234 }
235 void finish(int r) override {
236 pg->_committed_pushed_object(epoch, last_complete);
237 }
238};
239
240class PrimaryLogPG::C_OSD_AppliedRecoveredObjectReplica : public Context {
241 PrimaryLogPGRef pg;
242 public:
243 explicit C_OSD_AppliedRecoveredObjectReplica(PrimaryLogPG *p) :
244 pg(p) {}
11fdf7f2
TL
245 bool sync_finish(int r) override {
246 pg->_applied_recovered_object_replica();
247 return true;
248 }
7c673cae 249 void finish(int r) override {
9f95a23c 250 std::scoped_lock locker{*pg};
7c673cae
FG
251 pg->_applied_recovered_object_replica();
252 }
253};
254
255// OpContext
256void PrimaryLogPG::OpContext::start_async_reads(PrimaryLogPG *pg)
257{
258 inflightreads = 1;
259 list<pair<boost::tuple<uint64_t, uint64_t, unsigned>,
260 pair<bufferlist*, Context*> > > in;
261 in.swap(pending_async_reads);
262 pg->pgbackend->objects_read_async(
263 obc->obs.oi.soid,
264 in,
265 new OnReadComplete(pg, this), pg->get_pool().fast_read);
266}
267void PrimaryLogPG::OpContext::finish_read(PrimaryLogPG *pg)
268{
11fdf7f2 269 ceph_assert(inflightreads > 0);
7c673cae
FG
270 --inflightreads;
271 if (async_reads_complete()) {
11fdf7f2
TL
272 ceph_assert(pg->in_progress_async_reads.size());
273 ceph_assert(pg->in_progress_async_reads.front().second == this);
7c673cae 274 pg->in_progress_async_reads.pop_front();
c07f9fc5
FG
275
276 // Restart the op context now that all reads have been
277 // completed. Read failures will be handled by the op finisher
278 pg->execute_ctx(this);
7c673cae
FG
279 }
280}
281
c07f9fc5 282class CopyFromCallback : public PrimaryLogPG::CopyCallback {
7c673cae 283public:
c07f9fc5 284 PrimaryLogPG::CopyResults *results = nullptr;
7c673cae 285 PrimaryLogPG::OpContext *ctx;
c07f9fc5 286 OSDOp &osd_op;
9f95a23c
TL
287 uint32_t truncate_seq;
288 uint64_t truncate_size;
289 bool have_truncate = false;
c07f9fc5
FG
290
291 CopyFromCallback(PrimaryLogPG::OpContext *ctx, OSDOp &osd_op)
292 : ctx(ctx), osd_op(osd_op) {
293 }
7c673cae
FG
294 ~CopyFromCallback() override {}
295
296 void finish(PrimaryLogPG::CopyCallbackResults results_) override {
297 results = results_.get<1>();
298 int r = results_.get<0>();
7c673cae 299
9f95a23c
TL
300 // Only use truncate_{seq,size} from the original object if the client
301 // did not sent us these parameters
302 if (!have_truncate) {
303 truncate_seq = results->truncate_seq;
304 truncate_size = results->truncate_size;
305 }
306
7c673cae
FG
307 // for finish_copyfrom
308 ctx->user_at_version = results->user_version;
309
310 if (r >= 0) {
311 ctx->pg->execute_ctx(ctx);
c07f9fc5 312 } else {
7c673cae
FG
313 if (r != -ECANCELED) { // on cancel just toss it out; client resends
314 if (ctx->op)
315 ctx->pg->osd->reply_op_error(ctx->op, r);
316 } else if (results->should_requeue) {
317 if (ctx->op)
318 ctx->pg->requeue_op(ctx->op);
319 }
320 ctx->pg->close_op_ctx(ctx);
321 }
322 }
323
324 bool is_temp_obj_used() {
325 return results->started_temp_obj;
326 }
327 uint64_t get_data_size() {
328 return results->object_size;
329 }
9f95a23c
TL
330 void set_truncate(uint32_t seq, uint64_t size) {
331 truncate_seq = seq;
332 truncate_size = size;
333 have_truncate = true;
334 }
c07f9fc5
FG
335};
336
337struct CopyFromFinisher : public PrimaryLogPG::OpFinisher {
338 CopyFromCallback *copy_from_callback;
339
11fdf7f2 340 explicit CopyFromFinisher(CopyFromCallback *copy_from_callback)
c07f9fc5
FG
341 : copy_from_callback(copy_from_callback) {
342 }
343
344 int execute() override {
345 // instance will be destructed after this method completes
346 copy_from_callback->ctx->pg->finish_copyfrom(copy_from_callback);
347 return 0;
7c673cae
FG
348 }
349};
350
351// ======================
352// PGBackend::Listener
353
354void PrimaryLogPG::on_local_recover(
355 const hobject_t &hoid,
356 const ObjectRecoveryInfo &_recovery_info,
357 ObjectContextRef obc,
c07f9fc5 358 bool is_delete,
7c673cae
FG
359 ObjectStore::Transaction *t
360 )
361{
362 dout(10) << __func__ << ": " << hoid << dendl;
363
364 ObjectRecoveryInfo recovery_info(_recovery_info);
365 clear_object_snap_mapping(t, hoid);
c07f9fc5 366 if (!is_delete && recovery_info.soid.is_snap()) {
7c673cae
FG
367 OSDriver::OSTransaction _t(osdriver.get_transaction(t));
368 set<snapid_t> snaps;
11fdf7f2
TL
369 dout(20) << " snapset " << recovery_info.ss << dendl;
370 auto p = recovery_info.ss.clone_snaps.find(hoid.snap);
371 if (p != recovery_info.ss.clone_snaps.end()) {
372 snaps.insert(p->second.begin(), p->second.end());
1adf2230
AA
373 dout(20) << " snaps " << snaps << dendl;
374 snap_mapper.add_oid(
11fdf7f2
TL
375 recovery_info.soid,
376 snaps,
377 &_t);
378 } else {
379 derr << __func__ << " " << hoid << " had no clone_snaps" << dendl;
7c673cae 380 }
7c673cae 381 }
9f95a23c
TL
382 if (!is_delete && recovery_state.get_pg_log().get_missing().is_missing(recovery_info.soid) &&
383 recovery_state.get_pg_log().get_missing().get_items().find(recovery_info.soid)->second.need > recovery_info.version) {
11fdf7f2 384 ceph_assert(is_primary());
9f95a23c 385 const pg_log_entry_t *latest = recovery_state.get_pg_log().get_log().objects.find(recovery_info.soid)->second;
7c673cae
FG
386 if (latest->op == pg_log_entry_t::LOST_REVERT &&
387 latest->reverting_to == recovery_info.version) {
388 dout(10) << " got old revert version " << recovery_info.version
389 << " for " << *latest << dendl;
390 recovery_info.version = latest->version;
391 // update the attr to the revert event version
392 recovery_info.oi.prior_version = recovery_info.oi.version;
393 recovery_info.oi.version = latest->version;
394 bufferlist bl;
11fdf7f2 395 encode(recovery_info.oi, bl,
7c673cae 396 get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
11fdf7f2 397 ceph_assert(!pool.info.is_erasure());
7c673cae
FG
398 t->setattr(coll, ghobject_t(recovery_info.soid), OI_ATTR, bl);
399 if (obc)
400 obc->attr_cache[OI_ATTR] = bl;
401 }
402 }
403
404 // keep track of active pushes for scrub
405 ++active_pushes;
406
9f95a23c
TL
407 recovery_state.recover_got(
408 recovery_info.soid,
409 recovery_info.version,
410 is_delete,
411 *t);
7c673cae
FG
412
413 if (is_primary()) {
c07f9fc5
FG
414 if (!is_delete) {
415 obc->obs.exists = true;
7c673cae 416
c07f9fc5 417 bool got = obc->get_recovery_read();
11fdf7f2 418 ceph_assert(got);
7c673cae 419
11fdf7f2 420 ceph_assert(recovering.count(obc->obs.oi.soid));
c07f9fc5
FG
421 recovering[obc->obs.oi.soid] = obc;
422 obc->obs.oi = recovery_info.oi; // may have been updated above
c07f9fc5 423 }
7c673cae
FG
424
425 t->register_on_applied(new C_OSD_AppliedRecoveredObject(this, obc));
7c673cae
FG
426
427 publish_stats_to_osd();
7c673cae
FG
428 release_backoffs(hoid);
429 if (!is_unreadable_object(hoid)) {
430 auto unreadable_object_entry = waiting_for_unreadable_object.find(hoid);
431 if (unreadable_object_entry != waiting_for_unreadable_object.end()) {
432 dout(20) << " kicking unreadable waiters on " << hoid << dendl;
433 requeue_ops(unreadable_object_entry->second);
434 waiting_for_unreadable_object.erase(unreadable_object_entry);
435 }
436 }
7c673cae
FG
437 } else {
438 t->register_on_applied(
439 new C_OSD_AppliedRecoveredObjectReplica(this));
440
441 }
442
443 t->register_on_commit(
444 new C_OSD_CommittedPushedObject(
445 this,
11fdf7f2 446 get_osdmap_epoch(),
7c673cae 447 info.last_complete));
7c673cae
FG
448}
449
450void PrimaryLogPG::on_global_recover(
451 const hobject_t &soid,
c07f9fc5
FG
452 const object_stat_sum_t &stat_diff,
453 bool is_delete)
7c673cae 454{
9f95a23c 455 recovery_state.object_recovered(soid, stat_diff);
7c673cae
FG
456 publish_stats_to_osd();
457 dout(10) << "pushed " << soid << " to all replicas" << dendl;
458 map<hobject_t, ObjectContextRef>::iterator i = recovering.find(soid);
11fdf7f2 459 ceph_assert(i != recovering.end());
7c673cae 460
11fdf7f2 461 if (i->second && i->second->rwstate.recovery_read_marker) {
c07f9fc5
FG
462 // recover missing won't have had an obc, but it gets filled in
463 // during on_local_recover
11fdf7f2 464 ceph_assert(i->second);
c07f9fc5
FG
465 list<OpRequestRef> requeue_list;
466 i->second->drop_recovery_read(&requeue_list);
467 requeue_ops(requeue_list);
468 }
7c673cae
FG
469
470 backfills_in_flight.erase(soid);
471
472 recovering.erase(i);
473 finish_recovery_op(soid);
474 release_backoffs(soid);
475 auto degraded_object_entry = waiting_for_degraded_object.find(soid);
476 if (degraded_object_entry != waiting_for_degraded_object.end()) {
477 dout(20) << " kicking degraded waiters on " << soid << dendl;
478 requeue_ops(degraded_object_entry->second);
479 waiting_for_degraded_object.erase(degraded_object_entry);
480 }
481 auto unreadable_object_entry = waiting_for_unreadable_object.find(soid);
482 if (unreadable_object_entry != waiting_for_unreadable_object.end()) {
483 dout(20) << " kicking unreadable waiters on " << soid << dendl;
484 requeue_ops(unreadable_object_entry->second);
485 waiting_for_unreadable_object.erase(unreadable_object_entry);
486 }
487 finish_degraded_object(soid);
488}
489
7c673cae
FG
490void PrimaryLogPG::schedule_recovery_work(
491 GenContext<ThreadPool::TPHandle&> *c)
492{
11fdf7f2 493 osd->queue_recovery_context(this, c);
7c673cae
FG
494}
495
9f95a23c
TL
496void PrimaryLogPG::replica_clear_repop_obc(
497 const vector<pg_log_entry_t> &logv,
498 ObjectStore::Transaction &t)
7c673cae 499{
9f95a23c
TL
500 for (auto &&e: logv) {
501 /* Have to blast all clones, they share a snapset */
502 object_contexts.clear_range(
503 e.soid.get_object_boundary(), e.soid.get_head());
504 ceph_assert(
505 snapset_contexts.find(e.soid.get_head()) ==
506 snapset_contexts.end());
507 }
224ce89b
WB
508}
509
11fdf7f2
TL
510bool PrimaryLogPG::should_send_op(
511 pg_shard_t peer,
512 const hobject_t &hoid) {
513 if (peer == get_primary())
514 return true;
9f95a23c 515 ceph_assert(recovery_state.has_peer_info(peer));
11fdf7f2
TL
516 bool should_send =
517 hoid.pool != (int64_t)info.pgid.pool() ||
518 hoid <= last_backfill_started ||
9f95a23c 519 hoid <= recovery_state.get_peer_info(peer).last_backfill;
11fdf7f2 520 if (!should_send) {
9f95a23c 521 ceph_assert(is_backfill_target(peer));
11fdf7f2
TL
522 dout(10) << __func__ << " issue_repop shipping empty opt to osd." << peer
523 << ", object " << hoid
524 << " beyond std::max(last_backfill_started "
525 << ", peer_info[peer].last_backfill "
9f95a23c
TL
526 << recovery_state.get_peer_info(peer).last_backfill
527 << ")" << dendl;
11fdf7f2
TL
528 return should_send;
529 }
9f95a23c
TL
530 if (is_async_recovery_target(peer) &&
531 recovery_state.get_peer_missing(peer).is_missing(hoid)) {
11fdf7f2
TL
532 should_send = false;
533 dout(10) << __func__ << " issue_repop shipping empty opt to osd." << peer
534 << ", object " << hoid
535 << " which is pending recovery in async_recovery_targets" << dendl;
536 }
537 return should_send;
538}
539
540
7c673cae
FG
541ConnectionRef PrimaryLogPG::get_con_osd_cluster(
542 int peer, epoch_t from_epoch)
543{
544 return osd->get_con_osd_cluster(peer, from_epoch);
545}
546
547PerfCounters *PrimaryLogPG::get_logger()
548{
549 return osd->logger;
550}
551
552
553// ====================
554// missing objects
555
556bool PrimaryLogPG::is_missing_object(const hobject_t& soid) const
557{
9f95a23c 558 return recovery_state.get_pg_log().get_missing().get_items().count(soid);
7c673cae
FG
559}
560
561void PrimaryLogPG::maybe_kick_recovery(
562 const hobject_t &soid)
563{
564 eversion_t v;
11fdf7f2 565 bool work_started = false;
9f95a23c 566 if (!recovery_state.get_missing_loc().needs_recovery(soid, &v))
7c673cae
FG
567 return;
568
569 map<hobject_t, ObjectContextRef>::const_iterator p = recovering.find(soid);
570 if (p != recovering.end()) {
571 dout(7) << "object " << soid << " v " << v << ", already recovering." << dendl;
9f95a23c 572 } else if (recovery_state.get_missing_loc().is_unfound(soid)) {
7c673cae
FG
573 dout(7) << "object " << soid << " v " << v << ", is unfound." << dendl;
574 } else {
575 dout(7) << "object " << soid << " v " << v << ", recovering." << dendl;
576 PGBackend::RecoveryHandle *h = pgbackend->open_recovery_op();
577 if (is_missing_object(soid)) {
9f95a23c
TL
578 recover_missing(soid, v, CEPH_MSG_PRIO_HIGH, h);
579 } else if (recovery_state.get_missing_loc().is_deleted(soid)) {
11fdf7f2 580 prep_object_replica_deletes(soid, v, h, &work_started);
7c673cae 581 } else {
11fdf7f2 582 prep_object_replica_pushes(soid, v, h, &work_started);
7c673cae 583 }
9f95a23c 584 pgbackend->run_recovery_op(h, CEPH_MSG_PRIO_HIGH);
7c673cae
FG
585 }
586}
587
588void PrimaryLogPG::wait_for_unreadable_object(
589 const hobject_t& soid, OpRequestRef op)
590{
11fdf7f2 591 ceph_assert(is_unreadable_object(soid));
7c673cae
FG
592 maybe_kick_recovery(soid);
593 waiting_for_unreadable_object[soid].push_back(op);
594 op->mark_delayed("waiting for missing object");
595}
596
7c673cae
FG
597bool PrimaryLogPG::is_degraded_or_backfilling_object(const hobject_t& soid)
598{
599 /* The conditions below may clear (on_local_recover, before we queue
600 * the transaction) before we actually requeue the degraded waiters
601 * in on_global_recover after the transaction completes.
602 */
603 if (waiting_for_degraded_object.count(soid))
604 return true;
9f95a23c 605 if (recovery_state.get_pg_log().get_missing().get_items().count(soid))
7c673cae 606 return true;
9f95a23c
TL
607 ceph_assert(!get_acting_recovery_backfill().empty());
608 for (set<pg_shard_t>::iterator i = get_acting_recovery_backfill().begin();
609 i != get_acting_recovery_backfill().end();
7c673cae
FG
610 ++i) {
611 if (*i == get_primary()) continue;
612 pg_shard_t peer = *i;
9f95a23c 613 auto peer_missing_entry = recovery_state.get_peer_missing().find(peer);
11fdf7f2
TL
614 // If an object is missing on an async_recovery_target, return false.
615 // This will not block the op and the object is async recovered later.
9f95a23c 616 if (peer_missing_entry != recovery_state.get_peer_missing().end() &&
11fdf7f2 617 peer_missing_entry->second.get_items().count(soid)) {
9f95a23c 618 if (is_async_recovery_target(peer))
11fdf7f2
TL
619 continue;
620 else
621 return true;
622 }
7c673cae
FG
623 // Object is degraded if after last_backfill AND
624 // we are backfilling it
9f95a23c
TL
625 if (is_backfill_target(peer) &&
626 recovery_state.get_peer_info(peer).last_backfill <= soid &&
7c673cae
FG
627 last_backfill_started >= soid &&
628 backfills_in_flight.count(soid))
629 return true;
630 }
631 return false;
632}
633
11fdf7f2
TL
634bool PrimaryLogPG::is_degraded_on_async_recovery_target(const hobject_t& soid)
635{
9f95a23c
TL
636 for (auto &i: get_async_recovery_targets()) {
637 auto peer_missing_entry = recovery_state.get_peer_missing().find(i);
638 if (peer_missing_entry != recovery_state.get_peer_missing().end() &&
11fdf7f2
TL
639 peer_missing_entry->second.get_items().count(soid)) {
640 dout(30) << __func__ << " " << soid << dendl;
641 return true;
642 }
643 }
644 return false;
645}
646
7c673cae
FG
647void PrimaryLogPG::wait_for_degraded_object(const hobject_t& soid, OpRequestRef op)
648{
11fdf7f2 649 ceph_assert(is_degraded_or_backfilling_object(soid) || is_degraded_on_async_recovery_target(soid));
7c673cae
FG
650
651 maybe_kick_recovery(soid);
652 waiting_for_degraded_object[soid].push_back(op);
653 op->mark_delayed("waiting for degraded object");
654}
655
656void PrimaryLogPG::block_write_on_full_cache(
657 const hobject_t& _oid, OpRequestRef op)
658{
659 const hobject_t oid = _oid.get_head();
660 dout(20) << __func__ << ": blocking object " << oid
661 << " on full cache" << dendl;
662 objects_blocked_on_cache_full.insert(oid);
663 waiting_for_cache_not_full.push_back(op);
664 op->mark_delayed("waiting for cache not full");
665}
666
224ce89b
WB
667void PrimaryLogPG::block_for_clean(
668 const hobject_t& oid, OpRequestRef op)
669{
670 dout(20) << __func__ << ": blocking object " << oid
671 << " on primary repair" << dendl;
672 waiting_for_clean_to_primary_repair.push_back(op);
673 op->mark_delayed("waiting for clean to repair");
674}
675
7c673cae
FG
676void PrimaryLogPG::block_write_on_snap_rollback(
677 const hobject_t& oid, ObjectContextRef obc, OpRequestRef op)
678{
679 dout(20) << __func__ << ": blocking object " << oid.get_head()
680 << " on snap promotion " << obc->obs.oi.soid << dendl;
681 // otherwise, we'd have blocked in do_op
11fdf7f2
TL
682 ceph_assert(oid.is_head());
683 ceph_assert(objects_blocked_on_snap_promotion.count(oid) == 0);
7c673cae
FG
684 objects_blocked_on_snap_promotion[oid] = obc;
685 wait_for_blocked_object(obc->obs.oi.soid, op);
686}
687
688void PrimaryLogPG::block_write_on_degraded_snap(
689 const hobject_t& snap, OpRequestRef op)
690{
691 dout(20) << __func__ << ": blocking object " << snap.get_head()
692 << " on degraded snap " << snap << dendl;
693 // otherwise, we'd have blocked in do_op
11fdf7f2 694 ceph_assert(objects_blocked_on_degraded_snap.count(snap.get_head()) == 0);
7c673cae
FG
695 objects_blocked_on_degraded_snap[snap.get_head()] = snap.snap;
696 wait_for_degraded_object(snap, op);
697}
698
11fdf7f2 699bool PrimaryLogPG::maybe_await_blocked_head(
7c673cae
FG
700 const hobject_t &hoid,
701 OpRequestRef op)
702{
703 ObjectContextRef obc;
704 obc = object_contexts.lookup(hoid.get_head());
705 if (obc) {
706 if (obc->is_blocked()) {
707 wait_for_blocked_object(obc->obs.oi.soid, op);
708 return true;
709 } else {
710 return false;
711 }
712 }
7c673cae
FG
713 return false;
714}
715
716void PrimaryLogPG::wait_for_blocked_object(const hobject_t& soid, OpRequestRef op)
717{
718 dout(10) << __func__ << " " << soid << " " << op << dendl;
719 waiting_for_blocked_object[soid].push_back(op);
720 op->mark_delayed("waiting for blocked object");
721}
722
723void PrimaryLogPG::maybe_force_recovery()
724{
b32b8144 725 // no force if not in degraded/recovery/backfill states
7c673cae
FG
726 if (!is_degraded() &&
727 !state_test(PG_STATE_RECOVERING |
728 PG_STATE_RECOVERY_WAIT |
3efd9988 729 PG_STATE_BACKFILLING |
7c673cae
FG
730 PG_STATE_BACKFILL_WAIT |
731 PG_STATE_BACKFILL_TOOFULL))
732 return;
733
9f95a23c 734 if (recovery_state.get_pg_log().get_log().approx_size() <
7c673cae
FG
735 cct->_conf->osd_max_pg_log_entries *
736 cct->_conf->osd_force_recovery_pg_log_entries_factor)
737 return;
738
739 // find the oldest missing object
9f95a23c 740 version_t min_version = recovery_state.get_pg_log().get_log().head.version;
7c673cae 741 hobject_t soid;
9f95a23c
TL
742 if (!recovery_state.get_pg_log().get_missing().get_rmissing().empty()) {
743 min_version = recovery_state.get_pg_log().get_missing().get_rmissing().begin()->first;
744 soid = recovery_state.get_pg_log().get_missing().get_rmissing().begin()->second;
7c673cae 745 }
9f95a23c
TL
746 ceph_assert(!get_acting_recovery_backfill().empty());
747 for (set<pg_shard_t>::iterator it = get_acting_recovery_backfill().begin();
748 it != get_acting_recovery_backfill().end();
7c673cae
FG
749 ++it) {
750 if (*it == get_primary()) continue;
751 pg_shard_t peer = *it;
9f95a23c
TL
752 auto it_missing = recovery_state.get_peer_missing().find(peer);
753 if (it_missing != recovery_state.get_peer_missing().end() &&
11fdf7f2 754 !it_missing->second.get_rmissing().empty()) {
9f95a23c 755 const auto& min_obj = recovery_state.get_peer_missing(peer).get_rmissing().begin();
11fdf7f2
TL
756 dout(20) << __func__ << " peer " << peer << " min_version " << min_obj->first
757 << " oid " << min_obj->second << dendl;
758 if (min_version > min_obj->first) {
759 min_version = min_obj->first;
760 soid = min_obj->second;
761 }
7c673cae
FG
762 }
763 }
764
765 // recover it
766 if (soid != hobject_t())
767 maybe_kick_recovery(soid);
768}
769
9f95a23c
TL
770bool PrimaryLogPG::check_laggy(OpRequestRef& op)
771{
772 if (!HAVE_FEATURE(recovery_state.get_min_upacting_features(),
773 SERVER_OCTOPUS)) {
774 dout(20) << __func__ << " not all upacting has SERVER_OCTOPUS" << dendl;
775 return true;
7c673cae 776 }
9f95a23c
TL
777 if (state_test(PG_STATE_WAIT)) {
778 dout(10) << __func__ << " PG is WAIT state" << dendl;
779 } else if (!state_test(PG_STATE_LAGGY)) {
780 auto mnow = osd->get_mnow();
781 auto ru = recovery_state.get_readable_until();
782 if (mnow <= ru) {
783 // not laggy
784 return true;
785 }
786 dout(10) << __func__
787 << " mnow " << mnow
788 << " > readable_until " << ru << dendl;
7c673cae 789
9f95a23c
TL
790 if (!is_primary()) {
791 osd->reply_op_error(op, -EAGAIN);
792 return false;
7c673cae 793 }
7c673cae 794
9f95a23c
TL
795 // go to laggy state
796 state_set(PG_STATE_LAGGY);
797 publish_stats_to_osd();
7c673cae 798 }
9f95a23c
TL
799 dout(10) << __func__ << " not readable" << dendl;
800 waiting_for_readable.push_back(op);
801 op->mark_delayed("waiting for readable");
802 return false;
803}
7c673cae 804
9f95a23c 805bool PrimaryLogPG::check_laggy_requeue(OpRequestRef& op)
7c673cae 806{
9f95a23c
TL
807 if (!HAVE_FEATURE(recovery_state.get_min_upacting_features(),
808 SERVER_OCTOPUS)) {
809 return true;
7c673cae 810 }
9f95a23c
TL
811 if (!state_test(PG_STATE_WAIT) && !state_test(PG_STATE_LAGGY)) {
812 return true; // not laggy
813 }
814 dout(10) << __func__ << " not readable" << dendl;
815 waiting_for_readable.push_front(op);
816 op->mark_delayed("waiting for readable");
7c673cae
FG
817 return false;
818}
819
9f95a23c 820void PrimaryLogPG::recheck_readable()
7c673cae 821{
9f95a23c
TL
822 if (!is_wait() && !is_laggy()) {
823 dout(20) << __func__ << " wasn't wait or laggy" << dendl;
824 return;
825 }
826 auto mnow = osd->get_mnow();
827 bool pub = false;
828 if (is_wait()) {
829 auto prior_readable_until_ub = recovery_state.get_prior_readable_until_ub();
830 if (mnow < prior_readable_until_ub) {
831 dout(10) << __func__ << " still wait (mnow " << mnow
832 << " < prior_readable_until_ub " << prior_readable_until_ub
833 << ")" << dendl;
834 } else {
835 dout(10) << __func__ << " no longer wait (mnow " << mnow
836 << " >= prior_readable_until_ub " << prior_readable_until_ub
837 << ")" << dendl;
838 state_clear(PG_STATE_WAIT);
839 recovery_state.clear_prior_readable_until_ub();
840 pub = true;
841 }
842 }
843 if (is_laggy()) {
844 auto ru = recovery_state.get_readable_until();
845 if (ru == ceph::signedspan::zero()) {
846 dout(10) << __func__ << " still laggy (mnow " << mnow
847 << ", readable_until zero)" << dendl;
848 } else if (mnow >= ru) {
849 dout(10) << __func__ << " still laggy (mnow " << mnow
850 << " >= readable_until " << ru << ")" << dendl;
851 } else {
852 dout(10) << __func__ << " no longer laggy (mnow " << mnow
853 << " < readable_until " << ru << ")" << dendl;
854 state_clear(PG_STATE_LAGGY);
855 pub = true;
856 }
857 }
858 if (pub) {
859 publish_stats_to_osd();
860 }
861 if (!is_laggy() && !is_wait()) {
862 requeue_ops(waiting_for_readable);
863 }
7c673cae
FG
864}
865
9f95a23c 866bool PrimaryLogPG::pgls_filter(const PGLSFilter& filter, const hobject_t& sobj)
7c673cae
FG
867{
868 bufferlist bl;
869
870 // If filter has expressed an interest in an xattr, load it.
9f95a23c 871 if (!filter.get_xattr().empty()) {
7c673cae
FG
872 int ret = pgbackend->objects_get_attr(
873 sobj,
9f95a23c 874 filter.get_xattr(),
7c673cae 875 &bl);
9f95a23c 876 dout(0) << "getattr (sobj=" << sobj << ", attr=" << filter.get_xattr() << ") returned " << ret << dendl;
7c673cae 877 if (ret < 0) {
9f95a23c 878 if (ret != -ENODATA || filter.reject_empty_xattr()) {
7c673cae
FG
879 return false;
880 }
881 }
882 }
883
9f95a23c 884 return filter.filter(sobj, bl);
7c673cae
FG
885}
886
9f95a23c
TL
887std::pair<int, std::unique_ptr<const PGLSFilter>>
888PrimaryLogPG::get_pgls_filter(bufferlist::const_iterator& iter)
7c673cae
FG
889{
890 string type;
9f95a23c
TL
891 // storing non-const PGLSFilter for the sake of ::init()
892 std::unique_ptr<PGLSFilter> filter;
7c673cae
FG
893
894 try {
11fdf7f2 895 decode(type, iter);
7c673cae
FG
896 }
897 catch (buffer::error& e) {
9f95a23c 898 return { -EINVAL, nullptr };
7c673cae
FG
899 }
900
9f95a23c
TL
901 if (type.compare("plain") == 0) {
902 filter = std::make_unique<PGLSPlainFilter>();
7c673cae
FG
903 } else {
904 std::size_t dot = type.find(".");
905 if (dot == std::string::npos || dot == 0 || dot == type.size() - 1) {
9f95a23c 906 return { -EINVAL, nullptr };
7c673cae
FG
907 }
908
909 const std::string class_name = type.substr(0, dot);
910 const std::string filter_name = type.substr(dot + 1);
911 ClassHandler::ClassData *cls = NULL;
9f95a23c 912 int r = ClassHandler::get_instance().open_class(class_name, &cls);
7c673cae
FG
913 if (r != 0) {
914 derr << "Error opening class '" << class_name << "': "
915 << cpp_strerror(r) << dendl;
916 if (r != -EPERM) // propogate permission error
917 r = -EINVAL;
9f95a23c 918 return { r, nullptr };
7c673cae 919 } else {
11fdf7f2 920 ceph_assert(cls);
7c673cae
FG
921 }
922
923 ClassHandler::ClassFilter *class_filter = cls->get_filter(filter_name);
924 if (class_filter == NULL) {
925 derr << "Error finding filter '" << filter_name << "' in class "
926 << class_name << dendl;
9f95a23c 927 return { -EINVAL, nullptr };
7c673cae 928 }
9f95a23c 929 filter.reset(class_filter->fn());
7c673cae
FG
930 if (!filter) {
931 // Object classes are obliged to return us something, but let's
932 // give an error rather than asserting out.
933 derr << "Buggy class " << class_name << " failed to construct "
934 "filter " << filter_name << dendl;
9f95a23c 935 return { -EINVAL, nullptr };
7c673cae
FG
936 }
937 }
938
11fdf7f2 939 ceph_assert(filter);
7c673cae
FG
940 int r = filter->init(iter);
941 if (r < 0) {
942 derr << "Error initializing filter " << type << ": "
943 << cpp_strerror(r) << dendl;
9f95a23c 944 return { -EINVAL, nullptr };
7c673cae
FG
945 } else {
946 // Successfully constructed and initialized, return it.
9f95a23c 947 return std::make_pair(0, std::move(filter));
7c673cae
FG
948 }
949}
950
951
952// ==========================================================
953
9f95a23c
TL
954void PrimaryLogPG::do_command(
955 const string_view& orig_prefix,
956 const cmdmap_t& cmdmap,
957 const bufferlist& idata,
958 std::function<void(int,const std::string&,bufferlist&)> on_finish)
7c673cae 959{
7c673cae 960 string format;
9f95a23c
TL
961 cmd_getval(cmdmap, "format", format);
962 std::unique_ptr<Formatter> f(Formatter::create(
963 format, "json-pretty", "json-pretty"));
964 int ret = 0;
965 stringstream ss; // stderr error message stream
966 bufferlist outbl; // if empty at end, we'll dump formatter as output
967
968 // get final prefix:
969 // - ceph pg <pgid> foo -> prefix=pg, cmd=foo
970 // - ceph tell <pgid> foo -> prefix=foo
971 string prefix(orig_prefix);
7c673cae 972 string command;
9f95a23c
TL
973 cmd_getval(cmdmap, "cmd", command);
974 if (command.size()) {
975 prefix = command;
976 }
977
978 if (prefix == "query") {
7c673cae 979 f->open_object_section("pg");
7c673cae 980 f->dump_stream("snap_trimq") << snap_trimq;
b32b8144 981 f->dump_unsigned("snap_trimq_len", snap_trimq.size());
9f95a23c 982 recovery_state.dump_peering_state(f.get());
7c673cae
FG
983 f->close_section();
984
985 f->open_array_section("recovery_state");
986 handle_query_state(f.get());
987 f->close_section();
988
989 f->open_object_section("agent_state");
990 if (agent_state)
991 agent_state->dump(f.get());
992 f->close_section();
993
994 f->close_section();
7c673cae 995 }
9f95a23c
TL
996
997 else if (prefix == "mark_unfound_lost") {
7c673cae 998 string mulcmd;
9f95a23c 999 cmd_getval(cmdmap, "mulcmd", mulcmd);
7c673cae
FG
1000 int mode = -1;
1001 if (mulcmd == "revert") {
11fdf7f2 1002 if (pool.info.is_erasure()) {
7c673cae 1003 ss << "mode must be 'delete' for ec pool";
9f95a23c
TL
1004 ret = -EINVAL;
1005 goto out;
7c673cae
FG
1006 }
1007 mode = pg_log_entry_t::LOST_REVERT;
1008 } else if (mulcmd == "delete") {
1009 mode = pg_log_entry_t::LOST_DELETE;
1010 } else {
1011 ss << "mode must be 'revert' or 'delete'; mark not yet implemented";
9f95a23c
TL
1012 ret = -EINVAL;
1013 goto out;
7c673cae 1014 }
11fdf7f2 1015 ceph_assert(mode == pg_log_entry_t::LOST_REVERT ||
9f95a23c 1016 mode == pg_log_entry_t::LOST_DELETE);
7c673cae
FG
1017
1018 if (!is_primary()) {
1019 ss << "not primary";
9f95a23c
TL
1020 ret = -EROFS;
1021 goto out;
7c673cae
FG
1022 }
1023
9f95a23c 1024 uint64_t unfound = recovery_state.get_missing_loc().num_unfound();
7c673cae
FG
1025 if (!unfound) {
1026 ss << "pg has no unfound objects";
9f95a23c 1027 goto out; // make command idempotent
7c673cae
FG
1028 }
1029
9f95a23c 1030 if (!recovery_state.all_unfound_are_queried_or_lost(get_osdmap())) {
7c673cae
FG
1031 ss << "pg has " << unfound
1032 << " unfound objects but we haven't probed all sources, not marking lost";
9f95a23c
TL
1033 ret = -EINVAL;
1034 goto out;
7c673cae
FG
1035 }
1036
9f95a23c
TL
1037 mark_all_unfound_lost(mode, on_finish);
1038 return;
7c673cae 1039 }
9f95a23c
TL
1040
1041 else if (prefix == "list_unfound") {
7c673cae
FG
1042 hobject_t offset;
1043 string offset_json;
11fdf7f2 1044 bool show_offset = false;
9f95a23c 1045 if (cmd_getval(cmdmap, "offset", offset_json)) {
7c673cae
FG
1046 json_spirit::Value v;
1047 try {
1048 if (!json_spirit::read(offset_json, v))
1049 throw std::runtime_error("bad json");
1050 offset.decode(v);
1051 } catch (std::runtime_error& e) {
1052 ss << "error parsing offset: " << e.what();
9f95a23c
TL
1053 ret = -EINVAL;
1054 goto out;
7c673cae 1055 }
11fdf7f2 1056 show_offset = true;
7c673cae
FG
1057 }
1058 f->open_object_section("missing");
11fdf7f2 1059 if (show_offset) {
7c673cae
FG
1060 f->open_object_section("offset");
1061 offset.dump(f.get());
1062 f->close_section();
1063 }
9f95a23c
TL
1064 auto &needs_recovery_map = recovery_state.get_missing_loc()
1065 .get_needs_recovery();
11fdf7f2 1066 f->dump_int("num_missing", needs_recovery_map.size());
7c673cae 1067 f->dump_int("num_unfound", get_num_unfound());
7c673cae
FG
1068 map<hobject_t, pg_missing_item>::const_iterator p =
1069 needs_recovery_map.upper_bound(offset);
1070 {
1071 f->open_array_section("objects");
1072 int32_t num = 0;
9f95a23c
TL
1073 for (; p != needs_recovery_map.end() &&
1074 num < cct->_conf->osd_command_max_records;
1075 ++p) {
1076 if (recovery_state.get_missing_loc().is_unfound(p->first)) {
7c673cae
FG
1077 f->open_object_section("object");
1078 {
1079 f->open_object_section("oid");
1080 p->first.dump(f.get());
1081 f->close_section();
1082 }
1083 p->second.dump(f.get()); // have, need keys
1084 {
1085 f->open_array_section("locations");
9f95a23c
TL
1086 for (auto &&r : recovery_state.get_missing_loc().get_locations(
1087 p->first)) {
1088 f->dump_stream("shard") << r;
1089 }
7c673cae
FG
1090 f->close_section();
1091 }
1092 f->close_section();
1093 num++;
1094 }
1095 }
1096 f->close_section();
1097 }
1098 f->dump_bool("more", p != needs_recovery_map.end());
1099 f->close_section();
7c673cae
FG
1100 }
1101
9f95a23c
TL
1102 else if (prefix == "scrub" ||
1103 prefix == "deep_scrub") {
1104 bool deep = (prefix == "deep_scrub");
1105 int64_t time;
1106 cmd_getval(cmdmap, "time", time, (int64_t)0);
1107
1108 if (is_primary()) {
1109 const pg_pool_t *p = &pool.info;
1110 double pool_scrub_max_interval = 0;
1111 double scrub_max_interval;
1112 if (deep) {
1113 p->opts.get(pool_opts_t::DEEP_SCRUB_INTERVAL, &pool_scrub_max_interval);
1114 scrub_max_interval = pool_scrub_max_interval > 0 ?
1115 pool_scrub_max_interval : g_conf()->osd_deep_scrub_interval;
1116 } else {
1117 p->opts.get(pool_opts_t::SCRUB_MAX_INTERVAL, &pool_scrub_max_interval);
1118 scrub_max_interval = pool_scrub_max_interval > 0 ?
1119 pool_scrub_max_interval : g_conf()->osd_scrub_max_interval;
1120 }
1121 // Instead of marking must_scrub force a schedule scrub
1122 utime_t stamp = ceph_clock_now();
1123 if (time == 0)
1124 stamp -= scrub_max_interval;
1125 else
1126 stamp -= (float)time;
1127 stamp -= 100.0; // push back last scrub more for good measure
1128 if (deep) {
1129 set_last_deep_scrub_stamp(stamp);
1130 } else {
1131 set_last_scrub_stamp(stamp);
1132 }
1133 f->open_object_section("result");
1134 f->dump_bool("deep", deep);
1135 f->dump_stream("stamp") << stamp;
1136 f->close_section();
1137 } else {
1138 ss << "Not primary";
1139 ret = -EPERM;
1140 }
1141 outbl.append(ss.str());
1142 }
1143
1144 else {
1145 ret = -ENOSYS;
1146 ss << "prefix '" << prefix << "' not implemented";
1147 }
1148
1149 out:
1150 if (ret >= 0 && outbl.length() == 0) {
1151 f->flush(outbl);
1152 }
1153 on_finish(ret, ss.str(), outbl);
7c673cae
FG
1154}
1155
9f95a23c 1156
7c673cae
FG
1157// ==========================================================
1158
1159void PrimaryLogPG::do_pg_op(OpRequestRef op)
1160{
9f95a23c 1161 const MOSDOp *m = static_cast<const MOSDOp *>(op->get_req());
11fdf7f2 1162 ceph_assert(m->get_type() == CEPH_MSG_OSD_OP);
7c673cae
FG
1163 dout(10) << "do_pg_op " << *m << dendl;
1164
1165 op->mark_started();
1166
1167 int result = 0;
1168 string cname, mname;
7c673cae
FG
1169
1170 snapid_t snapid = m->get_snapid();
1171
1172 vector<OSDOp> ops = m->ops;
1173
1174 for (vector<OSDOp>::iterator p = ops.begin(); p != ops.end(); ++p) {
9f95a23c 1175 std::unique_ptr<const PGLSFilter> filter;
7c673cae 1176 OSDOp& osd_op = *p;
11fdf7f2 1177 auto bp = p->indata.cbegin();
7c673cae
FG
1178 switch (p->op.op) {
1179 case CEPH_OSD_OP_PGNLS_FILTER:
1180 try {
11fdf7f2
TL
1181 decode(cname, bp);
1182 decode(mname, bp);
7c673cae
FG
1183 }
1184 catch (const buffer::error& e) {
1185 dout(0) << "unable to decode PGLS_FILTER description in " << *m << dendl;
1186 result = -EINVAL;
1187 break;
1188 }
9f95a23c 1189 std::tie(result, filter) = get_pgls_filter(bp);
7c673cae
FG
1190 if (result < 0)
1191 break;
1192
11fdf7f2 1193 ceph_assert(filter);
7c673cae
FG
1194
1195 // fall through
1196
1197 case CEPH_OSD_OP_PGNLS:
1198 if (snapid != CEPH_NOSNAP) {
1199 result = -EINVAL;
1200 break;
1201 }
1202 if (get_osdmap()->raw_pg_to_pg(m->get_pg()) != info.pgid.pgid) {
1203 dout(10) << " pgnls pg=" << m->get_pg()
1204 << " " << get_osdmap()->raw_pg_to_pg(m->get_pg())
1205 << " != " << info.pgid << dendl;
1206 result = 0; // hmm?
1207 } else {
11fdf7f2
TL
1208 unsigned list_size = std::min<uint64_t>(cct->_conf->osd_max_pgls,
1209 p->op.pgls.count);
7c673cae 1210
11fdf7f2
TL
1211 dout(10) << " pgnls pg=" << m->get_pg() << " count " << list_size
1212 << dendl;
7c673cae
FG
1213 // read into a buffer
1214 vector<hobject_t> sentries;
1215 pg_nls_response_t response;
1216 try {
11fdf7f2 1217 decode(response.handle, bp);
7c673cae
FG
1218 }
1219 catch (const buffer::error& e) {
1220 dout(0) << "unable to decode PGNLS handle in " << *m << dendl;
1221 result = -EINVAL;
1222 break;
1223 }
1224
1225 hobject_t next;
1226 hobject_t lower_bound = response.handle;
1227 hobject_t pg_start = info.pgid.pgid.get_hobj_start();
1228 hobject_t pg_end = info.pgid.pgid.get_hobj_end(pool.info.get_pg_num());
1229 dout(10) << " pgnls lower_bound " << lower_bound
1230 << " pg_end " << pg_end << dendl;
1231 if (((!lower_bound.is_max() && lower_bound >= pg_end) ||
1232 (lower_bound != hobject_t() && lower_bound < pg_start))) {
1233 // this should only happen with a buggy client.
1234 dout(10) << "outside of PG bounds " << pg_start << " .. "
1235 << pg_end << dendl;
1236 result = -EINVAL;
1237 break;
1238 }
1239
1240 hobject_t current = lower_bound;
7c673cae
FG
1241 int r = pgbackend->objects_list_partial(
1242 current,
1243 list_size,
1244 list_size,
1245 &sentries,
1246 &next);
1247 if (r != 0) {
1248 result = -EINVAL;
1249 break;
1250 }
1251
1252 map<hobject_t, pg_missing_item>::const_iterator missing_iter =
9f95a23c 1253 recovery_state.get_pg_log().get_missing().get_items().lower_bound(current);
7c673cae
FG
1254 vector<hobject_t>::iterator ls_iter = sentries.begin();
1255 hobject_t _max = hobject_t::get_max();
1256 while (1) {
1257 const hobject_t &mcand =
9f95a23c 1258 missing_iter == recovery_state.get_pg_log().get_missing().get_items().end() ?
7c673cae
FG
1259 _max :
1260 missing_iter->first;
1261 const hobject_t &lcand =
1262 ls_iter == sentries.end() ?
1263 _max :
1264 *ls_iter;
1265
1266 hobject_t candidate;
1267 if (mcand == lcand) {
1268 candidate = mcand;
1269 if (!mcand.is_max()) {
1270 ++ls_iter;
1271 ++missing_iter;
1272 }
1273 } else if (mcand < lcand) {
1274 candidate = mcand;
11fdf7f2 1275 ceph_assert(!mcand.is_max());
7c673cae
FG
1276 ++missing_iter;
1277 } else {
1278 candidate = lcand;
11fdf7f2 1279 ceph_assert(!lcand.is_max());
7c673cae
FG
1280 ++ls_iter;
1281 }
1282
1283 dout(10) << " pgnls candidate 0x" << std::hex << candidate.get_hash()
11fdf7f2
TL
1284 << " vs lower bound 0x" << lower_bound.get_hash()
1285 << std::dec << dendl;
7c673cae
FG
1286
1287 if (candidate >= next) {
1288 break;
1289 }
1290
1291 if (response.entries.size() == list_size) {
1292 next = candidate;
1293 break;
1294 }
1295
7c673cae
FG
1296 if (candidate.snap != CEPH_NOSNAP)
1297 continue;
1298
1299 // skip internal namespace
1300 if (candidate.get_namespace() == cct->_conf->osd_hit_set_namespace)
1301 continue;
1302
9f95a23c 1303 if (recovery_state.get_missing_loc().is_deleted(candidate))
c07f9fc5
FG
1304 continue;
1305
7c673cae
FG
1306 // skip wrong namespace
1307 if (m->get_hobj().nspace != librados::all_nspaces &&
1308 candidate.get_namespace() != m->get_hobj().nspace)
1309 continue;
1310
9f95a23c 1311 if (filter && !pgls_filter(*filter, candidate))
7c673cae
FG
1312 continue;
1313
1314 dout(20) << "pgnls item 0x" << std::hex
1315 << candidate.get_hash()
1316 << ", rev 0x" << hobject_t::_reverse_bits(candidate.get_hash())
1317 << std::dec << " "
1318 << candidate.oid.name << dendl;
1319
1320 librados::ListObjectImpl item;
1321 item.nspace = candidate.get_namespace();
1322 item.oid = candidate.oid.name;
1323 item.locator = candidate.get_key();
1324 response.entries.push_back(item);
1325 }
1326
1327 if (next.is_max() &&
9f95a23c 1328 missing_iter == recovery_state.get_pg_log().get_missing().get_items().end() &&
7c673cae
FG
1329 ls_iter == sentries.end()) {
1330 result = 1;
1331
1332 // Set response.handle to the start of the next PG according
1333 // to the object sort order.
1334 response.handle = info.pgid.pgid.get_hobj_end(pool.info.get_pg_num());
1335 } else {
1336 response.handle = next;
1337 }
1338 dout(10) << "pgnls handle=" << response.handle << dendl;
11fdf7f2 1339 encode(response, osd_op.outdata);
7c673cae
FG
1340 dout(10) << " pgnls result=" << result << " outdata.length()="
1341 << osd_op.outdata.length() << dendl;
1342 }
1343 break;
1344
1345 case CEPH_OSD_OP_PGLS_FILTER:
1346 try {
11fdf7f2
TL
1347 decode(cname, bp);
1348 decode(mname, bp);
7c673cae
FG
1349 }
1350 catch (const buffer::error& e) {
1351 dout(0) << "unable to decode PGLS_FILTER description in " << *m << dendl;
1352 result = -EINVAL;
1353 break;
1354 }
9f95a23c 1355 std::tie(result, filter) = get_pgls_filter(bp);
7c673cae
FG
1356 if (result < 0)
1357 break;
1358
11fdf7f2 1359 ceph_assert(filter);
7c673cae
FG
1360
1361 // fall through
1362
1363 case CEPH_OSD_OP_PGLS:
1364 if (snapid != CEPH_NOSNAP) {
1365 result = -EINVAL;
1366 break;
1367 }
1368 if (get_osdmap()->raw_pg_to_pg(m->get_pg()) != info.pgid.pgid) {
1369 dout(10) << " pgls pg=" << m->get_pg()
1370 << " " << get_osdmap()->raw_pg_to_pg(m->get_pg())
1371 << " != " << info.pgid << dendl;
1372 result = 0; // hmm?
1373 } else {
11fdf7f2
TL
1374 unsigned list_size = std::min<uint64_t>(cct->_conf->osd_max_pgls,
1375 p->op.pgls.count);
7c673cae
FG
1376
1377 dout(10) << " pgls pg=" << m->get_pg() << " count " << list_size << dendl;
1378 // read into a buffer
1379 vector<hobject_t> sentries;
1380 pg_ls_response_t response;
1381 try {
11fdf7f2 1382 decode(response.handle, bp);
7c673cae
FG
1383 }
1384 catch (const buffer::error& e) {
1385 dout(0) << "unable to decode PGLS handle in " << *m << dendl;
1386 result = -EINVAL;
1387 break;
1388 }
1389
1390 hobject_t next;
1391 hobject_t current = response.handle;
7c673cae
FG
1392 int r = pgbackend->objects_list_partial(
1393 current,
1394 list_size,
1395 list_size,
1396 &sentries,
1397 &next);
1398 if (r != 0) {
1399 result = -EINVAL;
1400 break;
1401 }
1402
9f95a23c 1403 ceph_assert(snapid == CEPH_NOSNAP || recovery_state.get_pg_log().get_missing().get_items().empty());
7c673cae
FG
1404
1405 map<hobject_t, pg_missing_item>::const_iterator missing_iter =
9f95a23c 1406 recovery_state.get_pg_log().get_missing().get_items().lower_bound(current);
7c673cae
FG
1407 vector<hobject_t>::iterator ls_iter = sentries.begin();
1408 hobject_t _max = hobject_t::get_max();
1409 while (1) {
1410 const hobject_t &mcand =
9f95a23c 1411 missing_iter == recovery_state.get_pg_log().get_missing().get_items().end() ?
7c673cae
FG
1412 _max :
1413 missing_iter->first;
1414 const hobject_t &lcand =
1415 ls_iter == sentries.end() ?
1416 _max :
1417 *ls_iter;
1418
1419 hobject_t candidate;
1420 if (mcand == lcand) {
1421 candidate = mcand;
1422 if (!mcand.is_max()) {
1423 ++ls_iter;
1424 ++missing_iter;
1425 }
1426 } else if (mcand < lcand) {
1427 candidate = mcand;
11fdf7f2 1428 ceph_assert(!mcand.is_max());
7c673cae
FG
1429 ++missing_iter;
1430 } else {
1431 candidate = lcand;
11fdf7f2 1432 ceph_assert(!lcand.is_max());
7c673cae
FG
1433 ++ls_iter;
1434 }
1435
1436 if (candidate >= next) {
1437 break;
1438 }
1439
1440 if (response.entries.size() == list_size) {
1441 next = candidate;
1442 break;
1443 }
1444
7c673cae
FG
1445 if (candidate.snap != CEPH_NOSNAP)
1446 continue;
1447
1448 // skip wrong namespace
1449 if (candidate.get_namespace() != m->get_hobj().nspace)
1450 continue;
1451
9f95a23c 1452 if (recovery_state.get_missing_loc().is_deleted(candidate))
c07f9fc5
FG
1453 continue;
1454
9f95a23c 1455 if (filter && !pgls_filter(*filter, candidate))
7c673cae
FG
1456 continue;
1457
1458 response.entries.push_back(make_pair(candidate.oid,
1459 candidate.get_key()));
1460 }
1461 if (next.is_max() &&
9f95a23c 1462 missing_iter == recovery_state.get_pg_log().get_missing().get_items().end() &&
7c673cae
FG
1463 ls_iter == sentries.end()) {
1464 result = 1;
1465 }
1466 response.handle = next;
11fdf7f2 1467 encode(response, osd_op.outdata);
7c673cae
FG
1468 dout(10) << " pgls result=" << result << " outdata.length()="
1469 << osd_op.outdata.length() << dendl;
1470 }
1471 break;
1472
1473 case CEPH_OSD_OP_PG_HITSET_LS:
1474 {
1475 list< pair<utime_t,utime_t> > ls;
1476 for (list<pg_hit_set_info_t>::const_iterator p = info.hit_set.history.begin();
1477 p != info.hit_set.history.end();
1478 ++p)
1479 ls.push_back(make_pair(p->begin, p->end));
1480 if (hit_set)
1481 ls.push_back(make_pair(hit_set_start_stamp, utime_t()));
11fdf7f2 1482 encode(ls, osd_op.outdata);
7c673cae
FG
1483 }
1484 break;
1485
1486 case CEPH_OSD_OP_PG_HITSET_GET:
1487 {
1488 utime_t stamp(osd_op.op.hit_set_get.stamp);
1489 if (hit_set_start_stamp && stamp >= hit_set_start_stamp) {
1490 // read the current in-memory HitSet, not the version we've
1491 // checkpointed.
1492 if (!hit_set) {
1493 result= -ENOENT;
1494 break;
1495 }
11fdf7f2 1496 encode(*hit_set, osd_op.outdata);
7c673cae
FG
1497 result = osd_op.outdata.length();
1498 } else {
1499 // read an archived HitSet.
1500 hobject_t oid;
1501 for (list<pg_hit_set_info_t>::const_iterator p = info.hit_set.history.begin();
1502 p != info.hit_set.history.end();
1503 ++p) {
1504 if (stamp >= p->begin && stamp <= p->end) {
1505 oid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt);
1506 break;
1507 }
1508 }
1509 if (oid == hobject_t()) {
1510 result = -ENOENT;
1511 break;
1512 }
1513 if (!pool.info.is_replicated()) {
1514 // FIXME: EC not supported yet
1515 result = -EOPNOTSUPP;
1516 break;
1517 }
1518 if (is_unreadable_object(oid)) {
1519 wait_for_unreadable_object(oid, op);
7c673cae
FG
1520 return;
1521 }
1522 result = osd->store->read(ch, ghobject_t(oid), 0, 0, osd_op.outdata);
1523 }
1524 }
1525 break;
1526
1527 case CEPH_OSD_OP_SCRUBLS:
1528 result = do_scrub_ls(m, &osd_op);
1529 break;
1530
1531 default:
1532 result = -EINVAL;
1533 break;
1534 }
1535
1536 if (result < 0)
1537 break;
1538 }
1539
1540 // reply
11fdf7f2 1541 MOSDOpReply *reply = new MOSDOpReply(m, 0, get_osdmap_epoch(),
7c673cae
FG
1542 CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK,
1543 false);
1544 reply->claim_op_out_data(ops);
1545 reply->set_result(result);
1546 reply->set_reply_versions(info.last_update, info.last_user_version);
1547 osd->send_message_osd_client(reply, m->get_connection());
7c673cae
FG
1548}
1549
9f95a23c 1550int PrimaryLogPG::do_scrub_ls(const MOSDOp *m, OSDOp *osd_op)
7c673cae
FG
1551{
1552 if (m->get_pg() != info.pgid.pgid) {
1553 dout(10) << " scrubls pg=" << m->get_pg() << " != " << info.pgid << dendl;
1554 return -EINVAL; // hmm?
1555 }
11fdf7f2 1556 auto bp = osd_op->indata.cbegin();
7c673cae
FG
1557 scrub_ls_arg_t arg;
1558 try {
1559 arg.decode(bp);
1560 } catch (buffer::error&) {
1561 dout(10) << " corrupted scrub_ls_arg_t" << dendl;
1562 return -EINVAL;
1563 }
1564 int r = 0;
1565 scrub_ls_result_t result = {.interval = info.history.same_interval_since};
1566 if (arg.interval != 0 && arg.interval != info.history.same_interval_since) {
1567 r = -EAGAIN;
1568 } else if (!scrubber.store) {
1569 r = -ENOENT;
1570 } else if (arg.get_snapsets) {
1571 result.vals = scrubber.store->get_snap_errors(osd->store,
1572 get_pgid().pool(),
1573 arg.start_after,
1574 arg.max_return);
1575 } else {
1576 result.vals = scrubber.store->get_object_errors(osd->store,
1577 get_pgid().pool(),
1578 arg.start_after,
1579 arg.max_return);
1580 }
11fdf7f2 1581 encode(result, osd_op->outdata);
7c673cae
FG
1582 return r;
1583}
1584
7c673cae 1585PrimaryLogPG::PrimaryLogPG(OSDService *o, OSDMapRef curmap,
11fdf7f2
TL
1586 const PGPool &_pool,
1587 const map<string,string>& ec_profile, spg_t p) :
7c673cae
FG
1588 PG(o, curmap, _pool, p),
1589 pgbackend(
1590 PGBackend::build_pg_backend(
11fdf7f2 1591 _pool.info, ec_profile, this, coll_t(p), ch, o->store, cct)),
7c673cae 1592 object_contexts(o->cct, o->cct->_conf->osd_pg_object_context_cache_count),
7c673cae
FG
1593 new_backfill(false),
1594 temp_seq(0),
1595 snap_trimmer_machine(this)
1596{
9f95a23c 1597 recovery_state.set_backend_predicates(
7c673cae
FG
1598 pgbackend->get_is_readable_predicate(),
1599 pgbackend->get_is_recoverable_predicate());
1600 snap_trimmer_machine.initiate();
1601}
1602
1603void PrimaryLogPG::get_src_oloc(const object_t& oid, const object_locator_t& oloc, object_locator_t& src_oloc)
1604{
1605 src_oloc = oloc;
1606 if (oloc.key.empty())
1607 src_oloc.key = oid.name;
1608}
1609
1610void PrimaryLogPG::handle_backoff(OpRequestRef& op)
1611{
9f95a23c
TL
1612 auto m = op->get_req<MOSDBackoff>();
1613 auto session = ceph::ref_cast<Session>(m->get_connection()->get_priv());
7c673cae
FG
1614 if (!session)
1615 return; // drop it.
7c673cae
FG
1616 hobject_t begin = info.pgid.pgid.get_hobj_start();
1617 hobject_t end = info.pgid.pgid.get_hobj_end(pool.info.get_pg_num());
1618 if (begin < m->begin) {
1619 begin = m->begin;
1620 }
1621 if (end > m->end) {
1622 end = m->end;
1623 }
1624 dout(10) << __func__ << " backoff ack id " << m->id
1625 << " [" << begin << "," << end << ")" << dendl;
1626 session->ack_backoff(cct, m->pgid, m->id, begin, end);
1627}
1628
1629void PrimaryLogPG::do_request(
1630 OpRequestRef& op,
1631 ThreadPool::TPHandle &handle)
1632{
1633 if (op->osd_trace) {
1634 op->pg_trace.init("pg op", &trace_endpoint, &op->osd_trace);
1635 op->pg_trace.event("do request");
1636 }
1637 // make sure we have a new enough map
1638 auto p = waiting_for_map.find(op->get_source());
1639 if (p != waiting_for_map.end()) {
1640 // preserve ordering
1641 dout(20) << __func__ << " waiting_for_map "
1642 << p->first << " not empty, queueing" << dendl;
1643 p->second.push_back(op);
1644 op->mark_delayed("waiting_for_map not empty");
1645 return;
1646 }
1647 if (!have_same_or_newer_map(op->min_epoch)) {
1648 dout(20) << __func__ << " min " << op->min_epoch
1649 << ", queue on waiting_for_map " << op->get_source() << dendl;
1650 waiting_for_map[op->get_source()].push_back(op);
1651 op->mark_delayed("op must wait for map");
181888fb 1652 osd->request_osdmap_update(op->min_epoch);
7c673cae
FG
1653 return;
1654 }
1655
1656 if (can_discard_request(op)) {
1657 return;
1658 }
1659
1660 // pg-wide backoffs
1661 const Message *m = op->get_req();
11fdf7f2 1662 int msg_type = m->get_type();
7c673cae 1663 if (m->get_connection()->has_feature(CEPH_FEATURE_RADOS_BACKOFF)) {
9f95a23c 1664 auto session = ceph::ref_cast<Session>(m->get_connection()->get_priv());
7c673cae
FG
1665 if (!session)
1666 return; // drop it.
7c673cae 1667
11fdf7f2 1668 if (msg_type == CEPH_MSG_OSD_OP) {
7c673cae
FG
1669 if (session->check_backoff(cct, info.pgid,
1670 info.pgid.pgid.get_hobj_start(), m)) {
1671 return;
1672 }
1673
1674 bool backoff =
1675 is_down() ||
1676 is_incomplete() ||
1677 (!is_active() && is_peered());
11fdf7f2 1678 if (g_conf()->osd_backoff_on_peering && !backoff) {
7c673cae
FG
1679 if (is_peering()) {
1680 backoff = true;
1681 }
1682 }
1683 if (backoff) {
1684 add_pg_backoff(session);
1685 return;
1686 }
1687 }
1688 // pg backoff acks at pg-level
11fdf7f2 1689 if (msg_type == CEPH_MSG_OSD_BACKOFF) {
7c673cae
FG
1690 const MOSDBackoff *ba = static_cast<const MOSDBackoff*>(m);
1691 if (ba->begin != ba->end) {
1692 handle_backoff(op);
1693 return;
1694 }
1695 }
1696 }
1697
7c673cae
FG
1698 if (!is_peered()) {
1699 // Delay unless PGBackend says it's ok
1700 if (pgbackend->can_handle_while_inactive(op)) {
1701 bool handled = pgbackend->handle_message(op);
11fdf7f2 1702 ceph_assert(handled);
7c673cae
FG
1703 return;
1704 } else {
1705 waiting_for_peered.push_back(op);
1706 op->mark_delayed("waiting for peered");
1707 return;
1708 }
1709 }
1710
9f95a23c
TL
1711 if (recovery_state.needs_flush()) {
1712 dout(20) << "waiting for flush on " << op << dendl;
b32b8144
FG
1713 waiting_for_flush.push_back(op);
1714 op->mark_delayed("waiting for flush");
1715 return;
1716 }
1717
9f95a23c 1718 ceph_assert(is_peered() && !recovery_state.needs_flush());
7c673cae
FG
1719 if (pgbackend->handle_message(op))
1720 return;
1721
11fdf7f2 1722 switch (msg_type) {
7c673cae
FG
1723 case CEPH_MSG_OSD_OP:
1724 case CEPH_MSG_OSD_BACKOFF:
1725 if (!is_active()) {
1726 dout(20) << " peered, not active, waiting for active on " << op << dendl;
1727 waiting_for_active.push_back(op);
1728 op->mark_delayed("waiting for active");
1729 return;
1730 }
11fdf7f2 1731 switch (msg_type) {
7c673cae
FG
1732 case CEPH_MSG_OSD_OP:
1733 // verify client features
1734 if ((pool.info.has_tiers() || pool.info.is_tier()) &&
1735 !op->has_feature(CEPH_FEATURE_OSD_CACHEPOOL)) {
1736 osd->reply_op_error(op, -EOPNOTSUPP);
1737 return;
1738 }
1739 do_op(op);
1740 break;
1741 case CEPH_MSG_OSD_BACKOFF:
1742 // object-level backoff acks handled in osdop context
1743 handle_backoff(op);
1744 break;
1745 }
1746 break;
1747
7c673cae
FG
1748 case MSG_OSD_PG_SCAN:
1749 do_scan(op, handle);
1750 break;
1751
1752 case MSG_OSD_PG_BACKFILL:
1753 do_backfill(op);
1754 break;
1755
1756 case MSG_OSD_PG_BACKFILL_REMOVE:
1757 do_backfill_remove(op);
1758 break;
1759
1760 case MSG_OSD_SCRUB_RESERVE:
1761 {
9f95a23c 1762 auto m = op->get_req<MOSDScrubReserve>();
7c673cae
FG
1763 switch (m->type) {
1764 case MOSDScrubReserve::REQUEST:
1765 handle_scrub_reserve_request(op);
1766 break;
1767 case MOSDScrubReserve::GRANT:
1768 handle_scrub_reserve_grant(op, m->from);
1769 break;
1770 case MOSDScrubReserve::REJECT:
1771 handle_scrub_reserve_reject(op, m->from);
1772 break;
1773 case MOSDScrubReserve::RELEASE:
1774 handle_scrub_reserve_release(op);
1775 break;
1776 }
1777 }
1778 break;
1779
1780 case MSG_OSD_REP_SCRUB:
1781 replica_scrub(op, handle);
1782 break;
1783
1784 case MSG_OSD_REP_SCRUBMAP:
1785 do_replica_scrub_map(op);
1786 break;
1787
1788 case MSG_OSD_PG_UPDATE_LOG_MISSING:
1789 do_update_log_missing(op);
1790 break;
1791
1792 case MSG_OSD_PG_UPDATE_LOG_MISSING_REPLY:
1793 do_update_log_missing_reply(op);
1794 break;
1795
1796 default:
11fdf7f2 1797 ceph_abort_msg("bad message type in do_request");
7c673cae
FG
1798 }
1799}
1800
1801hobject_t PrimaryLogPG::earliest_backfill() const
1802{
1803 hobject_t e = hobject_t::get_max();
9f95a23c
TL
1804 for (const pg_shard_t& bt : get_backfill_targets()) {
1805 const pg_info_t &pi = recovery_state.get_peer_info(bt);
1806 e = std::min(pi.last_backfill, e);
7c673cae
FG
1807 }
1808 return e;
1809}
1810
1811/** do_op - do an op
1812 * pg lock will be held (if multithreaded)
1813 * osd_lock NOT held.
1814 */
1815void PrimaryLogPG::do_op(OpRequestRef& op)
1816{
11fdf7f2 1817 FUNCTRACE(cct);
7c673cae
FG
1818 // NOTE: take a non-const pointer here; we must be careful not to
1819 // change anything that will break other reads on m (operator<<).
1820 MOSDOp *m = static_cast<MOSDOp*>(op->get_nonconst_req());
11fdf7f2 1821 ceph_assert(m->get_type() == CEPH_MSG_OSD_OP);
7c673cae
FG
1822 if (m->finish_decode()) {
1823 op->reset_desc(); // for TrackedOp
1824 m->clear_payload();
1825 }
1826
1827 dout(20) << __func__ << ": op " << *m << dendl;
1828
9f95a23c 1829 const hobject_t head = m->get_hobj().get_head();
7c673cae
FG
1830
1831 if (!info.pgid.pgid.contains(
1832 info.pgid.pgid.get_split_bits(pool.info.get_pg_num()), head)) {
1833 derr << __func__ << " " << info.pgid.pgid << " does not contain "
1834 << head << " pg_num " << pool.info.get_pg_num() << " hash "
1835 << std::hex << head.get_hash() << std::dec << dendl;
1836 osd->clog->warn() << info.pgid.pgid << " does not contain " << head
1837 << " op " << *m;
11fdf7f2 1838 ceph_assert(!cct->_conf->osd_debug_misdirected_ops);
7c673cae
FG
1839 return;
1840 }
1841
1842 bool can_backoff =
1843 m->get_connection()->has_feature(CEPH_FEATURE_RADOS_BACKOFF);
9f95a23c 1844 ceph::ref_t<Session> session;
7c673cae 1845 if (can_backoff) {
11fdf7f2 1846 session = static_cast<Session*>(m->get_connection()->get_priv().get());
7c673cae
FG
1847 if (!session.get()) {
1848 dout(10) << __func__ << " no session" << dendl;
1849 return;
1850 }
7c673cae
FG
1851
1852 if (session->check_backoff(cct, info.pgid, head, m)) {
1853 return;
1854 }
1855 }
1856
1857 if (m->has_flag(CEPH_OSD_FLAG_PARALLELEXEC)) {
1858 // not implemented.
1859 dout(20) << __func__ << ": PARALLELEXEC not implemented " << *m << dendl;
1860 osd->reply_op_error(op, -EINVAL);
1861 return;
1862 }
1863
9f95a23c
TL
1864 {
1865 int r = op->maybe_init_op_info(*get_osdmap());
7c673cae
FG
1866 if (r) {
1867 osd->reply_op_error(op, r);
1868 return;
1869 }
1870 }
1871
1872 if ((m->get_flags() & (CEPH_OSD_FLAG_BALANCE_READS |
1873 CEPH_OSD_FLAG_LOCALIZE_READS)) &&
1874 op->may_read() &&
1875 !(op->may_write() || op->may_cache())) {
1876 // balanced reads; any replica will do
9f95a23c 1877 if (!(is_primary() || is_nonprimary())) {
7c673cae
FG
1878 osd->handle_misdirected_op(this, op);
1879 return;
1880 }
1881 } else {
1882 // normal case; must be primary
1883 if (!is_primary()) {
1884 osd->handle_misdirected_op(this, op);
1885 return;
1886 }
1887 }
1888
9f95a23c
TL
1889 if (!check_laggy(op)) {
1890 return;
1891 }
1892
7c673cae
FG
1893 if (!op_has_sufficient_caps(op)) {
1894 osd->reply_op_error(op, -EPERM);
1895 return;
1896 }
1897
31f18b77
FG
1898 if (op->includes_pg_op()) {
1899 return do_pg_op(op);
1900 }
1901
7c673cae
FG
1902 // object name too long?
1903 if (m->get_oid().name.size() > cct->_conf->osd_max_object_name_len) {
1904 dout(4) << "do_op name is longer than "
1905 << cct->_conf->osd_max_object_name_len
1906 << " bytes" << dendl;
1907 osd->reply_op_error(op, -ENAMETOOLONG);
1908 return;
1909 }
1910 if (m->get_hobj().get_key().size() > cct->_conf->osd_max_object_name_len) {
1911 dout(4) << "do_op locator is longer than "
1912 << cct->_conf->osd_max_object_name_len
1913 << " bytes" << dendl;
1914 osd->reply_op_error(op, -ENAMETOOLONG);
1915 return;
1916 }
1917 if (m->get_hobj().nspace.size() > cct->_conf->osd_max_object_namespace_len) {
1918 dout(4) << "do_op namespace is longer than "
1919 << cct->_conf->osd_max_object_namespace_len
1920 << " bytes" << dendl;
1921 osd->reply_op_error(op, -ENAMETOOLONG);
1922 return;
1923 }
494da23a
TL
1924 if (m->get_hobj().oid.name.empty()) {
1925 dout(4) << "do_op empty oid name is not allowed" << dendl;
1926 osd->reply_op_error(op, -EINVAL);
1927 return;
1928 }
7c673cae
FG
1929
1930 if (int r = osd->store->validate_hobject_key(head)) {
1931 dout(4) << "do_op object " << head << " invalid for backing store: "
1932 << r << dendl;
1933 osd->reply_op_error(op, r);
1934 return;
1935 }
1936
1937 // blacklisted?
1938 if (get_osdmap()->is_blacklisted(m->get_source_addr())) {
1939 dout(10) << "do_op " << m->get_source_addr() << " is blacklisted" << dendl;
1940 osd->reply_op_error(op, -EBLACKLISTED);
1941 return;
1942 }
1943
1944 // order this op as a write?
1945 bool write_ordered = op->rwordered();
1946
1947 // discard due to cluster full transition? (we discard any op that
1948 // originates before the cluster or pool is marked full; the client
1949 // will resend after the full flag is removed or if they expect the
1950 // op to succeed despite being full). The except is FULL_FORCE and
1951 // FULL_TRY ops, which there is no reason to discard because they
1952 // bypass all full checks anyway. If this op isn't write or
1953 // read-ordered, we skip.
1954 // FIXME: we exclude mds writes for now.
1955 if (write_ordered && !(m->get_source().is_mds() ||
1956 m->has_flag(CEPH_OSD_FLAG_FULL_TRY) ||
1957 m->has_flag(CEPH_OSD_FLAG_FULL_FORCE)) &&
1958 info.history.last_epoch_marked_full > m->get_map_epoch()) {
1959 dout(10) << __func__ << " discarding op sent before full " << m << " "
1960 << *m << dendl;
1961 return;
1962 }
1963 // mds should have stopped writing before this point.
1964 // We can't allow OSD to become non-startable even if mds
1965 // could be writing as part of file removals.
11fdf7f2
TL
1966 if (write_ordered && osd->check_failsafe_full(get_dpp()) &&
1967 !m->has_flag(CEPH_OSD_FLAG_FULL_TRY)) {
1968 dout(10) << __func__ << " fail-safe full check failed, dropping request." << dendl;
7c673cae
FG
1969 return;
1970 }
1971 int64_t poolid = get_pgid().pool();
1972 if (op->may_write()) {
1973
1974 const pg_pool_t *pi = get_osdmap()->get_pg_pool(poolid);
1975 if (!pi) {
1976 return;
1977 }
1978
1979 // invalid?
1980 if (m->get_snapid() != CEPH_NOSNAP) {
1981 dout(20) << __func__ << ": write to clone not valid " << *m << dendl;
1982 osd->reply_op_error(op, -EINVAL);
1983 return;
1984 }
1985
1986 // too big?
1987 if (cct->_conf->osd_max_write_size &&
1988 m->get_data_len() > cct->_conf->osd_max_write_size << 20) {
1989 // journal can't hold commit!
1990 derr << "do_op msg data len " << m->get_data_len()
1991 << " > osd_max_write_size " << (cct->_conf->osd_max_write_size << 20)
1992 << " on " << *m << dendl;
1993 osd->reply_op_error(op, -OSD_WRITETOOBIG);
1994 return;
1995 }
1996 }
1997
1998 dout(10) << "do_op " << *m
1999 << (op->may_write() ? " may_write" : "")
2000 << (op->may_read() ? " may_read" : "")
2001 << (op->may_cache() ? " may_cache" : "")
2002 << " -> " << (write_ordered ? "write-ordered" : "read-ordered")
2003 << " flags " << ceph_osd_flag_string(m->get_flags())
2004 << dendl;
2005
2006 // missing object?
2007 if (is_unreadable_object(head)) {
224ce89b
WB
2008 if (!is_primary()) {
2009 osd->reply_op_error(op, -EAGAIN);
2010 return;
2011 }
7c673cae 2012 if (can_backoff &&
11fdf7f2 2013 (g_conf()->osd_backoff_on_degraded ||
9f95a23c
TL
2014 (g_conf()->osd_backoff_on_unfound &&
2015 recovery_state.get_missing_loc().is_unfound(head)))) {
7c673cae
FG
2016 add_backoff(session, head, head);
2017 maybe_kick_recovery(head);
2018 } else {
2019 wait_for_unreadable_object(head, op);
2020 }
2021 return;
2022 }
2023
11fdf7f2
TL
2024 if (write_ordered) {
2025 // degraded object?
2026 if (is_degraded_or_backfilling_object(head)) {
2027 if (can_backoff && g_conf()->osd_backoff_on_degraded) {
2028 add_backoff(session, head, head);
2029 maybe_kick_recovery(head);
2030 } else {
2031 wait_for_degraded_object(head, op);
2032 }
2033 return;
7c673cae 2034 }
7c673cae 2035
11fdf7f2
TL
2036 if (scrubber.is_chunky_scrub_active() && write_blocked_by_scrub(head)) {
2037 dout(20) << __func__ << ": waiting for scrub" << dendl;
2038 waiting_for_scrub.push_back(op);
2039 op->mark_delayed("waiting for scrub");
2040 return;
2041 }
9f95a23c
TL
2042 if (!check_laggy_requeue(op)) {
2043 return;
2044 }
7c673cae 2045
11fdf7f2
TL
2046 // blocked on snap?
2047 if (auto blocked_iter = objects_blocked_on_degraded_snap.find(head);
2048 blocked_iter != std::end(objects_blocked_on_degraded_snap)) {
2049 hobject_t to_wait_on(head);
2050 to_wait_on.snap = blocked_iter->second;
2051 wait_for_degraded_object(to_wait_on, op);
2052 return;
2053 }
2054 if (auto blocked_snap_promote_iter = objects_blocked_on_snap_promotion.find(head);
2055 blocked_snap_promote_iter != std::end(objects_blocked_on_snap_promotion)) {
2056 wait_for_blocked_object(blocked_snap_promote_iter->second->obs.oi.soid, op);
2057 return;
2058 }
2059 if (objects_blocked_on_cache_full.count(head)) {
2060 block_write_on_full_cache(head, op);
2061 return;
2062 }
7c673cae
FG
2063 }
2064
2065 // dup/resent?
2066 if (op->may_write() || op->may_cache()) {
2067 // warning: we will get back *a* request for this reqid, but not
2068 // necessarily the most recent. this happens with flush and
2069 // promote ops, but we can't possible have both in our log where
2070 // the original request is still not stable on disk, so for our
2071 // purposes here it doesn't matter which one we get.
2072 eversion_t version;
2073 version_t user_version;
2074 int return_code = 0;
9f95a23c 2075 vector<pg_log_op_return_item_t> op_returns;
7c673cae 2076 bool got = check_in_progress_op(
9f95a23c 2077 m->get_reqid(), &version, &user_version, &return_code, &op_returns);
7c673cae
FG
2078 if (got) {
2079 dout(3) << __func__ << " dup " << m->get_reqid()
2080 << " version " << version << dendl;
2081 if (already_complete(version)) {
9f95a23c 2082 osd->reply_op_error(op, return_code, version, user_version, op_returns);
7c673cae
FG
2083 } else {
2084 dout(10) << " waiting for " << version << " to commit" << dendl;
2085 // always queue ondisk waiters, so that we can requeue if needed
9f95a23c
TL
2086 waiting_for_ondisk[version].emplace_back(op, user_version, return_code,
2087 op_returns);
7c673cae
FG
2088 op->mark_delayed("waiting for ondisk");
2089 }
2090 return;
2091 }
2092 }
2093
2094 ObjectContextRef obc;
11fdf7f2 2095 bool can_create = op->may_write();
7c673cae 2096 hobject_t missing_oid;
11fdf7f2
TL
2097
2098 // kludge around the fact that LIST_SNAPS sets CEPH_SNAPDIR for LIST_SNAPS
11fdf7f2 2099 const hobject_t& oid =
9f95a23c 2100 m->get_snapid() == CEPH_SNAPDIR ? head : m->get_hobj();
11fdf7f2
TL
2101
2102 // make sure LIST_SNAPS is on CEPH_SNAPDIR and nothing else
2103 for (vector<OSDOp>::iterator p = m->ops.begin(); p != m->ops.end(); ++p) {
2104 OSDOp& osd_op = *p;
2105
2106 if (osd_op.op.op == CEPH_OSD_OP_LIST_SNAPS) {
2107 if (m->get_snapid() != CEPH_SNAPDIR) {
2108 dout(10) << "LIST_SNAPS with incorrect context" << dendl;
2109 osd->reply_op_error(op, -EINVAL);
2110 return;
2111 }
2112 } else {
2113 if (m->get_snapid() == CEPH_SNAPDIR) {
2114 dout(10) << "non-LIST_SNAPS on snapdir" << dendl;
2115 osd->reply_op_error(op, -EINVAL);
2116 return;
2117 }
2118 }
2119 }
7c673cae
FG
2120
2121 // io blocked on obc?
2122 if (!m->has_flag(CEPH_OSD_FLAG_FLUSH) &&
11fdf7f2 2123 maybe_await_blocked_head(oid, op)) {
7c673cae
FG
2124 return;
2125 }
2126
9f95a23c
TL
2127 if (!is_primary()) {
2128 if (!recovery_state.can_serve_replica_read(oid)) {
2129 dout(20) << __func__ << ": oid " << oid
2130 << " unstable write on replica, bouncing to primary."
2131 << *m << dendl;
2132 osd->reply_op_error(op, -EAGAIN);
2133 return;
2134 } else {
2135 dout(20) << __func__ << ": serving replica read on oid" << oid
2136 << dendl;
2137 }
2138 }
2139
7c673cae
FG
2140 int r = find_object_context(
2141 oid, &obc, can_create,
2142 m->has_flag(CEPH_OSD_FLAG_MAP_SNAP_CLONE),
2143 &missing_oid);
2144
11fdf7f2
TL
2145 // LIST_SNAPS needs the ssc too
2146 if (obc &&
2147 m->get_snapid() == CEPH_SNAPDIR &&
2148 !obc->ssc) {
2149 obc->ssc = get_snapset_context(oid, true);
2150 }
2151
7c673cae
FG
2152 if (r == -EAGAIN) {
2153 // If we're not the primary of this OSD, we just return -EAGAIN. Otherwise,
2154 // we have to wait for the object.
2155 if (is_primary()) {
2156 // missing the specific snap we need; requeue and wait.
11fdf7f2 2157 ceph_assert(!op->may_write()); // only happens on a read/cache
7c673cae
FG
2158 wait_for_unreadable_object(missing_oid, op);
2159 return;
2160 }
2161 } else if (r == 0) {
2162 if (is_unreadable_object(obc->obs.oi.soid)) {
2163 dout(10) << __func__ << ": clone " << obc->obs.oi.soid
2164 << " is unreadable, waiting" << dendl;
2165 wait_for_unreadable_object(obc->obs.oi.soid, op);
2166 return;
2167 }
2168
2169 // degraded object? (the check above was for head; this could be a clone)
2170 if (write_ordered &&
2171 obc->obs.oi.soid.snap != CEPH_NOSNAP &&
2172 is_degraded_or_backfilling_object(obc->obs.oi.soid)) {
2173 dout(10) << __func__ << ": clone " << obc->obs.oi.soid
2174 << " is degraded, waiting" << dendl;
2175 wait_for_degraded_object(obc->obs.oi.soid, op);
2176 return;
2177 }
2178 }
2179
2180 bool in_hit_set = false;
2181 if (hit_set) {
2182 if (obc.get()) {
2183 if (obc->obs.oi.soid != hobject_t() && hit_set->contains(obc->obs.oi.soid))
2184 in_hit_set = true;
2185 } else {
2186 if (missing_oid != hobject_t() && hit_set->contains(missing_oid))
2187 in_hit_set = true;
2188 }
2189 if (!op->hitset_inserted) {
2190 hit_set->insert(oid);
2191 op->hitset_inserted = true;
2192 if (hit_set->is_full() ||
2193 hit_set_start_stamp + pool.info.hit_set_period <= m->get_recv_stamp()) {
2194 hit_set_persist();
2195 }
2196 }
2197 }
2198
2199 if (agent_state) {
2200 if (agent_choose_mode(false, op))
2201 return;
2202 }
2203
31f18b77
FG
2204 if (obc.get() && obc->obs.exists && obc->obs.oi.has_manifest()) {
2205 if (maybe_handle_manifest(op,
2206 write_ordered,
2207 obc))
2208 return;
2209 }
2210
7c673cae
FG
2211 if (maybe_handle_cache(op,
2212 write_ordered,
2213 obc,
2214 r,
2215 missing_oid,
2216 false,
2217 in_hit_set))
2218 return;
2219
2220 if (r && (r != -ENOENT || !obc)) {
2221 // copy the reqids for copy get on ENOENT
2222 if (r == -ENOENT &&
2223 (m->ops[0].op.op == CEPH_OSD_OP_COPY_GET)) {
2224 fill_in_copy_get_noent(op, oid, m->ops[0]);
2225 return;
2226 }
224ce89b 2227 dout(20) << __func__ << ": find_object_context got error " << r << dendl;
7c673cae 2228 if (op->may_write() &&
9f95a23c 2229 get_osdmap()->require_osd_release >= ceph_release_t::kraken) {
7c673cae
FG
2230 record_write_error(op, oid, nullptr, r);
2231 } else {
2232 osd->reply_op_error(op, r);
2233 }
2234 return;
2235 }
2236
2237 // make sure locator is consistent
2238 object_locator_t oloc(obc->obs.oi.soid);
2239 if (m->get_object_locator() != oloc) {
2240 dout(10) << " provided locator " << m->get_object_locator()
2241 << " != object's " << obc->obs.oi.soid << dendl;
2242 osd->clog->warn() << "bad locator " << m->get_object_locator()
2243 << " on object " << oloc
2244 << " op " << *m;
2245 }
2246
2247 // io blocked on obc?
2248 if (obc->is_blocked() &&
2249 !m->has_flag(CEPH_OSD_FLAG_FLUSH)) {
2250 wait_for_blocked_object(obc->obs.oi.soid, op);
2251 return;
2252 }
2253
2254 dout(25) << __func__ << " oi " << obc->obs.oi << dendl;
2255
c07f9fc5 2256 OpContext *ctx = new OpContext(op, m->get_reqid(), &m->ops, obc, this);
7c673cae 2257
7c673cae
FG
2258 if (m->has_flag(CEPH_OSD_FLAG_SKIPRWLOCKS)) {
2259 dout(20) << __func__ << ": skipping rw locks" << dendl;
2260 } else if (m->get_flags() & CEPH_OSD_FLAG_FLUSH) {
2261 dout(20) << __func__ << ": part of flush, will ignore write lock" << dendl;
2262
2263 // verify there is in fact a flush in progress
2264 // FIXME: we could make this a stronger test.
2265 map<hobject_t,FlushOpRef>::iterator p = flush_ops.find(obc->obs.oi.soid);
2266 if (p == flush_ops.end()) {
2267 dout(10) << __func__ << " no flush in progress, aborting" << dendl;
2268 reply_ctx(ctx, -EINVAL);
2269 return;
2270 }
2271 } else if (!get_rw_locks(write_ordered, ctx)) {
2272 dout(20) << __func__ << " waiting for rw locks " << dendl;
2273 op->mark_delayed("waiting for rw locks");
2274 close_op_ctx(ctx);
2275 return;
2276 }
2277 dout(20) << __func__ << " obc " << *obc << dendl;
2278
2279 if (r) {
2280 dout(20) << __func__ << " returned an error: " << r << dendl;
7c673cae 2281 if (op->may_write() &&
9f95a23c
TL
2282 get_osdmap()->require_osd_release >= ceph_release_t::kraken) {
2283 record_write_error(op, oid, nullptr, r,
2284 ctx->op->allows_returnvec() ? ctx : nullptr);
7c673cae
FG
2285 } else {
2286 osd->reply_op_error(op, r);
2287 }
9f95a23c 2288 close_op_ctx(ctx);
7c673cae
FG
2289 return;
2290 }
2291
2292 if (m->has_flag(CEPH_OSD_FLAG_IGNORE_CACHE)) {
2293 ctx->ignore_cache = true;
2294 }
2295
2296 if ((op->may_read()) && (obc->obs.oi.is_lost())) {
2297 // This object is lost. Reading from it returns an error.
2298 dout(20) << __func__ << ": object " << obc->obs.oi.soid
2299 << " is lost" << dendl;
2300 reply_ctx(ctx, -ENFILE);
2301 return;
2302 }
2303 if (!op->may_write() &&
2304 !op->may_cache() &&
2305 (!obc->obs.exists ||
2306 ((m->get_snapid() != CEPH_SNAPDIR) &&
2307 obc->obs.oi.is_whiteout()))) {
2308 // copy the reqids for copy get on ENOENT
2309 if (m->ops[0].op.op == CEPH_OSD_OP_COPY_GET) {
2310 fill_in_copy_get_noent(op, oid, m->ops[0]);
2311 close_op_ctx(ctx);
2312 return;
2313 }
2314 reply_ctx(ctx, -ENOENT);
2315 return;
2316 }
2317
2318 op->mark_started();
2319
2320 execute_ctx(ctx);
2321 utime_t prepare_latency = ceph_clock_now();
2322 prepare_latency -= op->get_dequeued_time();
2323 osd->logger->tinc(l_osd_op_prepare_lat, prepare_latency);
2324 if (op->may_read() && op->may_write()) {
2325 osd->logger->tinc(l_osd_op_rw_prepare_lat, prepare_latency);
2326 } else if (op->may_read()) {
2327 osd->logger->tinc(l_osd_op_r_prepare_lat, prepare_latency);
2328 } else if (op->may_write() || op->may_cache()) {
2329 osd->logger->tinc(l_osd_op_w_prepare_lat, prepare_latency);
2330 }
2331
2332 // force recovery of the oldest missing object if too many logs
2333 maybe_force_recovery();
2334}
b32b8144 2335
31f18b77
FG
2336PrimaryLogPG::cache_result_t PrimaryLogPG::maybe_handle_manifest_detail(
2337 OpRequestRef op,
2338 bool write_ordered,
2339 ObjectContextRef obc)
2340{
11fdf7f2 2341 ceph_assert(obc);
9f95a23c 2342 if (op->get_req<MOSDOp>()->get_flags() & CEPH_OSD_FLAG_IGNORE_REDIRECT) {
31f18b77
FG
2343 dout(20) << __func__ << ": ignoring redirect due to flag" << dendl;
2344 return cache_result_t::NOOP;
2345 }
2346
31f18b77 2347 // if it is write-ordered and blocked, stop now
11fdf7f2 2348 if (obc->is_blocked() && write_ordered) {
31f18b77
FG
2349 // we're already doing something with this object
2350 dout(20) << __func__ << " blocked on " << obc->obs.oi.soid << dendl;
2351 return cache_result_t::NOOP;
2352 }
2353
9f95a23c 2354 vector<OSDOp> ops = op->get_req<MOSDOp>()->ops;
31f18b77
FG
2355 for (vector<OSDOp>::iterator p = ops.begin(); p != ops.end(); ++p) {
2356 OSDOp& osd_op = *p;
2357 ceph_osd_op& op = osd_op.op;
11fdf7f2
TL
2358 if (op.op == CEPH_OSD_OP_SET_REDIRECT ||
2359 op.op == CEPH_OSD_OP_SET_CHUNK ||
9f95a23c
TL
2360 op.op == CEPH_OSD_OP_UNSET_MANIFEST ||
2361 op.op == CEPH_OSD_OP_TIER_FLUSH) {
2362 return cache_result_t::NOOP;
2363 } else if (op.op == CEPH_OSD_OP_TIER_PROMOTE) {
2364 bool is_dirty = false;
2365 for (auto& p : obc->obs.oi.manifest.chunk_map) {
2366 if (p.second.is_dirty()) {
2367 is_dirty = true;
2368 }
2369 }
2370 if (is_dirty) {
2371 start_flush(OpRequestRef(), obc, true, NULL, std::nullopt);
2372 }
31f18b77
FG
2373 return cache_result_t::NOOP;
2374 }
2375 }
2376
2377 switch (obc->obs.oi.manifest.type) {
2378 case object_manifest_t::TYPE_REDIRECT:
2379 if (op->may_write() || write_ordered) {
11fdf7f2 2380 do_proxy_write(op, obc);
31f18b77 2381 } else {
11fdf7f2
TL
2382 // promoted object
2383 if (obc->obs.oi.size != 0) {
2384 return cache_result_t::NOOP;
2385 }
31f18b77
FG
2386 do_proxy_read(op, obc);
2387 }
2388 return cache_result_t::HANDLED_PROXY;
11fdf7f2
TL
2389 case object_manifest_t::TYPE_CHUNKED:
2390 {
2391 if (can_proxy_chunked_read(op, obc)) {
2392 map<hobject_t,FlushOpRef>::iterator p = flush_ops.find(obc->obs.oi.soid);
2393 if (p != flush_ops.end()) {
2394 do_proxy_chunked_op(op, obc->obs.oi.soid, obc, true);
2395 return cache_result_t::HANDLED_PROXY;
2396 }
2397 do_proxy_chunked_op(op, obc->obs.oi.soid, obc, write_ordered);
2398 return cache_result_t::HANDLED_PROXY;
2399 }
2400
2401 MOSDOp *m = static_cast<MOSDOp*>(op->get_nonconst_req());
2402 ceph_assert(m->get_type() == CEPH_MSG_OSD_OP);
2403 hobject_t head = m->get_hobj();
2404
2405 if (is_degraded_or_backfilling_object(head)) {
2406 dout(20) << __func__ << ": " << head << " is degraded, waiting" << dendl;
2407 wait_for_degraded_object(head, op);
2408 return cache_result_t::BLOCKED_RECOVERY;
2409 }
2410
2411 if (write_blocked_by_scrub(head)) {
2412 dout(20) << __func__ << ": waiting for scrub" << dendl;
2413 waiting_for_scrub.push_back(op);
2414 op->mark_delayed("waiting for scrub");
2415 return cache_result_t::BLOCKED_RECOVERY;
2416 }
9f95a23c
TL
2417 if (!check_laggy_requeue(op)) {
2418 return cache_result_t::BLOCKED_RECOVERY;
2419 }
11fdf7f2
TL
2420
2421 for (auto& p : obc->obs.oi.manifest.chunk_map) {
2422 if (p.second.is_missing()) {
9f95a23c 2423 auto m = op->get_req<MOSDOp>();
11fdf7f2
TL
2424 const object_locator_t oloc = m->get_object_locator();
2425 promote_object(obc, obc->obs.oi.soid, oloc, op, NULL);
2426 return cache_result_t::BLOCKED_PROMOTE;
2427 }
2428 }
2429
2430 bool all_dirty = true;
2431 for (auto& p : obc->obs.oi.manifest.chunk_map) {
2432 if (!p.second.is_dirty()) {
2433 all_dirty = false;
2434 }
2435 }
2436 if (all_dirty) {
9f95a23c 2437 start_flush(OpRequestRef(), obc, true, NULL, std::nullopt);
11fdf7f2
TL
2438 }
2439 return cache_result_t::NOOP;
2440 }
31f18b77 2441 default:
11fdf7f2 2442 ceph_abort_msg("unrecognized manifest type");
31f18b77
FG
2443 }
2444
2445 return cache_result_t::NOOP;
2446}
7c673cae 2447
11fdf7f2
TL
2448struct C_ManifestFlush : public Context {
2449 PrimaryLogPGRef pg;
2450 hobject_t oid;
2451 epoch_t lpr;
2452 ceph_tid_t tid;
2453 utime_t start;
2454 uint64_t offset;
2455 uint64_t last_offset;
2456 C_ManifestFlush(PrimaryLogPG *p, hobject_t o, epoch_t e)
2457 : pg(p), oid(o), lpr(e),
2458 tid(0), start(ceph_clock_now())
2459 {}
2460 void finish(int r) override {
2461 if (r == -ECANCELED)
2462 return;
9f95a23c 2463 std::scoped_lock locker{*pg};
11fdf7f2
TL
2464 pg->handle_manifest_flush(oid, tid, r, offset, last_offset, lpr);
2465 pg->osd->logger->tinc(l_osd_tier_flush_lat, ceph_clock_now() - start);
11fdf7f2
TL
2466 }
2467};
2468
2469void PrimaryLogPG::handle_manifest_flush(hobject_t oid, ceph_tid_t tid, int r,
2470 uint64_t offset, uint64_t last_offset,
2471 epoch_t lpr)
2472{
2473 map<hobject_t,FlushOpRef>::iterator p = flush_ops.find(oid);
2474 if (p == flush_ops.end()) {
2475 dout(10) << __func__ << " no flush_op found" << dendl;
2476 return;
2477 }
2478 if (p->second->rval < 0) {
2479 return;
2480 }
2481 p->second->io_results[offset] = r;
2482 for (auto &ior: p->second->io_results) {
2483 if (ior.second < 0) {
2484 finish_manifest_flush(oid, tid, r, p->second->obc, last_offset);
2485 p->second->rval = r;
2486 return;
2487 }
2488 }
2489 if (p->second->chunks == p->second->io_results.size()) {
2490 if (lpr == get_last_peering_reset()) {
2491 ceph_assert(p->second->obc);
2492 finish_manifest_flush(oid, tid, r, p->second->obc, last_offset);
2493 }
2494 }
2495}
2496
2497int PrimaryLogPG::start_manifest_flush(OpRequestRef op, ObjectContextRef obc, bool blocking,
9f95a23c 2498 std::optional<std::function<void()>> &&on_flush)
11fdf7f2
TL
2499{
2500 auto p = obc->obs.oi.manifest.chunk_map.begin();
2501 FlushOpRef manifest_fop(std::make_shared<FlushOp>());
2502 manifest_fop->op = op;
2503 manifest_fop->obc = obc;
2504 manifest_fop->flushed_version = obc->obs.oi.user_version;
2505 manifest_fop->blocking = blocking;
2506 manifest_fop->on_flush = std::move(on_flush);
2507 int r = do_manifest_flush(op, obc, manifest_fop, p->first, blocking);
2508 if (r < 0) {
2509 return r;
2510 }
2511
2512 flush_ops[obc->obs.oi.soid] = manifest_fop;
2513 return -EINPROGRESS;
2514}
2515
2516int PrimaryLogPG::do_manifest_flush(OpRequestRef op, ObjectContextRef obc, FlushOpRef manifest_fop,
2517 uint64_t start_offset, bool block)
2518{
2519 struct object_manifest_t &manifest = obc->obs.oi.manifest;
2520 hobject_t soid = obc->obs.oi.soid;
2521 ceph_tid_t tid;
2522 SnapContext snapc;
2523 uint64_t max_copy_size = 0, last_offset = 0;
2524
2525 map<uint64_t, chunk_info_t>::iterator iter = manifest.chunk_map.find(start_offset);
2526 ceph_assert(iter != manifest.chunk_map.end());
2527 for (;iter != manifest.chunk_map.end(); ++iter) {
2528 if (iter->second.is_dirty()) {
2529 last_offset = iter->first;
2530 max_copy_size += iter->second.length;
2531 }
2532 if (get_copy_chunk_size() < max_copy_size) {
2533 break;
2534 }
2535 }
2536
2537 iter = manifest.chunk_map.find(start_offset);
2538 for (;iter != manifest.chunk_map.end(); ++iter) {
2539 if (!iter->second.is_dirty()) {
2540 continue;
2541 }
2542 uint64_t tgt_length = iter->second.length;
2543 uint64_t tgt_offset= iter->second.offset;
2544 hobject_t tgt_soid = iter->second.oid;
2545 object_locator_t oloc(tgt_soid);
2546 ObjectOperation obj_op;
2547 bufferlist chunk_data;
2548 int r = pgbackend->objects_read_sync(
2549 soid, iter->first, tgt_length, 0, &chunk_data);
2550 if (r < 0) {
2551 dout(0) << __func__ << " read fail " << " offset: " << tgt_offset
2552 << " len: " << tgt_length << " r: " << r << dendl;
2553 return r;
2554 }
2555 if (!chunk_data.length()) {
2556 return -ENODATA;
2557 }
2558
2559 unsigned flags = CEPH_OSD_FLAG_IGNORE_CACHE | CEPH_OSD_FLAG_IGNORE_OVERLAY |
2560 CEPH_OSD_FLAG_RWORDERED;
2561 tgt_length = chunk_data.length();
9f95a23c
TL
2562 if (pg_pool_t::fingerprint_t fp_algo = pool.info.get_fingerprint_type();
2563 iter->second.has_reference() &&
2564 fp_algo != pg_pool_t::TYPE_FINGERPRINT_NONE) {
2565 object_t fp_oid = [fp_algo, &chunk_data]() -> string {
2566 switch (fp_algo) {
11fdf7f2 2567 case pg_pool_t::TYPE_FINGERPRINT_SHA1:
9f95a23c
TL
2568 return crypto::digest<crypto::SHA1>(chunk_data).to_str();
2569 case pg_pool_t::TYPE_FINGERPRINT_SHA256:
2570 return crypto::digest<crypto::SHA256>(chunk_data).to_str();
2571 case pg_pool_t::TYPE_FINGERPRINT_SHA512:
2572 return crypto::digest<crypto::SHA512>(chunk_data).to_str();
11fdf7f2
TL
2573 default:
2574 assert(0 == "unrecognized fingerprint type");
9f95a23c
TL
2575 return {};
2576 }
2577 }();
2578 bufferlist in;
2579 if (fp_oid != tgt_soid.oid) {
2580 // decrement old chunk's reference count
2581 ObjectOperation dec_op;
2582 cls_chunk_refcount_put_op put_call;
2583 put_call.source = soid;
2584 ::encode(put_call, in);
2585 dec_op.call("cas", "chunk_put", in);
2586 // we don't care dec_op's completion. scrub for dedup will fix this.
2587 tid = osd->objecter->mutate(
2588 tgt_soid.oid, oloc, dec_op, snapc,
2589 ceph::real_clock::from_ceph_timespec(obc->obs.oi.mtime),
2590 flags, NULL);
2591 in.clear();
2592 }
2593 tgt_soid.oid = fp_oid;
2594 iter->second.oid = tgt_soid;
2595 // add data op
2596 ceph_osd_op osd_op;
2597 osd_op.extent.offset = 0;
2598 osd_op.extent.length = chunk_data.length();
2599 encode(osd_op, in);
2600 encode(soid, in);
2601 in.append(chunk_data);
2602 obj_op.call("cas", "cas_write_or_get", in);
11fdf7f2
TL
2603 } else {
2604 obj_op.add_data(CEPH_OSD_OP_WRITE, tgt_offset, tgt_length, chunk_data);
2605 }
2606
2607 C_ManifestFlush *fin = new C_ManifestFlush(this, soid, get_last_peering_reset());
2608 fin->offset = iter->first;
2609 fin->last_offset = last_offset;
2610 manifest_fop->chunks++;
2611
11fdf7f2
TL
2612 tid = osd->objecter->mutate(
2613 tgt_soid.oid, oloc, obj_op, snapc,
2614 ceph::real_clock::from_ceph_timespec(obc->obs.oi.mtime),
9f95a23c 2615 flags, new C_OnFinisher(fin, osd->get_objecter_finisher(get_pg_shard())));
11fdf7f2
TL
2616 fin->tid = tid;
2617 manifest_fop->io_tids[iter->first] = tid;
2618
2619 dout(20) << __func__ << " offset: " << tgt_offset << " len: " << tgt_length
2620 << " oid: " << tgt_soid.oid << " ori oid: " << soid.oid.name
2621 << " tid: " << tid << dendl;
2622 if (last_offset < iter->first) {
2623 break;
2624 }
2625 }
2626
2627 return 0;
2628}
2629
2630void PrimaryLogPG::finish_manifest_flush(hobject_t oid, ceph_tid_t tid, int r,
2631 ObjectContextRef obc, uint64_t last_offset)
2632{
2633 dout(10) << __func__ << " " << oid << " tid " << tid
2634 << " " << cpp_strerror(r) << " last_offset: " << last_offset << dendl;
2635 map<hobject_t,FlushOpRef>::iterator p = flush_ops.find(oid);
2636 if (p == flush_ops.end()) {
2637 dout(10) << __func__ << " no flush_op found" << dendl;
2638 return;
2639 }
2640 map<uint64_t, chunk_info_t>::iterator iter =
2641 obc->obs.oi.manifest.chunk_map.find(last_offset);
2642 ceph_assert(iter != obc->obs.oi.manifest.chunk_map.end());
2643 for (;iter != obc->obs.oi.manifest.chunk_map.end(); ++iter) {
2644 if (iter->second.is_dirty() && last_offset < iter->first) {
2645 do_manifest_flush(p->second->op, obc, p->second, iter->first, p->second->blocking);
2646 return;
2647 }
2648 }
2649 finish_flush(oid, tid, r);
2650}
2651
7c673cae 2652void PrimaryLogPG::record_write_error(OpRequestRef op, const hobject_t &soid,
9f95a23c
TL
2653 MOSDOpReply *orig_reply, int r,
2654 OpContext *ctx_for_op_returns)
7c673cae
FG
2655{
2656 dout(20) << __func__ << " r=" << r << dendl;
11fdf7f2 2657 ceph_assert(op->may_write());
9f95a23c 2658 const osd_reqid_t &reqid = op->get_req<MOSDOp>()->get_reqid();
31f18b77 2659 mempool::osd_pglog::list<pg_log_entry_t> entries;
7c673cae
FG
2660 entries.push_back(pg_log_entry_t(pg_log_entry_t::ERROR, soid,
2661 get_next_version(), eversion_t(), 0,
2662 reqid, utime_t(), r));
9f95a23c
TL
2663 if (ctx_for_op_returns) {
2664 entries.back().set_op_returns(*ctx_for_op_returns->ops);
2665 dout(20) << __func__ << " op_returns=" << entries.back().op_returns << dendl;
2666 }
7c673cae
FG
2667
2668 struct OnComplete {
2669 PrimaryLogPG *pg;
2670 OpRequestRef op;
2671 boost::intrusive_ptr<MOSDOpReply> orig_reply;
2672 int r;
2673 OnComplete(
2674 PrimaryLogPG *pg,
2675 OpRequestRef op,
2676 MOSDOpReply *orig_reply,
2677 int r)
2678 : pg(pg), op(op),
2679 orig_reply(orig_reply, false /* take over ref */), r(r)
2680 {}
2681 void operator()() {
2682 ldpp_dout(pg, 20) << "finished " << __func__ << " r=" << r << dendl;
9f95a23c 2683 auto m = op->get_req<MOSDOp>();
7c673cae 2684 MOSDOpReply *reply = orig_reply.detach();
7c673cae
FG
2685 ldpp_dout(pg, 10) << " sending commit on " << *m << " " << reply << dendl;
2686 pg->osd->send_message_osd_client(reply, m->get_connection());
2687 }
2688 };
2689
2690 ObcLockManager lock_manager;
2691 submit_log_entries(
2692 entries,
2693 std::move(lock_manager),
9f95a23c 2694 std::optional<std::function<void(void)> >(
7c673cae
FG
2695 OnComplete(this, op, orig_reply, r)),
2696 op,
2697 r);
2698}
2699
2700PrimaryLogPG::cache_result_t PrimaryLogPG::maybe_handle_cache_detail(
2701 OpRequestRef op,
2702 bool write_ordered,
2703 ObjectContextRef obc,
2704 int r, hobject_t missing_oid,
2705 bool must_promote,
2706 bool in_hit_set,
2707 ObjectContextRef *promote_obc)
2708{
b32b8144
FG
2709 // return quickly if caching is not enabled
2710 if (pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE)
2711 return cache_result_t::NOOP;
2712
7c673cae
FG
2713 if (op &&
2714 op->get_req() &&
2715 op->get_req()->get_type() == CEPH_MSG_OSD_OP &&
9f95a23c 2716 (op->get_req<MOSDOp>()->get_flags() &
7c673cae
FG
2717 CEPH_OSD_FLAG_IGNORE_CACHE)) {
2718 dout(20) << __func__ << ": ignoring cache due to flag" << dendl;
2719 return cache_result_t::NOOP;
2720 }
7c673cae
FG
2721
2722 must_promote = must_promote || op->need_promote();
2723
2724 if (obc)
2725 dout(25) << __func__ << " " << obc->obs.oi << " "
2726 << (obc->obs.exists ? "exists" : "DNE")
2727 << " missing_oid " << missing_oid
2728 << " must_promote " << (int)must_promote
2729 << " in_hit_set " << (int)in_hit_set
2730 << dendl;
2731 else
2732 dout(25) << __func__ << " (no obc)"
2733 << " missing_oid " << missing_oid
2734 << " must_promote " << (int)must_promote
2735 << " in_hit_set " << (int)in_hit_set
2736 << dendl;
2737
2738 // if it is write-ordered and blocked, stop now
2739 if (obc.get() && obc->is_blocked() && write_ordered) {
2740 // we're already doing something with this object
2741 dout(20) << __func__ << " blocked on " << obc->obs.oi.soid << dendl;
2742 return cache_result_t::NOOP;
2743 }
2744
2745 if (r == -ENOENT && missing_oid == hobject_t()) {
2746 // we know this object is logically absent (e.g., an undefined clone)
2747 return cache_result_t::NOOP;
2748 }
2749
2750 if (obc.get() && obc->obs.exists) {
2751 osd->logger->inc(l_osd_op_cache_hit);
2752 return cache_result_t::NOOP;
2753 }
b32b8144
FG
2754 if (!is_primary()) {
2755 dout(20) << __func__ << " cache miss; ask the primary" << dendl;
2756 osd->reply_op_error(op, -EAGAIN);
2757 return cache_result_t::REPLIED_WITH_EAGAIN;
2758 }
7c673cae
FG
2759
2760 if (missing_oid == hobject_t() && obc.get()) {
2761 missing_oid = obc->obs.oi.soid;
2762 }
2763
9f95a23c 2764 auto m = op->get_req<MOSDOp>();
7c673cae
FG
2765 const object_locator_t oloc = m->get_object_locator();
2766
2767 if (op->need_skip_handle_cache()) {
2768 return cache_result_t::NOOP;
2769 }
2770
7c673cae
FG
2771 OpRequestRef promote_op;
2772
2773 switch (pool.info.cache_mode) {
2774 case pg_pool_t::CACHEMODE_WRITEBACK:
2775 if (agent_state &&
2776 agent_state->evict_mode == TierAgentState::EVICT_MODE_FULL) {
2777 if (!op->may_write() && !op->may_cache() &&
2778 !write_ordered && !must_promote) {
2779 dout(20) << __func__ << " cache pool full, proxying read" << dendl;
2780 do_proxy_read(op);
2781 return cache_result_t::HANDLED_PROXY;
2782 }
2783 dout(20) << __func__ << " cache pool full, waiting" << dendl;
2784 block_write_on_full_cache(missing_oid, op);
2785 return cache_result_t::BLOCKED_FULL;
2786 }
2787
2788 if (must_promote || (!hit_set && !op->need_skip_promote())) {
2789 promote_object(obc, missing_oid, oloc, op, promote_obc);
2790 return cache_result_t::BLOCKED_PROMOTE;
2791 }
2792
2793 if (op->may_write() || op->may_cache()) {
11fdf7f2 2794 do_proxy_write(op);
7c673cae
FG
2795
2796 // Promote too?
2797 if (!op->need_skip_promote() &&
2798 maybe_promote(obc, missing_oid, oloc, in_hit_set,
2799 pool.info.min_write_recency_for_promote,
2800 OpRequestRef(),
2801 promote_obc)) {
2802 return cache_result_t::BLOCKED_PROMOTE;
2803 }
2804 return cache_result_t::HANDLED_PROXY;
2805 } else {
2806 do_proxy_read(op);
2807
2808 // Avoid duplicate promotion
2809 if (obc.get() && obc->is_blocked()) {
2810 if (promote_obc)
2811 *promote_obc = obc;
2812 return cache_result_t::BLOCKED_PROMOTE;
2813 }
2814
2815 // Promote too?
2816 if (!op->need_skip_promote()) {
2817 (void)maybe_promote(obc, missing_oid, oloc, in_hit_set,
2818 pool.info.min_read_recency_for_promote,
2819 promote_op, promote_obc);
2820 }
2821
2822 return cache_result_t::HANDLED_PROXY;
2823 }
11fdf7f2 2824 ceph_abort_msg("unreachable");
7c673cae
FG
2825 return cache_result_t::NOOP;
2826
7c673cae
FG
2827 case pg_pool_t::CACHEMODE_READONLY:
2828 // TODO: clean this case up
2829 if (!obc.get() && r == -ENOENT) {
2830 // we don't have the object and op's a read
2831 promote_object(obc, missing_oid, oloc, op, promote_obc);
2832 return cache_result_t::BLOCKED_PROMOTE;
2833 }
2834 if (!r) { // it must be a write
2835 do_cache_redirect(op);
2836 return cache_result_t::HANDLED_REDIRECT;
2837 }
2838 // crap, there was a failure of some kind
2839 return cache_result_t::NOOP;
2840
9f95a23c
TL
2841 case pg_pool_t::CACHEMODE_FORWARD:
2842 // this mode is deprecated; proxy instead
7c673cae
FG
2843 case pg_pool_t::CACHEMODE_PROXY:
2844 if (!must_promote) {
2845 if (op->may_write() || op->may_cache() || write_ordered) {
11fdf7f2
TL
2846 do_proxy_write(op);
2847 return cache_result_t::HANDLED_PROXY;
7c673cae
FG
2848 } else {
2849 do_proxy_read(op);
2850 return cache_result_t::HANDLED_PROXY;
2851 }
2852 }
2853 // ugh, we're forced to promote.
2854 if (agent_state &&
2855 agent_state->evict_mode == TierAgentState::EVICT_MODE_FULL) {
2856 dout(20) << __func__ << " cache pool full, waiting" << dendl;
2857 block_write_on_full_cache(missing_oid, op);
2858 return cache_result_t::BLOCKED_FULL;
2859 }
2860 promote_object(obc, missing_oid, oloc, op, promote_obc);
2861 return cache_result_t::BLOCKED_PROMOTE;
2862
9f95a23c
TL
2863 case pg_pool_t::CACHEMODE_READFORWARD:
2864 // this mode is deprecated; proxy instead
7c673cae
FG
2865 case pg_pool_t::CACHEMODE_READPROXY:
2866 // Do writeback to the cache tier for writes
2867 if (op->may_write() || write_ordered || must_promote) {
2868 if (agent_state &&
2869 agent_state->evict_mode == TierAgentState::EVICT_MODE_FULL) {
2870 dout(20) << __func__ << " cache pool full, waiting" << dendl;
2871 block_write_on_full_cache(missing_oid, op);
2872 return cache_result_t::BLOCKED_FULL;
2873 }
2874 promote_object(obc, missing_oid, oloc, op, promote_obc);
2875 return cache_result_t::BLOCKED_PROMOTE;
2876 }
2877
2878 // If it is a read, we can read, we need to proxy it
2879 do_proxy_read(op);
2880 return cache_result_t::HANDLED_PROXY;
2881
2882 default:
11fdf7f2 2883 ceph_abort_msg("unrecognized cache_mode");
7c673cae
FG
2884 }
2885 return cache_result_t::NOOP;
2886}
2887
2888bool PrimaryLogPG::maybe_promote(ObjectContextRef obc,
2889 const hobject_t& missing_oid,
2890 const object_locator_t& oloc,
2891 bool in_hit_set,
2892 uint32_t recency,
2893 OpRequestRef promote_op,
2894 ObjectContextRef *promote_obc)
2895{
2896 dout(20) << __func__ << " missing_oid " << missing_oid
2897 << " in_hit_set " << in_hit_set << dendl;
2898
2899 switch (recency) {
2900 case 0:
2901 break;
2902 case 1:
2903 // Check if in the current hit set
2904 if (in_hit_set) {
2905 break;
2906 } else {
2907 // not promoting
2908 return false;
2909 }
2910 break;
2911 default:
2912 {
2913 unsigned count = (int)in_hit_set;
2914 if (count) {
2915 // Check if in other hit sets
2916 const hobject_t& oid = obc.get() ? obc->obs.oi.soid : missing_oid;
2917 for (map<time_t,HitSetRef>::reverse_iterator itor =
2918 agent_state->hit_set_map.rbegin();
2919 itor != agent_state->hit_set_map.rend();
2920 ++itor) {
2921 if (!itor->second->contains(oid)) {
2922 break;
2923 }
2924 ++count;
2925 if (count >= recency) {
2926 break;
2927 }
2928 }
2929 }
2930 if (count >= recency) {
2931 break;
2932 }
2933 return false; // not promoting
2934 }
2935 break;
2936 }
2937
2938 if (osd->promote_throttle()) {
2939 dout(10) << __func__ << " promote throttled" << dendl;
2940 return false;
2941 }
2942 promote_object(obc, missing_oid, oloc, promote_op, promote_obc);
2943 return true;
2944}
2945
2946void PrimaryLogPG::do_cache_redirect(OpRequestRef op)
2947{
9f95a23c 2948 auto m = op->get_req<MOSDOp>();
7c673cae 2949 int flags = m->get_flags() & (CEPH_OSD_FLAG_ACK|CEPH_OSD_FLAG_ONDISK);
11fdf7f2
TL
2950 MOSDOpReply *reply = new MOSDOpReply(m, -ENOENT, get_osdmap_epoch(),
2951 flags, false);
7c673cae
FG
2952 request_redirect_t redir(m->get_object_locator(), pool.info.tier_of);
2953 reply->set_redirect(redir);
2954 dout(10) << "sending redirect to pool " << pool.info.tier_of << " for op "
2955 << op << dendl;
2956 m->get_connection()->send_message(reply);
2957 return;
2958}
2959
2960struct C_ProxyRead : public Context {
2961 PrimaryLogPGRef pg;
2962 hobject_t oid;
2963 epoch_t last_peering_reset;
2964 ceph_tid_t tid;
2965 PrimaryLogPG::ProxyReadOpRef prdop;
2966 utime_t start;
2967 C_ProxyRead(PrimaryLogPG *p, hobject_t o, epoch_t lpr,
2968 const PrimaryLogPG::ProxyReadOpRef& prd)
2969 : pg(p), oid(o), last_peering_reset(lpr),
2970 tid(0), prdop(prd), start(ceph_clock_now())
2971 {}
2972 void finish(int r) override {
2973 if (prdop->canceled)
2974 return;
9f95a23c 2975 std::scoped_lock locker{*pg};
7c673cae 2976 if (prdop->canceled) {
7c673cae
FG
2977 return;
2978 }
2979 if (last_peering_reset == pg->get_last_peering_reset()) {
2980 pg->finish_proxy_read(oid, tid, r);
2981 pg->osd->logger->tinc(l_osd_tier_r_lat, ceph_clock_now() - start);
2982 }
7c673cae
FG
2983 }
2984};
2985
11fdf7f2
TL
2986struct C_ProxyChunkRead : public Context {
2987 PrimaryLogPGRef pg;
2988 hobject_t oid;
2989 epoch_t last_peering_reset;
2990 ceph_tid_t tid;
2991 PrimaryLogPG::ProxyReadOpRef prdop;
2992 utime_t start;
2993 ObjectOperation *obj_op;
2994 int op_index = 0;
2995 uint64_t req_offset = 0;
2996 ObjectContextRef obc;
2997 uint64_t req_total_len = 0;
2998 C_ProxyChunkRead(PrimaryLogPG *p, hobject_t o, epoch_t lpr,
2999 const PrimaryLogPG::ProxyReadOpRef& prd)
3000 : pg(p), oid(o), last_peering_reset(lpr),
3001 tid(0), prdop(prd), start(ceph_clock_now()), obj_op(NULL)
3002 {}
3003 void finish(int r) override {
3004 if (prdop->canceled)
3005 return;
9f95a23c 3006 std::scoped_lock locker{*pg};
11fdf7f2 3007 if (prdop->canceled) {
11fdf7f2
TL
3008 return;
3009 }
3010 if (last_peering_reset == pg->get_last_peering_reset()) {
3011 if (r >= 0) {
3012 if (!prdop->ops[op_index].outdata.length()) {
3013 ceph_assert(req_total_len);
3014 bufferlist list;
3015 bufferptr bptr(req_total_len);
3016 list.push_back(std::move(bptr));
3017 prdop->ops[op_index].outdata.append(list);
3018 }
3019 ceph_assert(obj_op);
3020 uint64_t copy_offset;
3021 if (req_offset >= prdop->ops[op_index].op.extent.offset) {
3022 copy_offset = req_offset - prdop->ops[op_index].op.extent.offset;
3023 } else {
3024 copy_offset = 0;
3025 }
9f95a23c
TL
3026 prdop->ops[op_index].outdata.begin(copy_offset).copy_in(
3027 obj_op->ops[0].outdata.length(),
3028 obj_op->ops[0].outdata.c_str());
11fdf7f2
TL
3029 }
3030
3031 pg->finish_proxy_read(oid, tid, r);
3032 pg->osd->logger->tinc(l_osd_tier_r_lat, ceph_clock_now() - start);
3033 if (obj_op) {
3034 delete obj_op;
3035 }
3036 }
11fdf7f2
TL
3037 }
3038};
3039
31f18b77 3040void PrimaryLogPG::do_proxy_read(OpRequestRef op, ObjectContextRef obc)
7c673cae
FG
3041{
3042 // NOTE: non-const here because the ProxyReadOp needs mutable refs to
3043 // stash the result in the request's OSDOp vector
3044 MOSDOp *m = static_cast<MOSDOp*>(op->get_nonconst_req());
31f18b77
FG
3045 object_locator_t oloc;
3046 hobject_t soid;
3047 /* extensible tier */
3048 if (obc && obc->obs.exists && obc->obs.oi.has_manifest()) {
3049 switch (obc->obs.oi.manifest.type) {
3050 case object_manifest_t::TYPE_REDIRECT:
3051 oloc = object_locator_t(obc->obs.oi.manifest.redirect_target);
3052 soid = obc->obs.oi.manifest.redirect_target;
3053 break;
31f18b77 3054 default:
11fdf7f2 3055 ceph_abort_msg("unrecognized manifest type");
31f18b77
FG
3056 }
3057 } else {
3058 /* proxy */
3059 soid = m->get_hobj();
3060 oloc = object_locator_t(m->get_object_locator());
3061 oloc.pool = pool.info.tier_of;
3062 }
7c673cae
FG
3063 unsigned flags = CEPH_OSD_FLAG_IGNORE_CACHE | CEPH_OSD_FLAG_IGNORE_OVERLAY;
3064
3065 // pass through some original flags that make sense.
3066 // - leave out redirection and balancing flags since we are
3067 // already proxying through the primary
3068 // - leave off read/write/exec flags that are derived from the op
3069 flags |= m->get_flags() & (CEPH_OSD_FLAG_RWORDERED |
3070 CEPH_OSD_FLAG_ORDERSNAP |
3071 CEPH_OSD_FLAG_ENFORCE_SNAPC |
3072 CEPH_OSD_FLAG_MAP_SNAP_CLONE);
3073
3074 dout(10) << __func__ << " Start proxy read for " << *m << dendl;
3075
3076 ProxyReadOpRef prdop(std::make_shared<ProxyReadOp>(op, soid, m->ops));
3077
3078 ObjectOperation obj_op;
3079 obj_op.dup(prdop->ops);
3080
3081 if (pool.info.cache_mode == pg_pool_t::CACHEMODE_WRITEBACK &&
3082 (agent_state && agent_state->evict_mode != TierAgentState::EVICT_MODE_FULL)) {
3083 for (unsigned i = 0; i < obj_op.ops.size(); i++) {
3084 ceph_osd_op op = obj_op.ops[i].op;
3085 switch (op.op) {
3086 case CEPH_OSD_OP_READ:
3087 case CEPH_OSD_OP_SYNC_READ:
3088 case CEPH_OSD_OP_SPARSE_READ:
3089 case CEPH_OSD_OP_CHECKSUM:
c07f9fc5 3090 case CEPH_OSD_OP_CMPEXT:
7c673cae
FG
3091 op.flags = (op.flags | CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL) &
3092 ~(CEPH_OSD_OP_FLAG_FADVISE_DONTNEED | CEPH_OSD_OP_FLAG_FADVISE_NOCACHE);
3093 }
3094 }
3095 }
3096
3097 C_ProxyRead *fin = new C_ProxyRead(this, soid, get_last_peering_reset(),
3098 prdop);
3099 ceph_tid_t tid = osd->objecter->read(
3100 soid.oid, oloc, obj_op,
3101 m->get_snapid(), NULL,
9f95a23c 3102 flags, new C_OnFinisher(fin, osd->get_objecter_finisher(get_pg_shard())),
7c673cae
FG
3103 &prdop->user_version,
3104 &prdop->data_offset,
3105 m->get_features());
3106 fin->tid = tid;
3107 prdop->objecter_tid = tid;
3108 proxyread_ops[tid] = prdop;
3109 in_progress_proxy_ops[soid].push_back(op);
3110}
3111
3112void PrimaryLogPG::finish_proxy_read(hobject_t oid, ceph_tid_t tid, int r)
3113{
3114 dout(10) << __func__ << " " << oid << " tid " << tid
3115 << " " << cpp_strerror(r) << dendl;
3116
3117 map<ceph_tid_t, ProxyReadOpRef>::iterator p = proxyread_ops.find(tid);
3118 if (p == proxyread_ops.end()) {
3119 dout(10) << __func__ << " no proxyread_op found" << dendl;
3120 return;
3121 }
3122 ProxyReadOpRef prdop = p->second;
3123 if (tid != prdop->objecter_tid) {
3124 dout(10) << __func__ << " tid " << tid << " != prdop " << prdop
3125 << " tid " << prdop->objecter_tid << dendl;
3126 return;
3127 }
3128 if (oid != prdop->soid) {
3129 dout(10) << __func__ << " oid " << oid << " != prdop " << prdop
3130 << " soid " << prdop->soid << dendl;
3131 return;
3132 }
3133 proxyread_ops.erase(tid);
3134
3135 map<hobject_t, list<OpRequestRef>>::iterator q = in_progress_proxy_ops.find(oid);
3136 if (q == in_progress_proxy_ops.end()) {
3137 dout(10) << __func__ << " no in_progress_proxy_ops found" << dendl;
3138 return;
3139 }
11fdf7f2 3140 ceph_assert(q->second.size());
7c673cae
FG
3141 list<OpRequestRef>::iterator it = std::find(q->second.begin(),
3142 q->second.end(),
3143 prdop->op);
11fdf7f2 3144 ceph_assert(it != q->second.end());
7c673cae
FG
3145 OpRequestRef op = *it;
3146 q->second.erase(it);
3147 if (q->second.size() == 0) {
3148 in_progress_proxy_ops.erase(oid);
11fdf7f2
TL
3149 } else if (std::find(q->second.begin(),
3150 q->second.end(),
3151 prdop->op) != q->second.end()) {
3152 /* multiple read case */
3153 dout(20) << __func__ << " " << oid << " is not completed " << dendl;
3154 return;
7c673cae
FG
3155 }
3156
3157 osd->logger->inc(l_osd_tier_proxy_read);
3158
9f95a23c 3159 auto m = op->get_req<MOSDOp>();
c07f9fc5 3160 OpContext *ctx = new OpContext(op, m->get_reqid(), &prdop->ops, this);
11fdf7f2 3161 ctx->reply = new MOSDOpReply(m, 0, get_osdmap_epoch(), 0, false);
7c673cae
FG
3162 ctx->user_at_version = prdop->user_version;
3163 ctx->data_off = prdop->data_offset;
3164 ctx->ignore_log_op_stats = true;
3165 complete_read_ctx(r, ctx);
3166}
3167
3168void PrimaryLogPG::kick_proxy_ops_blocked(hobject_t& soid)
3169{
3170 map<hobject_t, list<OpRequestRef>>::iterator p = in_progress_proxy_ops.find(soid);
3171 if (p == in_progress_proxy_ops.end())
3172 return;
3173
3174 list<OpRequestRef>& ls = p->second;
3175 dout(10) << __func__ << " " << soid << " requeuing " << ls.size() << " requests" << dendl;
3176 requeue_ops(ls);
3177 in_progress_proxy_ops.erase(p);
3178}
3179
94b18763
FG
3180void PrimaryLogPG::cancel_proxy_read(ProxyReadOpRef prdop,
3181 vector<ceph_tid_t> *tids)
7c673cae
FG
3182{
3183 dout(10) << __func__ << " " << prdop->soid << dendl;
3184 prdop->canceled = true;
3185
3186 // cancel objecter op, if we can
3187 if (prdop->objecter_tid) {
94b18763 3188 tids->push_back(prdop->objecter_tid);
7c673cae
FG
3189 for (uint32_t i = 0; i < prdop->ops.size(); i++) {
3190 prdop->ops[i].outdata.clear();
3191 }
3192 proxyread_ops.erase(prdop->objecter_tid);
3193 prdop->objecter_tid = 0;
3194 }
3195}
3196
94b18763 3197void PrimaryLogPG::cancel_proxy_ops(bool requeue, vector<ceph_tid_t> *tids)
7c673cae
FG
3198{
3199 dout(10) << __func__ << dendl;
3200
3201 // cancel proxy reads
3202 map<ceph_tid_t, ProxyReadOpRef>::iterator p = proxyread_ops.begin();
3203 while (p != proxyread_ops.end()) {
94b18763 3204 cancel_proxy_read((p++)->second, tids);
7c673cae
FG
3205 }
3206
3207 // cancel proxy writes
3208 map<ceph_tid_t, ProxyWriteOpRef>::iterator q = proxywrite_ops.begin();
3209 while (q != proxywrite_ops.end()) {
94b18763 3210 cancel_proxy_write((q++)->second, tids);
7c673cae
FG
3211 }
3212
3213 if (requeue) {
3214 map<hobject_t, list<OpRequestRef>>::iterator p =
3215 in_progress_proxy_ops.begin();
3216 while (p != in_progress_proxy_ops.end()) {
3217 list<OpRequestRef>& ls = p->second;
3218 dout(10) << __func__ << " " << p->first << " requeuing " << ls.size()
3219 << " requests" << dendl;
3220 requeue_ops(ls);
3221 in_progress_proxy_ops.erase(p++);
3222 }
3223 } else {
3224 in_progress_proxy_ops.clear();
3225 }
3226}
3227
3228struct C_ProxyWrite_Commit : public Context {
3229 PrimaryLogPGRef pg;
3230 hobject_t oid;
3231 epoch_t last_peering_reset;
3232 ceph_tid_t tid;
3233 PrimaryLogPG::ProxyWriteOpRef pwop;
3234 C_ProxyWrite_Commit(PrimaryLogPG *p, hobject_t o, epoch_t lpr,
3235 const PrimaryLogPG::ProxyWriteOpRef& pw)
3236 : pg(p), oid(o), last_peering_reset(lpr),
3237 tid(0), pwop(pw)
3238 {}
3239 void finish(int r) override {
3240 if (pwop->canceled)
3241 return;
9f95a23c 3242 std::scoped_lock locker{*pg};
7c673cae 3243 if (pwop->canceled) {
7c673cae
FG
3244 return;
3245 }
3246 if (last_peering_reset == pg->get_last_peering_reset()) {
3247 pg->finish_proxy_write(oid, tid, r);
3248 }
7c673cae
FG
3249 }
3250};
3251
11fdf7f2 3252void PrimaryLogPG::do_proxy_write(OpRequestRef op, ObjectContextRef obc)
7c673cae
FG
3253{
3254 // NOTE: non-const because ProxyWriteOp takes a mutable ref
3255 MOSDOp *m = static_cast<MOSDOp*>(op->get_nonconst_req());
31f18b77 3256 object_locator_t oloc;
7c673cae 3257 SnapContext snapc(m->get_snap_seq(), m->get_snaps());
31f18b77
FG
3258 hobject_t soid;
3259 /* extensible tier */
3260 if (obc && obc->obs.exists && obc->obs.oi.has_manifest()) {
3261 switch (obc->obs.oi.manifest.type) {
3262 case object_manifest_t::TYPE_REDIRECT:
3263 oloc = object_locator_t(obc->obs.oi.manifest.redirect_target);
3264 soid = obc->obs.oi.manifest.redirect_target;
3265 break;
31f18b77 3266 default:
11fdf7f2 3267 ceph_abort_msg("unrecognized manifest type");
31f18b77
FG
3268 }
3269 } else {
3270 /* proxy */
3271 soid = m->get_hobj();
3272 oloc = object_locator_t(m->get_object_locator());
3273 oloc.pool = pool.info.tier_of;
3274 }
7c673cae 3275
7c673cae 3276 unsigned flags = CEPH_OSD_FLAG_IGNORE_CACHE | CEPH_OSD_FLAG_IGNORE_OVERLAY;
31f18b77
FG
3277 if (!(op->may_write() || op->may_cache())) {
3278 flags |= CEPH_OSD_FLAG_RWORDERED;
3279 }
9f95a23c
TL
3280 if (op->allows_returnvec()) {
3281 flags |= CEPH_OSD_FLAG_RETURNVEC;
3282 }
3283
7c673cae
FG
3284 dout(10) << __func__ << " Start proxy write for " << *m << dendl;
3285
3286 ProxyWriteOpRef pwop(std::make_shared<ProxyWriteOp>(op, soid, m->ops, m->get_reqid()));
c07f9fc5 3287 pwop->ctx = new OpContext(op, m->get_reqid(), &pwop->ops, this);
7c673cae
FG
3288 pwop->mtime = m->get_mtime();
3289
3290 ObjectOperation obj_op;
3291 obj_op.dup(pwop->ops);
3292
3293 C_ProxyWrite_Commit *fin = new C_ProxyWrite_Commit(
3294 this, soid, get_last_peering_reset(), pwop);
3295 ceph_tid_t tid = osd->objecter->mutate(
3296 soid.oid, oloc, obj_op, snapc,
3297 ceph::real_clock::from_ceph_timespec(pwop->mtime),
9f95a23c 3298 flags, new C_OnFinisher(fin, osd->get_objecter_finisher(get_pg_shard())),
7c673cae
FG
3299 &pwop->user_version, pwop->reqid);
3300 fin->tid = tid;
3301 pwop->objecter_tid = tid;
3302 proxywrite_ops[tid] = pwop;
3303 in_progress_proxy_ops[soid].push_back(op);
3304}
3305
11fdf7f2
TL
3306void PrimaryLogPG::do_proxy_chunked_op(OpRequestRef op, const hobject_t& missing_oid,
3307 ObjectContextRef obc, bool write_ordered)
3308{
3309 MOSDOp *m = static_cast<MOSDOp*>(op->get_nonconst_req());
3310 OSDOp *osd_op = NULL;
3311 for (unsigned int i = 0; i < m->ops.size(); i++) {
3312 osd_op = &m->ops[i];
3313 uint64_t cursor = osd_op->op.extent.offset;
3314 uint64_t op_length = osd_op->op.extent.offset + osd_op->op.extent.length;
3315 uint64_t chunk_length = 0, chunk_index = 0, req_len = 0;
3316 object_manifest_t *manifest = &obc->obs.oi.manifest;
3317 map <uint64_t, map<uint64_t, uint64_t>> chunk_read;
3318
3319 while (cursor < op_length) {
3320 chunk_index = 0;
3321 chunk_length = 0;
3322 /* find the right chunk position for cursor */
3323 for (auto &p : manifest->chunk_map) {
3324 if (p.first <= cursor && p.first + p.second.length > cursor) {
3325 chunk_length = p.second.length;
3326 chunk_index = p.first;
3327 break;
3328 }
3329 }
3330 /* no index */
3331 if (!chunk_index && !chunk_length) {
3332 if (cursor == osd_op->op.extent.offset) {
3333 OpContext *ctx = new OpContext(op, m->get_reqid(), &m->ops, this);
3334 ctx->reply = new MOSDOpReply(m, 0, get_osdmap_epoch(), 0, false);
3335 ctx->data_off = osd_op->op.extent.offset;
3336 ctx->ignore_log_op_stats = true;
3337 complete_read_ctx(0, ctx);
3338 }
3339 break;
3340 }
3341 uint64_t next_length = chunk_length;
3342 /* the size to read -> | op length | */
3343 /* | a chunk | */
3344 if (cursor + next_length > op_length) {
3345 next_length = op_length - cursor;
3346 }
3347 /* the size to read -> | op length | */
3348 /* | a chunk | */
3349 if (cursor + next_length > chunk_index + chunk_length) {
3350 next_length = chunk_index + chunk_length - cursor;
3351 }
3352
3353 chunk_read[cursor] = {{chunk_index, next_length}};
3354 cursor += next_length;
3355 }
3356
3357 req_len = cursor - osd_op->op.extent.offset;
3358 for (auto &p : chunk_read) {
3359 auto chunks = p.second.begin();
3360 dout(20) << __func__ << " chunk_index: " << chunks->first
3361 << " next_length: " << chunks->second << " cursor: "
3362 << p.first << dendl;
3363 do_proxy_chunked_read(op, obc, i, chunks->first, p.first, chunks->second, req_len, write_ordered);
3364 }
3365 }
3366}
3367
3368struct RefCountCallback : public Context {
3369public:
11fdf7f2
TL
3370 PrimaryLogPG::OpContext *ctx;
3371 OSDOp& osd_op;
9f95a23c 3372 bool requeue = false;
11fdf7f2 3373
9f95a23c
TL
3374 RefCountCallback(PrimaryLogPG::OpContext *ctx, OSDOp &osd_op)
3375 : ctx(ctx), osd_op(osd_op) {}
11fdf7f2 3376 void finish(int r) override {
9f95a23c
TL
3377 // NB: caller must already have pg->lock held
3378 ctx->obc->stop_block();
3379 ctx->pg->kick_object_context_blocked(ctx->obc);
3380 if (r >= 0) {
3381 osd_op.rval = 0;
3382 ctx->pg->execute_ctx(ctx);
3383 } else {
3384 // on cancel simply toss op out,
3385 // or requeue as requested
3386 if (r != -ECANCELED) {
3387 if (ctx->op)
3388 ctx->pg->osd->reply_op_error(ctx->op, r);
3389 } else if (requeue) {
3390 if (ctx->op)
3391 ctx->pg->requeue_op(ctx->op);
11fdf7f2 3392 }
9f95a23c 3393 ctx->pg->close_op_ctx(ctx);
11fdf7f2 3394 }
9f95a23c
TL
3395 }
3396 void set_requeue(bool rq) {
3397 requeue = rq;
11fdf7f2
TL
3398 }
3399};
3400
3401struct SetManifestFinisher : public PrimaryLogPG::OpFinisher {
3402 OSDOp& osd_op;
3403
3404 explicit SetManifestFinisher(OSDOp& osd_op) : osd_op(osd_op) {
3405 }
3406
3407 int execute() override {
3408 return osd_op.rval;
3409 }
3410};
3411
9f95a23c
TL
3412struct C_SetManifestRefCountDone : public Context {
3413 RefCountCallback* cb;
3414 hobject_t soid;
3415 C_SetManifestRefCountDone(
3416 RefCountCallback* cb, hobject_t soid) : cb(cb), soid(soid) {}
3417 void finish(int r) override {
3418 if (r == -ECANCELED)
3419 return;
3420 auto pg = cb->ctx->pg;
3421 std::scoped_lock locker{*pg};
3422 auto it = pg->manifest_ops.find(soid);
3423 if (it == pg->manifest_ops.end()) {
3424 // raced with cancel_manifest_ops
3425 return;
3426 }
3427 pg->manifest_ops.erase(it);
3428 cb->complete(r);
3429 }
3430};
3431
3432void PrimaryLogPG::cancel_manifest_ops(bool requeue, vector<ceph_tid_t> *tids)
3433{
3434 dout(10) << __func__ << dendl;
3435 auto p = manifest_ops.begin();
3436 while (p != manifest_ops.end()) {
3437 auto mop = p->second;
3438 // cancel objecter op, if we can
3439 if (mop->objecter_tid) {
3440 tids->push_back(mop->objecter_tid);
3441 mop->objecter_tid = 0;
3442 }
3443 mop->cb->set_requeue(requeue);
3444 mop->cb->complete(-ECANCELED);
3445 manifest_ops.erase(p++);
3446 }
3447}
3448
11fdf7f2 3449void PrimaryLogPG::refcount_manifest(ObjectContextRef obc, object_locator_t oloc, hobject_t soid,
9f95a23c 3450 SnapContext snapc, bool get, RefCountCallback *cb, uint64_t offset)
11fdf7f2
TL
3451{
3452 unsigned flags = CEPH_OSD_FLAG_IGNORE_CACHE | CEPH_OSD_FLAG_IGNORE_OVERLAY |
3453 CEPH_OSD_FLAG_RWORDERED;
3454
3455 dout(10) << __func__ << " Start refcount for " << soid << dendl;
3456
3457 ObjectOperation obj_op;
3458 bufferlist in;
3459 if (get) {
3460 cls_chunk_refcount_get_op call;
3461 call.source = obc->obs.oi.soid;
3462 ::encode(call, in);
3463 obj_op.call("cas", "chunk_get", in);
3464 } else {
3465 cls_chunk_refcount_put_op call;
3466 call.source = obc->obs.oi.soid;
3467 ::encode(call, in);
3468 obj_op.call("cas", "chunk_put", in);
3469 }
3470
9f95a23c 3471 Context *c = nullptr;
11fdf7f2 3472 if (cb) {
9f95a23c
TL
3473 C_SetManifestRefCountDone *fin =
3474 new C_SetManifestRefCountDone(cb, obc->obs.oi.soid);
3475 c = new C_OnFinisher(fin, osd->get_objecter_finisher(get_pg_shard()));
11fdf7f2
TL
3476 }
3477
9f95a23c 3478 auto tid = osd->objecter->mutate(
11fdf7f2
TL
3479 soid.oid, oloc, obj_op, snapc,
3480 ceph::real_clock::from_ceph_timespec(obc->obs.oi.mtime),
3481 flags, c);
9f95a23c
TL
3482 if (cb) {
3483 manifest_ops[obc->obs.oi.soid] = std::make_shared<ManifestOp>(cb, tid);
3484 obc->start_block();
3485 }
11fdf7f2
TL
3486}
3487
3488void PrimaryLogPG::do_proxy_chunked_read(OpRequestRef op, ObjectContextRef obc, int op_index,
3489 uint64_t chunk_index, uint64_t req_offset, uint64_t req_length,
3490 uint64_t req_total_len, bool write_ordered)
3491{
3492 MOSDOp *m = static_cast<MOSDOp*>(op->get_nonconst_req());
3493 object_manifest_t *manifest = &obc->obs.oi.manifest;
3494 if (!manifest->chunk_map.count(chunk_index)) {
3495 return;
3496 }
3497 uint64_t chunk_length = manifest->chunk_map[chunk_index].length;
3498 hobject_t soid = manifest->chunk_map[chunk_index].oid;
3499 hobject_t ori_soid = m->get_hobj();
3500 object_locator_t oloc(soid);
3501 unsigned flags = CEPH_OSD_FLAG_IGNORE_CACHE | CEPH_OSD_FLAG_IGNORE_OVERLAY;
3502 if (write_ordered) {
3503 flags |= CEPH_OSD_FLAG_RWORDERED;
3504 }
3505
3506 if (!chunk_length || soid == hobject_t()) {
3507 return;
3508 }
3509
3510 /* same as do_proxy_read() */
3511 flags |= m->get_flags() & (CEPH_OSD_FLAG_RWORDERED |
3512 CEPH_OSD_FLAG_ORDERSNAP |
3513 CEPH_OSD_FLAG_ENFORCE_SNAPC |
3514 CEPH_OSD_FLAG_MAP_SNAP_CLONE);
3515
3516 dout(10) << __func__ << " Start do chunk proxy read for " << *m
3517 << " index: " << op_index << " oid: " << soid.oid.name << " req_offset: " << req_offset
3518 << " req_length: " << req_length << dendl;
3519
3520 ProxyReadOpRef prdop(std::make_shared<ProxyReadOp>(op, ori_soid, m->ops));
3521
3522 ObjectOperation *pobj_op = new ObjectOperation;
3523 OSDOp &osd_op = pobj_op->add_op(m->ops[op_index].op.op);
3524
3525 if (chunk_index <= req_offset) {
3526 osd_op.op.extent.offset = manifest->chunk_map[chunk_index].offset + req_offset - chunk_index;
3527 } else {
3528 ceph_abort_msg("chunk_index > req_offset");
3529 }
3530 osd_op.op.extent.length = req_length;
3531
3532 ObjectOperation obj_op;
3533 obj_op.dup(pobj_op->ops);
3534
3535 C_ProxyChunkRead *fin = new C_ProxyChunkRead(this, ori_soid, get_last_peering_reset(),
3536 prdop);
3537 fin->obj_op = pobj_op;
3538 fin->op_index = op_index;
3539 fin->req_offset = req_offset;
3540 fin->obc = obc;
3541 fin->req_total_len = req_total_len;
3542
11fdf7f2
TL
3543 ceph_tid_t tid = osd->objecter->read(
3544 soid.oid, oloc, obj_op,
3545 m->get_snapid(), NULL,
9f95a23c 3546 flags, new C_OnFinisher(fin, osd->get_objecter_finisher(get_pg_shard())),
11fdf7f2
TL
3547 &prdop->user_version,
3548 &prdop->data_offset,
3549 m->get_features());
3550 fin->tid = tid;
3551 prdop->objecter_tid = tid;
3552 proxyread_ops[tid] = prdop;
3553 in_progress_proxy_ops[ori_soid].push_back(op);
3554}
3555
3556bool PrimaryLogPG::can_proxy_chunked_read(OpRequestRef op, ObjectContextRef obc)
3557{
3558 MOSDOp *m = static_cast<MOSDOp*>(op->get_nonconst_req());
3559 OSDOp *osd_op = NULL;
3560 bool ret = true;
3561 for (unsigned int i = 0; i < m->ops.size(); i++) {
3562 osd_op = &m->ops[i];
3563 ceph_osd_op op = osd_op->op;
3564 switch (op.op) {
3565 case CEPH_OSD_OP_READ:
3566 case CEPH_OSD_OP_SYNC_READ: {
3567 uint64_t cursor = osd_op->op.extent.offset;
3568 uint64_t remain = osd_op->op.extent.length;
3569
3570 /* requested chunks exist in chunk_map ? */
3571 for (auto &p : obc->obs.oi.manifest.chunk_map) {
3572 if (p.first <= cursor && p.first + p.second.length > cursor) {
3573 if (!p.second.is_missing()) {
3574 return false;
3575 }
3576 if (p.second.length >= remain) {
3577 remain = 0;
3578 break;
3579 } else {
3580 remain = remain - p.second.length;
3581 }
3582 cursor += p.second.length;
3583 }
3584 }
3585
3586 if (remain) {
3587 dout(20) << __func__ << " requested chunks don't exist in chunk_map " << dendl;
3588 return false;
3589 }
3590 continue;
3591 }
3592 default:
3593 return false;
3594 }
3595 }
3596 return ret;
3597}
3598
7c673cae
FG
3599void PrimaryLogPG::finish_proxy_write(hobject_t oid, ceph_tid_t tid, int r)
3600{
3601 dout(10) << __func__ << " " << oid << " tid " << tid
3602 << " " << cpp_strerror(r) << dendl;
3603
3604 map<ceph_tid_t, ProxyWriteOpRef>::iterator p = proxywrite_ops.find(tid);
3605 if (p == proxywrite_ops.end()) {
3606 dout(10) << __func__ << " no proxywrite_op found" << dendl;
3607 return;
3608 }
3609 ProxyWriteOpRef pwop = p->second;
11fdf7f2
TL
3610 ceph_assert(tid == pwop->objecter_tid);
3611 ceph_assert(oid == pwop->soid);
7c673cae
FG
3612
3613 proxywrite_ops.erase(tid);
3614
3615 map<hobject_t, list<OpRequestRef> >::iterator q = in_progress_proxy_ops.find(oid);
3616 if (q == in_progress_proxy_ops.end()) {
3617 dout(10) << __func__ << " no in_progress_proxy_ops found" << dendl;
3618 delete pwop->ctx;
3619 pwop->ctx = NULL;
3620 return;
3621 }
3622 list<OpRequestRef>& in_progress_op = q->second;
11fdf7f2 3623 ceph_assert(in_progress_op.size());
7c673cae
FG
3624 list<OpRequestRef>::iterator it = std::find(in_progress_op.begin(),
3625 in_progress_op.end(),
3626 pwop->op);
11fdf7f2 3627 ceph_assert(it != in_progress_op.end());
7c673cae
FG
3628 in_progress_op.erase(it);
3629 if (in_progress_op.size() == 0) {
3630 in_progress_proxy_ops.erase(oid);
11fdf7f2
TL
3631 } else if (std::find(in_progress_op.begin(),
3632 in_progress_op.end(),
3633 pwop->op) != in_progress_op.end()) {
3634 if (pwop->ctx)
3635 delete pwop->ctx;
3636 pwop->ctx = NULL;
3637 dout(20) << __func__ << " " << oid << " tid " << tid
3638 << " in_progress_op size: "
3639 << in_progress_op.size() << dendl;
3640 return;
7c673cae
FG
3641 }
3642
3643 osd->logger->inc(l_osd_tier_proxy_write);
3644
9f95a23c 3645 auto m = pwop->op->get_req<MOSDOp>();
11fdf7f2 3646 ceph_assert(m != NULL);
7c673cae
FG
3647
3648 if (!pwop->sent_reply) {
3649 // send commit.
9f95a23c
TL
3650 assert(pwop->ctx->reply == nullptr);
3651 MOSDOpReply *reply = new MOSDOpReply(m, r, get_osdmap_epoch(), 0,
3652 true /* we claim it below */);
3653 reply->set_reply_versions(eversion_t(), pwop->user_version);
7c673cae 3654 reply->add_flags(CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK);
9f95a23c 3655 reply->claim_op_out_data(pwop->ops);
7c673cae
FG
3656 dout(10) << " sending commit on " << pwop << " " << reply << dendl;
3657 osd->send_message_osd_client(reply, m->get_connection());
3658 pwop->sent_reply = true;
3659 pwop->ctx->op->mark_commit_sent();
3660 }
3661
3662 delete pwop->ctx;
3663 pwop->ctx = NULL;
3664}
3665
94b18763
FG
3666void PrimaryLogPG::cancel_proxy_write(ProxyWriteOpRef pwop,
3667 vector<ceph_tid_t> *tids)
7c673cae
FG
3668{
3669 dout(10) << __func__ << " " << pwop->soid << dendl;
3670 pwop->canceled = true;
3671
3672 // cancel objecter op, if we can
3673 if (pwop->objecter_tid) {
94b18763 3674 tids->push_back(pwop->objecter_tid);
7c673cae
FG
3675 delete pwop->ctx;
3676 pwop->ctx = NULL;
3677 proxywrite_ops.erase(pwop->objecter_tid);
3678 pwop->objecter_tid = 0;
3679 }
3680}
3681
3682class PromoteCallback: public PrimaryLogPG::CopyCallback {
3683 ObjectContextRef obc;
3684 PrimaryLogPG *pg;
3685 utime_t start;
3686public:
3687 PromoteCallback(ObjectContextRef obc_, PrimaryLogPG *pg_)
3688 : obc(obc_),
3689 pg(pg_),
3690 start(ceph_clock_now()) {}
3691
3692 void finish(PrimaryLogPG::CopyCallbackResults results) override {
3693 PrimaryLogPG::CopyResults *results_data = results.get<1>();
3694 int r = results.get<0>();
3695 pg->finish_promote(r, results_data, obc);
3696 pg->osd->logger->tinc(l_osd_tier_promote_lat, ceph_clock_now() - start);
3697 }
3698};
3699
11fdf7f2
TL
3700class PromoteManifestCallback: public PrimaryLogPG::CopyCallback {
3701 ObjectContextRef obc;
3702 PrimaryLogPG *pg;
3703 utime_t start;
3704 PrimaryLogPG::OpContext *ctx;
3705 PrimaryLogPG::CopyCallbackResults promote_results;
3706public:
3707 PromoteManifestCallback(ObjectContextRef obc_, PrimaryLogPG *pg_, PrimaryLogPG::OpContext *ctx = NULL)
3708 : obc(obc_),
3709 pg(pg_),
3710 start(ceph_clock_now()), ctx(ctx) {}
3711
3712 void finish(PrimaryLogPG::CopyCallbackResults results) override {
3713 PrimaryLogPG::CopyResults *results_data = results.get<1>();
3714 int r = results.get<0>();
3715 if (ctx) {
3716 promote_results = results;
3717 pg->execute_ctx(ctx);
3718 } else {
3719 pg->finish_promote_manifest(r, results_data, obc);
3720 }
3721 pg->osd->logger->tinc(l_osd_tier_promote_lat, ceph_clock_now() - start);
3722 }
3723 friend struct PromoteFinisher;
3724};
3725
3726struct PromoteFinisher : public PrimaryLogPG::OpFinisher {
3727 PromoteManifestCallback *promote_callback;
3728
3729 explicit PromoteFinisher(PromoteManifestCallback *promote_callback)
3730 : promote_callback(promote_callback) {
3731 }
3732
3733 int execute() override {
3734 if (promote_callback->ctx->obc->obs.oi.manifest.is_redirect()) {
3735 promote_callback->ctx->pg->finish_promote(promote_callback->promote_results.get<0>(),
3736 promote_callback->promote_results.get<1>(),
3737 promote_callback->obc);
3738 } else if (promote_callback->ctx->obc->obs.oi.manifest.is_chunked()) {
3739 promote_callback->ctx->pg->finish_promote_manifest(promote_callback->promote_results.get<0>(),
3740 promote_callback->promote_results.get<1>(),
3741 promote_callback->obc);
3742 } else {
3743 ceph_abort_msg("unrecognized manifest type");
3744 }
3745 return 0;
3746 }
3747};
3748
7c673cae
FG
3749void PrimaryLogPG::promote_object(ObjectContextRef obc,
3750 const hobject_t& missing_oid,
3751 const object_locator_t& oloc,
3752 OpRequestRef op,
3753 ObjectContextRef *promote_obc)
3754{
3755 hobject_t hoid = obc ? obc->obs.oi.soid : missing_oid;
11fdf7f2 3756 ceph_assert(hoid != hobject_t());
28e407b8 3757 if (write_blocked_by_scrub(hoid)) {
7c673cae
FG
3758 dout(10) << __func__ << " " << hoid
3759 << " blocked by scrub" << dendl;
3760 if (op) {
3761 waiting_for_scrub.push_back(op);
3762 op->mark_delayed("waiting for scrub");
3763 dout(10) << __func__ << " " << hoid
3764 << " placing op in waiting_for_scrub" << dendl;
3765 } else {
3766 dout(10) << __func__ << " " << hoid
3767 << " no op, dropping on the floor" << dendl;
3768 }
3769 return;
3770 }
9f95a23c
TL
3771 if (op && !check_laggy_requeue(op)) {
3772 return;
3773 }
7c673cae 3774 if (!obc) { // we need to create an ObjectContext
11fdf7f2 3775 ceph_assert(missing_oid != hobject_t());
7c673cae
FG
3776 obc = get_object_context(missing_oid, true);
3777 }
3778 if (promote_obc)
3779 *promote_obc = obc;
3780
3781 /*
3782 * Before promote complete, if there are proxy-reads for the object,
3783 * for this case we don't use DONTNEED.
3784 */
3785 unsigned src_fadvise_flags = LIBRADOS_OP_FLAG_FADVISE_SEQUENTIAL;
3786 map<hobject_t, list<OpRequestRef>>::iterator q = in_progress_proxy_ops.find(obc->obs.oi.soid);
3787 if (q == in_progress_proxy_ops.end()) {
3788 src_fadvise_flags |= LIBRADOS_OP_FLAG_FADVISE_DONTNEED;
3789 }
3790
11fdf7f2
TL
3791 CopyCallback *cb;
3792 object_locator_t my_oloc;
3793 hobject_t src_hoid;
3794 if (!obc->obs.oi.has_manifest()) {
3795 my_oloc = oloc;
3796 my_oloc.pool = pool.info.tier_of;
3797 src_hoid = obc->obs.oi.soid;
3798 cb = new PromoteCallback(obc, this);
3799 } else {
3800 if (obc->obs.oi.manifest.is_chunked()) {
3801 src_hoid = obc->obs.oi.soid;
3802 cb = new PromoteManifestCallback(obc, this);
3803 } else if (obc->obs.oi.manifest.is_redirect()) {
3804 object_locator_t src_oloc(obc->obs.oi.manifest.redirect_target);
3805 my_oloc = src_oloc;
3806 src_hoid = obc->obs.oi.manifest.redirect_target;
3807 cb = new PromoteCallback(obc, this);
3808 } else {
3809 ceph_abort_msg("unrecognized manifest type");
3810 }
3811 }
7c673cae
FG
3812
3813 unsigned flags = CEPH_OSD_COPY_FROM_FLAG_IGNORE_OVERLAY |
3814 CEPH_OSD_COPY_FROM_FLAG_IGNORE_CACHE |
3815 CEPH_OSD_COPY_FROM_FLAG_MAP_SNAP_CLONE |
3816 CEPH_OSD_COPY_FROM_FLAG_RWORDERED;
11fdf7f2 3817 start_copy(cb, obc, src_hoid, my_oloc, 0, flags,
7c673cae
FG
3818 obc->obs.oi.soid.snap == CEPH_NOSNAP,
3819 src_fadvise_flags, 0);
3820
11fdf7f2 3821 ceph_assert(obc->is_blocked());
7c673cae
FG
3822
3823 if (op)
3824 wait_for_blocked_object(obc->obs.oi.soid, op);
9f95a23c
TL
3825
3826 recovery_state.update_stats(
3827 [](auto &history, auto &stats) {
3828 stats.stats.sum.num_promote++;
3829 return false;
3830 });
7c673cae
FG
3831}
3832
3833void PrimaryLogPG::execute_ctx(OpContext *ctx)
3834{
11fdf7f2 3835 FUNCTRACE(cct);
7c673cae
FG
3836 dout(10) << __func__ << " " << ctx << dendl;
3837 ctx->reset_obs(ctx->obc);
3838 ctx->update_log_only = false; // reset in case finish_copyfrom() is re-running execute_ctx
3839 OpRequestRef op = ctx->op;
9f95a23c 3840 auto m = op->get_req<MOSDOp>();
7c673cae
FG
3841 ObjectContextRef obc = ctx->obc;
3842 const hobject_t& soid = obc->obs.oi.soid;
3843
3844 // this method must be idempotent since we may call it several times
3845 // before we finally apply the resulting transaction.
3846 ctx->op_t.reset(new PGTransaction);
3847
3848 if (op->may_write() || op->may_cache()) {
3849 // snap
3850 if (!(m->has_flag(CEPH_OSD_FLAG_ENFORCE_SNAPC)) &&
3851 pool.info.is_pool_snaps_mode()) {
3852 // use pool's snapc
3853 ctx->snapc = pool.snapc;
3854 } else {
3855 // client specified snapc
3856 ctx->snapc.seq = m->get_snap_seq();
3857 ctx->snapc.snaps = m->get_snaps();
3858 filter_snapc(ctx->snapc.snaps);
3859 }
3860 if ((m->has_flag(CEPH_OSD_FLAG_ORDERSNAP)) &&
3861 ctx->snapc.seq < obc->ssc->snapset.seq) {
3862 dout(10) << " ORDERSNAP flag set and snapc seq " << ctx->snapc.seq
3863 << " < snapset seq " << obc->ssc->snapset.seq
3864 << " on " << obc->obs.oi.soid << dendl;
3865 reply_ctx(ctx, -EOLDSNAPC);
3866 return;
3867 }
3868
3869 // version
3870 ctx->at_version = get_next_version();
3871 ctx->mtime = m->get_mtime();
3872
c07f9fc5 3873 dout(10) << __func__ << " " << soid << " " << *ctx->ops
7c673cae
FG
3874 << " ov " << obc->obs.oi.version << " av " << ctx->at_version
3875 << " snapc " << ctx->snapc
3876 << " snapset " << obc->ssc->snapset
3877 << dendl;
3878 } else {
c07f9fc5 3879 dout(10) << __func__ << " " << soid << " " << *ctx->ops
7c673cae
FG
3880 << " ov " << obc->obs.oi.version
3881 << dendl;
3882 }
3883
3884 if (!ctx->user_at_version)
3885 ctx->user_at_version = obc->obs.oi.user_version;
3886 dout(30) << __func__ << " user_at_version " << ctx->user_at_version << dendl;
3887
7c673cae
FG
3888 {
3889#ifdef WITH_LTTNG
3890 osd_reqid_t reqid = ctx->op->get_reqid();
3891#endif
3892 tracepoint(osd, prepare_tx_enter, reqid.name._type,
3893 reqid.name._num, reqid.tid, reqid.inc);
3894 }
3895
3896 int result = prepare_transaction(ctx);
3897
3898 {
3899#ifdef WITH_LTTNG
3900 osd_reqid_t reqid = ctx->op->get_reqid();
3901#endif
3902 tracepoint(osd, prepare_tx_exit, reqid.name._type,
3903 reqid.name._num, reqid.tid, reqid.inc);
3904 }
3905
c07f9fc5
FG
3906 bool pending_async_reads = !ctx->pending_async_reads.empty();
3907 if (result == -EINPROGRESS || pending_async_reads) {
7c673cae 3908 // come back later.
c07f9fc5 3909 if (pending_async_reads) {
11fdf7f2 3910 ceph_assert(pool.info.is_erasure());
c07f9fc5
FG
3911 in_progress_async_reads.push_back(make_pair(op, ctx));
3912 ctx->start_async_reads(this);
3913 }
7c673cae
FG
3914 return;
3915 }
3916
3917 if (result == -EAGAIN) {
3918 // clean up after the ctx
3919 close_op_ctx(ctx);
3920 return;
3921 }
3922
9f95a23c
TL
3923 bool ignore_out_data = false;
3924 if (!ctx->op_t->empty() &&
3925 op->may_write() &&
3926 result >= 0) {
3927 // successful update
3928 if (ctx->op->allows_returnvec()) {
3929 // enforce reasonable bound on the return buffer sizes
3930 for (auto& i : *ctx->ops) {
3931 if (i.outdata.length() > cct->_conf->osd_max_write_op_reply_len) {
3932 dout(10) << __func__ << " op " << i << " outdata overflow" << dendl;
3933 result = -EOVERFLOW; // overall result is overflow
3934 i.rval = -EOVERFLOW;
3935 i.outdata.clear();
3936 }
3937 }
3938 } else {
3939 // legacy behavior -- zero result and return data etc.
3940 ignore_out_data = true;
3941 result = 0;
3942 }
7c673cae 3943 }
9f95a23c
TL
3944
3945 // prepare the reply
3946 ctx->reply = new MOSDOpReply(m, result, get_osdmap_epoch(), 0,
3947 ignore_out_data);
3948 dout(20) << __func__ << " alloc reply " << ctx->reply
3949 << " result " << result << dendl;
7c673cae
FG
3950
3951 // read or error?
3952 if ((ctx->op_t->empty() || result < 0) && !ctx->update_log_only) {
3953 // finish side-effects
3954 if (result >= 0)
3955 do_osd_op_effects(ctx, m->get_connection());
3956
c07f9fc5 3957 complete_read_ctx(result, ctx);
7c673cae
FG
3958 return;
3959 }
3960
3961 ctx->reply->set_reply_versions(ctx->at_version, ctx->user_at_version);
3962
11fdf7f2 3963 ceph_assert(op->may_write() || op->may_cache());
7c673cae
FG
3964
3965 // trim log?
9f95a23c 3966 recovery_state.update_trim_to();
7c673cae
FG
3967
3968 // verify that we are doing this in order?
3969 if (cct->_conf->osd_debug_op_order && m->get_source().is_client() &&
3970 !pool.info.is_tier() && !pool.info.has_tiers()) {
3971 map<client_t,ceph_tid_t>& cm = debug_op_order[obc->obs.oi.soid];
3972 ceph_tid_t t = m->get_tid();
3973 client_t n = m->get_source().num();
3974 map<client_t,ceph_tid_t>::iterator p = cm.find(n);
3975 if (p == cm.end()) {
3976 dout(20) << " op order client." << n << " tid " << t << " (first)" << dendl;
3977 cm[n] = t;
3978 } else {
3979 dout(20) << " op order client." << n << " tid " << t << " last was " << p->second << dendl;
3980 if (p->second > t) {
3981 derr << "bad op order, already applied " << p->second << " > this " << t << dendl;
11fdf7f2 3982 ceph_abort_msg("out of order op");
7c673cae
FG
3983 }
3984 p->second = t;
3985 }
3986 }
3987
3988 if (ctx->update_log_only) {
3989 if (result >= 0)
3990 do_osd_op_effects(ctx, m->get_connection());
3991
3992 dout(20) << __func__ << " update_log_only -- result=" << result << dendl;
3993 // save just what we need from ctx
3994 MOSDOpReply *reply = ctx->reply;
3995 ctx->reply = nullptr;
c07f9fc5 3996 reply->get_header().data_off = (ctx->data_off ? *ctx->data_off : 0);
7c673cae
FG
3997
3998 if (result == -ENOENT) {
3999 reply->set_enoent_reply_versions(info.last_update,
4000 info.last_user_version);
4001 }
4002 reply->add_flags(CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK);
4003 // append to pg log for dup detection - don't save buffers for now
9f95a23c
TL
4004 record_write_error(op, soid, reply, result,
4005 ctx->op->allows_returnvec() ? ctx : nullptr);
4006 close_op_ctx(ctx);
7c673cae
FG
4007 return;
4008 }
4009
4010 // no need to capture PG ref, repop cancel will handle that
4011 // Can capture the ctx by pointer, it's owned by the repop
4012 ctx->register_on_commit(
4013 [m, ctx, this](){
4014 if (ctx->op)
11fdf7f2 4015 log_op_stats(*ctx->op, ctx->bytes_written, ctx->bytes_read);
7c673cae
FG
4016
4017 if (m && !ctx->sent_reply) {
4018 MOSDOpReply *reply = ctx->reply;
9f95a23c 4019 ctx->reply = nullptr;
7c673cae
FG
4020 reply->add_flags(CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK);
4021 dout(10) << " sending reply on " << *m << " " << reply << dendl;
4022 osd->send_message_osd_client(reply, m->get_connection());
4023 ctx->sent_reply = true;
4024 ctx->op->mark_commit_sent();
4025 }
4026 });
4027 ctx->register_on_success(
4028 [ctx, this]() {
4029 do_osd_op_effects(
4030 ctx,
4031 ctx->op ? ctx->op->get_req()->get_connection() :
4032 ConnectionRef());
4033 });
4034 ctx->register_on_finish(
11fdf7f2 4035 [ctx]() {
7c673cae
FG
4036 delete ctx;
4037 });
4038
4039 // issue replica writes
4040 ceph_tid_t rep_tid = osd->get_tid();
4041
4042 RepGather *repop = new_repop(ctx, obc, rep_tid);
4043
4044 issue_repop(repop, ctx);
4045 eval_repop(repop);
4046 repop->put();
4047}
4048
c07f9fc5
FG
4049void PrimaryLogPG::close_op_ctx(OpContext *ctx) {
4050 release_object_locks(ctx->lock_manager);
4051
4052 ctx->op_t.reset();
4053
4054 for (auto p = ctx->on_finish.begin(); p != ctx->on_finish.end();
4055 ctx->on_finish.erase(p++)) {
4056 (*p)();
4057 }
4058 delete ctx;
4059}
4060
7c673cae
FG
4061void PrimaryLogPG::reply_ctx(OpContext *ctx, int r)
4062{
4063 if (ctx->op)
4064 osd->reply_op_error(ctx->op, r);
4065 close_op_ctx(ctx);
4066}
4067
11fdf7f2
TL
4068void PrimaryLogPG::log_op_stats(const OpRequest& op,
4069 const uint64_t inb,
4070 const uint64_t outb)
7c673cae 4071{
9f95a23c 4072 auto m = op.get_req<MOSDOp>();
11fdf7f2 4073 const utime_t now = ceph_clock_now();
7c673cae 4074
11fdf7f2
TL
4075 const utime_t latency = now - m->get_recv_stamp();
4076 const utime_t process_latency = now - op.get_dequeued_time();
7c673cae
FG
4077
4078 osd->logger->inc(l_osd_op);
4079
4080 osd->logger->inc(l_osd_op_outb, outb);
4081 osd->logger->inc(l_osd_op_inb, inb);
4082 osd->logger->tinc(l_osd_op_lat, latency);
4083 osd->logger->tinc(l_osd_op_process_lat, process_latency);
4084
11fdf7f2 4085 if (op.may_read() && op.may_write()) {
7c673cae
FG
4086 osd->logger->inc(l_osd_op_rw);
4087 osd->logger->inc(l_osd_op_rw_inb, inb);
4088 osd->logger->inc(l_osd_op_rw_outb, outb);
4089 osd->logger->tinc(l_osd_op_rw_lat, latency);
4090 osd->logger->hinc(l_osd_op_rw_lat_inb_hist, latency.to_nsec(), inb);
4091 osd->logger->hinc(l_osd_op_rw_lat_outb_hist, latency.to_nsec(), outb);
4092 osd->logger->tinc(l_osd_op_rw_process_lat, process_latency);
11fdf7f2 4093 } else if (op.may_read()) {
7c673cae
FG
4094 osd->logger->inc(l_osd_op_r);
4095 osd->logger->inc(l_osd_op_r_outb, outb);
4096 osd->logger->tinc(l_osd_op_r_lat, latency);
4097 osd->logger->hinc(l_osd_op_r_lat_outb_hist, latency.to_nsec(), outb);
4098 osd->logger->tinc(l_osd_op_r_process_lat, process_latency);
11fdf7f2 4099 } else if (op.may_write() || op.may_cache()) {
7c673cae
FG
4100 osd->logger->inc(l_osd_op_w);
4101 osd->logger->inc(l_osd_op_w_inb, inb);
4102 osd->logger->tinc(l_osd_op_w_lat, latency);
4103 osd->logger->hinc(l_osd_op_w_lat_inb_hist, latency.to_nsec(), inb);
4104 osd->logger->tinc(l_osd_op_w_process_lat, process_latency);
11fdf7f2 4105 } else {
7c673cae 4106 ceph_abort();
11fdf7f2 4107 }
7c673cae
FG
4108
4109 dout(15) << "log_op_stats " << *m
4110 << " inb " << inb
4111 << " outb " << outb
4112 << " lat " << latency << dendl;
7c673cae 4113
11fdf7f2
TL
4114 if (m_dynamic_perf_stats.is_enabled()) {
4115 m_dynamic_perf_stats.add(osd, info, op, inb, outb, latency);
7c673cae 4116 }
11fdf7f2 4117}
7c673cae 4118
11fdf7f2
TL
4119void PrimaryLogPG::set_dynamic_perf_stats_queries(
4120 const std::list<OSDPerfMetricQuery> &queries)
4121{
4122 m_dynamic_perf_stats.set_queries(queries);
7c673cae
FG
4123}
4124
11fdf7f2 4125void PrimaryLogPG::get_dynamic_perf_stats(DynamicPerfStats *stats)
7c673cae 4126{
11fdf7f2 4127 std::swap(m_dynamic_perf_stats, *stats);
7c673cae
FG
4128}
4129
4130void PrimaryLogPG::do_scan(
4131 OpRequestRef op,
4132 ThreadPool::TPHandle &handle)
4133{
9f95a23c 4134 auto m = op->get_req<MOSDPGScan>();
11fdf7f2 4135 ceph_assert(m->get_type() == MSG_OSD_PG_SCAN);
7c673cae
FG
4136 dout(10) << "do_scan " << *m << dendl;
4137
4138 op->mark_started();
4139
4140 switch (m->op) {
4141 case MOSDPGScan::OP_SCAN_GET_DIGEST:
4142 {
11fdf7f2
TL
4143 auto dpp = get_dpp();
4144 if (osd->check_backfill_full(dpp)) {
4145 dout(1) << __func__ << ": Canceling backfill: Full." << dendl;
7c673cae 4146 queue_peering_event(
11fdf7f2
TL
4147 PGPeeringEventRef(
4148 std::make_shared<PGPeeringEvent>(
4149 get_osdmap_epoch(),
4150 get_osdmap_epoch(),
9f95a23c 4151 PeeringState::BackfillTooFull())));
7c673cae
FG
4152 return;
4153 }
4154
4155 BackfillInterval bi;
4156 bi.begin = m->begin;
4157 // No need to flush, there won't be any in progress writes occuring
4158 // past m->begin
4159 scan_range(
4160 cct->_conf->osd_backfill_scan_min,
4161 cct->_conf->osd_backfill_scan_max,
4162 &bi,
4163 handle);
4164 MOSDPGScan *reply = new MOSDPGScan(
4165 MOSDPGScan::OP_SCAN_DIGEST,
4166 pg_whoami,
11fdf7f2 4167 get_osdmap_epoch(), m->query_epoch,
7c673cae 4168 spg_t(info.pgid.pgid, get_primary().shard), bi.begin, bi.end);
11fdf7f2 4169 encode(bi.objects, reply->get_data());
7c673cae
FG
4170 osd->send_message_osd_cluster(reply, m->get_connection());
4171 }
4172 break;
4173
4174 case MOSDPGScan::OP_SCAN_DIGEST:
4175 {
4176 pg_shard_t from = m->from;
4177
4178 // Check that from is in backfill_targets vector
9f95a23c 4179 ceph_assert(is_backfill_target(from));
7c673cae
FG
4180
4181 BackfillInterval& bi = peer_backfill_info[from];
4182 bi.begin = m->begin;
4183 bi.end = m->end;
11fdf7f2 4184 auto p = m->get_data().cbegin();
7c673cae
FG
4185
4186 // take care to preserve ordering!
4187 bi.clear_objects();
4188 ::decode_noclear(bi.objects, p);
4189
4190 if (waiting_on_backfill.erase(from)) {
4191 if (waiting_on_backfill.empty()) {
9f95a23c
TL
4192 ceph_assert(
4193 peer_backfill_info.size() ==
4194 get_backfill_targets().size());
7c673cae
FG
4195 finish_recovery_op(hobject_t::get_max());
4196 }
4197 } else {
4198 // we canceled backfill for a while due to a too full, and this
4199 // is an extra response from a non-too-full peer
11fdf7f2 4200 dout(20) << __func__ << " canceled backfill (too full?)" << dendl;
7c673cae
FG
4201 }
4202 }
4203 break;
4204 }
4205}
4206
4207void PrimaryLogPG::do_backfill(OpRequestRef op)
4208{
9f95a23c 4209 auto m = op->get_req<MOSDPGBackfill>();
11fdf7f2 4210 ceph_assert(m->get_type() == MSG_OSD_PG_BACKFILL);
7c673cae
FG
4211 dout(10) << "do_backfill " << *m << dendl;
4212
4213 op->mark_started();
4214
4215 switch (m->op) {
4216 case MOSDPGBackfill::OP_BACKFILL_FINISH:
4217 {
11fdf7f2 4218 ceph_assert(cct->_conf->osd_kill_backfill_at != 1);
7c673cae
FG
4219
4220 MOSDPGBackfill *reply = new MOSDPGBackfill(
4221 MOSDPGBackfill::OP_BACKFILL_FINISH_ACK,
11fdf7f2 4222 get_osdmap_epoch(),
7c673cae
FG
4223 m->query_epoch,
4224 spg_t(info.pgid.pgid, get_primary().shard));
4225 reply->set_priority(get_recovery_op_priority());
4226 osd->send_message_osd_cluster(reply, m->get_connection());
4227 queue_peering_event(
11fdf7f2
TL
4228 PGPeeringEventRef(
4229 std::make_shared<PGPeeringEvent>(
4230 get_osdmap_epoch(),
4231 get_osdmap_epoch(),
7c673cae
FG
4232 RecoveryDone())));
4233 }
4234 // fall-thru
4235
4236 case MOSDPGBackfill::OP_BACKFILL_PROGRESS:
4237 {
11fdf7f2 4238 ceph_assert(cct->_conf->osd_kill_backfill_at != 2);
7c673cae 4239
7c673cae 4240 ObjectStore::Transaction t;
9f95a23c
TL
4241 recovery_state.update_backfill_progress(
4242 m->last_backfill,
4243 m->stats,
4244 m->op == MOSDPGBackfill::OP_BACKFILL_PROGRESS,
4245 t);
4246
11fdf7f2
TL
4247 int tr = osd->store->queue_transaction(ch, std::move(t), NULL);
4248 ceph_assert(tr == 0);
7c673cae
FG
4249 }
4250 break;
4251
4252 case MOSDPGBackfill::OP_BACKFILL_FINISH_ACK:
4253 {
11fdf7f2
TL
4254 ceph_assert(is_primary());
4255 ceph_assert(cct->_conf->osd_kill_backfill_at != 3);
7c673cae
FG
4256 finish_recovery_op(hobject_t::get_max());
4257 }
4258 break;
4259 }
4260}
4261
4262void PrimaryLogPG::do_backfill_remove(OpRequestRef op)
4263{
4264 const MOSDPGBackfillRemove *m = static_cast<const MOSDPGBackfillRemove*>(
4265 op->get_req());
11fdf7f2 4266 ceph_assert(m->get_type() == MSG_OSD_PG_BACKFILL_REMOVE);
7c673cae
FG
4267 dout(7) << __func__ << " " << m->ls << dendl;
4268
4269 op->mark_started();
4270
4271 ObjectStore::Transaction t;
4272 for (auto& p : m->ls) {
11fdf7f2
TL
4273 if (is_remote_backfilling()) {
4274 struct stat st;
4275 int r = osd->store->stat(ch, ghobject_t(p.first, ghobject_t::NO_GEN,
4276 pg_whoami.shard) , &st);
4277 if (r == 0) {
4278 sub_local_num_bytes(st.st_size);
4279 int64_t usersize;
4280 if (pool.info.is_erasure()) {
4281 bufferlist bv;
4282 int r = osd->store->getattr(
4283 ch,
4284 ghobject_t(p.first, ghobject_t::NO_GEN, pg_whoami.shard),
4285 OI_ATTR,
4286 bv);
4287 if (r >= 0) {
4288 object_info_t oi(bv);
4289 usersize = oi.size * pgbackend->get_ec_data_chunk_count();
4290 } else {
4291 dout(0) << __func__ << " " << ghobject_t(p.first, ghobject_t::NO_GEN, pg_whoami.shard)
4292 << " can't get object info" << dendl;
4293 usersize = 0;
4294 }
4295 } else {
4296 usersize = st.st_size;
4297 }
4298 sub_num_bytes(usersize);
4299 dout(10) << __func__ << " " << ghobject_t(p.first, ghobject_t::NO_GEN, pg_whoami.shard)
4300 << " sub actual data by " << st.st_size
4301 << " sub num_bytes by " << usersize
4302 << dendl;
4303 }
4304 }
7c673cae
FG
4305 remove_snap_mapped_object(t, p.first);
4306 }
11fdf7f2
TL
4307 int r = osd->store->queue_transaction(ch, std::move(t), NULL);
4308 ceph_assert(r == 0);
7c673cae
FG
4309}
4310
224ce89b 4311int PrimaryLogPG::trim_object(
9f95a23c
TL
4312 bool first, const hobject_t &coid, snapid_t snap_to_trim,
4313 PrimaryLogPG::OpContextUPtr *ctxp)
7c673cae 4314{
224ce89b 4315 *ctxp = NULL;
11fdf7f2 4316
7c673cae
FG
4317 // load clone info
4318 bufferlist bl;
4319 ObjectContextRef obc = get_object_context(coid, false, NULL);
224ce89b
WB
4320 if (!obc || !obc->ssc || !obc->ssc->exists) {
4321 osd->clog->error() << __func__ << ": Can not trim " << coid
4322 << " repair needed " << (obc ? "(no obc->ssc or !exists)" : "(no obc)");
4323 return -ENOENT;
7c673cae 4324 }
7c673cae 4325
11fdf7f2
TL
4326 hobject_t head_oid = coid.get_head();
4327 ObjectContextRef head_obc = get_object_context(head_oid, false);
4328 if (!head_obc) {
224ce89b 4329 osd->clog->error() << __func__ << ": Can not trim " << coid
11fdf7f2 4330 << " repair needed, no snapset obc for " << head_oid;
224ce89b
WB
4331 return -ENOENT;
4332 }
7c673cae
FG
4333
4334 SnapSet& snapset = obc->ssc->snapset;
4335
7c673cae 4336 object_info_t &coi = obc->obs.oi;
11fdf7f2
TL
4337 auto citer = snapset.clone_snaps.find(coid.snap);
4338 if (citer == snapset.clone_snaps.end()) {
4339 osd->clog->error() << "No clone_snaps in snapset " << snapset
4340 << " for object " << coid << "\n";
4341 return -ENOENT;
7c673cae 4342 }
11fdf7f2 4343 set<snapid_t> old_snaps(citer->second.begin(), citer->second.end());
7c673cae 4344 if (old_snaps.empty()) {
c07f9fc5 4345 osd->clog->error() << "No object info snaps for object " << coid;
224ce89b 4346 return -ENOENT;
7c673cae
FG
4347 }
4348
4349 dout(10) << coid << " old_snaps " << old_snaps
4350 << " old snapset " << snapset << dendl;
4351 if (snapset.seq == 0) {
c07f9fc5 4352 osd->clog->error() << "No snapset.seq for object " << coid;
224ce89b 4353 return -ENOENT;
7c673cae
FG
4354 }
4355
4356 set<snapid_t> new_snaps;
9f95a23c 4357 const OSDMapRef& osdmap = get_osdmap();
7c673cae
FG
4358 for (set<snapid_t>::iterator i = old_snaps.begin();
4359 i != old_snaps.end();
4360 ++i) {
9f95a23c
TL
4361 if (!osdmap->in_removed_snaps_queue(info.pgid.pgid.pool(), *i) &&
4362 *i != snap_to_trim) {
7c673cae 4363 new_snaps.insert(*i);
9f95a23c 4364 }
7c673cae
FG
4365 }
4366
4367 vector<snapid_t>::iterator p = snapset.clones.end();
4368
4369 if (new_snaps.empty()) {
4370 p = std::find(snapset.clones.begin(), snapset.clones.end(), coid.snap);
4371 if (p == snapset.clones.end()) {
c07f9fc5 4372 osd->clog->error() << "Snap " << coid.snap << " not in clones";
224ce89b 4373 return -ENOENT;
7c673cae
FG
4374 }
4375 }
4376
4377 OpContextUPtr ctx = simple_opc_create(obc);
11fdf7f2 4378 ctx->head_obc = head_obc;
7c673cae
FG
4379
4380 if (!ctx->lock_manager.get_snaptrimmer_write(
4381 coid,
4382 obc,
4383 first)) {
4384 close_op_ctx(ctx.release());
4385 dout(10) << __func__ << ": Unable to get a wlock on " << coid << dendl;
224ce89b 4386 return -ENOLCK;
7c673cae
FG
4387 }
4388
4389 if (!ctx->lock_manager.get_snaptrimmer_write(
11fdf7f2
TL
4390 head_oid,
4391 head_obc,
7c673cae
FG
4392 first)) {
4393 close_op_ctx(ctx.release());
11fdf7f2 4394 dout(10) << __func__ << ": Unable to get a wlock on " << head_oid << dendl;
224ce89b 4395 return -ENOLCK;
7c673cae
FG
4396 }
4397
4398 ctx->at_version = get_next_version();
4399
4400 PGTransaction *t = ctx->op_t.get();
4401
4402 if (new_snaps.empty()) {
4403 // remove clone
4404 dout(10) << coid << " snaps " << old_snaps << " -> "
4405 << new_snaps << " ... deleting" << dendl;
4406
4407 // ...from snapset
11fdf7f2 4408 ceph_assert(p != snapset.clones.end());
7c673cae
FG
4409
4410 snapid_t last = coid.snap;
4411 ctx->delta_stats.num_bytes -= snapset.get_clone_bytes(last);
4412
4413 if (p != snapset.clones.begin()) {
4414 // not the oldest... merge overlap into next older clone
4415 vector<snapid_t>::iterator n = p - 1;
4416 hobject_t prev_coid = coid;
4417 prev_coid.snap = *n;
4418 bool adjust_prev_bytes = is_present_clone(prev_coid);
4419
4420 if (adjust_prev_bytes)
4421 ctx->delta_stats.num_bytes -= snapset.get_clone_bytes(*n);
4422
4423 snapset.clone_overlap[*n].intersection_of(
4424 snapset.clone_overlap[*p]);
4425
4426 if (adjust_prev_bytes)
4427 ctx->delta_stats.num_bytes += snapset.get_clone_bytes(*n);
4428 }
4429 ctx->delta_stats.num_objects--;
4430 if (coi.is_dirty())
4431 ctx->delta_stats.num_objects_dirty--;
4432 if (coi.is_omap())
4433 ctx->delta_stats.num_objects_omap--;
4434 if (coi.is_whiteout()) {
4435 dout(20) << __func__ << " trimming whiteout on " << coid << dendl;
4436 ctx->delta_stats.num_whiteouts--;
4437 }
4438 ctx->delta_stats.num_object_clones--;
4439 if (coi.is_cache_pinned())
4440 ctx->delta_stats.num_objects_pinned--;
11fdf7f2
TL
4441 if (coi.has_manifest())
4442 ctx->delta_stats.num_objects_manifest--;
7c673cae
FG
4443 obc->obs.exists = false;
4444
4445 snapset.clones.erase(p);
4446 snapset.clone_overlap.erase(last);
4447 snapset.clone_size.erase(last);
4448 snapset.clone_snaps.erase(last);
4449
4450 ctx->log.push_back(
4451 pg_log_entry_t(
4452 pg_log_entry_t::DELETE,
4453 coid,
4454 ctx->at_version,
4455 ctx->obs->oi.version,
4456 0,
4457 osd_reqid_t(),
4458 ctx->mtime,
4459 0)
4460 );
4461 t->remove(coid);
4462 t->update_snaps(
4463 coid,
4464 old_snaps,
4465 new_snaps);
31f18b77
FG
4466
4467 coi = object_info_t(coid);
4468
7c673cae
FG
4469 ctx->at_version.version++;
4470 } else {
4471 // save adjusted snaps for this object
4472 dout(10) << coid << " snaps " << old_snaps << " -> " << new_snaps << dendl;
11fdf7f2
TL
4473 snapset.clone_snaps[coid.snap] =
4474 vector<snapid_t>(new_snaps.rbegin(), new_snaps.rend());
4475 // we still do a 'modify' event on this object just to trigger a
4476 // snapmapper.update ... :(
7c673cae
FG
4477
4478 coi.prior_version = coi.version;
4479 coi.version = ctx->at_version;
4480 bl.clear();
11fdf7f2 4481 encode(coi, bl, get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
7c673cae
FG
4482 t->setattr(coid, OI_ATTR, bl);
4483
4484 ctx->log.push_back(
4485 pg_log_entry_t(
4486 pg_log_entry_t::MODIFY,
4487 coid,
4488 coi.version,
4489 coi.prior_version,
4490 0,
4491 osd_reqid_t(),
4492 ctx->mtime,
4493 0)
4494 );
4495 ctx->at_version.version++;
4496
4497 t->update_snaps(
4498 coid,
4499 old_snaps,
4500 new_snaps);
4501 }
4502
4503 // save head snapset
4504 dout(10) << coid << " new snapset " << snapset << " on "
11fdf7f2 4505 << head_obc->obs.oi << dendl;
7c673cae 4506 if (snapset.clones.empty() &&
11fdf7f2
TL
4507 (head_obc->obs.oi.is_whiteout() &&
4508 !(head_obc->obs.oi.is_dirty() && pool.info.is_tier()) &&
4509 !head_obc->obs.oi.is_cache_pinned())) {
7c673cae
FG
4510 // NOTE: this arguably constitutes minor interference with the
4511 // tiering agent if this is a cache tier since a snap trim event
4512 // is effectively evicting a whiteout we might otherwise want to
4513 // keep around.
11fdf7f2 4514 dout(10) << coid << " removing " << head_oid << dendl;
7c673cae
FG
4515 ctx->log.push_back(
4516 pg_log_entry_t(
4517 pg_log_entry_t::DELETE,
11fdf7f2 4518 head_oid,
7c673cae 4519 ctx->at_version,
11fdf7f2 4520 head_obc->obs.oi.version,
7c673cae
FG
4521 0,
4522 osd_reqid_t(),
4523 ctx->mtime,
4524 0)
4525 );
11fdf7f2
TL
4526 derr << "removing snap head" << dendl;
4527 object_info_t& oi = head_obc->obs.oi;
4528 ctx->delta_stats.num_objects--;
4529 if (oi.is_dirty()) {
4530 ctx->delta_stats.num_objects_dirty--;
4531 }
4532 if (oi.is_omap())
4533 ctx->delta_stats.num_objects_omap--;
4534 if (oi.is_whiteout()) {
4535 dout(20) << __func__ << " trimming whiteout on " << oi.soid << dendl;
4536 ctx->delta_stats.num_whiteouts--;
4537 }
4538 if (oi.is_cache_pinned()) {
4539 ctx->delta_stats.num_objects_pinned--;
4540 }
4541 if (coi.has_manifest())
4542 ctx->delta_stats.num_objects_manifest--;
4543 head_obc->obs.exists = false;
4544 head_obc->obs.oi = object_info_t(head_oid);
4545 t->remove(head_oid);
7c673cae 4546 } else {
9f95a23c
TL
4547 if (get_osdmap()->require_osd_release < ceph_release_t::octopus) {
4548 // filter SnapSet::snaps for the benefit of pre-octopus
4549 // peers. This is perhaps overly conservative in that I'm not
4550 // certain they need this, but let's be conservative here.
4551 dout(10) << coid << " filtering snapset on " << head_oid << dendl;
4552 snapset.filter(pool.info);
4553 } else {
4554 snapset.snaps.clear();
4555 }
11fdf7f2 4556 dout(10) << coid << " writing updated snapset on " << head_oid
7c673cae
FG
4557 << ", snapset is " << snapset << dendl;
4558 ctx->log.push_back(
4559 pg_log_entry_t(
4560 pg_log_entry_t::MODIFY,
11fdf7f2 4561 head_oid,
7c673cae 4562 ctx->at_version,
11fdf7f2 4563 head_obc->obs.oi.version,
7c673cae
FG
4564 0,
4565 osd_reqid_t(),
4566 ctx->mtime,
4567 0)
4568 );
4569
11fdf7f2
TL
4570 head_obc->obs.oi.prior_version = head_obc->obs.oi.version;
4571 head_obc->obs.oi.version = ctx->at_version;
7c673cae
FG
4572
4573 map <string, bufferlist> attrs;
4574 bl.clear();
11fdf7f2 4575 encode(snapset, bl);
7c673cae
FG
4576 attrs[SS_ATTR].claim(bl);
4577
4578 bl.clear();
11fdf7f2 4579 encode(head_obc->obs.oi, bl,
7c673cae
FG
4580 get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
4581 attrs[OI_ATTR].claim(bl);
11fdf7f2 4582 t->setattrs(head_oid, attrs);
7c673cae
FG
4583 }
4584
224ce89b
WB
4585 *ctxp = std::move(ctx);
4586 return 0;
7c673cae
FG
4587}
4588
4589void PrimaryLogPG::kick_snap_trim()
4590{
11fdf7f2
TL
4591 ceph_assert(is_active());
4592 ceph_assert(is_primary());
4593 if (is_clean() &&
4594 !state_test(PG_STATE_PREMERGE) &&
4595 !snap_trimq.empty()) {
4596 if (get_osdmap()->test_flag(CEPH_OSDMAP_NOSNAPTRIM)) {
4597 dout(10) << __func__ << ": nosnaptrim set, not kicking" << dendl;
4598 } else {
4599 dout(10) << __func__ << ": clean and snaps to trim, kicking" << dendl;
4600 snap_trimmer_machine.process_event(KickTrim());
4601 }
7c673cae
FG
4602 }
4603}
4604
4605void PrimaryLogPG::snap_trimmer_scrub_complete()
4606{
4607 if (is_primary() && is_active() && is_clean()) {
11fdf7f2 4608 ceph_assert(!snap_trimq.empty());
7c673cae
FG
4609 snap_trimmer_machine.process_event(ScrubComplete());
4610 }
4611}
4612
4613void PrimaryLogPG::snap_trimmer(epoch_t queued)
4614{
9f95a23c 4615 if (recovery_state.is_deleting() || pg_has_reset_since(queued)) {
7c673cae
FG
4616 return;
4617 }
4618
11fdf7f2 4619 ceph_assert(is_primary());
7c673cae
FG
4620
4621 dout(10) << "snap_trimmer posting" << dendl;
4622 snap_trimmer_machine.process_event(DoSnapWork());
4623 dout(10) << "snap_trimmer complete" << dendl;
4624 return;
4625}
4626
4627int PrimaryLogPG::do_xattr_cmp_u64(int op, __u64 v1, bufferlist& xattr)
4628{
4629 __u64 v2;
4630
4631 string v2s(xattr.c_str(), xattr.length());
4632 if (v2s.length())
4633 v2 = strtoull(v2s.c_str(), NULL, 10);
4634 else
4635 v2 = 0;
4636
4637 dout(20) << "do_xattr_cmp_u64 '" << v1 << "' vs '" << v2 << "' op " << op << dendl;
4638
4639 switch (op) {
4640 case CEPH_OSD_CMPXATTR_OP_EQ:
4641 return (v1 == v2);
4642 case CEPH_OSD_CMPXATTR_OP_NE:
4643 return (v1 != v2);
4644 case CEPH_OSD_CMPXATTR_OP_GT:
4645 return (v1 > v2);
4646 case CEPH_OSD_CMPXATTR_OP_GTE:
4647 return (v1 >= v2);
4648 case CEPH_OSD_CMPXATTR_OP_LT:
4649 return (v1 < v2);
4650 case CEPH_OSD_CMPXATTR_OP_LTE:
4651 return (v1 <= v2);
4652 default:
4653 return -EINVAL;
4654 }
4655}
4656
4657int PrimaryLogPG::do_xattr_cmp_str(int op, string& v1s, bufferlist& xattr)
4658{
4659 string v2s(xattr.c_str(), xattr.length());
4660
4661 dout(20) << "do_xattr_cmp_str '" << v1s << "' vs '" << v2s << "' op " << op << dendl;
4662
4663 switch (op) {
4664 case CEPH_OSD_CMPXATTR_OP_EQ:
4665 return (v1s.compare(v2s) == 0);
4666 case CEPH_OSD_CMPXATTR_OP_NE:
4667 return (v1s.compare(v2s) != 0);
4668 case CEPH_OSD_CMPXATTR_OP_GT:
4669 return (v1s.compare(v2s) > 0);
4670 case CEPH_OSD_CMPXATTR_OP_GTE:
4671 return (v1s.compare(v2s) >= 0);
4672 case CEPH_OSD_CMPXATTR_OP_LT:
4673 return (v1s.compare(v2s) < 0);
4674 case CEPH_OSD_CMPXATTR_OP_LTE:
4675 return (v1s.compare(v2s) <= 0);
4676 default:
4677 return -EINVAL;
4678 }
4679}
4680
7c673cae
FG
4681int PrimaryLogPG::do_writesame(OpContext *ctx, OSDOp& osd_op)
4682{
4683 ceph_osd_op& op = osd_op.op;
4684 vector<OSDOp> write_ops(1);
4685 OSDOp& write_op = write_ops[0];
4686 uint64_t write_length = op.writesame.length;
4687 int result = 0;
4688
4689 if (!write_length)
4690 return 0;
4691
4692 if (!op.writesame.data_length || write_length % op.writesame.data_length)
4693 return -EINVAL;
4694
4695 if (op.writesame.data_length != osd_op.indata.length()) {
4696 derr << "invalid length ws data length " << op.writesame.data_length << " actual len " << osd_op.indata.length() << dendl;
4697 return -EINVAL;
4698 }
4699
4700 while (write_length) {
4701 write_op.indata.append(osd_op.indata);
4702 write_length -= op.writesame.data_length;
4703 }
4704
4705 write_op.op.op = CEPH_OSD_OP_WRITE;
4706 write_op.op.extent.offset = op.writesame.offset;
4707 write_op.op.extent.length = op.writesame.length;
4708 result = do_osd_ops(ctx, write_ops);
4709 if (result < 0)
4710 derr << "do_writesame do_osd_ops failed " << result << dendl;
4711
4712 return result;
4713}
4714
4715// ========================================================================
4716// low level osd ops
4717
4718int PrimaryLogPG::do_tmap2omap(OpContext *ctx, unsigned flags)
4719{
4720 dout(20) << " convert tmap to omap for " << ctx->new_obs.oi.soid << dendl;
4721 bufferlist header, vals;
4722 int r = _get_tmap(ctx, &header, &vals);
4723 if (r < 0) {
4724 if (r == -ENODATA && (flags & CEPH_OSD_TMAP2OMAP_NULLOK))
4725 r = 0;
4726 return r;
4727 }
4728
4729 vector<OSDOp> ops(3);
4730
4731 ops[0].op.op = CEPH_OSD_OP_TRUNCATE;
4732 ops[0].op.extent.offset = 0;
4733 ops[0].op.extent.length = 0;
4734
4735 ops[1].op.op = CEPH_OSD_OP_OMAPSETHEADER;
4736 ops[1].indata.claim(header);
4737
4738 ops[2].op.op = CEPH_OSD_OP_OMAPSETVALS;
4739 ops[2].indata.claim(vals);
4740
4741 return do_osd_ops(ctx, ops);
4742}
4743
11fdf7f2
TL
4744int PrimaryLogPG::do_tmapup_slow(OpContext *ctx, bufferlist::const_iterator& bp,
4745 OSDOp& osd_op, bufferlist& bl)
7c673cae
FG
4746{
4747 // decode
4748 bufferlist header;
4749 map<string, bufferlist> m;
4750 if (bl.length()) {
11fdf7f2
TL
4751 auto p = bl.cbegin();
4752 decode(header, p);
4753 decode(m, p);
4754 ceph_assert(p.end());
7c673cae
FG
4755 }
4756
4757 // do the update(s)
4758 while (!bp.end()) {
4759 __u8 op;
4760 string key;
11fdf7f2 4761 decode(op, bp);
7c673cae
FG
4762
4763 switch (op) {
4764 case CEPH_OSD_TMAP_SET: // insert key
4765 {
11fdf7f2 4766 decode(key, bp);
7c673cae 4767 bufferlist data;
11fdf7f2 4768 decode(data, bp);
7c673cae
FG
4769 m[key] = data;
4770 }
4771 break;
4772 case CEPH_OSD_TMAP_RM: // remove key
11fdf7f2 4773 decode(key, bp);
7c673cae
FG
4774 if (!m.count(key)) {
4775 return -ENOENT;
4776 }
4777 m.erase(key);
4778 break;
4779 case CEPH_OSD_TMAP_RMSLOPPY: // remove key
11fdf7f2 4780 decode(key, bp);
7c673cae
FG
4781 m.erase(key);
4782 break;
4783 case CEPH_OSD_TMAP_HDR: // update header
4784 {
11fdf7f2 4785 decode(header, bp);
7c673cae
FG
4786 }
4787 break;
4788 default:
4789 return -EINVAL;
4790 }
4791 }
4792
4793 // reencode
4794 bufferlist obl;
11fdf7f2
TL
4795 encode(header, obl);
4796 encode(m, obl);
7c673cae
FG
4797
4798 // write it out
4799 vector<OSDOp> nops(1);
4800 OSDOp& newop = nops[0];
4801 newop.op.op = CEPH_OSD_OP_WRITEFULL;
4802 newop.op.extent.offset = 0;
4803 newop.op.extent.length = obl.length();
4804 newop.indata = obl;
4805 do_osd_ops(ctx, nops);
7c673cae
FG
4806 return 0;
4807}
4808
11fdf7f2 4809int PrimaryLogPG::do_tmapup(OpContext *ctx, bufferlist::const_iterator& bp, OSDOp& osd_op)
7c673cae 4810{
11fdf7f2 4811 bufferlist::const_iterator orig_bp = bp;
7c673cae
FG
4812 int result = 0;
4813 if (bp.end()) {
4814 dout(10) << "tmapup is a no-op" << dendl;
4815 } else {
4816 // read the whole object
4817 vector<OSDOp> nops(1);
4818 OSDOp& newop = nops[0];
4819 newop.op.op = CEPH_OSD_OP_READ;
4820 newop.op.extent.offset = 0;
4821 newop.op.extent.length = 0;
4822 result = do_osd_ops(ctx, nops);
4823
4824 dout(10) << "tmapup read " << newop.outdata.length() << dendl;
4825
4826 dout(30) << " starting is \n";
4827 newop.outdata.hexdump(*_dout);
4828 *_dout << dendl;
4829
11fdf7f2 4830 auto ip = newop.outdata.cbegin();
7c673cae
FG
4831 bufferlist obl;
4832
4833 dout(30) << "the update command is: \n";
4834 osd_op.indata.hexdump(*_dout);
4835 *_dout << dendl;
4836
4837 // header
4838 bufferlist header;
4839 __u32 nkeys = 0;
4840 if (newop.outdata.length()) {
11fdf7f2
TL
4841 decode(header, ip);
4842 decode(nkeys, ip);
7c673cae
FG
4843 }
4844 dout(10) << "tmapup header " << header.length() << dendl;
4845
4846 if (!bp.end() && *bp == CEPH_OSD_TMAP_HDR) {
4847 ++bp;
11fdf7f2 4848 decode(header, bp);
7c673cae
FG
4849 dout(10) << "tmapup new header " << header.length() << dendl;
4850 }
4851
11fdf7f2 4852 encode(header, obl);
7c673cae
FG
4853
4854 dout(20) << "tmapup initial nkeys " << nkeys << dendl;
4855
4856 // update keys
4857 bufferlist newkeydata;
4858 string nextkey, last_in_key;
4859 bufferlist nextval;
4860 bool have_next = false;
4861 if (!ip.end()) {
4862 have_next = true;
11fdf7f2
TL
4863 decode(nextkey, ip);
4864 decode(nextval, ip);
7c673cae
FG
4865 }
4866 while (!bp.end() && !result) {
4867 __u8 op;
4868 string key;
4869 try {
11fdf7f2
TL
4870 decode(op, bp);
4871 decode(key, bp);
7c673cae
FG
4872 }
4873 catch (buffer::error& e) {
4874 return -EINVAL;
4875 }
4876 if (key < last_in_key) {
4877 dout(5) << "tmapup warning: key '" << key << "' < previous key '" << last_in_key
4878 << "', falling back to an inefficient (unsorted) update" << dendl;
4879 bp = orig_bp;
4880 return do_tmapup_slow(ctx, bp, osd_op, newop.outdata);
4881 }
4882 last_in_key = key;
4883
4884 dout(10) << "tmapup op " << (int)op << " key " << key << dendl;
9f95a23c 4885
7c673cae
FG
4886 // skip existing intervening keys
4887 bool key_exists = false;
4888 while (have_next && !key_exists) {
4889 dout(20) << " (have_next=" << have_next << " nextkey=" << nextkey << ")" << dendl;
4890 if (nextkey > key)
4891 break;
4892 if (nextkey < key) {
4893 // copy untouched.
11fdf7f2
TL
4894 encode(nextkey, newkeydata);
4895 encode(nextval, newkeydata);
7c673cae
FG
4896 dout(20) << " keep " << nextkey << " " << nextval.length() << dendl;
4897 } else {
4898 // don't copy; discard old value. and stop.
4899 dout(20) << " drop " << nextkey << " " << nextval.length() << dendl;
4900 key_exists = true;
4901 nkeys--;
4902 }
4903 if (!ip.end()) {
11fdf7f2
TL
4904 decode(nextkey, ip);
4905 decode(nextval, ip);
7c673cae
FG
4906 } else {
4907 have_next = false;
4908 }
4909 }
4910
4911 if (op == CEPH_OSD_TMAP_SET) {
4912 bufferlist val;
4913 try {
11fdf7f2 4914 decode(val, bp);
7c673cae
FG
4915 }
4916 catch (buffer::error& e) {
4917 return -EINVAL;
4918 }
11fdf7f2
TL
4919 encode(key, newkeydata);
4920 encode(val, newkeydata);
7c673cae
FG
4921 dout(20) << " set " << key << " " << val.length() << dendl;
4922 nkeys++;
4923 } else if (op == CEPH_OSD_TMAP_CREATE) {
4924 if (key_exists) {
4925 return -EEXIST;
4926 }
4927 bufferlist val;
4928 try {
11fdf7f2 4929 decode(val, bp);
7c673cae
FG
4930 }
4931 catch (buffer::error& e) {
4932 return -EINVAL;
4933 }
11fdf7f2
TL
4934 encode(key, newkeydata);
4935 encode(val, newkeydata);
7c673cae
FG
4936 dout(20) << " create " << key << " " << val.length() << dendl;
4937 nkeys++;
4938 } else if (op == CEPH_OSD_TMAP_RM) {
4939 // do nothing.
4940 if (!key_exists) {
4941 return -ENOENT;
4942 }
4943 } else if (op == CEPH_OSD_TMAP_RMSLOPPY) {
4944 // do nothing
4945 } else {
4946 dout(10) << " invalid tmap op " << (int)op << dendl;
4947 return -EINVAL;
4948 }
4949 }
4950
4951 // copy remaining
4952 if (have_next) {
11fdf7f2
TL
4953 encode(nextkey, newkeydata);
4954 encode(nextval, newkeydata);
7c673cae
FG
4955 dout(20) << " keep " << nextkey << " " << nextval.length() << dendl;
4956 }
4957 if (!ip.end()) {
4958 bufferlist rest;
4959 rest.substr_of(newop.outdata, ip.get_off(), newop.outdata.length() - ip.get_off());
4960 dout(20) << " keep trailing " << rest.length()
4961 << " at " << newkeydata.length() << dendl;
4962 newkeydata.claim_append(rest);
4963 }
4964
4965 // encode final key count + key data
4966 dout(20) << "tmapup final nkeys " << nkeys << dendl;
11fdf7f2 4967 encode(nkeys, obl);
7c673cae
FG
4968 obl.claim_append(newkeydata);
4969
4970 if (0) {
4971 dout(30) << " final is \n";
4972 obl.hexdump(*_dout);
4973 *_dout << dendl;
4974
4975 // sanity check
11fdf7f2 4976 auto tp = obl.cbegin();
7c673cae 4977 bufferlist h;
11fdf7f2 4978 decode(h, tp);
7c673cae 4979 map<string,bufferlist> d;
11fdf7f2
TL
4980 decode(d, tp);
4981 ceph_assert(tp.end());
7c673cae
FG
4982 dout(0) << " **** debug sanity check, looks ok ****" << dendl;
4983 }
4984
4985 // write it out
4986 if (!result) {
4987 dout(20) << "tmapput write " << obl.length() << dendl;
4988 newop.op.op = CEPH_OSD_OP_WRITEFULL;
4989 newop.op.extent.offset = 0;
4990 newop.op.extent.length = obl.length();
4991 newop.indata = obl;
4992 do_osd_ops(ctx, nops);
7c673cae
FG
4993 }
4994 }
4995 return result;
4996}
4997
11fdf7f2
TL
4998static int check_offset_and_length(uint64_t offset, uint64_t length,
4999 uint64_t max, DoutPrefixProvider *dpp)
7c673cae
FG
5000{
5001 if (offset >= max ||
5002 length > max ||
11fdf7f2
TL
5003 offset + length > max) {
5004 ldpp_dout(dpp, 10) << __func__ << " "
5005 << "osd_max_object_size: " << max
5006 << "; Hard limit of object size is 4GB." << dendl;
7c673cae 5007 return -EFBIG;
11fdf7f2 5008 }
7c673cae
FG
5009
5010 return 0;
5011}
5012
5013struct FillInVerifyExtent : public Context {
5014 ceph_le64 *r;
5015 int32_t *rval;
5016 bufferlist *outdatap;
9f95a23c 5017 std::optional<uint32_t> maybe_crc;
7c673cae
FG
5018 uint64_t size;
5019 OSDService *osd;
5020 hobject_t soid;
9f95a23c 5021 uint32_t flags;
7c673cae 5022 FillInVerifyExtent(ceph_le64 *r, int32_t *rv, bufferlist *blp,
9f95a23c
TL
5023 std::optional<uint32_t> mc, uint64_t size,
5024 OSDService *osd, hobject_t soid, uint32_t flags) :
7c673cae
FG
5025 r(r), rval(rv), outdatap(blp), maybe_crc(mc),
5026 size(size), osd(osd), soid(soid), flags(flags) {}
5027 void finish(int len) override {
7c673cae 5028 *r = len;
c07f9fc5
FG
5029 if (len < 0) {
5030 *rval = len;
7c673cae 5031 return;
c07f9fc5
FG
5032 }
5033 *rval = 0;
5034
7c673cae
FG
5035 // whole object? can we verify the checksum?
5036 if (maybe_crc && *r == size) {
5037 uint32_t crc = outdatap->crc32c(-1);
5038 if (maybe_crc != crc) {
5039 osd->clog->error() << std::hex << " full-object read crc 0x" << crc
5040 << " != expected 0x" << *maybe_crc
5041 << std::dec << " on " << soid;
5042 if (!(flags & CEPH_OSD_OP_FLAG_FAILOK)) {
5043 *rval = -EIO;
5044 *r = 0;
5045 }
5046 }
5047 }
5048 }
5049};
5050
5051struct ToSparseReadResult : public Context {
c07f9fc5
FG
5052 int* result;
5053 bufferlist* data_bl;
7c673cae 5054 uint64_t data_offset;
c07f9fc5
FG
5055 ceph_le64* len;
5056 ToSparseReadResult(int* result, bufferlist* bl, uint64_t offset,
5057 ceph_le64* len)
5058 : result(result), data_bl(bl), data_offset(offset),len(len) {}
7c673cae 5059 void finish(int r) override {
c07f9fc5
FG
5060 if (r < 0) {
5061 *result = r;
5062 return;
5063 }
5064 *result = 0;
5065 *len = r;
7c673cae
FG
5066 bufferlist outdata;
5067 map<uint64_t, uint64_t> extents = {{data_offset, r}};
11fdf7f2 5068 encode(extents, outdata);
c07f9fc5
FG
5069 ::encode_destructively(*data_bl, outdata);
5070 data_bl->swap(outdata);
7c673cae
FG
5071 }
5072};
5073
5074template<typename V>
5075static string list_keys(const map<string, V>& m) {
5076 string s;
5077 for (typename map<string, V>::const_iterator itr = m.begin(); itr != m.end(); ++itr) {
5078 if (!s.empty()) {
5079 s.push_back(',');
5080 }
5081 s.append(itr->first);
5082 }
5083 return s;
5084}
5085
5086template<typename T>
5087static string list_entries(const T& m) {
5088 string s;
5089 for (typename T::const_iterator itr = m.begin(); itr != m.end(); ++itr) {
5090 if (!s.empty()) {
5091 s.push_back(',');
5092 }
5093 s.append(*itr);
5094 }
5095 return s;
5096}
5097
5098void PrimaryLogPG::maybe_create_new_object(
5099 OpContext *ctx,
5100 bool ignore_transaction)
5101{
5102 ObjectState& obs = ctx->new_obs;
5103 if (!obs.exists) {
5104 ctx->delta_stats.num_objects++;
5105 obs.exists = true;
11fdf7f2 5106 ceph_assert(!obs.oi.is_whiteout());
7c673cae
FG
5107 obs.oi.new_object();
5108 if (!ignore_transaction)
5109 ctx->op_t->create(obs.oi.soid);
5110 } else if (obs.oi.is_whiteout()) {
5111 dout(10) << __func__ << " clearing whiteout on " << obs.oi.soid << dendl;
5112 ctx->new_obs.oi.clear_flag(object_info_t::FLAG_WHITEOUT);
5113 --ctx->delta_stats.num_whiteouts;
5114 }
5115}
5116
c07f9fc5
FG
5117struct ReadFinisher : public PrimaryLogPG::OpFinisher {
5118 OSDOp& osd_op;
5119
11fdf7f2 5120 explicit ReadFinisher(OSDOp& osd_op) : osd_op(osd_op) {
c07f9fc5
FG
5121 }
5122
5123 int execute() override {
5124 return osd_op.rval;
5125 }
5126};
5127
7c673cae
FG
5128struct C_ChecksumRead : public Context {
5129 PrimaryLogPG *primary_log_pg;
5130 OSDOp &osd_op;
5131 Checksummer::CSumType csum_type;
5132 bufferlist init_value_bl;
5133 ceph_le64 read_length;
5134 bufferlist read_bl;
5135 Context *fill_extent_ctx;
5136
5137 C_ChecksumRead(PrimaryLogPG *primary_log_pg, OSDOp &osd_op,
5138 Checksummer::CSumType csum_type, bufferlist &&init_value_bl,
9f95a23c
TL
5139 std::optional<uint32_t> maybe_crc, uint64_t size,
5140 OSDService *osd, hobject_t soid, uint32_t flags)
7c673cae
FG
5141 : primary_log_pg(primary_log_pg), osd_op(osd_op),
5142 csum_type(csum_type), init_value_bl(std::move(init_value_bl)),
5143 fill_extent_ctx(new FillInVerifyExtent(&read_length, &osd_op.rval,
5144 &read_bl, maybe_crc, size,
5145 osd, soid, flags)) {
5146 }
c07f9fc5
FG
5147 ~C_ChecksumRead() override {
5148 delete fill_extent_ctx;
5149 }
7c673cae
FG
5150
5151 void finish(int r) override {
5152 fill_extent_ctx->complete(r);
c07f9fc5 5153 fill_extent_ctx = nullptr;
7c673cae
FG
5154
5155 if (osd_op.rval >= 0) {
11fdf7f2 5156 bufferlist::const_iterator init_value_bl_it = init_value_bl.begin();
7c673cae 5157 osd_op.rval = primary_log_pg->finish_checksum(osd_op, csum_type,
c07f9fc5 5158 &init_value_bl_it, read_bl);
7c673cae
FG
5159 }
5160 }
5161};
5162
5163int PrimaryLogPG::do_checksum(OpContext *ctx, OSDOp& osd_op,
11fdf7f2 5164 bufferlist::const_iterator *bl_it)
7c673cae
FG
5165{
5166 dout(20) << __func__ << dendl;
5167
5168 auto& op = osd_op.op;
5169 if (op.checksum.chunk_size > 0) {
5170 if (op.checksum.length == 0) {
5171 dout(10) << __func__ << ": length required when chunk size provided"
5172 << dendl;
5173 return -EINVAL;
5174 }
5175 if (op.checksum.length % op.checksum.chunk_size != 0) {
5176 dout(10) << __func__ << ": length not aligned to chunk size" << dendl;
5177 return -EINVAL;
5178 }
5179 }
5180
5181 auto& oi = ctx->new_obs.oi;
5182 if (op.checksum.offset == 0 && op.checksum.length == 0) {
5183 // zeroed offset+length implies checksum whole object
5184 op.checksum.length = oi.size;
11fdf7f2
TL
5185 } else if (op.checksum.offset >= oi.size) {
5186 // read size was trimmed to zero, do nothing
5187 // see PrimaryLogPG::do_read
5188 return 0;
5189 } else if (op.extent.offset + op.extent.length > oi.size) {
5190 op.extent.length = oi.size - op.extent.offset;
5191 if (op.checksum.chunk_size > 0 &&
5192 op.checksum.length % op.checksum.chunk_size != 0) {
5193 dout(10) << __func__ << ": length (trimmed to 0x"
5194 << std::hex << op.checksum.length
5195 << ") not aligned to chunk size 0x"
5196 << op.checksum.chunk_size << std::dec
5197 << dendl;
5198 return -EINVAL;
5199 }
7c673cae
FG
5200 }
5201
5202 Checksummer::CSumType csum_type;
5203 switch (op.checksum.type) {
5204 case CEPH_OSD_CHECKSUM_OP_TYPE_XXHASH32:
5205 csum_type = Checksummer::CSUM_XXHASH32;
5206 break;
5207 case CEPH_OSD_CHECKSUM_OP_TYPE_XXHASH64:
5208 csum_type = Checksummer::CSUM_XXHASH64;
5209 break;
5210 case CEPH_OSD_CHECKSUM_OP_TYPE_CRC32C:
5211 csum_type = Checksummer::CSUM_CRC32C;
5212 break;
5213 default:
5214 dout(10) << __func__ << ": unknown crc type ("
5215 << static_cast<uint32_t>(op.checksum.type) << ")" << dendl;
5216 return -EINVAL;
5217 }
5218
5219 size_t csum_init_value_size = Checksummer::get_csum_init_value_size(csum_type);
5220 if (bl_it->get_remaining() < csum_init_value_size) {
5221 dout(10) << __func__ << ": init value not provided" << dendl;
5222 return -EINVAL;
5223 }
5224
5225 bufferlist init_value_bl;
5226 init_value_bl.substr_of(bl_it->get_bl(), bl_it->get_off(),
5227 csum_init_value_size);
9f95a23c 5228 *bl_it += csum_init_value_size;
7c673cae 5229
11fdf7f2 5230 if (pool.info.is_erasure() && op.checksum.length > 0) {
7c673cae
FG
5231 // If there is a data digest and it is possible we are reading
5232 // entire object, pass the digest.
9f95a23c 5233 std::optional<uint32_t> maybe_crc;
11fdf7f2 5234 if (oi.is_data_digest() && op.checksum.offset == 0 &&
7c673cae
FG
5235 op.checksum.length >= oi.size) {
5236 maybe_crc = oi.data_digest;
5237 }
5238
5239 // async read
5240 auto& soid = oi.soid;
5241 auto checksum_ctx = new C_ChecksumRead(this, osd_op, csum_type,
5242 std::move(init_value_bl), maybe_crc,
5243 oi.size, osd, soid, op.flags);
c07f9fc5 5244
7c673cae
FG
5245 ctx->pending_async_reads.push_back({
5246 {op.checksum.offset, op.checksum.length, op.flags},
5247 {&checksum_ctx->read_bl, checksum_ctx}});
5248
5249 dout(10) << __func__ << ": async_read noted for " << soid << dendl;
c07f9fc5
FG
5250 ctx->op_finishers[ctx->current_osd_subop_num].reset(
5251 new ReadFinisher(osd_op));
5252 return -EINPROGRESS;
7c673cae
FG
5253 }
5254
5255 // sync read
7c673cae
FG
5256 std::vector<OSDOp> read_ops(1);
5257 auto& read_op = read_ops[0];
5258 if (op.checksum.length > 0) {
5259 read_op.op.op = CEPH_OSD_OP_READ;
5260 read_op.op.flags = op.flags;
5261 read_op.op.extent.offset = op.checksum.offset;
5262 read_op.op.extent.length = op.checksum.length;
5263 read_op.op.extent.truncate_size = 0;
5264 read_op.op.extent.truncate_seq = 0;
5265
5266 int r = do_osd_ops(ctx, read_ops);
5267 if (r < 0) {
5268 derr << __func__ << ": do_osd_ops failed: " << cpp_strerror(r) << dendl;
5269 return r;
5270 }
5271 }
5272
11fdf7f2 5273 bufferlist::const_iterator init_value_bl_it = init_value_bl.begin();
7c673cae
FG
5274 return finish_checksum(osd_op, csum_type, &init_value_bl_it,
5275 read_op.outdata);
5276}
5277
5278int PrimaryLogPG::finish_checksum(OSDOp& osd_op,
5279 Checksummer::CSumType csum_type,
11fdf7f2 5280 bufferlist::const_iterator *init_value_bl_it,
7c673cae
FG
5281 const bufferlist &read_bl) {
5282 dout(20) << __func__ << dendl;
5283
5284 auto& op = osd_op.op;
5285
5286 if (op.checksum.length > 0 && read_bl.length() != op.checksum.length) {
5287 derr << __func__ << ": bytes read " << read_bl.length() << " != "
5288 << op.checksum.length << dendl;
5289 return -EINVAL;
5290 }
5291
5292 size_t csum_chunk_size = (op.checksum.chunk_size != 0 ?
5293 op.checksum.chunk_size : read_bl.length());
5294 uint32_t csum_count = (csum_chunk_size > 0 ?
5295 read_bl.length() / csum_chunk_size : 0);
5296
5297 bufferlist csum;
5298 bufferptr csum_data;
5299 if (csum_count > 0) {
5300 size_t csum_value_size = Checksummer::get_csum_value_size(csum_type);
5301 csum_data = buffer::create(csum_value_size * csum_count);
5302 csum_data.zero();
5303 csum.append(csum_data);
5304
5305 switch (csum_type) {
5306 case Checksummer::CSUM_XXHASH32:
5307 {
5308 Checksummer::xxhash32::init_value_t init_value;
11fdf7f2 5309 decode(init_value, *init_value_bl_it);
7c673cae
FG
5310 Checksummer::calculate<Checksummer::xxhash32>(
5311 init_value, csum_chunk_size, 0, read_bl.length(), read_bl,
5312 &csum_data);
5313 }
5314 break;
5315 case Checksummer::CSUM_XXHASH64:
5316 {
5317 Checksummer::xxhash64::init_value_t init_value;
11fdf7f2 5318 decode(init_value, *init_value_bl_it);
7c673cae
FG
5319 Checksummer::calculate<Checksummer::xxhash64>(
5320 init_value, csum_chunk_size, 0, read_bl.length(), read_bl,
5321 &csum_data);
5322 }
5323 break;
5324 case Checksummer::CSUM_CRC32C:
5325 {
5326 Checksummer::crc32c::init_value_t init_value;
11fdf7f2 5327 decode(init_value, *init_value_bl_it);
7c673cae
FG
5328 Checksummer::calculate<Checksummer::crc32c>(
5329 init_value, csum_chunk_size, 0, read_bl.length(), read_bl,
5330 &csum_data);
5331 }
5332 break;
5333 default:
5334 break;
5335 }
5336 }
5337
11fdf7f2 5338 encode(csum_count, osd_op.outdata);
7c673cae
FG
5339 osd_op.outdata.claim_append(csum);
5340 return 0;
5341}
5342
c07f9fc5
FG
5343struct C_ExtentCmpRead : public Context {
5344 PrimaryLogPG *primary_log_pg;
5345 OSDOp &osd_op;
11fdf7f2 5346 ceph_le64 read_length{};
c07f9fc5
FG
5347 bufferlist read_bl;
5348 Context *fill_extent_ctx;
5349
5350 C_ExtentCmpRead(PrimaryLogPG *primary_log_pg, OSDOp &osd_op,
9f95a23c
TL
5351 std::optional<uint32_t> maybe_crc, uint64_t size,
5352 OSDService *osd, hobject_t soid, uint32_t flags)
c07f9fc5
FG
5353 : primary_log_pg(primary_log_pg), osd_op(osd_op),
5354 fill_extent_ctx(new FillInVerifyExtent(&read_length, &osd_op.rval,
5355 &read_bl, maybe_crc, size,
5356 osd, soid, flags)) {
5357 }
5358 ~C_ExtentCmpRead() override {
5359 delete fill_extent_ctx;
5360 }
5361
5362 void finish(int r) override {
5363 if (r == -ENOENT) {
5364 osd_op.rval = 0;
5365 read_bl.clear();
5366 delete fill_extent_ctx;
5367 } else {
5368 fill_extent_ctx->complete(r);
5369 }
5370 fill_extent_ctx = nullptr;
5371
5372 if (osd_op.rval >= 0) {
5373 osd_op.rval = primary_log_pg->finish_extent_cmp(osd_op, read_bl);
5374 }
5375 }
5376};
5377
5378int PrimaryLogPG::do_extent_cmp(OpContext *ctx, OSDOp& osd_op)
5379{
5380 dout(20) << __func__ << dendl;
5381 ceph_osd_op& op = osd_op.op;
5382
3efd9988
FG
5383 auto& oi = ctx->new_obs.oi;
5384 uint64_t size = oi.size;
5385 if ((oi.truncate_seq < op.extent.truncate_seq) &&
5386 (op.extent.offset + op.extent.length > op.extent.truncate_size)) {
5387 size = op.extent.truncate_size;
5388 }
5389
5390 if (op.extent.offset >= size) {
5391 op.extent.length = 0;
5392 } else if (op.extent.offset + op.extent.length > size) {
5393 op.extent.length = size - op.extent.offset;
5394 }
5395
5396 if (op.extent.length == 0) {
5397 dout(20) << __func__ << " zero length extent" << dendl;
5398 return finish_extent_cmp(osd_op, bufferlist{});
5399 } else if (!ctx->obs->exists || ctx->obs->oi.is_whiteout()) {
c07f9fc5
FG
5400 dout(20) << __func__ << " object DNE" << dendl;
5401 return finish_extent_cmp(osd_op, {});
11fdf7f2 5402 } else if (pool.info.is_erasure()) {
c07f9fc5
FG
5403 // If there is a data digest and it is possible we are reading
5404 // entire object, pass the digest.
9f95a23c 5405 std::optional<uint32_t> maybe_crc;
11fdf7f2 5406 if (oi.is_data_digest() && op.checksum.offset == 0 &&
c07f9fc5
FG
5407 op.checksum.length >= oi.size) {
5408 maybe_crc = oi.data_digest;
5409 }
5410
5411 // async read
5412 auto& soid = oi.soid;
5413 auto extent_cmp_ctx = new C_ExtentCmpRead(this, osd_op, maybe_crc, oi.size,
5414 osd, soid, op.flags);
5415 ctx->pending_async_reads.push_back({
5416 {op.extent.offset, op.extent.length, op.flags},
5417 {&extent_cmp_ctx->read_bl, extent_cmp_ctx}});
5418
5419 dout(10) << __func__ << ": async_read noted for " << soid << dendl;
5420
5421 ctx->op_finishers[ctx->current_osd_subop_num].reset(
5422 new ReadFinisher(osd_op));
5423 return -EINPROGRESS;
5424 }
5425
5426 // sync read
5427 vector<OSDOp> read_ops(1);
5428 OSDOp& read_op = read_ops[0];
5429
5430 read_op.op.op = CEPH_OSD_OP_SYNC_READ;
5431 read_op.op.extent.offset = op.extent.offset;
5432 read_op.op.extent.length = op.extent.length;
5433 read_op.op.extent.truncate_seq = op.extent.truncate_seq;
5434 read_op.op.extent.truncate_size = op.extent.truncate_size;
5435
5436 int result = do_osd_ops(ctx, read_ops);
5437 if (result < 0) {
5438 derr << __func__ << " failed " << result << dendl;
5439 return result;
5440 }
5441 return finish_extent_cmp(osd_op, read_op.outdata);
5442}
5443
5444int PrimaryLogPG::finish_extent_cmp(OSDOp& osd_op, const bufferlist &read_bl)
5445{
5446 for (uint64_t idx = 0; idx < osd_op.indata.length(); ++idx) {
5447 char read_byte = (idx < read_bl.length() ? read_bl[idx] : 0);
5448 if (osd_op.indata[idx] != read_byte) {
5449 return (-MAX_ERRNO - idx);
5450 }
5451 }
5452
5453 return 0;
5454}
5455
5456int PrimaryLogPG::do_read(OpContext *ctx, OSDOp& osd_op) {
5457 dout(20) << __func__ << dendl;
5458 auto& op = osd_op.op;
5459 auto& oi = ctx->new_obs.oi;
5460 auto& soid = oi.soid;
5461 __u32 seq = oi.truncate_seq;
5462 uint64_t size = oi.size;
5463 bool trimmed_read = false;
5464
91327a77
AA
5465 dout(30) << __func__ << " oi.size: " << oi.size << dendl;
5466 dout(30) << __func__ << " oi.truncate_seq: " << oi.truncate_seq << dendl;
5467 dout(30) << __func__ << " op.extent.truncate_seq: " << op.extent.truncate_seq << dendl;
5468 dout(30) << __func__ << " op.extent.truncate_size: " << op.extent.truncate_size << dendl;
5469
c07f9fc5
FG
5470 // are we beyond truncate_size?
5471 if ( (seq < op.extent.truncate_seq) &&
91327a77
AA
5472 (op.extent.offset + op.extent.length > op.extent.truncate_size) &&
5473 (size > op.extent.truncate_size) )
c07f9fc5
FG
5474 size = op.extent.truncate_size;
5475
5476 if (op.extent.length == 0) //length is zero mean read the whole object
5477 op.extent.length = size;
5478
5479 if (op.extent.offset >= size) {
5480 op.extent.length = 0;
5481 trimmed_read = true;
5482 } else if (op.extent.offset + op.extent.length > size) {
5483 op.extent.length = size - op.extent.offset;
5484 trimmed_read = true;
5485 }
5486
91327a77
AA
5487 dout(30) << __func__ << "op.extent.length is now " << op.extent.length << dendl;
5488
c07f9fc5
FG
5489 // read into a buffer
5490 int result = 0;
5491 if (trimmed_read && op.extent.length == 0) {
5492 // read size was trimmed to zero and it is expected to do nothing
5493 // a read operation of 0 bytes does *not* do nothing, this is why
5494 // the trimmed_read boolean is needed
11fdf7f2
TL
5495 } else if (pool.info.is_erasure()) {
5496 // The initialisation below is required to silence a false positive
5497 // -Wmaybe-uninitialized warning
9f95a23c 5498 std::optional<uint32_t> maybe_crc;
c07f9fc5
FG
5499 // If there is a data digest and it is possible we are reading
5500 // entire object, pass the digest. FillInVerifyExtent will
5501 // will check the oi.size again.
11fdf7f2 5502 if (oi.is_data_digest() && op.extent.offset == 0 &&
c07f9fc5
FG
5503 op.extent.length >= oi.size)
5504 maybe_crc = oi.data_digest;
5505 ctx->pending_async_reads.push_back(
5506 make_pair(
5507 boost::make_tuple(op.extent.offset, op.extent.length, op.flags),
5508 make_pair(&osd_op.outdata,
5509 new FillInVerifyExtent(&op.extent.length, &osd_op.rval,
5510 &osd_op.outdata, maybe_crc, oi.size,
5511 osd, soid, op.flags))));
5512 dout(10) << " async_read noted for " << soid << dendl;
5513
5514 ctx->op_finishers[ctx->current_osd_subop_num].reset(
5515 new ReadFinisher(osd_op));
5516 } else {
5517 int r = pgbackend->objects_read_sync(
5518 soid, op.extent.offset, op.extent.length, op.flags, &osd_op.outdata);
1adf2230 5519 // whole object? can we verify the checksum?
11fdf7f2 5520 if (r >= 0 && op.extent.offset == 0 &&
1adf2230
AA
5521 (uint64_t)r == oi.size && oi.is_data_digest()) {
5522 uint32_t crc = osd_op.outdata.crc32c(-1);
5523 if (oi.data_digest != crc) {
5524 osd->clog->error() << info.pgid << std::hex
5525 << " full-object read crc 0x" << crc
5526 << " != expected 0x" << oi.data_digest
5527 << std::dec << " on " << soid;
5528 r = -EIO; // try repair later
5529 }
5530 }
c07f9fc5 5531 if (r == -EIO) {
11fdf7f2 5532 r = rep_repair_primary_object(soid, ctx);
c07f9fc5
FG
5533 }
5534 if (r >= 0)
5535 op.extent.length = r;
a8e16298 5536 else if (r == -EAGAIN) {
11fdf7f2 5537 result = -EAGAIN;
a8e16298 5538 } else {
c07f9fc5
FG
5539 result = r;
5540 op.extent.length = 0;
5541 }
5542 dout(10) << " read got " << r << " / " << op.extent.length
5543 << " bytes from obj " << soid << dendl;
c07f9fc5 5544 }
11fdf7f2
TL
5545 if (result >= 0) {
5546 ctx->delta_stats.num_rd_kb += shift_round_up(op.extent.length, 10);
5547 ctx->delta_stats.num_rd++;
5548 }
c07f9fc5
FG
5549 return result;
5550}
5551
5552int PrimaryLogPG::do_sparse_read(OpContext *ctx, OSDOp& osd_op) {
5553 dout(20) << __func__ << dendl;
5554 auto& op = osd_op.op;
5555 auto& oi = ctx->new_obs.oi;
5556 auto& soid = oi.soid;
5557
5558 if (op.extent.truncate_seq) {
5559 dout(0) << "sparse_read does not support truncation sequence " << dendl;
5560 return -EINVAL;
5561 }
5562
5563 ++ctx->num_read;
11fdf7f2 5564 if (pool.info.is_erasure()) {
c07f9fc5
FG
5565 // translate sparse read to a normal one if not supported
5566 uint64_t offset = op.extent.offset;
5567 uint64_t length = op.extent.length;
5568 if (offset > oi.size) {
5569 length = 0;
5570 } else if (offset + length > oi.size) {
5571 length = oi.size - offset;
5572 }
5573
5574 if (length > 0) {
5575 ctx->pending_async_reads.push_back(
5576 make_pair(
5577 boost::make_tuple(offset, length, op.flags),
5578 make_pair(
5579 &osd_op.outdata,
5580 new ToSparseReadResult(&osd_op.rval, &osd_op.outdata, offset,
5581 &op.extent.length))));
5582 dout(10) << " async_read (was sparse_read) noted for " << soid << dendl;
5583
5584 ctx->op_finishers[ctx->current_osd_subop_num].reset(
5585 new ReadFinisher(osd_op));
5586 } else {
5587 dout(10) << " sparse read ended up empty for " << soid << dendl;
5588 map<uint64_t, uint64_t> extents;
11fdf7f2 5589 encode(extents, osd_op.outdata);
c07f9fc5
FG
5590 }
5591 } else {
5592 // read into a buffer
5593 map<uint64_t, uint64_t> m;
c07f9fc5
FG
5594 int r = osd->store->fiemap(ch, ghobject_t(soid, ghobject_t::NO_GEN,
5595 info.pgid.shard),
5596 op.extent.offset, op.extent.length, m);
5597 if (r < 0) {
5598 return r;
5599 }
5600
c07f9fc5 5601 bufferlist data_bl;
9f95a23c
TL
5602 r = pgbackend->objects_readv_sync(soid, std::move(m), op.flags, &data_bl);
5603 if (r == -EIO) {
5604 r = rep_repair_primary_object(soid, ctx);
5605 }
5606 if (r < 0) {
5607 return r;
c07f9fc5
FG
5608 }
5609
5610 // Why SPARSE_READ need checksum? In fact, librbd always use sparse-read.
5611 // Maybe at first, there is no much whole objects. With continued use, more
5612 // and more whole object exist. So from this point, for spare-read add
5613 // checksum make sense.
9f95a23c 5614 if ((uint64_t)r == oi.size && oi.is_data_digest()) {
c07f9fc5
FG
5615 uint32_t crc = data_bl.crc32c(-1);
5616 if (oi.data_digest != crc) {
5617 osd->clog->error() << info.pgid << std::hex
5618 << " full-object read crc 0x" << crc
5619 << " != expected 0x" << oi.data_digest
5620 << std::dec << " on " << soid;
11fdf7f2 5621 r = rep_repair_primary_object(soid, ctx);
1adf2230
AA
5622 if (r < 0) {
5623 return r;
5624 }
c07f9fc5
FG
5625 }
5626 }
5627
1911f103 5628 op.extent.length = r;
c07f9fc5 5629
11fdf7f2 5630 encode(m, osd_op.outdata); // re-encode since it might be modified
c07f9fc5
FG
5631 ::encode_destructively(data_bl, osd_op.outdata);
5632
9f95a23c 5633 dout(10) << " sparse_read got " << r << " bytes from object "
c07f9fc5
FG
5634 << soid << dendl;
5635 }
5636
11fdf7f2 5637 ctx->delta_stats.num_rd_kb += shift_round_up(op.extent.length, 10);
c07f9fc5
FG
5638 ctx->delta_stats.num_rd++;
5639 return 0;
5640}
5641
7c673cae
FG
5642int PrimaryLogPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops)
5643{
5644 int result = 0;
5645 SnapSetContext *ssc = ctx->obc->ssc;
5646 ObjectState& obs = ctx->new_obs;
5647 object_info_t& oi = obs.oi;
5648 const hobject_t& soid = oi.soid;
11fdf7f2
TL
5649 const bool skip_data_digest = osd->store->has_builtin_csum() &&
5650 osd->osd_skip_data_digest;
7c673cae 5651
7c673cae
FG
5652 PGTransaction* t = ctx->op_t.get();
5653
5654 dout(10) << "do_osd_op " << soid << " " << ops << dendl;
5655
c07f9fc5 5656 ctx->current_osd_subop_num = 0;
b32b8144 5657 for (auto p = ops.begin(); p != ops.end(); ++p, ctx->current_osd_subop_num++, ctx->processed_subop_count++) {
7c673cae
FG
5658 OSDOp& osd_op = *p;
5659 ceph_osd_op& op = osd_op.op;
5660
c07f9fc5
FG
5661 OpFinisher* op_finisher = nullptr;
5662 {
5663 auto op_finisher_it = ctx->op_finishers.find(ctx->current_osd_subop_num);
5664 if (op_finisher_it != ctx->op_finishers.end()) {
5665 op_finisher = op_finisher_it->second.get();
5666 }
5667 }
5668
9f95a23c 5669 // TODO: check endianness (ceph_le32 vs uint32_t, etc.)
7c673cae
FG
5670 // The fields in ceph_osd_op are little-endian (according to the definition in rados.h),
5671 // but the code in this function seems to treat them as native-endian. What should the
5672 // tracepoints do?
5673 tracepoint(osd, do_osd_op_pre, soid.oid.name.c_str(), soid.snap.val, op.op, ceph_osd_op_name(op.op), op.flags);
5674
5675 dout(10) << "do_osd_op " << osd_op << dendl;
5676
11fdf7f2 5677 auto bp = osd_op.indata.cbegin();
7c673cae
FG
5678
5679 // user-visible modifcation?
5680 switch (op.op) {
5681 // non user-visible modifications
5682 case CEPH_OSD_OP_WATCH:
5683 case CEPH_OSD_OP_CACHE_EVICT:
5684 case CEPH_OSD_OP_CACHE_FLUSH:
5685 case CEPH_OSD_OP_CACHE_TRY_FLUSH:
5686 case CEPH_OSD_OP_UNDIRTY:
5687 case CEPH_OSD_OP_COPY_FROM: // we handle user_version update explicitly
9f95a23c 5688 case CEPH_OSD_OP_COPY_FROM2:
7c673cae
FG
5689 case CEPH_OSD_OP_CACHE_PIN:
5690 case CEPH_OSD_OP_CACHE_UNPIN:
31f18b77 5691 case CEPH_OSD_OP_SET_REDIRECT:
11fdf7f2 5692 case CEPH_OSD_OP_TIER_PROMOTE:
9f95a23c 5693 case CEPH_OSD_OP_TIER_FLUSH:
7c673cae
FG
5694 break;
5695 default:
5696 if (op.op & CEPH_OSD_OP_MODE_WR)
5697 ctx->user_modify = true;
5698 }
5699
5700 // munge -1 truncate to 0 truncate
5701 if (ceph_osd_op_uses_extent(op.op) &&
5702 op.extent.truncate_seq == 1 &&
5703 op.extent.truncate_size == (-1ULL)) {
5704 op.extent.truncate_size = 0;
5705 op.extent.truncate_seq = 0;
5706 }
5707
5708 // munge ZERO -> TRUNCATE? (don't munge to DELETE or we risk hosing attributes)
5709 if (op.op == CEPH_OSD_OP_ZERO &&
11fdf7f2
TL
5710 obs.exists &&
5711 op.extent.offset < static_cast<Option::size_t>(osd->osd_max_object_size) &&
5712 op.extent.length >= 1 &&
5713 op.extent.length <= static_cast<Option::size_t>(osd->osd_max_object_size) &&
7c673cae
FG
5714 op.extent.offset + op.extent.length >= oi.size) {
5715 if (op.extent.offset >= oi.size) {
5716 // no-op
5717 goto fail;
5718 }
5719 dout(10) << " munging ZERO " << op.extent.offset << "~" << op.extent.length
5720 << " -> TRUNCATE " << op.extent.offset << " (old size is " << oi.size << ")" << dendl;
5721 op.op = CEPH_OSD_OP_TRUNCATE;
5722 }
5723
5724 switch (op.op) {
5725
5726 // --- READS ---
5727
5728 case CEPH_OSD_OP_CMPEXT:
5729 ++ctx->num_read;
c07f9fc5
FG
5730 tracepoint(osd, do_osd_op_pre_extent_cmp, soid.oid.name.c_str(),
5731 soid.snap.val, oi.size, oi.truncate_seq, op.extent.offset,
5732 op.extent.length, op.extent.truncate_size,
5733 op.extent.truncate_seq);
5734
5735 if (op_finisher == nullptr) {
5736 result = do_extent_cmp(ctx, osd_op);
5737 } else {
5738 result = op_finisher->execute();
5739 }
7c673cae
FG
5740 break;
5741
5742 case CEPH_OSD_OP_SYNC_READ:
11fdf7f2 5743 if (pool.info.is_erasure()) {
7c673cae
FG
5744 result = -EOPNOTSUPP;
5745 break;
5746 }
5747 // fall through
5748 case CEPH_OSD_OP_READ:
5749 ++ctx->num_read;
c07f9fc5
FG
5750 tracepoint(osd, do_osd_op_pre_read, soid.oid.name.c_str(),
5751 soid.snap.val, oi.size, oi.truncate_seq, op.extent.offset,
5752 op.extent.length, op.extent.truncate_size,
5753 op.extent.truncate_seq);
5754 if (op_finisher == nullptr) {
5755 if (!ctx->data_off) {
7c673cae
FG
5756 ctx->data_off = op.extent.offset;
5757 }
c07f9fc5
FG
5758 result = do_read(ctx, osd_op);
5759 } else {
5760 result = op_finisher->execute();
7c673cae
FG
5761 }
5762 break;
5763
5764 case CEPH_OSD_OP_CHECKSUM:
5765 ++ctx->num_read;
5766 {
5767 tracepoint(osd, do_osd_op_pre_checksum, soid.oid.name.c_str(),
5768 soid.snap.val, oi.size, oi.truncate_seq, op.checksum.type,
5769 op.checksum.offset, op.checksum.length,
5770 op.checksum.chunk_size);
5771
c07f9fc5
FG
5772 if (op_finisher == nullptr) {
5773 result = do_checksum(ctx, osd_op, &bp);
5774 } else {
5775 result = op_finisher->execute();
7c673cae
FG
5776 }
5777 }
5778 break;
5779
5780 /* map extents */
5781 case CEPH_OSD_OP_MAPEXT:
5782 tracepoint(osd, do_osd_op_pre_mapext, soid.oid.name.c_str(), soid.snap.val, op.extent.offset, op.extent.length);
11fdf7f2 5783 if (pool.info.is_erasure()) {
7c673cae
FG
5784 result = -EOPNOTSUPP;
5785 break;
5786 }
5787 ++ctx->num_read;
5788 {
5789 // read into a buffer
5790 bufferlist bl;
5791 int r = osd->store->fiemap(ch, ghobject_t(soid, ghobject_t::NO_GEN,
5792 info.pgid.shard),
5793 op.extent.offset, op.extent.length, bl);
5794 osd_op.outdata.claim(bl);
5795 if (r < 0)
5796 result = r;
5797 else
11fdf7f2 5798 ctx->delta_stats.num_rd_kb += shift_round_up(bl.length(), 10);
7c673cae
FG
5799 ctx->delta_stats.num_rd++;
5800 dout(10) << " map_extents done on object " << soid << dendl;
5801 }
5802 break;
5803
5804 /* map extents */
5805 case CEPH_OSD_OP_SPARSE_READ:
c07f9fc5
FG
5806 tracepoint(osd, do_osd_op_pre_sparse_read, soid.oid.name.c_str(),
5807 soid.snap.val, oi.size, oi.truncate_seq, op.extent.offset,
5808 op.extent.length, op.extent.truncate_size,
5809 op.extent.truncate_seq);
5810 if (op_finisher == nullptr) {
5811 result = do_sparse_read(ctx, osd_op);
7c673cae 5812 } else {
c07f9fc5 5813 result = op_finisher->execute();
7c673cae 5814 }
7c673cae
FG
5815 break;
5816
5817 case CEPH_OSD_OP_CALL:
5818 {
5819 string cname, mname;
5820 bufferlist indata;
5821 try {
5822 bp.copy(op.cls.class_len, cname);
5823 bp.copy(op.cls.method_len, mname);
5824 bp.copy(op.cls.indata_len, indata);
5825 } catch (buffer::error& e) {
5826 dout(10) << "call unable to decode class + method + indata" << dendl;
5827 dout(30) << "in dump: ";
5828 osd_op.indata.hexdump(*_dout);
5829 *_dout << dendl;
5830 result = -EINVAL;
5831 tracepoint(osd, do_osd_op_pre_call, soid.oid.name.c_str(), soid.snap.val, "???", "???");
5832 break;
5833 }
5834 tracepoint(osd, do_osd_op_pre_call, soid.oid.name.c_str(), soid.snap.val, cname.c_str(), mname.c_str());
5835
5836 ClassHandler::ClassData *cls;
9f95a23c 5837 result = ClassHandler::get_instance().open_class(cname, &cls);
11fdf7f2 5838 ceph_assert(result == 0); // init_op_flags() already verified this works.
7c673cae 5839
9f95a23c 5840 ClassHandler::ClassMethod *method = cls->get_method(mname);
7c673cae
FG
5841 if (!method) {
5842 dout(10) << "call method " << cname << "." << mname << " does not exist" << dendl;
5843 result = -EOPNOTSUPP;
5844 break;
5845 }
5846
5847 int flags = method->get_flags();
5848 if (flags & CLS_METHOD_WR)
5849 ctx->user_modify = true;
5850
5851 bufferlist outdata;
5852 dout(10) << "call method " << cname << "." << mname << dendl;
5853 int prev_rd = ctx->num_read;
5854 int prev_wr = ctx->num_write;
5855 result = method->exec((cls_method_context_t)&ctx, indata, outdata);
5856
5857 if (ctx->num_read > prev_rd && !(flags & CLS_METHOD_RD)) {
5858 derr << "method " << cname << "." << mname << " tried to read object but is not marked RD" << dendl;
5859 result = -EIO;
5860 break;
5861 }
5862 if (ctx->num_write > prev_wr && !(flags & CLS_METHOD_WR)) {
5863 derr << "method " << cname << "." << mname << " tried to update object but is not marked WR" << dendl;
5864 result = -EIO;
5865 break;
5866 }
5867
5868 dout(10) << "method called response length=" << outdata.length() << dendl;
5869 op.extent.length = outdata.length();
5870 osd_op.outdata.claim_append(outdata);
5871 dout(30) << "out dump: ";
5872 osd_op.outdata.hexdump(*_dout);
5873 *_dout << dendl;
5874 }
5875 break;
5876
5877 case CEPH_OSD_OP_STAT:
5878 // note: stat does not require RD
5879 {
5880 tracepoint(osd, do_osd_op_pre_stat, soid.oid.name.c_str(), soid.snap.val);
5881
5882 if (obs.exists && !oi.is_whiteout()) {
11fdf7f2
TL
5883 encode(oi.size, osd_op.outdata);
5884 encode(oi.mtime, osd_op.outdata);
7c673cae
FG
5885 dout(10) << "stat oi has " << oi.size << " " << oi.mtime << dendl;
5886 } else {
5887 result = -ENOENT;
5888 dout(10) << "stat oi object does not exist" << dendl;
5889 }
5890
5891 ctx->delta_stats.num_rd++;
5892 }
5893 break;
5894
5895 case CEPH_OSD_OP_ISDIRTY:
5896 ++ctx->num_read;
5897 {
5898 tracepoint(osd, do_osd_op_pre_isdirty, soid.oid.name.c_str(), soid.snap.val);
5899 bool is_dirty = obs.oi.is_dirty();
11fdf7f2 5900 encode(is_dirty, osd_op.outdata);
7c673cae
FG
5901 ctx->delta_stats.num_rd++;
5902 result = 0;
5903 }
5904 break;
5905
5906 case CEPH_OSD_OP_UNDIRTY:
5907 ++ctx->num_write;
9f95a23c 5908 result = 0;
7c673cae
FG
5909 {
5910 tracepoint(osd, do_osd_op_pre_undirty, soid.oid.name.c_str(), soid.snap.val);
5911 if (oi.is_dirty()) {
5912 ctx->undirty = true; // see make_writeable()
5913 ctx->modify = true;
5914 ctx->delta_stats.num_wr++;
5915 }
7c673cae
FG
5916 }
5917 break;
5918
5919 case CEPH_OSD_OP_CACHE_TRY_FLUSH:
5920 ++ctx->num_write;
9f95a23c 5921 result = 0;
7c673cae
FG
5922 {
5923 tracepoint(osd, do_osd_op_pre_try_flush, soid.oid.name.c_str(), soid.snap.val);
9f95a23c 5924 if (ctx->lock_type != RWState::RWNONE) {
7c673cae
FG
5925 dout(10) << "cache-try-flush without SKIPRWLOCKS flag set" << dendl;
5926 result = -EINVAL;
5927 break;
5928 }
5929 if (pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE) {
5930 result = -EINVAL;
5931 break;
5932 }
5933 if (!obs.exists) {
5934 result = 0;
5935 break;
5936 }
5937 if (oi.is_cache_pinned()) {
5938 dout(10) << "cache-try-flush on a pinned object, consider unpin this object first" << dendl;
5939 result = -EPERM;
5940 break;
5941 }
5942 if (oi.is_dirty()) {
9f95a23c 5943 result = start_flush(ctx->op, ctx->obc, false, NULL, std::nullopt);
7c673cae
FG
5944 if (result == -EINPROGRESS)
5945 result = -EAGAIN;
5946 } else {
5947 result = 0;
5948 }
5949 }
5950 break;
5951
5952 case CEPH_OSD_OP_CACHE_FLUSH:
5953 ++ctx->num_write;
9f95a23c 5954 result = 0;
7c673cae
FG
5955 {
5956 tracepoint(osd, do_osd_op_pre_cache_flush, soid.oid.name.c_str(), soid.snap.val);
9f95a23c 5957 if (ctx->lock_type == RWState::RWNONE) {
7c673cae
FG
5958 dout(10) << "cache-flush with SKIPRWLOCKS flag set" << dendl;
5959 result = -EINVAL;
5960 break;
5961 }
5962 if (pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE) {
5963 result = -EINVAL;
5964 break;
5965 }
5966 if (!obs.exists) {
5967 result = 0;
5968 break;
5969 }
5970 if (oi.is_cache_pinned()) {
5971 dout(10) << "cache-flush on a pinned object, consider unpin this object first" << dendl;
5972 result = -EPERM;
5973 break;
5974 }
5975 hobject_t missing;
5976 if (oi.is_dirty()) {
9f95a23c 5977 result = start_flush(ctx->op, ctx->obc, true, &missing, std::nullopt);
7c673cae
FG
5978 if (result == -EINPROGRESS)
5979 result = -EAGAIN;
5980 } else {
5981 result = 0;
5982 }
5983 // Check special return value which has set missing_return
5984 if (result == -ENOENT) {
5985 dout(10) << __func__ << " CEPH_OSD_OP_CACHE_FLUSH got ENOENT" << dendl;
11fdf7f2 5986 ceph_assert(!missing.is_min());
7c673cae
FG
5987 wait_for_unreadable_object(missing, ctx->op);
5988 // Error code which is used elsewhere when wait_for_unreadable_object() is used
5989 result = -EAGAIN;
5990 }
5991 }
5992 break;
5993
5994 case CEPH_OSD_OP_CACHE_EVICT:
5995 ++ctx->num_write;
9f95a23c 5996 result = 0;
7c673cae
FG
5997 {
5998 tracepoint(osd, do_osd_op_pre_cache_evict, soid.oid.name.c_str(), soid.snap.val);
5999 if (pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE) {
6000 result = -EINVAL;
6001 break;
6002 }
6003 if (!obs.exists) {
6004 result = 0;
6005 break;
6006 }
6007 if (oi.is_cache_pinned()) {
6008 dout(10) << "cache-evict on a pinned object, consider unpin this object first" << dendl;
6009 result = -EPERM;
6010 break;
6011 }
6012 if (oi.is_dirty()) {
6013 result = -EBUSY;
6014 break;
6015 }
6016 if (!oi.watchers.empty()) {
6017 result = -EBUSY;
6018 break;
6019 }
6020 if (soid.snap == CEPH_NOSNAP) {
6021 result = _verify_no_head_clones(soid, ssc->snapset);
6022 if (result < 0)
6023 break;
6024 }
6025 result = _delete_oid(ctx, true, false);
6026 if (result >= 0) {
6027 // mark that this is a cache eviction to avoid triggering normal
11fdf7f2 6028 // make_writeable() clone creation in finish_ctx()
7c673cae
FG
6029 ctx->cache_evict = true;
6030 }
6031 osd->logger->inc(l_osd_tier_evict);
6032 }
6033 break;
6034
6035 case CEPH_OSD_OP_GETXATTR:
6036 ++ctx->num_read;
6037 {
6038 string aname;
6039 bp.copy(op.xattr.name_len, aname);
6040 tracepoint(osd, do_osd_op_pre_getxattr, soid.oid.name.c_str(), soid.snap.val, aname.c_str());
6041 string name = "_" + aname;
6042 int r = getattr_maybe_cache(
6043 ctx->obc,
6044 name,
6045 &(osd_op.outdata));
6046 if (r >= 0) {
6047 op.xattr.value_len = osd_op.outdata.length();
6048 result = 0;
11fdf7f2 6049 ctx->delta_stats.num_rd_kb += shift_round_up(osd_op.outdata.length(), 10);
7c673cae
FG
6050 } else
6051 result = r;
6052
6053 ctx->delta_stats.num_rd++;
6054 }
6055 break;
6056
6057 case CEPH_OSD_OP_GETXATTRS:
6058 ++ctx->num_read;
6059 {
6060 tracepoint(osd, do_osd_op_pre_getxattrs, soid.oid.name.c_str(), soid.snap.val);
6061 map<string, bufferlist> out;
6062 result = getattrs_maybe_cache(
6063 ctx->obc,
b32b8144 6064 &out);
7c673cae
FG
6065
6066 bufferlist bl;
11fdf7f2
TL
6067 encode(out, bl);
6068 ctx->delta_stats.num_rd_kb += shift_round_up(bl.length(), 10);
7c673cae
FG
6069 ctx->delta_stats.num_rd++;
6070 osd_op.outdata.claim_append(bl);
6071 }
6072 break;
6073
6074 case CEPH_OSD_OP_CMPXATTR:
6075 ++ctx->num_read;
6076 {
6077 string aname;
6078 bp.copy(op.xattr.name_len, aname);
6079 tracepoint(osd, do_osd_op_pre_cmpxattr, soid.oid.name.c_str(), soid.snap.val, aname.c_str());
6080 string name = "_" + aname;
6081 name[op.xattr.name_len + 1] = 0;
6082
6083 bufferlist xattr;
6084 result = getattr_maybe_cache(
6085 ctx->obc,
6086 name,
6087 &xattr);
6088 if (result < 0 && result != -EEXIST && result != -ENODATA)
6089 break;
6090
6091 ctx->delta_stats.num_rd++;
11fdf7f2 6092 ctx->delta_stats.num_rd_kb += shift_round_up(xattr.length(), 10);
7c673cae
FG
6093
6094 switch (op.xattr.cmp_mode) {
6095 case CEPH_OSD_CMPXATTR_MODE_STRING:
6096 {
6097 string val;
6098 bp.copy(op.xattr.value_len, val);
6099 val[op.xattr.value_len] = 0;
6100 dout(10) << "CEPH_OSD_OP_CMPXATTR name=" << name << " val=" << val
6101 << " op=" << (int)op.xattr.cmp_op << " mode=" << (int)op.xattr.cmp_mode << dendl;
6102 result = do_xattr_cmp_str(op.xattr.cmp_op, val, xattr);
6103 }
6104 break;
6105
6106 case CEPH_OSD_CMPXATTR_MODE_U64:
6107 {
6108 uint64_t u64val;
6109 try {
11fdf7f2 6110 decode(u64val, bp);
7c673cae
FG
6111 }
6112 catch (buffer::error& e) {
6113 result = -EINVAL;
6114 goto fail;
6115 }
6116 dout(10) << "CEPH_OSD_OP_CMPXATTR name=" << name << " val=" << u64val
6117 << " op=" << (int)op.xattr.cmp_op << " mode=" << (int)op.xattr.cmp_mode << dendl;
6118 result = do_xattr_cmp_u64(op.xattr.cmp_op, u64val, xattr);
6119 }
6120 break;
6121
6122 default:
6123 dout(10) << "bad cmp mode " << (int)op.xattr.cmp_mode << dendl;
6124 result = -EINVAL;
6125 }
6126
6127 if (!result) {
6128 dout(10) << "comparison returned false" << dendl;
6129 result = -ECANCELED;
6130 break;
6131 }
6132 if (result < 0) {
6133 dout(10) << "comparison returned " << result << " " << cpp_strerror(-result) << dendl;
6134 break;
6135 }
6136
6137 dout(10) << "comparison returned true" << dendl;
6138 }
6139 break;
6140
6141 case CEPH_OSD_OP_ASSERT_VER:
6142 ++ctx->num_read;
6143 {
6144 uint64_t ver = op.assert_ver.ver;
6145 tracepoint(osd, do_osd_op_pre_assert_ver, soid.oid.name.c_str(), soid.snap.val, ver);
6146 if (!ver)
6147 result = -EINVAL;
6148 else if (ver < oi.user_version)
6149 result = -ERANGE;
6150 else if (ver > oi.user_version)
6151 result = -EOVERFLOW;
6152 }
6153 break;
6154
6155 case CEPH_OSD_OP_LIST_WATCHERS:
6156 ++ctx->num_read;
6157 {
6158 tracepoint(osd, do_osd_op_pre_list_watchers, soid.oid.name.c_str(), soid.snap.val);
6159 obj_list_watch_response_t resp;
6160
6161 map<pair<uint64_t, entity_name_t>, watch_info_t>::const_iterator oi_iter;
6162 for (oi_iter = oi.watchers.begin(); oi_iter != oi.watchers.end();
6163 ++oi_iter) {
6164 dout(20) << "key cookie=" << oi_iter->first.first
6165 << " entity=" << oi_iter->first.second << " "
6166 << oi_iter->second << dendl;
11fdf7f2
TL
6167 ceph_assert(oi_iter->first.first == oi_iter->second.cookie);
6168 ceph_assert(oi_iter->first.second.is_client());
7c673cae
FG
6169
6170 watch_item_t wi(oi_iter->first.second, oi_iter->second.cookie,
6171 oi_iter->second.timeout_seconds, oi_iter->second.addr);
6172 resp.entries.push_back(wi);
6173 }
6174
6175 resp.encode(osd_op.outdata, ctx->get_features());
6176 result = 0;
6177
6178 ctx->delta_stats.num_rd++;
6179 break;
6180 }
6181
6182 case CEPH_OSD_OP_LIST_SNAPS:
6183 ++ctx->num_read;
6184 {
6185 tracepoint(osd, do_osd_op_pre_list_snaps, soid.oid.name.c_str(), soid.snap.val);
6186 obj_list_snap_response_t resp;
6187
6188 if (!ssc) {
6189 ssc = ctx->obc->ssc = get_snapset_context(soid, false);
6190 }
11fdf7f2
TL
6191 ceph_assert(ssc);
6192 dout(20) << " snapset " << ssc->snapset << dendl;
7c673cae
FG
6193
6194 int clonecount = ssc->snapset.clones.size();
11fdf7f2 6195 clonecount++; // for head
7c673cae
FG
6196 resp.clones.reserve(clonecount);
6197 for (auto clone_iter = ssc->snapset.clones.begin();
6198 clone_iter != ssc->snapset.clones.end(); ++clone_iter) {
6199 clone_info ci;
6200 ci.cloneid = *clone_iter;
6201
6202 hobject_t clone_oid = soid;
6203 clone_oid.snap = *clone_iter;
6204
11fdf7f2
TL
6205 auto p = ssc->snapset.clone_snaps.find(*clone_iter);
6206 if (p == ssc->snapset.clone_snaps.end()) {
6207 osd->clog->error() << "osd." << osd->whoami
6208 << ": inconsistent clone_snaps found for oid "
6209 << soid << " clone " << *clone_iter
6210 << " snapset " << ssc->snapset;
6211 result = -EINVAL;
6212 break;
6213 }
6214 for (auto q = p->second.rbegin(); q != p->second.rend(); ++q) {
6215 ci.snaps.push_back(*q);
7c673cae
FG
6216 }
6217
6218 dout(20) << " clone " << *clone_iter << " snaps " << ci.snaps << dendl;
6219
6220 map<snapid_t, interval_set<uint64_t> >::const_iterator coi;
6221 coi = ssc->snapset.clone_overlap.find(ci.cloneid);
6222 if (coi == ssc->snapset.clone_overlap.end()) {
6223 osd->clog->error() << "osd." << osd->whoami
6224 << ": inconsistent clone_overlap found for oid "
6225 << soid << " clone " << *clone_iter;
6226 result = -EINVAL;
6227 break;
6228 }
6229 const interval_set<uint64_t> &o = coi->second;
6230 ci.overlap.reserve(o.num_intervals());
6231 for (interval_set<uint64_t>::const_iterator r = o.begin();
6232 r != o.end(); ++r) {
6233 ci.overlap.push_back(pair<uint64_t,uint64_t>(r.get_start(),
6234 r.get_len()));
6235 }
6236
6237 map<snapid_t, uint64_t>::const_iterator si;
6238 si = ssc->snapset.clone_size.find(ci.cloneid);
6239 if (si == ssc->snapset.clone_size.end()) {
6240 osd->clog->error() << "osd." << osd->whoami
6241 << ": inconsistent clone_size found for oid "
6242 << soid << " clone " << *clone_iter;
6243 result = -EINVAL;
6244 break;
6245 }
6246 ci.size = si->second;
6247
6248 resp.clones.push_back(ci);
6249 }
6250 if (result < 0) {
6251 break;
6252 }
11fdf7f2
TL
6253 if (!ctx->obc->obs.oi.is_whiteout()) {
6254 ceph_assert(obs.exists);
7c673cae
FG
6255 clone_info ci;
6256 ci.cloneid = CEPH_NOSNAP;
6257
6258 //Size for HEAD is oi.size
6259 ci.size = oi.size;
6260
6261 resp.clones.push_back(ci);
6262 }
6263 resp.seq = ssc->snapset.seq;
6264
6265 resp.encode(osd_op.outdata);
6266 result = 0;
6267
6268 ctx->delta_stats.num_rd++;
6269 break;
6270 }
6271
6272 case CEPH_OSD_OP_NOTIFY:
6273 ++ctx->num_read;
6274 {
6275 uint32_t timeout;
6276 bufferlist bl;
6277
6278 try {
6279 uint32_t ver; // obsolete
11fdf7f2
TL
6280 decode(ver, bp);
6281 decode(timeout, bp);
6282 decode(bl, bp);
7c673cae
FG
6283 } catch (const buffer::error &e) {
6284 timeout = 0;
6285 }
6286 tracepoint(osd, do_osd_op_pre_notify, soid.oid.name.c_str(), soid.snap.val, timeout);
6287 if (!timeout)
6288 timeout = cct->_conf->osd_default_notify_timeout;
6289
6290 notify_info_t n;
6291 n.timeout = timeout;
11fdf7f2 6292 n.notify_id = osd->get_next_id(get_osdmap_epoch());
9f95a23c 6293 n.cookie = op.notify.cookie;
7c673cae
FG
6294 n.bl = bl;
6295 ctx->notifies.push_back(n);
6296
6297 // return our unique notify id to the client
11fdf7f2 6298 encode(n.notify_id, osd_op.outdata);
7c673cae
FG
6299 }
6300 break;
6301
6302 case CEPH_OSD_OP_NOTIFY_ACK:
6303 ++ctx->num_read;
6304 {
6305 try {
6306 uint64_t notify_id = 0;
6307 uint64_t watch_cookie = 0;
11fdf7f2
TL
6308 decode(notify_id, bp);
6309 decode(watch_cookie, bp);
7c673cae
FG
6310 bufferlist reply_bl;
6311 if (!bp.end()) {
11fdf7f2 6312 decode(reply_bl, bp);
7c673cae
FG
6313 }
6314 tracepoint(osd, do_osd_op_pre_notify_ack, soid.oid.name.c_str(), soid.snap.val, notify_id, watch_cookie, "Y");
6315 OpContext::NotifyAck ack(notify_id, watch_cookie, reply_bl);
6316 ctx->notify_acks.push_back(ack);
6317 } catch (const buffer::error &e) {
6318 tracepoint(osd, do_osd_op_pre_notify_ack, soid.oid.name.c_str(), soid.snap.val, op.watch.cookie, 0, "N");
6319 OpContext::NotifyAck ack(
6320 // op.watch.cookie is actually the notify_id for historical reasons
6321 op.watch.cookie
6322 );
6323 ctx->notify_acks.push_back(ack);
6324 }
6325 }
6326 break;
6327
6328 case CEPH_OSD_OP_SETALLOCHINT:
6329 ++ctx->num_write;
9f95a23c 6330 result = 0;
7c673cae
FG
6331 {
6332 tracepoint(osd, do_osd_op_pre_setallochint, soid.oid.name.c_str(), soid.snap.val, op.alloc_hint.expected_object_size, op.alloc_hint.expected_write_size);
6333 maybe_create_new_object(ctx);
6334 oi.expected_object_size = op.alloc_hint.expected_object_size;
6335 oi.expected_write_size = op.alloc_hint.expected_write_size;
6336 oi.alloc_hint_flags = op.alloc_hint.flags;
6337 t->set_alloc_hint(soid, op.alloc_hint.expected_object_size,
6338 op.alloc_hint.expected_write_size,
6339 op.alloc_hint.flags);
7c673cae
FG
6340 }
6341 break;
6342
6343
6344 // --- WRITES ---
6345
6346 // -- object data --
6347
6348 case CEPH_OSD_OP_WRITE:
6349 ++ctx->num_write;
9f95a23c 6350 result = 0;
7c673cae
FG
6351 { // write
6352 __u32 seq = oi.truncate_seq;
6353 tracepoint(osd, do_osd_op_pre_write, soid.oid.name.c_str(), soid.snap.val, oi.size, seq, op.extent.offset, op.extent.length, op.extent.truncate_size, op.extent.truncate_seq);
6354 if (op.extent.length != osd_op.indata.length()) {
6355 result = -EINVAL;
6356 break;
6357 }
6358
6359 if (pool.info.has_flag(pg_pool_t::FLAG_WRITE_FADVISE_DONTNEED))
6360 op.flags = op.flags | CEPH_OSD_OP_FLAG_FADVISE_DONTNEED;
6361
6362 if (pool.info.requires_aligned_append() &&
6363 (op.extent.offset % pool.info.required_alignment() != 0)) {
6364 result = -EOPNOTSUPP;
6365 break;
6366 }
6367
6368 if (!obs.exists) {
6369 if (pool.info.requires_aligned_append() && op.extent.offset) {
6370 result = -EOPNOTSUPP;
6371 break;
6372 }
6373 } else if (op.extent.offset != oi.size &&
6374 pool.info.requires_aligned_append()) {
6375 result = -EOPNOTSUPP;
6376 break;
6377 }
6378
6379 if (seq && (seq > op.extent.truncate_seq) &&
6380 (op.extent.offset + op.extent.length > oi.size)) {
6381 // old write, arrived after trimtrunc
6382 op.extent.length = (op.extent.offset > oi.size ? 0 : oi.size - op.extent.offset);
6383 dout(10) << " old truncate_seq " << op.extent.truncate_seq << " < current " << seq
6384 << ", adjusting write length to " << op.extent.length << dendl;
6385 bufferlist t;
6386 t.substr_of(osd_op.indata, 0, op.extent.length);
6387 osd_op.indata.swap(t);
6388 }
6389 if (op.extent.truncate_seq > seq) {
6390 // write arrives before trimtrunc
6391 if (obs.exists && !oi.is_whiteout()) {
6392 dout(10) << " truncate_seq " << op.extent.truncate_seq << " > current " << seq
6393 << ", truncating to " << op.extent.truncate_size << dendl;
6394 t->truncate(soid, op.extent.truncate_size);
6395 oi.truncate_seq = op.extent.truncate_seq;
6396 oi.truncate_size = op.extent.truncate_size;
11fdf7f2
TL
6397 if (oi.size > op.extent.truncate_size) {
6398 interval_set<uint64_t> trim;
6399 trim.insert(op.extent.truncate_size,
6400 oi.size - op.extent.truncate_size);
6401 ctx->modified_ranges.union_of(trim);
9f95a23c 6402 ctx->clean_regions.mark_data_region_dirty(op.extent.truncate_size, oi.size - op.extent.truncate_size);
11fdf7f2 6403 }
7c673cae 6404 if (op.extent.truncate_size != oi.size) {
11fdf7f2
TL
6405 truncate_update_size_and_usage(ctx->delta_stats,
6406 oi,
6407 op.extent.truncate_size);
7c673cae
FG
6408 }
6409 } else {
6410 dout(10) << " truncate_seq " << op.extent.truncate_seq << " > current " << seq
6411 << ", but object is new" << dendl;
6412 oi.truncate_seq = op.extent.truncate_seq;
6413 oi.truncate_size = op.extent.truncate_size;
6414 }
6415 }
11fdf7f2
TL
6416 result = check_offset_and_length(
6417 op.extent.offset, op.extent.length,
6418 static_cast<Option::size_t>(osd->osd_max_object_size), get_dpp());
7c673cae
FG
6419 if (result < 0)
6420 break;
6421
6422 maybe_create_new_object(ctx);
6423
6424 if (op.extent.length == 0) {
6425 if (op.extent.offset > oi.size) {
6426 t->truncate(
6427 soid, op.extent.offset);
eafe8130
TL
6428 truncate_update_size_and_usage(ctx->delta_stats, oi,
6429 op.extent.offset);
7c673cae
FG
6430 } else {
6431 t->nop(soid);
6432 }
6433 } else {
6434 t->write(
6435 soid, op.extent.offset, op.extent.length, osd_op.indata, op.flags);
6436 }
6437
28e407b8
AA
6438 if (op.extent.offset == 0 && op.extent.length >= oi.size
6439 && !skip_data_digest) {
7c673cae 6440 obs.oi.set_data_digest(osd_op.indata.crc32c(-1));
28e407b8
AA
6441 } else if (op.extent.offset == oi.size && obs.oi.is_data_digest()) {
6442 if (skip_data_digest) {
6443 obs.oi.clear_data_digest();
6444 } else {
6445 obs.oi.set_data_digest(osd_op.indata.crc32c(obs.oi.data_digest));
6446 }
6447 } else {
7c673cae 6448 obs.oi.clear_data_digest();
28e407b8 6449 }
7c673cae
FG
6450 write_update_size_and_usage(ctx->delta_stats, oi, ctx->modified_ranges,
6451 op.extent.offset, op.extent.length);
9f95a23c
TL
6452 ctx->clean_regions.mark_data_region_dirty(op.extent.offset, op.extent.length);
6453 dout(10) << "clean_regions modified" << ctx->clean_regions << dendl;
7c673cae
FG
6454 }
6455 break;
6456
6457 case CEPH_OSD_OP_WRITEFULL:
6458 ++ctx->num_write;
9f95a23c 6459 result = 0;
7c673cae
FG
6460 { // write full object
6461 tracepoint(osd, do_osd_op_pre_writefull, soid.oid.name.c_str(), soid.snap.val, oi.size, 0, op.extent.length);
6462
6463 if (op.extent.length != osd_op.indata.length()) {
6464 result = -EINVAL;
6465 break;
6466 }
11fdf7f2
TL
6467 result = check_offset_and_length(
6468 0, op.extent.length,
6469 static_cast<Option::size_t>(osd->osd_max_object_size), get_dpp());
7c673cae
FG
6470 if (result < 0)
6471 break;
6472
6473 if (pool.info.has_flag(pg_pool_t::FLAG_WRITE_FADVISE_DONTNEED))
6474 op.flags = op.flags | CEPH_OSD_OP_FLAG_FADVISE_DONTNEED;
6475
6476 maybe_create_new_object(ctx);
11fdf7f2 6477 if (pool.info.is_erasure()) {
7c673cae
FG
6478 t->truncate(soid, 0);
6479 } else if (obs.exists && op.extent.length < oi.size) {
6480 t->truncate(soid, op.extent.length);
6481 }
6482 if (op.extent.length) {
6483 t->write(soid, 0, op.extent.length, osd_op.indata, op.flags);
6484 }
28e407b8
AA
6485 if (!skip_data_digest) {
6486 obs.oi.set_data_digest(osd_op.indata.crc32c(-1));
6487 } else {
6488 obs.oi.clear_data_digest();
6489 }
9f95a23c
TL
6490 ctx->clean_regions.mark_data_region_dirty(0,
6491 std::max((uint64_t)op.extent.length, oi.size));
7c673cae
FG
6492 write_update_size_and_usage(ctx->delta_stats, oi, ctx->modified_ranges,
6493 0, op.extent.length, true);
6494 }
6495 break;
6496
6497 case CEPH_OSD_OP_WRITESAME:
6498 ++ctx->num_write;
6499 tracepoint(osd, do_osd_op_pre_writesame, soid.oid.name.c_str(), soid.snap.val, oi.size, op.writesame.offset, op.writesame.length, op.writesame.data_length);
6500 result = do_writesame(ctx, osd_op);
6501 break;
6502
6503 case CEPH_OSD_OP_ROLLBACK :
6504 ++ctx->num_write;
6505 tracepoint(osd, do_osd_op_pre_rollback, soid.oid.name.c_str(), soid.snap.val);
6506 result = _rollback_to(ctx, op);
6507 break;
6508
6509 case CEPH_OSD_OP_ZERO:
6510 tracepoint(osd, do_osd_op_pre_zero, soid.oid.name.c_str(), soid.snap.val, op.extent.offset, op.extent.length);
6511 if (pool.info.requires_aligned_append()) {
6512 result = -EOPNOTSUPP;
6513 break;
6514 }
6515 ++ctx->num_write;
6516 { // zero
11fdf7f2
TL
6517 result = check_offset_and_length(
6518 op.extent.offset, op.extent.length,
6519 static_cast<Option::size_t>(osd->osd_max_object_size), get_dpp());
7c673cae
FG
6520 if (result < 0)
6521 break;
11fdf7f2
TL
6522
6523 ceph_assert(op.extent.length);
7c673cae
FG
6524 if (obs.exists && !oi.is_whiteout()) {
6525 t->zero(soid, op.extent.offset, op.extent.length);
6526 interval_set<uint64_t> ch;
6527 ch.insert(op.extent.offset, op.extent.length);
6528 ctx->modified_ranges.union_of(ch);
9f95a23c 6529 ctx->clean_regions.mark_data_region_dirty(op.extent.offset, op.extent.length);
7c673cae
FG
6530 ctx->delta_stats.num_wr++;
6531 oi.clear_data_digest();
6532 } else {
6533 // no-op
6534 }
6535 }
6536 break;
6537 case CEPH_OSD_OP_CREATE:
6538 ++ctx->num_write;
9f95a23c 6539 result = 0;
7c673cae
FG
6540 {
6541 tracepoint(osd, do_osd_op_pre_create, soid.oid.name.c_str(), soid.snap.val);
7c673cae 6542 if (obs.exists && !oi.is_whiteout() &&
9f95a23c 6543 (op.flags & CEPH_OSD_OP_FLAG_EXCL)) {
7c673cae
FG
6544 result = -EEXIST; /* this is an exclusive create */
6545 } else {
6546 if (osd_op.indata.length()) {
11fdf7f2 6547 auto p = osd_op.indata.cbegin();
7c673cae
FG
6548 string category;
6549 try {
11fdf7f2 6550 decode(category, p);
7c673cae
FG
6551 }
6552 catch (buffer::error& e) {
6553 result = -EINVAL;
6554 goto fail;
6555 }
6556 // category is no longer implemented.
6557 }
9f95a23c
TL
6558 maybe_create_new_object(ctx);
6559 t->nop(soid);
7c673cae
FG
6560 }
6561 }
6562 break;
6563
6564 case CEPH_OSD_OP_TRIMTRUNC:
6565 op.extent.offset = op.extent.truncate_size;
6566 // falling through
6567
6568 case CEPH_OSD_OP_TRUNCATE:
6569 tracepoint(osd, do_osd_op_pre_truncate, soid.oid.name.c_str(), soid.snap.val, oi.size, oi.truncate_seq, op.extent.offset, op.extent.length, op.extent.truncate_size, op.extent.truncate_seq);
6570 if (pool.info.requires_aligned_append()) {
6571 result = -EOPNOTSUPP;
6572 break;
6573 }
6574 ++ctx->num_write;
9f95a23c 6575 result = 0;
7c673cae
FG
6576 {
6577 // truncate
6578 if (!obs.exists || oi.is_whiteout()) {
6579 dout(10) << " object dne, truncate is a no-op" << dendl;
6580 break;
6581 }
6582
11fdf7f2
TL
6583 result = check_offset_and_length(
6584 op.extent.offset, op.extent.length,
6585 static_cast<Option::size_t>(osd->osd_max_object_size), get_dpp());
6586 if (result < 0)
7c673cae 6587 break;
7c673cae
FG
6588
6589 if (op.extent.truncate_seq) {
11fdf7f2 6590 ceph_assert(op.extent.offset == op.extent.truncate_size);
7c673cae
FG
6591 if (op.extent.truncate_seq <= oi.truncate_seq) {
6592 dout(10) << " truncate seq " << op.extent.truncate_seq << " <= current " << oi.truncate_seq
6593 << ", no-op" << dendl;
6594 break; // old
6595 }
6596 dout(10) << " truncate seq " << op.extent.truncate_seq << " > current " << oi.truncate_seq
6597 << ", truncating" << dendl;
6598 oi.truncate_seq = op.extent.truncate_seq;
6599 oi.truncate_size = op.extent.truncate_size;
6600 }
6601
6602 maybe_create_new_object(ctx);
6603 t->truncate(soid, op.extent.offset);
6604 if (oi.size > op.extent.offset) {
6605 interval_set<uint64_t> trim;
6606 trim.insert(op.extent.offset, oi.size-op.extent.offset);
6607 ctx->modified_ranges.union_of(trim);
9f95a23c
TL
6608 ctx->clean_regions.mark_data_region_dirty(op.extent.offset, oi.size - op.extent.offset);
6609 } else if (oi.size < op.extent.offset) {
6610 ctx->clean_regions.mark_data_region_dirty(oi.size, op.extent.offset - oi.size);
6611 }
7c673cae 6612 if (op.extent.offset != oi.size) {
11fdf7f2
TL
6613 truncate_update_size_and_usage(ctx->delta_stats,
6614 oi,
6615 op.extent.offset);
7c673cae
FG
6616 }
6617 ctx->delta_stats.num_wr++;
6618 // do no set exists, or we will break above DELETE -> TRUNCATE munging.
6619
6620 oi.clear_data_digest();
6621 }
6622 break;
6623
6624 case CEPH_OSD_OP_DELETE:
6625 ++ctx->num_write;
9f95a23c 6626 result = 0;
7c673cae
FG
6627 tracepoint(osd, do_osd_op_pre_delete, soid.oid.name.c_str(), soid.snap.val);
6628 {
11fdf7f2
TL
6629 if (oi.has_manifest()) {
6630 if ((oi.flags & object_info_t::FLAG_REDIRECT_HAS_REFERENCE) && oi.manifest.is_redirect()) {
6631 ctx->register_on_commit(
6632 [oi, ctx, this](){
6633 object_locator_t target_oloc(oi.manifest.redirect_target);
6634 refcount_manifest(ctx->obc, target_oloc, oi.manifest.redirect_target,
6635 SnapContext(), false, NULL, 0);
6636 });
6637 } else if (oi.manifest.is_chunked()) {
6638 ctx->register_on_commit(
6639 [oi, ctx, this](){
6640 for (auto p : oi.manifest.chunk_map) {
6641 if (p.second.has_reference()) {
6642 object_locator_t target_oloc(p.second.oid);
6643 refcount_manifest(ctx->obc, target_oloc, p.second.oid,
6644 SnapContext(), false, NULL, p.first);
6645 }
6646 }
6647 });
6648 }
6649 }
7c673cae
FG
6650 result = _delete_oid(ctx, false, ctx->ignore_cache);
6651 }
6652 break;
6653
6654 case CEPH_OSD_OP_WATCH:
6655 ++ctx->num_write;
9f95a23c 6656 result = 0;
7c673cae
FG
6657 {
6658 tracepoint(osd, do_osd_op_pre_watch, soid.oid.name.c_str(), soid.snap.val,
6659 op.watch.cookie, op.watch.op);
6660 if (!obs.exists) {
6661 result = -ENOENT;
6662 break;
6663 }
9f95a23c 6664 result = 0;
7c673cae
FG
6665 uint64_t cookie = op.watch.cookie;
6666 entity_name_t entity = ctx->reqid.name;
6667 ObjectContextRef obc = ctx->obc;
6668
6669 dout(10) << "watch " << ceph_osd_watch_op_name(op.watch.op)
6670 << ": ctx->obc=" << (void *)obc.get() << " cookie=" << cookie
6671 << " oi.version=" << oi.version.version << " ctx->at_version=" << ctx->at_version << dendl;
6672 dout(10) << "watch: oi.user_version=" << oi.user_version<< dendl;
6673 dout(10) << "watch: peer_addr="
6674 << ctx->op->get_req()->get_connection()->get_peer_addr() << dendl;
6675
6676 uint32_t timeout = cct->_conf->osd_client_watch_timeout;
6677 if (op.watch.timeout != 0) {
6678 timeout = op.watch.timeout;
6679 }
6680
6681 watch_info_t w(cookie, timeout,
6682 ctx->op->get_req()->get_connection()->get_peer_addr());
6683 if (op.watch.op == CEPH_OSD_WATCH_OP_WATCH ||
6684 op.watch.op == CEPH_OSD_WATCH_OP_LEGACY_WATCH) {
6685 if (oi.watchers.count(make_pair(cookie, entity))) {
6686 dout(10) << " found existing watch " << w << " by " << entity << dendl;
6687 } else {
6688 dout(10) << " registered new watch " << w << " by " << entity << dendl;
6689 oi.watchers[make_pair(cookie, entity)] = w;
6690 t->nop(soid); // make sure update the object_info on disk!
6691 }
6692 bool will_ping = (op.watch.op == CEPH_OSD_WATCH_OP_WATCH);
6693 ctx->watch_connects.push_back(make_pair(w, will_ping));
6694 } else if (op.watch.op == CEPH_OSD_WATCH_OP_RECONNECT) {
6695 if (!oi.watchers.count(make_pair(cookie, entity))) {
6696 result = -ENOTCONN;
6697 break;
6698 }
6699 dout(10) << " found existing watch " << w << " by " << entity << dendl;
6700 ctx->watch_connects.push_back(make_pair(w, true));
6701 } else if (op.watch.op == CEPH_OSD_WATCH_OP_PING) {
6702 /* Note: WATCH with PING doesn't cause may_write() to return true,
6703 * so if there is nothing else in the transaction, this is going
6704 * to run do_osd_op_effects, but not write out a log entry */
6705 if (!oi.watchers.count(make_pair(cookie, entity))) {
6706 result = -ENOTCONN;
6707 break;
6708 }
6709 map<pair<uint64_t,entity_name_t>,WatchRef>::iterator p =
6710 obc->watchers.find(make_pair(cookie, entity));
6711 if (p == obc->watchers.end() ||
6712 !p->second->is_connected()) {
6713 // client needs to reconnect
6714 result = -ETIMEDOUT;
6715 break;
6716 }
6717 dout(10) << " found existing watch " << w << " by " << entity << dendl;
6718 p->second->got_ping(ceph_clock_now());
6719 result = 0;
6720 } else if (op.watch.op == CEPH_OSD_WATCH_OP_UNWATCH) {
6721 map<pair<uint64_t, entity_name_t>, watch_info_t>::iterator oi_iter =
6722 oi.watchers.find(make_pair(cookie, entity));
6723 if (oi_iter != oi.watchers.end()) {
6724 dout(10) << " removed watch " << oi_iter->second << " by "
6725 << entity << dendl;
6726 oi.watchers.erase(oi_iter);
6727 t->nop(soid); // update oi on disk
6728 ctx->watch_disconnects.push_back(
6729 watch_disconnect_t(cookie, entity, false));
6730 } else {
6731 dout(10) << " can't remove: no watch by " << entity << dendl;
6732 }
6733 }
6734 }
6735 break;
6736
6737 case CEPH_OSD_OP_CACHE_PIN:
6738 tracepoint(osd, do_osd_op_pre_cache_pin, soid.oid.name.c_str(), soid.snap.val);
6739 if ((!pool.info.is_tier() ||
6740 pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE)) {
6741 result = -EINVAL;
6742 dout(10) << " pin object is only allowed on the cache tier " << dendl;
6743 break;
6744 }
6745 ++ctx->num_write;
9f95a23c 6746 result = 0;
7c673cae
FG
6747 {
6748 if (!obs.exists || oi.is_whiteout()) {
6749 result = -ENOENT;
6750 break;
6751 }
6752
6753 if (!oi.is_cache_pinned()) {
6754 oi.set_flag(object_info_t::FLAG_CACHE_PIN);
6755 ctx->modify = true;
6756 ctx->delta_stats.num_objects_pinned++;
6757 ctx->delta_stats.num_wr++;
6758 }
7c673cae
FG
6759 }
6760 break;
6761
6762 case CEPH_OSD_OP_CACHE_UNPIN:
6763 tracepoint(osd, do_osd_op_pre_cache_unpin, soid.oid.name.c_str(), soid.snap.val);
6764 if ((!pool.info.is_tier() ||
6765 pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE)) {
6766 result = -EINVAL;
6767 dout(10) << " pin object is only allowed on the cache tier " << dendl;
6768 break;
6769 }
6770 ++ctx->num_write;
9f95a23c 6771 result = 0;
7c673cae
FG
6772 {
6773 if (!obs.exists || oi.is_whiteout()) {
6774 result = -ENOENT;
6775 break;
6776 }
6777
6778 if (oi.is_cache_pinned()) {
6779 oi.clear_flag(object_info_t::FLAG_CACHE_PIN);
6780 ctx->modify = true;
6781 ctx->delta_stats.num_objects_pinned--;
6782 ctx->delta_stats.num_wr++;
6783 }
7c673cae
FG
6784 }
6785 break;
6786
31f18b77
FG
6787 case CEPH_OSD_OP_SET_REDIRECT:
6788 ++ctx->num_write;
9f95a23c 6789 result = 0;
31f18b77
FG
6790 {
6791 if (pool.info.is_tier()) {
6792 result = -EINVAL;
6793 break;
6794 }
6795 if (!obs.exists) {
6796 result = -ENOENT;
6797 break;
6798 }
9f95a23c 6799 if (get_osdmap()->require_osd_release < ceph_release_t::luminous) {
31f18b77
FG
6800 result = -EOPNOTSUPP;
6801 break;
6802 }
6803
6804 object_t target_name;
6805 object_locator_t target_oloc;
6806 snapid_t target_snapid = (uint64_t)op.copy_from.snapid;
6807 version_t target_version = op.copy_from.src_version;
6808 try {
11fdf7f2
TL
6809 decode(target_name, bp);
6810 decode(target_oloc, bp);
6811 }
6812 catch (buffer::error& e) {
6813 result = -EINVAL;
6814 goto fail;
6815 }
6816 pg_t raw_pg;
6817 get_osdmap()->object_locator_to_pg(target_name, target_oloc, raw_pg);
6818 hobject_t target(target_name, target_oloc.key, target_snapid,
6819 raw_pg.ps(), raw_pg.pool(),
6820 target_oloc.nspace);
6821 if (target == soid) {
6822 dout(20) << " set-redirect self is invalid" << dendl;
6823 result = -EINVAL;
6824 break;
6825 }
6826
6827 bool need_reference = (osd_op.op.flags & CEPH_OSD_OP_FLAG_WITH_REFERENCE);
6828 bool has_reference = (oi.flags & object_info_t::FLAG_REDIRECT_HAS_REFERENCE);
6829 if (has_reference) {
6830 result = -EINVAL;
6831 dout(5) << " the object is already a manifest " << dendl;
6832 break;
6833 }
6834 if (op_finisher == nullptr && need_reference) {
6835 // start
6836 ctx->op_finishers[ctx->current_osd_subop_num].reset(
6837 new SetManifestFinisher(osd_op));
9f95a23c 6838 RefCountCallback *fin = new RefCountCallback(ctx, osd_op);
11fdf7f2
TL
6839 refcount_manifest(ctx->obc, target_oloc, target, SnapContext(),
6840 true, fin, 0);
6841 result = -EINPROGRESS;
6842 } else {
6843 // finish
6844 if (op_finisher) {
6845 result = op_finisher->execute();
6846 ceph_assert(result == 0);
6847 }
6848
6849 if (!oi.has_manifest() && !oi.manifest.is_redirect())
6850 ctx->delta_stats.num_objects_manifest++;
6851
6852 oi.set_flag(object_info_t::FLAG_MANIFEST);
6853 oi.manifest.redirect_target = target;
6854 oi.manifest.type = object_manifest_t::TYPE_REDIRECT;
6855 t->truncate(soid, 0);
9f95a23c 6856 ctx->clean_regions.mark_data_region_dirty(0, oi.size);
11fdf7f2
TL
6857 if (oi.is_omap() && pool.info.supports_omap()) {
6858 t->omap_clear(soid);
6859 obs.oi.clear_omap_digest();
6860 obs.oi.clear_flag(object_info_t::FLAG_OMAP);
9f95a23c 6861 ctx->clean_regions.mark_omap_dirty();
11fdf7f2 6862 }
9f95a23c
TL
6863 write_update_size_and_usage(ctx->delta_stats, oi, ctx->modified_ranges,
6864 0, oi.size, false);
11fdf7f2
TL
6865 ctx->delta_stats.num_bytes -= oi.size;
6866 oi.size = 0;
6867 oi.new_object();
6868 oi.user_version = target_version;
6869 ctx->user_at_version = target_version;
6870 /* rm_attrs */
6871 map<string,bufferlist> rmattrs;
6872 result = getattrs_maybe_cache(ctx->obc, &rmattrs);
6873 if (result < 0) {
eafe8130 6874 dout(10) << __func__ << " error: " << cpp_strerror(result) << dendl;
11fdf7f2
TL
6875 return result;
6876 }
6877 map<string, bufferlist>::iterator iter;
6878 for (iter = rmattrs.begin(); iter != rmattrs.end(); ++iter) {
6879 const string& name = iter->first;
6880 t->rmattr(soid, name);
6881 }
6882 if (!has_reference && need_reference) {
6883 oi.set_flag(object_info_t::FLAG_REDIRECT_HAS_REFERENCE);
6884 }
6885 dout(10) << "set-redirect oid:" << oi.soid << " user_version: " << oi.user_version << dendl;
6886 if (op_finisher) {
6887 ctx->op_finishers.erase(ctx->current_osd_subop_num);
6888 }
6889 }
6890 }
6891
6892 break;
6893
6894 case CEPH_OSD_OP_SET_CHUNK:
6895 ++ctx->num_write;
9f95a23c 6896 result = 0;
11fdf7f2
TL
6897 {
6898 if (pool.info.is_tier()) {
6899 result = -EINVAL;
6900 break;
6901 }
6902 if (!obs.exists) {
6903 result = -ENOENT;
6904 break;
6905 }
9f95a23c 6906 if (get_osdmap()->require_osd_release < ceph_release_t::luminous) {
11fdf7f2
TL
6907 result = -EOPNOTSUPP;
6908 break;
6909 }
6910
6911 object_locator_t tgt_oloc;
6912 uint64_t src_offset, src_length, tgt_offset;
6913 object_t tgt_name;
6914 try {
6915 decode(src_offset, bp);
6916 decode(src_length, bp);
6917 decode(tgt_oloc, bp);
6918 decode(tgt_name, bp);
6919 decode(tgt_offset, bp);
31f18b77
FG
6920 }
6921 catch (buffer::error& e) {
6922 result = -EINVAL;
11fdf7f2
TL
6923 goto fail;
6924 }
6925
6926 if (!src_length) {
6927 result = -EINVAL;
6928 goto fail;
6929 }
6930
6931 for (auto &p : oi.manifest.chunk_map) {
6932 if ((p.first <= src_offset && p.first + p.second.length > src_offset) ||
6933 (p.first > src_offset && p.first <= src_offset + src_length)) {
6934 dout(20) << __func__ << " overlapped !! offset: " << src_offset << " length: " << src_length
6935 << " chunk_info: " << p << dendl;
6936 result = -EOPNOTSUPP;
6937 goto fail;
6938 }
6939 }
6940
6941 if (!oi.manifest.is_chunked()) {
6942 oi.manifest.clear();
6943 }
6944
6945 pg_t raw_pg;
6946 chunk_info_t chunk_info;
6947 get_osdmap()->object_locator_to_pg(tgt_name, tgt_oloc, raw_pg);
6948 hobject_t target(tgt_name, tgt_oloc.key, snapid_t(),
6949 raw_pg.ps(), raw_pg.pool(),
6950 tgt_oloc.nspace);
6951 bool need_reference = (osd_op.op.flags & CEPH_OSD_OP_FLAG_WITH_REFERENCE);
6952 bool has_reference = (oi.manifest.chunk_map.find(src_offset) != oi.manifest.chunk_map.end()) &&
6953 (oi.manifest.chunk_map[src_offset].flags & chunk_info_t::FLAG_HAS_REFERENCE);
6954 if (has_reference) {
6955 result = -EINVAL;
6956 dout(5) << " the object is already a manifest " << dendl;
6957 break;
6958 }
6959 if (op_finisher == nullptr && need_reference) {
6960 // start
6961 ctx->op_finishers[ctx->current_osd_subop_num].reset(
6962 new SetManifestFinisher(osd_op));
9f95a23c 6963 RefCountCallback *fin = new RefCountCallback(ctx, osd_op);
11fdf7f2
TL
6964 refcount_manifest(ctx->obc, tgt_oloc, target, SnapContext(),
6965 true, fin, src_offset);
6966 result = -EINPROGRESS;
6967 } else {
6968 if (op_finisher) {
6969 result = op_finisher->execute();
6970 ceph_assert(result == 0);
6971 }
6972
6973 chunk_info_t chunk_info;
6974 chunk_info.set_flag(chunk_info_t::FLAG_MISSING);
6975 chunk_info.oid = target;
6976 chunk_info.offset = tgt_offset;
6977 chunk_info.length= src_length;
6978 oi.manifest.chunk_map[src_offset] = chunk_info;
6979 if (!oi.has_manifest() && !oi.manifest.is_chunked())
6980 ctx->delta_stats.num_objects_manifest++;
6981 oi.set_flag(object_info_t::FLAG_MANIFEST);
6982 oi.manifest.type = object_manifest_t::TYPE_CHUNKED;
6983 if (!has_reference && need_reference) {
6984 oi.manifest.chunk_map[src_offset].set_flag(chunk_info_t::FLAG_HAS_REFERENCE);
6985 }
6986 if (need_reference && pool.info.get_fingerprint_type() != pg_pool_t::TYPE_FINGERPRINT_NONE) {
6987 oi.manifest.chunk_map[src_offset].set_flag(chunk_info_t::FLAG_HAS_FINGERPRINT);
6988 }
6989 ctx->modify = true;
6990
6991 dout(10) << "set-chunked oid:" << oi.soid << " user_version: " << oi.user_version
6992 << " chunk_info: " << chunk_info << dendl;
6993 if (op_finisher) {
6994 ctx->op_finishers.erase(ctx->current_osd_subop_num);
6995 }
6996 }
6997 }
6998
6999 break;
7000
7001 case CEPH_OSD_OP_TIER_PROMOTE:
7002 ++ctx->num_write;
9f95a23c 7003 result = 0;
11fdf7f2
TL
7004 {
7005 if (pool.info.is_tier()) {
7006 result = -EINVAL;
7007 break;
7008 }
7009 if (!obs.exists) {
7010 result = -ENOENT;
7011 break;
7012 }
9f95a23c 7013 if (get_osdmap()->require_osd_release < ceph_release_t::luminous) {
11fdf7f2
TL
7014 result = -EOPNOTSUPP;
7015 break;
7016 }
7017 if (!obs.oi.has_manifest()) {
7018 result = 0;
7019 break;
7020 }
7021
7022 if (op_finisher == nullptr) {
7023 PromoteManifestCallback *cb;
7024 object_locator_t my_oloc;
7025 hobject_t src_hoid;
7026
7027 if (obs.oi.manifest.is_chunked()) {
7028 src_hoid = obs.oi.soid;
7029 cb = new PromoteManifestCallback(ctx->obc, this, ctx);
7030 } else if (obs.oi.manifest.is_redirect()) {
7031 object_locator_t src_oloc(obs.oi.manifest.redirect_target);
7032 my_oloc = src_oloc;
7033 src_hoid = obs.oi.manifest.redirect_target;
7034 cb = new PromoteManifestCallback(ctx->obc, this, ctx);
7035 } else {
7036 ceph_abort_msg("unrecognized manifest type");
7037 }
7038 ctx->op_finishers[ctx->current_osd_subop_num].reset(
7039 new PromoteFinisher(cb));
7040 unsigned flags = CEPH_OSD_COPY_FROM_FLAG_IGNORE_OVERLAY |
7041 CEPH_OSD_COPY_FROM_FLAG_IGNORE_CACHE |
7042 CEPH_OSD_COPY_FROM_FLAG_MAP_SNAP_CLONE |
7043 CEPH_OSD_COPY_FROM_FLAG_RWORDERED;
7044 unsigned src_fadvise_flags = LIBRADOS_OP_FLAG_FADVISE_SEQUENTIAL;
7045 start_copy(cb, ctx->obc, src_hoid, my_oloc, 0, flags,
7046 obs.oi.soid.snap == CEPH_NOSNAP,
7047 src_fadvise_flags, 0);
7048
7049 dout(10) << "tier-promote oid:" << oi.soid << " manifest: " << obs.oi.manifest << dendl;
7050 result = -EINPROGRESS;
7051 } else {
7052 result = op_finisher->execute();
7053 ceph_assert(result == 0);
7054 ctx->op_finishers.erase(ctx->current_osd_subop_num);
7055 }
7056 }
7057
7058 break;
7059
9f95a23c
TL
7060 case CEPH_OSD_OP_TIER_FLUSH:
7061 ++ctx->num_write;
7062 result = 0;
7063 {
7064 if (pool.info.is_tier()) {
7065 result = -EINVAL;
7066 break;
7067 }
7068 if (!obs.exists) {
7069 result = -ENOENT;
7070 break;
7071 }
7072 if (get_osdmap()->require_osd_release < ceph_release_t::octopus) {
7073 result = -EOPNOTSUPP;
7074 break;
7075 }
7076 if (!obs.oi.has_manifest()) {
7077 result = 0;
7078 break;
7079 }
7080
7081 hobject_t missing;
7082 bool is_dirty = false;
7083 for (auto& p : ctx->obc->obs.oi.manifest.chunk_map) {
7084 if (p.second.is_dirty()) {
7085 is_dirty = true;
7086 break;
7087 }
7088 }
7089
7090 if (is_dirty) {
7091 result = start_flush(ctx->op, ctx->obc, true, NULL, std::nullopt);
7092 if (result == -EINPROGRESS)
7093 result = -EAGAIN;
7094 } else {
7095 result = 0;
7096 }
7097 }
7098
7099 break;
7100
11fdf7f2
TL
7101 case CEPH_OSD_OP_UNSET_MANIFEST:
7102 ++ctx->num_write;
9f95a23c 7103 result = 0;
11fdf7f2
TL
7104 {
7105 if (pool.info.is_tier()) {
7106 result = -EINVAL;
7107 break;
31f18b77 7108 }
11fdf7f2
TL
7109 if (!obs.exists) {
7110 result = -ENOENT;
31f18b77
FG
7111 break;
7112 }
11fdf7f2
TL
7113 if (!oi.has_manifest()) {
7114 result = -EOPNOTSUPP;
7115 break;
31f18b77 7116 }
9f95a23c 7117 if (get_osdmap()->require_osd_release < ceph_release_t::luminous) {
11fdf7f2
TL
7118 result = -EOPNOTSUPP;
7119 break;
31f18b77 7120 }
11fdf7f2
TL
7121
7122 if (oi.manifest.is_redirect()) {
7123 if ((oi.flags & object_info_t::FLAG_REDIRECT_HAS_REFERENCE)) {
7124 ctx->register_on_commit(
7125 [oi, ctx, this](){
7126 object_locator_t target_oloc(oi.manifest.redirect_target);
7127 refcount_manifest(ctx->obc, target_oloc, oi.manifest.redirect_target,
7128 SnapContext(), false, NULL, 0);
7129 });
7130 }
7131 } else if (oi.manifest.is_chunked()) {
7132 ctx->register_on_commit(
7133 [oi, ctx, this](){
7134 for (auto p : oi.manifest.chunk_map) {
7135 if (p.second.flags & chunk_info_t::FLAG_HAS_REFERENCE) {
7136 object_locator_t target_oloc(p.second.oid);
7137 refcount_manifest(ctx->obc, target_oloc, p.second.oid,
7138 SnapContext(), false, NULL, p.first);
7139 }
7140 }
7141 });
7142 } else {
7143 ceph_abort_msg("unrecognized manifest type");
31f18b77 7144 }
11fdf7f2
TL
7145
7146 oi.clear_flag(object_info_t::FLAG_MANIFEST);
7147 oi.manifest = object_manifest_t();
7148 ctx->delta_stats.num_objects_manifest--;
7149 ctx->delta_stats.num_wr++;
7150 ctx->modify = true;
31f18b77
FG
7151 }
7152
7153 break;
7c673cae
FG
7154
7155 // -- object attrs --
7156
7157 case CEPH_OSD_OP_SETXATTR:
7158 ++ctx->num_write;
9f95a23c 7159 result = 0;
7c673cae
FG
7160 {
7161 if (cct->_conf->osd_max_attr_size > 0 &&
7162 op.xattr.value_len > cct->_conf->osd_max_attr_size) {
7163 tracepoint(osd, do_osd_op_pre_setxattr, soid.oid.name.c_str(), soid.snap.val, "???");
7164 result = -EFBIG;
7165 break;
7166 }
11fdf7f2
TL
7167 unsigned max_name_len =
7168 std::min<uint64_t>(osd->store->get_max_attr_name_length(),
7169 cct->_conf->osd_max_attr_name_len);
7c673cae
FG
7170 if (op.xattr.name_len > max_name_len) {
7171 result = -ENAMETOOLONG;
7172 break;
7173 }
7174 maybe_create_new_object(ctx);
7175 string aname;
7176 bp.copy(op.xattr.name_len, aname);
7177 tracepoint(osd, do_osd_op_pre_setxattr, soid.oid.name.c_str(), soid.snap.val, aname.c_str());
7178 string name = "_" + aname;
7179 bufferlist bl;
7180 bp.copy(op.xattr.value_len, bl);
7181 t->setattr(soid, name, bl);
7182 ctx->delta_stats.num_wr++;
7183 }
7184 break;
7185
7186 case CEPH_OSD_OP_RMXATTR:
7187 ++ctx->num_write;
9f95a23c 7188 result = 0;
7c673cae
FG
7189 {
7190 string aname;
7191 bp.copy(op.xattr.name_len, aname);
7192 tracepoint(osd, do_osd_op_pre_rmxattr, soid.oid.name.c_str(), soid.snap.val, aname.c_str());
7193 if (!obs.exists || oi.is_whiteout()) {
7194 result = -ENOENT;
7195 break;
7196 }
7197 string name = "_" + aname;
7198 t->rmattr(soid, name);
7199 ctx->delta_stats.num_wr++;
7200 }
7201 break;
7202
7203
7204 // -- fancy writers --
7205 case CEPH_OSD_OP_APPEND:
7206 {
7207 tracepoint(osd, do_osd_op_pre_append, soid.oid.name.c_str(), soid.snap.val, oi.size, oi.truncate_seq, op.extent.offset, op.extent.length, op.extent.truncate_size, op.extent.truncate_seq);
7208 // just do it inline; this works because we are happy to execute
7209 // fancy op on replicas as well.
7210 vector<OSDOp> nops(1);
7211 OSDOp& newop = nops[0];
7212 newop.op.op = CEPH_OSD_OP_WRITE;
7213 newop.op.extent.offset = oi.size;
7214 newop.op.extent.length = op.extent.length;
7215 newop.op.extent.truncate_seq = oi.truncate_seq;
7216 newop.indata = osd_op.indata;
7217 result = do_osd_ops(ctx, nops);
7218 osd_op.outdata.claim(newop.outdata);
7219 }
7220 break;
7221
7222 case CEPH_OSD_OP_STARTSYNC:
9f95a23c 7223 result = 0;
7c673cae
FG
7224 t->nop(soid);
7225 break;
7226
7c673cae
FG
7227 // -- trivial map --
7228 case CEPH_OSD_OP_TMAPGET:
7229 tracepoint(osd, do_osd_op_pre_tmapget, soid.oid.name.c_str(), soid.snap.val);
11fdf7f2 7230 if (pool.info.is_erasure()) {
7c673cae
FG
7231 result = -EOPNOTSUPP;
7232 break;
7233 }
7234 {
7235 vector<OSDOp> nops(1);
7236 OSDOp& newop = nops[0];
7237 newop.op.op = CEPH_OSD_OP_SYNC_READ;
7238 newop.op.extent.offset = 0;
7239 newop.op.extent.length = 0;
9f95a23c 7240 result = do_osd_ops(ctx, nops);
7c673cae
FG
7241 osd_op.outdata.claim(newop.outdata);
7242 }
7243 break;
7244
7245 case CEPH_OSD_OP_TMAPPUT:
7246 tracepoint(osd, do_osd_op_pre_tmapput, soid.oid.name.c_str(), soid.snap.val);
11fdf7f2 7247 if (pool.info.is_erasure()) {
7c673cae
FG
7248 result = -EOPNOTSUPP;
7249 break;
7250 }
7251 {
7252 //_dout_lock.Lock();
7253 //osd_op.data.hexdump(*_dout);
7254 //_dout_lock.Unlock();
7255
7256 // verify sort order
7257 bool unsorted = false;
7258 if (true) {
7259 bufferlist header;
11fdf7f2 7260 decode(header, bp);
7c673cae 7261 uint32_t n;
11fdf7f2 7262 decode(n, bp);
7c673cae
FG
7263 string last_key;
7264 while (n--) {
7265 string key;
11fdf7f2 7266 decode(key, bp);
7c673cae
FG
7267 dout(10) << "tmapput key " << key << dendl;
7268 bufferlist val;
11fdf7f2 7269 decode(val, bp);
7c673cae
FG
7270 if (key < last_key) {
7271 dout(10) << "TMAPPUT is unordered; resorting" << dendl;
7272 unsorted = true;
7273 break;
7274 }
7275 last_key = key;
7276 }
7277 }
7278
7279 // write it
7280 vector<OSDOp> nops(1);
7281 OSDOp& newop = nops[0];
7282 newop.op.op = CEPH_OSD_OP_WRITEFULL;
7283 newop.op.extent.offset = 0;
7284 newop.op.extent.length = osd_op.indata.length();
7285 newop.indata = osd_op.indata;
7286
7287 if (unsorted) {
7288 bp = osd_op.indata.begin();
7289 bufferlist header;
7290 map<string, bufferlist> m;
11fdf7f2
TL
7291 decode(header, bp);
7292 decode(m, bp);
7293 ceph_assert(bp.end());
7c673cae 7294 bufferlist newbl;
11fdf7f2
TL
7295 encode(header, newbl);
7296 encode(m, newbl);
7c673cae
FG
7297 newop.indata = newbl;
7298 }
7299 result = do_osd_ops(ctx, nops);
11fdf7f2 7300 ceph_assert(result == 0);
7c673cae
FG
7301 }
7302 break;
7303
7304 case CEPH_OSD_OP_TMAPUP:
7305 tracepoint(osd, do_osd_op_pre_tmapup, soid.oid.name.c_str(), soid.snap.val);
11fdf7f2 7306 if (pool.info.is_erasure()) {
7c673cae
FG
7307 result = -EOPNOTSUPP;
7308 break;
7309 }
7310 ++ctx->num_write;
7311 result = do_tmapup(ctx, bp, osd_op);
7312 break;
7313
7314 case CEPH_OSD_OP_TMAP2OMAP:
7315 ++ctx->num_write;
7316 tracepoint(osd, do_osd_op_pre_tmap2omap, soid.oid.name.c_str(), soid.snap.val);
7317 result = do_tmap2omap(ctx, op.tmap2omap.flags);
7318 break;
7319
7320 // OMAP Read ops
7321 case CEPH_OSD_OP_OMAPGETKEYS:
7322 ++ctx->num_read;
7323 {
7324 string start_after;
7325 uint64_t max_return;
7326 try {
11fdf7f2
TL
7327 decode(start_after, bp);
7328 decode(max_return, bp);
7c673cae
FG
7329 }
7330 catch (buffer::error& e) {
7331 result = -EINVAL;
7332 tracepoint(osd, do_osd_op_pre_omapgetkeys, soid.oid.name.c_str(), soid.snap.val, "???", 0);
7333 goto fail;
7334 }
7335 if (max_return > cct->_conf->osd_max_omap_entries_per_request) {
7336 max_return = cct->_conf->osd_max_omap_entries_per_request;
7337 }
7338 tracepoint(osd, do_osd_op_pre_omapgetkeys, soid.oid.name.c_str(), soid.snap.val, start_after.c_str(), max_return);
7339
7340 bufferlist bl;
7341 uint32_t num = 0;
7342 bool truncated = false;
7343 if (oi.is_omap()) {
7344 ObjectMap::ObjectMapIterator iter = osd->store->get_omap_iterator(
11fdf7f2 7345 ch, ghobject_t(soid)
7c673cae 7346 );
11fdf7f2 7347 ceph_assert(iter);
7c673cae 7348 iter->upper_bound(start_after);
11fdf7f2 7349 for (num = 0; iter->valid(); ++num, iter->next()) {
7c673cae
FG
7350 if (num >= max_return ||
7351 bl.length() >= cct->_conf->osd_max_omap_bytes_per_request) {
7352 truncated = true;
7353 break;
7354 }
11fdf7f2 7355 encode(iter->key(), bl);
7c673cae
FG
7356 }
7357 } // else return empty out_set
11fdf7f2 7358 encode(num, osd_op.outdata);
7c673cae 7359 osd_op.outdata.claim_append(bl);
11fdf7f2
TL
7360 encode(truncated, osd_op.outdata);
7361 ctx->delta_stats.num_rd_kb += shift_round_up(osd_op.outdata.length(), 10);
7c673cae
FG
7362 ctx->delta_stats.num_rd++;
7363 }
7364 break;
7365
7366 case CEPH_OSD_OP_OMAPGETVALS:
7367 ++ctx->num_read;
7368 {
7369 string start_after;
7370 uint64_t max_return;
7371 string filter_prefix;
7372 try {
11fdf7f2
TL
7373 decode(start_after, bp);
7374 decode(max_return, bp);
7375 decode(filter_prefix, bp);
7c673cae
FG
7376 }
7377 catch (buffer::error& e) {
7378 result = -EINVAL;
7379 tracepoint(osd, do_osd_op_pre_omapgetvals, soid.oid.name.c_str(), soid.snap.val, "???", 0, "???");
7380 goto fail;
7381 }
7382 if (max_return > cct->_conf->osd_max_omap_entries_per_request) {
7383 max_return = cct->_conf->osd_max_omap_entries_per_request;
7384 }
7385 tracepoint(osd, do_osd_op_pre_omapgetvals, soid.oid.name.c_str(), soid.snap.val, start_after.c_str(), max_return, filter_prefix.c_str());
7386
7387 uint32_t num = 0;
7388 bool truncated = false;
7389 bufferlist bl;
7390 if (oi.is_omap()) {
7391 ObjectMap::ObjectMapIterator iter = osd->store->get_omap_iterator(
11fdf7f2 7392 ch, ghobject_t(soid)
7c673cae
FG
7393 );
7394 if (!iter) {
7395 result = -ENOENT;
7396 goto fail;
7397 }
7398 iter->upper_bound(start_after);
7399 if (filter_prefix > start_after) iter->lower_bound(filter_prefix);
7400 for (num = 0;
7401 iter->valid() &&
7402 iter->key().substr(0, filter_prefix.size()) == filter_prefix;
11fdf7f2 7403 ++num, iter->next()) {
7c673cae
FG
7404 dout(20) << "Found key " << iter->key() << dendl;
7405 if (num >= max_return ||
7406 bl.length() >= cct->_conf->osd_max_omap_bytes_per_request) {
7407 truncated = true;
7408 break;
7409 }
11fdf7f2
TL
7410 encode(iter->key(), bl);
7411 encode(iter->value(), bl);
7c673cae
FG
7412 }
7413 } // else return empty out_set
11fdf7f2 7414 encode(num, osd_op.outdata);
7c673cae 7415 osd_op.outdata.claim_append(bl);
11fdf7f2
TL
7416 encode(truncated, osd_op.outdata);
7417 ctx->delta_stats.num_rd_kb += shift_round_up(osd_op.outdata.length(), 10);
7c673cae
FG
7418 ctx->delta_stats.num_rd++;
7419 }
7420 break;
7421
7422 case CEPH_OSD_OP_OMAPGETHEADER:
7423 tracepoint(osd, do_osd_op_pre_omapgetheader, soid.oid.name.c_str(), soid.snap.val);
7424 if (!oi.is_omap()) {
7425 // return empty header
7426 break;
7427 }
7428 ++ctx->num_read;
7429 {
7430 osd->store->omap_get_header(ch, ghobject_t(soid), &osd_op.outdata);
11fdf7f2 7431 ctx->delta_stats.num_rd_kb += shift_round_up(osd_op.outdata.length(), 10);
7c673cae
FG
7432 ctx->delta_stats.num_rd++;
7433 }
7434 break;
7435
7436 case CEPH_OSD_OP_OMAPGETVALSBYKEYS:
7437 ++ctx->num_read;
7438 {
7439 set<string> keys_to_get;
7440 try {
11fdf7f2 7441 decode(keys_to_get, bp);
7c673cae
FG
7442 }
7443 catch (buffer::error& e) {
7444 result = -EINVAL;
7445 tracepoint(osd, do_osd_op_pre_omapgetvalsbykeys, soid.oid.name.c_str(), soid.snap.val, "???");
7446 goto fail;
7447 }
7448 tracepoint(osd, do_osd_op_pre_omapgetvalsbykeys, soid.oid.name.c_str(), soid.snap.val, list_entries(keys_to_get).c_str());
7449 map<string, bufferlist> out;
7450 if (oi.is_omap()) {
7451 osd->store->omap_get_values(ch, ghobject_t(soid), keys_to_get, &out);
7452 } // else return empty omap entries
11fdf7f2
TL
7453 encode(out, osd_op.outdata);
7454 ctx->delta_stats.num_rd_kb += shift_round_up(osd_op.outdata.length(), 10);
7c673cae
FG
7455 ctx->delta_stats.num_rd++;
7456 }
7457 break;
7458
7459 case CEPH_OSD_OP_OMAP_CMP:
7460 ++ctx->num_read;
7461 {
7462 if (!obs.exists || oi.is_whiteout()) {
7463 result = -ENOENT;
7464 tracepoint(osd, do_osd_op_pre_omap_cmp, soid.oid.name.c_str(), soid.snap.val, "???");
7465 break;
7466 }
7467 map<string, pair<bufferlist, int> > assertions;
7468 try {
11fdf7f2 7469 decode(assertions, bp);
7c673cae
FG
7470 }
7471 catch (buffer::error& e) {
7472 result = -EINVAL;
7473 tracepoint(osd, do_osd_op_pre_omap_cmp, soid.oid.name.c_str(), soid.snap.val, "???");
7474 goto fail;
7475 }
7476 tracepoint(osd, do_osd_op_pre_omap_cmp, soid.oid.name.c_str(), soid.snap.val, list_keys(assertions).c_str());
7477
7478 map<string, bufferlist> out;
7479
7480 if (oi.is_omap()) {
7481 set<string> to_get;
7482 for (map<string, pair<bufferlist, int> >::iterator i = assertions.begin();
7483 i != assertions.end();
7484 ++i)
7485 to_get.insert(i->first);
7486 int r = osd->store->omap_get_values(ch, ghobject_t(soid),
7487 to_get, &out);
7488 if (r < 0) {
7489 result = r;
7490 break;
7491 }
7492 } // else leave out empty
7493
7494 //Should set num_rd_kb based on encode length of map
7495 ctx->delta_stats.num_rd++;
7496
7497 int r = 0;
7498 bufferlist empty;
7499 for (map<string, pair<bufferlist, int> >::iterator i = assertions.begin();
7500 i != assertions.end();
7501 ++i) {
7502 auto out_entry = out.find(i->first);
7503 bufferlist &bl = (out_entry != out.end()) ?
7504 out_entry->second : empty;
7505 switch (i->second.second) {
7506 case CEPH_OSD_CMPXATTR_OP_EQ:
7507 if (!(bl == i->second.first)) {
7508 r = -ECANCELED;
7509 }
7510 break;
7511 case CEPH_OSD_CMPXATTR_OP_LT:
7512 if (!(bl < i->second.first)) {
7513 r = -ECANCELED;
7514 }
7515 break;
7516 case CEPH_OSD_CMPXATTR_OP_GT:
7517 if (!(bl > i->second.first)) {
7518 r = -ECANCELED;
7519 }
7520 break;
7521 default:
7522 r = -EINVAL;
7523 break;
7524 }
7525 if (r < 0)
7526 break;
7527 }
7528 if (r < 0) {
7529 result = r;
7530 }
7531 }
7532 break;
7533
7534 // OMAP Write ops
7535 case CEPH_OSD_OP_OMAPSETVALS:
7536 if (!pool.info.supports_omap()) {
7537 result = -EOPNOTSUPP;
7538 tracepoint(osd, do_osd_op_pre_omapsetvals, soid.oid.name.c_str(), soid.snap.val);
7539 break;
7540 }
7541 ++ctx->num_write;
9f95a23c 7542 result = 0;
7c673cae
FG
7543 {
7544 maybe_create_new_object(ctx);
7545 bufferlist to_set_bl;
7546 try {
7547 decode_str_str_map_to_bl(bp, &to_set_bl);
7548 }
7549 catch (buffer::error& e) {
7550 result = -EINVAL;
7551 tracepoint(osd, do_osd_op_pre_omapsetvals, soid.oid.name.c_str(), soid.snap.val);
7552 goto fail;
7553 }
7554 tracepoint(osd, do_osd_op_pre_omapsetvals, soid.oid.name.c_str(), soid.snap.val);
11fdf7f2 7555 if (cct->_conf->subsys.should_gather<dout_subsys, 20>()) {
7c673cae
FG
7556 dout(20) << "setting vals: " << dendl;
7557 map<string,bufferlist> to_set;
11fdf7f2
TL
7558 bufferlist::const_iterator pt = to_set_bl.begin();
7559 decode(to_set, pt);
7c673cae
FG
7560 for (map<string, bufferlist>::iterator i = to_set.begin();
7561 i != to_set.end();
7562 ++i) {
7563 dout(20) << "\t" << i->first << dendl;
7564 }
7565 }
7566 t->omap_setkeys(soid, to_set_bl);
9f95a23c 7567 ctx->clean_regions.mark_omap_dirty();
7c673cae 7568 ctx->delta_stats.num_wr++;
11fdf7f2 7569 ctx->delta_stats.num_wr_kb += shift_round_up(to_set_bl.length(), 10);
7c673cae
FG
7570 }
7571 obs.oi.set_flag(object_info_t::FLAG_OMAP);
7572 obs.oi.clear_omap_digest();
7573 break;
7574
7575 case CEPH_OSD_OP_OMAPSETHEADER:
7576 tracepoint(osd, do_osd_op_pre_omapsetheader, soid.oid.name.c_str(), soid.snap.val);
7577 if (!pool.info.supports_omap()) {
7578 result = -EOPNOTSUPP;
7579 break;
7580 }
7581 ++ctx->num_write;
9f95a23c 7582 result = 0;
7c673cae
FG
7583 {
7584 maybe_create_new_object(ctx);
7585 t->omap_setheader(soid, osd_op.indata);
9f95a23c 7586 ctx->clean_regions.mark_omap_dirty();
7c673cae
FG
7587 ctx->delta_stats.num_wr++;
7588 }
7589 obs.oi.set_flag(object_info_t::FLAG_OMAP);
7590 obs.oi.clear_omap_digest();
7591 break;
7592
7593 case CEPH_OSD_OP_OMAPCLEAR:
7594 tracepoint(osd, do_osd_op_pre_omapclear, soid.oid.name.c_str(), soid.snap.val);
7595 if (!pool.info.supports_omap()) {
7596 result = -EOPNOTSUPP;
7597 break;
7598 }
7599 ++ctx->num_write;
9f95a23c 7600 result = 0;
7c673cae
FG
7601 {
7602 if (!obs.exists || oi.is_whiteout()) {
7603 result = -ENOENT;
7604 break;
7605 }
7606 if (oi.is_omap()) {
7607 t->omap_clear(soid);
9f95a23c 7608 ctx->clean_regions.mark_omap_dirty();
7c673cae
FG
7609 ctx->delta_stats.num_wr++;
7610 obs.oi.clear_omap_digest();
7611 obs.oi.clear_flag(object_info_t::FLAG_OMAP);
7612 }
7613 }
7614 break;
7615
7616 case CEPH_OSD_OP_OMAPRMKEYS:
7617 if (!pool.info.supports_omap()) {
7618 result = -EOPNOTSUPP;
7619 tracepoint(osd, do_osd_op_pre_omaprmkeys, soid.oid.name.c_str(), soid.snap.val);
7620 break;
7621 }
7622 ++ctx->num_write;
9f95a23c 7623 result = 0;
7c673cae
FG
7624 {
7625 if (!obs.exists || oi.is_whiteout()) {
7626 result = -ENOENT;
7627 tracepoint(osd, do_osd_op_pre_omaprmkeys, soid.oid.name.c_str(), soid.snap.val);
7628 break;
7629 }
7630 bufferlist to_rm_bl;
7631 try {
7632 decode_str_set_to_bl(bp, &to_rm_bl);
7633 }
7634 catch (buffer::error& e) {
7635 result = -EINVAL;
7636 tracepoint(osd, do_osd_op_pre_omaprmkeys, soid.oid.name.c_str(), soid.snap.val);
7637 goto fail;
7638 }
7639 tracepoint(osd, do_osd_op_pre_omaprmkeys, soid.oid.name.c_str(), soid.snap.val);
7640 t->omap_rmkeys(soid, to_rm_bl);
9f95a23c
TL
7641 ctx->clean_regions.mark_omap_dirty();
7642 ctx->delta_stats.num_wr++;
7643 }
7644 obs.oi.clear_omap_digest();
7645 break;
7646
7647 case CEPH_OSD_OP_OMAPRMKEYRANGE:
7648 tracepoint(osd, do_osd_op_pre_omaprmkeyrange, soid.oid.name.c_str(), soid.snap.val);
7649 if (!pool.info.supports_omap()) {
7650 result = -EOPNOTSUPP;
7651 break;
7652 }
7653 ++ctx->num_write;
7654 result = 0;
7655 {
7656 if (!obs.exists || oi.is_whiteout()) {
7657 result = -ENOENT;
7658 break;
7659 }
7660 std::string key_begin, key_end;
7661 try {
7662 decode(key_begin, bp);
7663 decode(key_end, bp);
7664 } catch (buffer::error& e) {
7665 result = -EINVAL;
7666 goto fail;
7667 }
7668 t->omap_rmkeyrange(soid, key_begin, key_end);
7c673cae
FG
7669 ctx->delta_stats.num_wr++;
7670 }
7671 obs.oi.clear_omap_digest();
7672 break;
7673
7674 case CEPH_OSD_OP_COPY_GET:
7675 ++ctx->num_read;
c07f9fc5
FG
7676 tracepoint(osd, do_osd_op_pre_copy_get, soid.oid.name.c_str(),
7677 soid.snap.val);
7678 if (op_finisher == nullptr) {
7679 result = do_copy_get(ctx, bp, osd_op, ctx->obc);
7680 } else {
7681 result = op_finisher->execute();
7682 }
7c673cae
FG
7683 break;
7684
7685 case CEPH_OSD_OP_COPY_FROM:
9f95a23c 7686 case CEPH_OSD_OP_COPY_FROM2:
7c673cae 7687 ++ctx->num_write;
9f95a23c 7688 result = 0;
7c673cae
FG
7689 {
7690 object_t src_name;
7691 object_locator_t src_oloc;
9f95a23c
TL
7692 uint32_t truncate_seq = 0;
7693 uint64_t truncate_size = 0;
7694 bool have_truncate = false;
7c673cae
FG
7695 snapid_t src_snapid = (uint64_t)op.copy_from.snapid;
7696 version_t src_version = op.copy_from.src_version;
9f95a23c
TL
7697
7698 if ((op.op == CEPH_OSD_OP_COPY_FROM2) &&
7699 (op.copy_from.flags & ~CEPH_OSD_COPY_FROM_FLAGS)) {
7700 dout(20) << "invalid copy-from2 flags 0x"
7701 << std::hex << (int)op.copy_from.flags << std::dec << dendl;
7702 result = -EINVAL;
7703 break;
7704 }
7c673cae 7705 try {
11fdf7f2
TL
7706 decode(src_name, bp);
7707 decode(src_oloc, bp);
9f95a23c
TL
7708 // check if client sent us truncate_seq and truncate_size
7709 if ((op.op == CEPH_OSD_OP_COPY_FROM2) &&
7710 (op.copy_from.flags & CEPH_OSD_COPY_FROM_FLAG_TRUNCATE_SEQ)) {
7711 decode(truncate_seq, bp);
7712 decode(truncate_size, bp);
7713 have_truncate = true;
7714 }
7c673cae
FG
7715 }
7716 catch (buffer::error& e) {
7717 result = -EINVAL;
7718 tracepoint(osd,
7719 do_osd_op_pre_copy_from,
7720 soid.oid.name.c_str(),
7721 soid.snap.val,
7722 "???",
7723 0,
7724 "???",
7725 "???",
7726 0,
7727 src_snapid,
7728 src_version);
7729 goto fail;
7730 }
7731 tracepoint(osd,
7732 do_osd_op_pre_copy_from,
7733 soid.oid.name.c_str(),
7734 soid.snap.val,
7735 src_name.name.c_str(),
7736 src_oloc.pool,
7737 src_oloc.key.c_str(),
7738 src_oloc.nspace.c_str(),
7739 src_oloc.hash,
7740 src_snapid,
7741 src_version);
c07f9fc5 7742 if (op_finisher == nullptr) {
7c673cae
FG
7743 // start
7744 pg_t raw_pg;
7745 get_osdmap()->object_locator_to_pg(src_name, src_oloc, raw_pg);
7746 hobject_t src(src_name, src_oloc.key, src_snapid,
7747 raw_pg.ps(), raw_pg.pool(),
7748 src_oloc.nspace);
7749 if (src == soid) {
7750 dout(20) << " copy from self is invalid" << dendl;
7751 result = -EINVAL;
7752 break;
7753 }
c07f9fc5 7754 CopyFromCallback *cb = new CopyFromCallback(ctx, osd_op);
9f95a23c
TL
7755 if (have_truncate)
7756 cb->set_truncate(truncate_seq, truncate_size);
c07f9fc5
FG
7757 ctx->op_finishers[ctx->current_osd_subop_num].reset(
7758 new CopyFromFinisher(cb));
7c673cae
FG
7759 start_copy(cb, ctx->obc, src, src_oloc, src_version,
7760 op.copy_from.flags,
7761 false,
7762 op.copy_from.src_fadvise_flags,
7763 op.flags);
7764 result = -EINPROGRESS;
7765 } else {
7766 // finish
c07f9fc5 7767 result = op_finisher->execute();
11fdf7f2 7768 ceph_assert(result == 0);
c07f9fc5
FG
7769
7770 // COPY_FROM cannot be executed multiple times -- it must restart
7771 ctx->op_finishers.erase(ctx->current_osd_subop_num);
7c673cae
FG
7772 }
7773 }
7774 break;
7775
7776 default:
7777 tracepoint(osd, do_osd_op_pre_unknown, soid.oid.name.c_str(), soid.snap.val, op.op, ceph_osd_op_name(op.op));
7778 dout(1) << "unrecognized osd op " << op.op
7779 << " " << ceph_osd_op_name(op.op)
7780 << dendl;
7781 result = -EOPNOTSUPP;
7782 }
7783
7784 fail:
7785 osd_op.rval = result;
7786 tracepoint(osd, do_osd_op_post, soid.oid.name.c_str(), soid.snap.val, op.op, ceph_osd_op_name(op.op), op.flags, result);
11fdf7f2
TL
7787 if (result < 0 && (op.flags & CEPH_OSD_OP_FLAG_FAILOK) &&
7788 result != -EAGAIN && result != -EINPROGRESS)
7c673cae
FG
7789 result = 0;
7790
7791 if (result < 0)
7792 break;
7793 }
eafe8130
TL
7794 if (result < 0) {
7795 dout(10) << __func__ << " error: " << cpp_strerror(result) << dendl;
7796 }
7c673cae
FG
7797 return result;
7798}
7799
7800int PrimaryLogPG::_get_tmap(OpContext *ctx, bufferlist *header, bufferlist *vals)
7801{
7802 if (ctx->new_obs.oi.size == 0) {
7803 dout(20) << "unable to get tmap for zero sized " << ctx->new_obs.oi.soid << dendl;
7804 return -ENODATA;
7805 }
7806 vector<OSDOp> nops(1);
7807 OSDOp &newop = nops[0];
7808 newop.op.op = CEPH_OSD_OP_TMAPGET;
7809 do_osd_ops(ctx, nops);
7810 try {
11fdf7f2
TL
7811 bufferlist::const_iterator i = newop.outdata.begin();
7812 decode(*header, i);
7c673cae
FG
7813 (*vals).substr_of(newop.outdata, i.get_off(), i.get_remaining());
7814 } catch (...) {
7815 dout(20) << "unsuccessful at decoding tmap for " << ctx->new_obs.oi.soid
7816 << dendl;
7817 return -EINVAL;
7818 }
7819 dout(20) << "successful at decoding tmap for " << ctx->new_obs.oi.soid
7820 << dendl;
7821 return 0;
7822}
7823
7824int PrimaryLogPG::_verify_no_head_clones(const hobject_t& soid,
7825 const SnapSet& ss)
7826{
7827 // verify that all clones have been evicted
7828 dout(20) << __func__ << " verifying clones are absent "
7829 << ss << dendl;
7830 for (vector<snapid_t>::const_iterator p = ss.clones.begin();
7831 p != ss.clones.end();
7832 ++p) {
7833 hobject_t clone_oid = soid;
7834 clone_oid.snap = *p;
7835 if (is_missing_object(clone_oid))
7836 return -EBUSY;
7837 ObjectContextRef clone_obc = get_object_context(clone_oid, false);
7838 if (clone_obc && clone_obc->obs.exists) {
7839 dout(10) << __func__ << " cannot evict head before clone "
7840 << clone_oid << dendl;
7841 return -EBUSY;
7842 }
7843 if (copy_ops.count(clone_oid)) {
7844 dout(10) << __func__ << " cannot evict head, pending promote on clone "
7845 << clone_oid << dendl;
7846 return -EBUSY;
7847 }
7848 }
7849 return 0;
7850}
7851
7852inline int PrimaryLogPG::_delete_oid(
7853 OpContext *ctx,
7854 bool no_whiteout, // no whiteouts, no matter what.
7855 bool try_no_whiteout) // try not to whiteout
7856{
7857 SnapSet& snapset = ctx->new_snapset;
7858 ObjectState& obs = ctx->new_obs;
7859 object_info_t& oi = obs.oi;
7860 const hobject_t& soid = oi.soid;
7861 PGTransaction* t = ctx->op_t.get();
7862
7863 // cache: cache: set whiteout on delete?
7864 bool whiteout = false;
7865 if (pool.info.cache_mode != pg_pool_t::CACHEMODE_NONE
7866 && !no_whiteout
7867 && !try_no_whiteout) {
7868 whiteout = true;
7869 }
11fdf7f2
TL
7870
7871 // in luminous or later, we can't delete the head if there are
7872 // clones. we trust the caller passing no_whiteout has already
7873 // verified they don't exist.
7874 if (!snapset.clones.empty() ||
7875 (!ctx->snapc.snaps.empty() && ctx->snapc.snaps[0] > snapset.seq)) {
7876 if (no_whiteout) {
7877 dout(20) << __func__ << " has or will have clones but no_whiteout=1"
7878 << dendl;
7879 } else {
7880 dout(20) << __func__ << " has or will have clones; will whiteout"
7881 << dendl;
7882 whiteout = true;
7c673cae 7883 }
7c673cae
FG
7884 }
7885 dout(20) << __func__ << " " << soid << " whiteout=" << (int)whiteout
7886 << " no_whiteout=" << (int)no_whiteout
7887 << " try_no_whiteout=" << (int)try_no_whiteout
7888 << dendl;
7889 if (!obs.exists || (obs.oi.is_whiteout() && whiteout))
7890 return -ENOENT;
7891
7892 t->remove(soid);
7893
7894 if (oi.size > 0) {
7895 interval_set<uint64_t> ch;
7896 ch.insert(0, oi.size);
7897 ctx->modified_ranges.union_of(ch);
9f95a23c 7898 ctx->clean_regions.mark_data_region_dirty(0, oi.size);
7c673cae
FG
7899 }
7900
9f95a23c 7901 ctx->clean_regions.mark_omap_dirty();
7c673cae
FG
7902 ctx->delta_stats.num_wr++;
7903 if (soid.is_snap()) {
11fdf7f2 7904 ceph_assert(ctx->obc->ssc->snapset.clone_overlap.count(soid.snap));
7c673cae
FG
7905 ctx->delta_stats.num_bytes -= ctx->obc->ssc->snapset.get_clone_bytes(soid.snap);
7906 } else {
7907 ctx->delta_stats.num_bytes -= oi.size;
7908 }
7909 oi.size = 0;
7910 oi.new_object();
7911
7912 // disconnect all watchers
7913 for (map<pair<uint64_t, entity_name_t>, watch_info_t>::iterator p =
7914 oi.watchers.begin();
7915 p != oi.watchers.end();
7916 ++p) {
7917 dout(20) << __func__ << " will disconnect watcher " << p->first << dendl;
7918 ctx->watch_disconnects.push_back(
7919 watch_disconnect_t(p->first.first, p->first.second, true));
7920 }
7921 oi.watchers.clear();
7922
7923 if (whiteout) {
7924 dout(20) << __func__ << " setting whiteout on " << soid << dendl;
7925 oi.set_flag(object_info_t::FLAG_WHITEOUT);
7926 ctx->delta_stats.num_whiteouts++;
7927 t->create(soid);
7928 osd->logger->inc(l_osd_tier_whiteout);
7929 return 0;
7930 }
7931
7932 // delete the head
7933 ctx->delta_stats.num_objects--;
7934 if (soid.is_snap())
7935 ctx->delta_stats.num_object_clones--;
7936 if (oi.is_whiteout()) {
7937 dout(20) << __func__ << " deleting whiteout on " << soid << dendl;
7938 ctx->delta_stats.num_whiteouts--;
7939 oi.clear_flag(object_info_t::FLAG_WHITEOUT);
7940 }
7941 if (oi.is_cache_pinned()) {
7942 ctx->delta_stats.num_objects_pinned--;
7943 }
11fdf7f2
TL
7944 if (oi.has_manifest()) {
7945 ctx->delta_stats.num_objects_manifest--;
7c673cae
FG
7946 }
7947 obs.exists = false;
7948 return 0;
7949}
7950
7951int PrimaryLogPG::_rollback_to(OpContext *ctx, ceph_osd_op& op)
7952{
7953 SnapSet& snapset = ctx->new_snapset;
7954 ObjectState& obs = ctx->new_obs;
7955 object_info_t& oi = obs.oi;
7956 const hobject_t& soid = oi.soid;
7957 PGTransaction* t = ctx->op_t.get();
7958 snapid_t snapid = (uint64_t)op.snap.snapid;
7959 hobject_t missing_oid;
7960
7961 dout(10) << "_rollback_to " << soid << " snapid " << snapid << dendl;
7962
7963 ObjectContextRef rollback_to;
11fdf7f2 7964
7c673cae
FG
7965 int ret = find_object_context(
7966 hobject_t(soid.oid, soid.get_key(), snapid, soid.get_hash(), info.pgid.pool(),
7967 soid.get_namespace()),
7968 &rollback_to, false, false, &missing_oid);
7969 if (ret == -EAGAIN) {
7970 /* clone must be missing */
11fdf7f2 7971 ceph_assert(is_degraded_or_backfilling_object(missing_oid) || is_degraded_on_async_recovery_target(missing_oid));
c07f9fc5 7972 dout(20) << "_rollback_to attempted to roll back to a missing or backfilling clone "
7c673cae
FG
7973 << missing_oid << " (requested snapid: ) " << snapid << dendl;
7974 block_write_on_degraded_snap(missing_oid, ctx->op);
7975 return ret;
7976 }
7977 {
7978 ObjectContextRef promote_obc;
31f18b77
FG
7979 cache_result_t tier_mode_result;
7980 if (obs.exists && obs.oi.has_manifest()) {
7981 tier_mode_result =
7982 maybe_handle_manifest_detail(
7983 ctx->op,
7984 true,
7985 rollback_to);
7986 } else {
7987 tier_mode_result =
7988 maybe_handle_cache_detail(
7989 ctx->op,
7990 true,
7991 rollback_to,
7992 ret,
7993 missing_oid,
7994 true,
7995 false,
7996 &promote_obc);
7997 }
7998 switch (tier_mode_result) {
7c673cae
FG
7999 case cache_result_t::NOOP:
8000 break;
8001 case cache_result_t::BLOCKED_PROMOTE:
11fdf7f2 8002 ceph_assert(promote_obc);
7c673cae
FG
8003 block_write_on_snap_rollback(soid, promote_obc, ctx->op);
8004 return -EAGAIN;
8005 case cache_result_t::BLOCKED_FULL:
8006 block_write_on_full_cache(soid, ctx->op);
8007 return -EAGAIN;
b32b8144 8008 case cache_result_t::REPLIED_WITH_EAGAIN:
11fdf7f2 8009 ceph_abort_msg("this can't happen, no rollback on replica");
7c673cae 8010 default:
11fdf7f2 8011 ceph_abort_msg("must promote was set, other values are not valid");
7c673cae
FG
8012 return -EAGAIN;
8013 }
8014 }
8015
8016 if (ret == -ENOENT || (rollback_to && rollback_to->obs.oi.is_whiteout())) {
8017 // there's no snapshot here, or there's no object.
8018 // if there's no snapshot, we delete the object; otherwise, do nothing.
8019 dout(20) << "_rollback_to deleting head on " << soid.oid
8020 << " because got ENOENT|whiteout on find_object_context" << dendl;
8021 if (ctx->obc->obs.oi.watchers.size()) {
8022 // Cannot delete an object with watchers
8023 ret = -EBUSY;
8024 } else {
8025 _delete_oid(ctx, false, false);
8026 ret = 0;
8027 }
8028 } else if (ret) {
8029 // ummm....huh? It *can't* return anything else at time of writing.
11fdf7f2 8030 ceph_abort_msg("unexpected error code in _rollback_to");
7c673cae
FG
8031 } else { //we got our context, let's use it to do the rollback!
8032 hobject_t& rollback_to_sobject = rollback_to->obs.oi.soid;
11fdf7f2
TL
8033 if (is_degraded_or_backfilling_object(rollback_to_sobject) ||
8034 is_degraded_on_async_recovery_target(rollback_to_sobject)) {
7c673cae
FG
8035 dout(20) << "_rollback_to attempted to roll back to a degraded object "
8036 << rollback_to_sobject << " (requested snapid: ) " << snapid << dendl;
8037 block_write_on_degraded_snap(rollback_to_sobject, ctx->op);
8038 ret = -EAGAIN;
8039 } else if (rollback_to->obs.oi.soid.snap == CEPH_NOSNAP) {
8040 // rolling back to the head; we just need to clone it.
8041 ctx->modify = true;
8042 } else {
8043 /* 1) Delete current head
8044 * 2) Clone correct snapshot into head
8045 * 3) Calculate clone_overlaps by following overlaps
8046 * forward from rollback snapshot */
8047 dout(10) << "_rollback_to deleting " << soid.oid
8048 << " and rolling back to old snap" << dendl;
8049
8050 if (obs.exists) {
8051 t->remove(soid);
8052 }
8053 t->clone(soid, rollback_to_sobject);
7c673cae
FG
8054 t->add_obc(rollback_to);
8055
8056 map<snapid_t, interval_set<uint64_t> >::iterator iter =
8057 snapset.clone_overlap.lower_bound(snapid);
11fdf7f2 8058 ceph_assert(iter != snapset.clone_overlap.end());
7c673cae 8059 interval_set<uint64_t> overlaps = iter->second;
7c673cae
FG
8060 for ( ;
8061 iter != snapset.clone_overlap.end();
8062 ++iter)
8063 overlaps.intersection_of(iter->second);
8064
8065 if (obs.oi.size > 0) {
8066 interval_set<uint64_t> modified;
8067 modified.insert(0, obs.oi.size);
8068 overlaps.intersection_of(modified);
8069 modified.subtract(overlaps);
8070 ctx->modified_ranges.union_of(modified);
8071 }
8072
8073 // Adjust the cached objectcontext
8074 maybe_create_new_object(ctx, true);
8075 ctx->delta_stats.num_bytes -= obs.oi.size;
8076 ctx->delta_stats.num_bytes += rollback_to->obs.oi.size;
9f95a23c
TL
8077 ctx->clean_regions.mark_data_region_dirty(0, std::max(obs.oi.size, rollback_to->obs.oi.size));
8078 ctx->clean_regions.mark_omap_dirty();
7c673cae
FG
8079 obs.oi.size = rollback_to->obs.oi.size;
8080 if (rollback_to->obs.oi.is_data_digest())
8081 obs.oi.set_data_digest(rollback_to->obs.oi.data_digest);
8082 else
8083 obs.oi.clear_data_digest();
8084 if (rollback_to->obs.oi.is_omap_digest())
8085 obs.oi.set_omap_digest(rollback_to->obs.oi.omap_digest);
8086 else
8087 obs.oi.clear_omap_digest();
8088
8089 if (rollback_to->obs.oi.is_omap()) {
8090 dout(10) << __func__ << " setting omap flag on " << obs.oi.soid << dendl;
8091 obs.oi.set_flag(object_info_t::FLAG_OMAP);
8092 } else {
8093 dout(10) << __func__ << " clearing omap flag on " << obs.oi.soid << dendl;
8094 obs.oi.clear_flag(object_info_t::FLAG_OMAP);
8095 }
7c673cae
FG
8096 }
8097 }
8098 return ret;
8099}
8100
8101void PrimaryLogPG::_make_clone(
8102 OpContext *ctx,
8103 PGTransaction* t,
8104 ObjectContextRef obc,
8105 const hobject_t& head, const hobject_t& coid,
8106 object_info_t *poi)
8107{
8108 bufferlist bv;
11fdf7f2 8109 encode(*poi, bv, get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
7c673cae
FG
8110
8111 t->clone(coid, head);
11fdf7f2
TL
8112 setattr_maybe_cache(obc, t, OI_ATTR, bv);
8113 rmattr_maybe_cache(obc, t, SS_ATTR);
7c673cae
FG
8114}
8115
8116void PrimaryLogPG::make_writeable(OpContext *ctx)
8117{
8118 const hobject_t& soid = ctx->obs->oi.soid;
8119 SnapContext& snapc = ctx->snapc;
8120
8121 // clone?
11fdf7f2 8122 ceph_assert(soid.snap == CEPH_NOSNAP);
7c673cae
FG
8123 dout(20) << "make_writeable " << soid << " snapset=" << ctx->new_snapset
8124 << " snapc=" << snapc << dendl;
8125
8126 bool was_dirty = ctx->obc->obs.oi.is_dirty();
8127 if (ctx->new_obs.exists) {
8128 // we will mark the object dirty
8129 if (ctx->undirty && was_dirty) {
8130 dout(20) << " clearing DIRTY flag" << dendl;
11fdf7f2 8131 ceph_assert(ctx->new_obs.oi.is_dirty());
7c673cae
FG
8132 ctx->new_obs.oi.clear_flag(object_info_t::FLAG_DIRTY);
8133 --ctx->delta_stats.num_objects_dirty;
8134 osd->logger->inc(l_osd_tier_clean);
8135 } else if (!was_dirty && !ctx->undirty) {
8136 dout(20) << " setting DIRTY flag" << dendl;
8137 ctx->new_obs.oi.set_flag(object_info_t::FLAG_DIRTY);
8138 ++ctx->delta_stats.num_objects_dirty;
8139 osd->logger->inc(l_osd_tier_dirty);
8140 }
8141 } else {
8142 if (was_dirty) {
8143 dout(20) << " deletion, decrementing num_dirty and clearing flag" << dendl;
8144 ctx->new_obs.oi.clear_flag(object_info_t::FLAG_DIRTY);
8145 --ctx->delta_stats.num_objects_dirty;
8146 }
8147 }
8148
8149 if ((ctx->new_obs.exists &&
8150 ctx->new_obs.oi.is_omap()) &&
8151 (!ctx->obc->obs.exists ||
8152 !ctx->obc->obs.oi.is_omap())) {
8153 ++ctx->delta_stats.num_objects_omap;
8154 }
8155 if ((!ctx->new_obs.exists ||
8156 !ctx->new_obs.oi.is_omap()) &&
8157 (ctx->obc->obs.exists &&
8158 ctx->obc->obs.oi.is_omap())) {
8159 --ctx->delta_stats.num_objects_omap;
8160 }
8161
7c673cae 8162 if (ctx->new_snapset.seq > snapc.seq) {
11fdf7f2 8163 dout(10) << " op snapset is old" << dendl;
7c673cae
FG
8164 }
8165
8166 if ((ctx->obs->exists && !ctx->obs->oi.is_whiteout()) && // head exist(ed)
8167 snapc.snaps.size() && // there are snaps
8168 !ctx->cache_evict &&
8169 snapc.snaps[0] > ctx->new_snapset.seq) { // existing object is old
8170 // clone
8171 hobject_t coid = soid;
8172 coid.snap = snapc.seq;
8173
8174 unsigned l;
11fdf7f2
TL
8175 for (l = 1;
8176 l < snapc.snaps.size() && snapc.snaps[l] > ctx->new_snapset.seq;
8177 l++) ;
8178
7c673cae
FG
8179 vector<snapid_t> snaps(l);
8180 for (unsigned i=0; i<l; i++)
8181 snaps[i] = snapc.snaps[i];
8182
8183 // prepare clone
8184 object_info_t static_snap_oi(coid);
8185 object_info_t *snap_oi;
8186 if (is_primary()) {
8187 ctx->clone_obc = object_contexts.lookup_or_create(static_snap_oi.soid);
11fdf7f2
TL
8188 ctx->clone_obc->destructor_callback =
8189 new C_PG_ObjectContext(this, ctx->clone_obc.get());
7c673cae
FG
8190 ctx->clone_obc->obs.oi = static_snap_oi;
8191 ctx->clone_obc->obs.exists = true;
8192 ctx->clone_obc->ssc = ctx->obc->ssc;
8193 ctx->clone_obc->ssc->ref++;
11fdf7f2 8194 if (pool.info.is_erasure())
7c673cae
FG
8195 ctx->clone_obc->attr_cache = ctx->obc->attr_cache;
8196 snap_oi = &ctx->clone_obc->obs.oi;
8197 bool got = ctx->lock_manager.get_write_greedy(
8198 coid,
8199 ctx->clone_obc,
8200 ctx->op);
11fdf7f2 8201 ceph_assert(got);
7c673cae
FG
8202 dout(20) << " got greedy write on clone_obc " << *ctx->clone_obc << dendl;
8203 } else {
8204 snap_oi = &static_snap_oi;
8205 }
8206 snap_oi->version = ctx->at_version;
8207 snap_oi->prior_version = ctx->obs->oi.version;
8208 snap_oi->copy_user_bits(ctx->obs->oi);
8209
7c673cae
FG
8210 _make_clone(ctx, ctx->op_t.get(), ctx->clone_obc, soid, coid, snap_oi);
8211
8212 ctx->delta_stats.num_objects++;
8213 if (snap_oi->is_dirty()) {
8214 ctx->delta_stats.num_objects_dirty++;
8215 osd->logger->inc(l_osd_tier_dirty);
8216 }
8217 if (snap_oi->is_omap())
8218 ctx->delta_stats.num_objects_omap++;
8219 if (snap_oi->is_cache_pinned())
8220 ctx->delta_stats.num_objects_pinned++;
11fdf7f2
TL
8221 if (snap_oi->has_manifest())
8222 ctx->delta_stats.num_objects_manifest++;
7c673cae
FG
8223 ctx->delta_stats.num_object_clones++;
8224 ctx->new_snapset.clones.push_back(coid.snap);
8225 ctx->new_snapset.clone_size[coid.snap] = ctx->obs->oi.size;
11fdf7f2 8226 ctx->new_snapset.clone_snaps[coid.snap] = snaps;
7c673cae
FG
8227
8228 // clone_overlap should contain an entry for each clone
8229 // (an empty interval_set if there is no overlap)
8230 ctx->new_snapset.clone_overlap[coid.snap];
8231 if (ctx->obs->oi.size)
8232 ctx->new_snapset.clone_overlap[coid.snap].insert(0, ctx->obs->oi.size);
8233
8234 // log clone
8235 dout(10) << " cloning v " << ctx->obs->oi.version
8236 << " to " << coid << " v " << ctx->at_version
8237 << " snaps=" << snaps
8238 << " snapset=" << ctx->new_snapset << dendl;
11fdf7f2
TL
8239 ctx->log.push_back(pg_log_entry_t(
8240 pg_log_entry_t::CLONE, coid, ctx->at_version,
8241 ctx->obs->oi.version,
8242 ctx->obs->oi.user_version,
8243 osd_reqid_t(), ctx->new_obs.oi.mtime, 0));
8244 encode(snaps, ctx->log.back().snaps);
7c673cae
FG
8245
8246 ctx->at_version.version++;
8247 }
8248
8249 // update most recent clone_overlap and usage stats
8250 if (ctx->new_snapset.clones.size() > 0) {
11fdf7f2
TL
8251 // the clone_overlap is difference of range between head and clones.
8252 // we need to check whether the most recent clone exists, if it's
8253 // been evicted, it's not included in the stats, but the clone_overlap
8254 // is still exist in the snapset, so we should update the
8255 // clone_overlap to make it sense.
7c673cae
FG
8256 hobject_t last_clone_oid = soid;
8257 last_clone_oid.snap = ctx->new_snapset.clone_overlap.rbegin()->first;
11fdf7f2
TL
8258 interval_set<uint64_t> &newest_overlap =
8259 ctx->new_snapset.clone_overlap.rbegin()->second;
8260 ctx->modified_ranges.intersection_of(newest_overlap);
7c673cae 8261 if (is_present_clone(last_clone_oid)) {
7c673cae 8262 // modified_ranges is still in use by the clone
11fdf7f2 8263 ctx->delta_stats.num_bytes += ctx->modified_ranges.size();
7c673cae 8264 }
11fdf7f2 8265 newest_overlap.subtract(ctx->modified_ranges);
7c673cae
FG
8266 }
8267
11fdf7f2
TL
8268 if (snapc.seq > ctx->new_snapset.seq) {
8269 // update snapset with latest snap context
8270 ctx->new_snapset.seq = snapc.seq;
9f95a23c
TL
8271 if (get_osdmap()->require_osd_release < ceph_release_t::octopus) {
8272 ctx->new_snapset.snaps = snapc.snaps;
8273 } else {
8274 ctx->new_snapset.snaps.clear();
8275 }
7c673cae
FG
8276 }
8277 dout(20) << "make_writeable " << soid
8278 << " done, snapset=" << ctx->new_snapset << dendl;
8279}
8280
8281
8282void PrimaryLogPG::write_update_size_and_usage(object_stat_sum_t& delta_stats, object_info_t& oi,
8283 interval_set<uint64_t>& modified, uint64_t offset,
8284 uint64_t length, bool write_full)
8285{
8286 interval_set<uint64_t> ch;
8287 if (write_full) {
8288 if (oi.size)
8289 ch.insert(0, oi.size);
8290 } else if (length)
8291 ch.insert(offset, length);
8292 modified.union_of(ch);
11fdf7f2
TL
8293 if (write_full ||
8294 (offset + length > oi.size && length)) {
7c673cae
FG
8295 uint64_t new_size = offset + length;
8296 delta_stats.num_bytes -= oi.size;
8297 delta_stats.num_bytes += new_size;
8298 oi.size = new_size;
8299 }
11fdf7f2
TL
8300
8301 if (oi.has_manifest() && oi.manifest.is_chunked()) {
8302 for (auto &p : oi.manifest.chunk_map) {
8303 if ((p.first <= offset && p.first + p.second.length > offset) ||
9f95a23c 8304 (p.first > offset && p.first < offset + length)) {
11fdf7f2
TL
8305 p.second.clear_flag(chunk_info_t::FLAG_MISSING);
8306 p.second.set_flag(chunk_info_t::FLAG_DIRTY);
8307 }
8308 }
8309 }
7c673cae 8310 delta_stats.num_wr++;
11fdf7f2 8311 delta_stats.num_wr_kb += shift_round_up(length, 10);
7c673cae
FG
8312}
8313
11fdf7f2
TL
8314void PrimaryLogPG::truncate_update_size_and_usage(
8315 object_stat_sum_t& delta_stats,
8316 object_info_t& oi,
8317 uint64_t truncate_size)
7c673cae 8318{
11fdf7f2
TL
8319 if (oi.size != truncate_size) {
8320 delta_stats.num_bytes -= oi.size;
8321 delta_stats.num_bytes += truncate_size;
8322 oi.size = truncate_size;
7c673cae
FG
8323 }
8324}
8325
8326void PrimaryLogPG::complete_disconnect_watches(
8327 ObjectContextRef obc,
8328 const list<watch_disconnect_t> &to_disconnect)
8329{
8330 for (list<watch_disconnect_t>::const_iterator i =
8331 to_disconnect.begin();
8332 i != to_disconnect.end();
8333 ++i) {
8334 pair<uint64_t, entity_name_t> watcher(i->cookie, i->name);
8335 auto watchers_entry = obc->watchers.find(watcher);
8336 if (watchers_entry != obc->watchers.end()) {
8337 WatchRef watch = watchers_entry->second;
8338 dout(10) << "do_osd_op_effects disconnect watcher " << watcher << dendl;
8339 obc->watchers.erase(watcher);
8340 watch->remove(i->send_disconnect);
8341 } else {
8342 dout(10) << "do_osd_op_effects disconnect failed to find watcher "
8343 << watcher << dendl;
8344 }
8345 }
8346}
8347
8348void PrimaryLogPG::do_osd_op_effects(OpContext *ctx, const ConnectionRef& conn)
8349{
8350 entity_name_t entity = ctx->reqid.name;
8351 dout(15) << "do_osd_op_effects " << entity << " con " << conn.get() << dendl;
8352
8353 // disconnects first
8354 complete_disconnect_watches(ctx->obc, ctx->watch_disconnects);
8355
11fdf7f2 8356 ceph_assert(conn);
7c673cae 8357
11fdf7f2
TL
8358 auto session = conn->get_priv();
8359 if (!session)
7c673cae 8360 return;
7c673cae
FG
8361
8362 for (list<pair<watch_info_t,bool> >::iterator i = ctx->watch_connects.begin();
8363 i != ctx->watch_connects.end();
8364 ++i) {
8365 pair<uint64_t, entity_name_t> watcher(i->first.cookie, entity);
8366 dout(15) << "do_osd_op_effects applying watch connect on session "
8367 << session.get() << " watcher " << watcher << dendl;
8368 WatchRef watch;
8369 if (ctx->obc->watchers.count(watcher)) {
8370 dout(15) << "do_osd_op_effects found existing watch watcher " << watcher
8371 << dendl;
8372 watch = ctx->obc->watchers[watcher];
8373 } else {
8374 dout(15) << "do_osd_op_effects new watcher " << watcher
8375 << dendl;
8376 watch = Watch::makeWatchRef(
8377 this, osd, ctx->obc, i->first.timeout_seconds,
8378 i->first.cookie, entity, conn->get_peer_addr());
8379 ctx->obc->watchers.insert(
8380 make_pair(
8381 watcher,
8382 watch));
8383 }
8384 watch->connect(conn, i->second);
8385 }
8386
8387 for (list<notify_info_t>::iterator p = ctx->notifies.begin();
8388 p != ctx->notifies.end();
8389 ++p) {
8390 dout(10) << "do_osd_op_effects, notify " << *p << dendl;
8391 ConnectionRef conn(ctx->op->get_req()->get_connection());
8392 NotifyRef notif(
8393 Notify::makeNotifyRef(
8394 conn,
8395 ctx->reqid.name.num(),
8396 p->bl,
8397 p->timeout,
8398 p->cookie,
8399 p->notify_id,
8400 ctx->obc->obs.oi.user_version,
8401 osd));
8402 for (map<pair<uint64_t, entity_name_t>, WatchRef>::iterator i =
8403 ctx->obc->watchers.begin();
8404 i != ctx->obc->watchers.end();
8405 ++i) {
8406 dout(10) << "starting notify on watch " << i->first << dendl;
8407 i->second->start_notify(notif);
8408 }
8409 notif->init();
8410 }
8411
8412 for (list<OpContext::NotifyAck>::iterator p = ctx->notify_acks.begin();
8413 p != ctx->notify_acks.end();
8414 ++p) {
8415 if (p->watch_cookie)
9f95a23c 8416 dout(10) << "notify_ack " << make_pair(*(p->watch_cookie), p->notify_id) << dendl;
7c673cae
FG
8417 else
8418 dout(10) << "notify_ack " << make_pair("NULL", p->notify_id) << dendl;
8419 for (map<pair<uint64_t, entity_name_t>, WatchRef>::iterator i =
8420 ctx->obc->watchers.begin();
8421 i != ctx->obc->watchers.end();
8422 ++i) {
8423 if (i->first.second != entity) continue;
8424 if (p->watch_cookie &&
9f95a23c 8425 *(p->watch_cookie) != i->first.first) continue;
7c673cae
FG
8426 dout(10) << "acking notify on watch " << i->first << dendl;
8427 i->second->notify_ack(p->notify_id, p->reply_bl);
8428 }
8429 }
8430}
8431
8432hobject_t PrimaryLogPG::generate_temp_object(const hobject_t& target)
8433{
8434 ostringstream ss;
8435 ss << "temp_" << info.pgid << "_" << get_role()
8436 << "_" << osd->monc->get_global_id() << "_" << (++temp_seq);
8437 hobject_t hoid = target.make_temp_hobject(ss.str());
8438 dout(20) << __func__ << " " << hoid << dendl;
8439 return hoid;
8440}
8441
8442hobject_t PrimaryLogPG::get_temp_recovery_object(
8443 const hobject_t& target,
8444 eversion_t version)
8445{
8446 ostringstream ss;
8447 ss << "temp_recovering_" << info.pgid // (note this includes the shardid)
8448 << "_" << version
8449 << "_" << info.history.same_interval_since
8450 << "_" << target.snap;
8451 // pgid + version + interval + snapid is unique, and short
8452 hobject_t hoid = target.make_temp_hobject(ss.str());
8453 dout(20) << __func__ << " " << hoid << dendl;
8454 return hoid;
8455}
8456
8457int PrimaryLogPG::prepare_transaction(OpContext *ctx)
8458{
11fdf7f2 8459 ceph_assert(!ctx->ops->empty());
7c673cae
FG
8460
8461 // valid snap context?
8462 if (!ctx->snapc.is_valid()) {
8463 dout(10) << " invalid snapc " << ctx->snapc << dendl;
8464 return -EINVAL;
8465 }
8466
8467 // prepare the actual mutation
c07f9fc5 8468 int result = do_osd_ops(ctx, *ctx->ops);
7c673cae
FG
8469 if (result < 0) {
8470 if (ctx->op->may_write() &&
9f95a23c 8471 get_osdmap()->require_osd_release >= ceph_release_t::kraken) {
7c673cae
FG
8472 // need to save the error code in the pg log, to detect dup ops,
8473 // but do nothing else
8474 ctx->update_log_only = true;
8475 }
8476 return result;
8477 }
8478
8479 // read-op? write-op noop? done?
8480 if (ctx->op_t->empty() && !ctx->modify) {
11fdf7f2
TL
8481 if (ctx->pending_async_reads.empty())
8482 unstable_stats.add(ctx->delta_stats);
7c673cae 8483 if (ctx->op->may_write() &&
9f95a23c 8484 get_osdmap()->require_osd_release >= ceph_release_t::kraken) {
7c673cae
FG
8485 ctx->update_log_only = true;
8486 }
8487 return result;
8488 }
8489
8490 // check for full
8491 if ((ctx->delta_stats.num_bytes > 0 ||
8492 ctx->delta_stats.num_objects > 0) && // FIXME: keys?
9f95a23c
TL
8493 pool.info.has_flag(pg_pool_t::FLAG_FULL)) {
8494 auto m = ctx->op->get_req<MOSDOp>();
7c673cae
FG
8495 if (ctx->reqid.name.is_mds() || // FIXME: ignore MDS for now
8496 m->has_flag(CEPH_OSD_FLAG_FULL_FORCE)) {
8497 dout(20) << __func__ << " full, but proceeding due to FULL_FORCE or MDS"
8498 << dendl;
8499 } else if (m->has_flag(CEPH_OSD_FLAG_FULL_TRY)) {
8500 // they tried, they failed.
8501 dout(20) << __func__ << " full, replying to FULL_TRY op" << dendl;
11fdf7f2 8502 return pool.info.has_flag(pg_pool_t::FLAG_FULL_QUOTA) ? -EDQUOT : -ENOSPC;
7c673cae
FG
8503 } else {
8504 // drop request
8505 dout(20) << __func__ << " full, dropping request (bad client)" << dendl;
8506 return -EAGAIN;
8507 }
8508 }
8509
11fdf7f2 8510 const hobject_t& soid = ctx->obs->oi.soid;
7c673cae
FG
8511 // clone, if necessary
8512 if (soid.snap == CEPH_NOSNAP)
8513 make_writeable(ctx);
8514
8515 finish_ctx(ctx,
8516 ctx->new_obs.exists ? pg_log_entry_t::MODIFY :
9f95a23c
TL
8517 pg_log_entry_t::DELETE,
8518 result);
7c673cae
FG
8519
8520 return result;
8521}
8522
9f95a23c 8523void PrimaryLogPG::finish_ctx(OpContext *ctx, int log_op_type, int result)
7c673cae
FG
8524{
8525 const hobject_t& soid = ctx->obs->oi.soid;
8526 dout(20) << __func__ << " " << soid << " " << ctx
8527 << " op " << pg_log_entry_t::get_op_name(log_op_type)
8528 << dendl;
8529 utime_t now = ceph_clock_now();
8530
7c673cae
FG
8531 // finish and log the op.
8532 if (ctx->user_modify) {
8533 // update the user_version for any modify ops, except for the watch op
11fdf7f2 8534 ctx->user_at_version = std::max(info.last_user_version, ctx->new_obs.oi.user_version) + 1;
7c673cae
FG
8535 /* In order for new clients and old clients to interoperate properly
8536 * when exchanging versions, we need to lower bound the user_version
8537 * (which our new clients pay proper attention to)
8538 * by the at_version (which is all the old clients can ever see). */
8539 if (ctx->at_version.version > ctx->user_at_version)
8540 ctx->user_at_version = ctx->at_version.version;
8541 ctx->new_obs.oi.user_version = ctx->user_at_version;
8542 }
8543 ctx->bytes_written = ctx->op_t->get_bytes_written();
8544
8545 if (ctx->new_obs.exists) {
7c673cae
FG
8546 ctx->new_obs.oi.version = ctx->at_version;
8547 ctx->new_obs.oi.prior_version = ctx->obs->oi.version;
8548 ctx->new_obs.oi.last_reqid = ctx->reqid;
8549 if (ctx->mtime != utime_t()) {
8550 ctx->new_obs.oi.mtime = ctx->mtime;
8551 dout(10) << " set mtime to " << ctx->new_obs.oi.mtime << dendl;
8552 ctx->new_obs.oi.local_mtime = now;
8553 } else {
8554 dout(10) << " mtime unchanged at " << ctx->new_obs.oi.mtime << dendl;
8555 }
8556
11fdf7f2 8557 // object_info_t
7c673cae
FG
8558 map <string, bufferlist> attrs;
8559 bufferlist bv(sizeof(ctx->new_obs.oi));
11fdf7f2 8560 encode(ctx->new_obs.oi, bv,
7c673cae
FG
8561 get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
8562 attrs[OI_ATTR].claim(bv);
8563
11fdf7f2 8564 // snapset
7c673cae
FG
8565 if (soid.snap == CEPH_NOSNAP) {
8566 dout(10) << " final snapset " << ctx->new_snapset
8567 << " in " << soid << dendl;
11fdf7f2
TL
8568 bufferlist bss;
8569 encode(ctx->new_snapset, bss);
7c673cae
FG
8570 attrs[SS_ATTR].claim(bss);
8571 } else {
8572 dout(10) << " no snapset (this is a clone)" << dendl;
8573 }
8574 ctx->op_t->setattrs(soid, attrs);
8575 } else {
11fdf7f2 8576 // reset cached oi
7c673cae
FG
8577 ctx->new_obs.oi = object_info_t(ctx->obc->obs.oi.soid);
8578 }
8579
7c673cae 8580 // append to log
9f95a23c
TL
8581 ctx->log.push_back(
8582 pg_log_entry_t(log_op_type, soid, ctx->at_version,
8583 ctx->obs->oi.version,
8584 ctx->user_at_version, ctx->reqid,
8585 ctx->mtime,
8586 (ctx->op && ctx->op->allows_returnvec()) ? result : 0));
8587 if (ctx->op && ctx->op->allows_returnvec()) {
8588 // also the per-op values
8589 ctx->log.back().set_op_returns(*ctx->ops);
8590 dout(20) << __func__ << " op_returns " << ctx->log.back().op_returns
8591 << dendl;
8592 }
8593
8594 ctx->log.back().clean_regions = ctx->clean_regions;
8595 dout(20) << __func__ << " object " << soid << " marks clean_regions " << ctx->log.back().clean_regions << dendl;
8596
7c673cae
FG
8597 if (soid.snap < CEPH_NOSNAP) {
8598 switch (log_op_type) {
8599 case pg_log_entry_t::MODIFY:
8600 case pg_log_entry_t::PROMOTE:
8601 case pg_log_entry_t::CLEAN:
11fdf7f2
TL
8602 dout(20) << __func__ << " encoding snaps from " << ctx->new_snapset
8603 << dendl;
8604 encode(ctx->new_snapset.clone_snaps[soid.snap], ctx->log.back().snaps);
7c673cae
FG
8605 break;
8606 default:
8607 break;
8608 }
8609 }
8610
8611 if (!ctx->extra_reqids.empty()) {
11fdf7f2
TL
8612 dout(20) << __func__ << " extra_reqids " << ctx->extra_reqids << " "
8613 << ctx->extra_reqid_return_codes << dendl;
7c673cae 8614 ctx->log.back().extra_reqids.swap(ctx->extra_reqids);
11fdf7f2 8615 ctx->log.back().extra_reqid_return_codes.swap(ctx->extra_reqid_return_codes);
7c673cae
FG
8616 }
8617
8618 // apply new object state.
8619 ctx->obc->obs = ctx->new_obs;
8620
11fdf7f2 8621 if (soid.is_head() && !ctx->obc->obs.exists) {
7c673cae
FG
8622 ctx->obc->ssc->exists = false;
8623 ctx->obc->ssc->snapset = SnapSet();
8624 } else {
8625 ctx->obc->ssc->exists = true;
8626 ctx->obc->ssc->snapset = ctx->new_snapset;
8627 }
8628}
8629
8630void PrimaryLogPG::apply_stats(
8631 const hobject_t &soid,
8632 const object_stat_sum_t &delta_stats) {
8633
9f95a23c
TL
8634 recovery_state.apply_op_stats(soid, delta_stats);
8635 for (set<pg_shard_t>::const_iterator i = get_backfill_targets().begin();
8636 i != get_backfill_targets().end();
7c673cae
FG
8637 ++i) {
8638 pg_shard_t bt = *i;
9f95a23c
TL
8639 const pg_info_t& pinfo = recovery_state.get_peer_info(bt);
8640 if (soid > pinfo.last_backfill && soid <= last_backfill_started) {
7c673cae 8641 pending_backfill_updates[soid].stats.add(delta_stats);
9f95a23c 8642 }
7c673cae
FG
8643 }
8644
8645 if (is_primary() && scrubber.active) {
8646 if (soid < scrubber.start) {
8647 dout(20) << __func__ << " " << soid << " < [" << scrubber.start
8648 << "," << scrubber.end << ")" << dendl;
8649 scrub_cstat.add(delta_stats);
8650 } else {
8651 dout(20) << __func__ << " " << soid << " >= [" << scrubber.start
8652 << "," << scrubber.end << ")" << dendl;
8653 }
8654 }
8655}
8656
8657void PrimaryLogPG::complete_read_ctx(int result, OpContext *ctx)
8658{
9f95a23c 8659 auto m = ctx->op->get_req<MOSDOp>();
11fdf7f2 8660 ceph_assert(ctx->async_reads_complete());
7c673cae 8661
c07f9fc5
FG
8662 for (vector<OSDOp>::iterator p = ctx->ops->begin();
8663 p != ctx->ops->end() && result >= 0; ++p) {
7c673cae
FG
8664 if (p->rval < 0 && !(p->op.flags & CEPH_OSD_OP_FLAG_FAILOK)) {
8665 result = p->rval;
8666 break;
8667 }
8668 ctx->bytes_read += p->outdata.length();
8669 }
c07f9fc5 8670 ctx->reply->get_header().data_off = (ctx->data_off ? *ctx->data_off : 0);
7c673cae
FG
8671
8672 MOSDOpReply *reply = ctx->reply;
8673 ctx->reply = nullptr;
8674
8675 if (result >= 0) {
8676 if (!ctx->ignore_log_op_stats) {
11fdf7f2
TL
8677 log_op_stats(*ctx->op, ctx->bytes_written, ctx->bytes_read);
8678
7c673cae
FG
8679 publish_stats_to_osd();
8680 }
8681
8682 // on read, return the current object version
8683 if (ctx->obs) {
8684 reply->set_reply_versions(eversion_t(), ctx->obs->oi.user_version);
8685 } else {
8686 reply->set_reply_versions(eversion_t(), ctx->user_at_version);
8687 }
8688 } else if (result == -ENOENT) {
8689 // on ENOENT, set a floor for what the next user version will be.
8690 reply->set_enoent_reply_versions(info.last_update, info.last_user_version);
8691 }
8692
8693 reply->set_result(result);
8694 reply->add_flags(CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK);
8695 osd->send_message_osd_client(reply, m->get_connection());
8696 close_op_ctx(ctx);
8697}
8698
8699// ========================================================================
8700// copyfrom
8701
8702struct C_Copyfrom : public Context {
8703 PrimaryLogPGRef pg;
8704 hobject_t oid;
8705 epoch_t last_peering_reset;
8706 ceph_tid_t tid;
11fdf7f2 8707 PrimaryLogPG::CopyOpRef cop; // used for keeping the cop alive
7c673cae
FG
8708 C_Copyfrom(PrimaryLogPG *p, hobject_t o, epoch_t lpr,
8709 const PrimaryLogPG::CopyOpRef& c)
8710 : pg(p), oid(o), last_peering_reset(lpr),
8711 tid(0), cop(c)
8712 {}
8713 void finish(int r) override {
8714 if (r == -ECANCELED)
8715 return;
9f95a23c 8716 std::scoped_lock l{*pg};
7c673cae
FG
8717 if (last_peering_reset == pg->get_last_peering_reset()) {
8718 pg->process_copy_chunk(oid, tid, r);
11fdf7f2 8719 cop.reset();
7c673cae 8720 }
7c673cae
FG
8721 }
8722};
8723
8724struct C_CopyFrom_AsyncReadCb : public Context {
8725 OSDOp *osd_op;
8726 object_copy_data_t reply_obj;
8727 uint64_t features;
8728 size_t len;
8729 C_CopyFrom_AsyncReadCb(OSDOp *osd_op, uint64_t features) :
8730 osd_op(osd_op), features(features), len(0) {}
8731 void finish(int r) override {
c07f9fc5
FG
8732 osd_op->rval = r;
8733 if (r < 0) {
8734 return;
8735 }
8736
11fdf7f2
TL
8737 ceph_assert(len > 0);
8738 ceph_assert(len <= reply_obj.data.length());
7c673cae
FG
8739 bufferlist bl;
8740 bl.substr_of(reply_obj.data, 0, len);
8741 reply_obj.data.swap(bl);
11fdf7f2
TL
8742 encode(reply_obj, osd_op->outdata, features);
8743 }
8744};
8745
8746struct C_CopyChunk : public Context {
8747 PrimaryLogPGRef pg;
8748 hobject_t oid;
8749 epoch_t last_peering_reset;
8750 ceph_tid_t tid;
8751 PrimaryLogPG::CopyOpRef cop; // used for keeping the cop alive
8752 uint64_t offset = 0;
8753 C_CopyChunk(PrimaryLogPG *p, hobject_t o, epoch_t lpr,
8754 const PrimaryLogPG::CopyOpRef& c)
8755 : pg(p), oid(o), last_peering_reset(lpr),
8756 tid(0), cop(c)
8757 {}
8758 void finish(int r) override {
8759 if (r == -ECANCELED)
8760 return;
9f95a23c 8761 std::scoped_lock l{*pg};
11fdf7f2
TL
8762 if (last_peering_reset == pg->get_last_peering_reset()) {
8763 pg->process_copy_chunk_manifest(oid, tid, r, offset);
8764 cop.reset();
8765 }
7c673cae
FG
8766 }
8767};
8768
11fdf7f2 8769int PrimaryLogPG::do_copy_get(OpContext *ctx, bufferlist::const_iterator& bp,
c07f9fc5 8770 OSDOp& osd_op, ObjectContextRef &obc)
7c673cae
FG
8771{
8772 object_info_t& oi = obc->obs.oi;
8773 hobject_t& soid = oi.soid;
8774 int result = 0;
8775 object_copy_cursor_t cursor;
8776 uint64_t out_max;
8777 try {
11fdf7f2
TL
8778 decode(cursor, bp);
8779 decode(out_max, bp);
7c673cae
FG
8780 }
8781 catch (buffer::error& e) {
8782 result = -EINVAL;
8783 return result;
8784 }
8785
8786 const MOSDOp *op = reinterpret_cast<const MOSDOp*>(ctx->op->get_req());
8787 uint64_t features = op->get_features();
8788
8789 bool async_read_started = false;
8790 object_copy_data_t _reply_obj;
11fdf7f2
TL
8791 C_CopyFrom_AsyncReadCb *cb = nullptr;
8792 if (pool.info.is_erasure()) {
7c673cae
FG
8793 cb = new C_CopyFrom_AsyncReadCb(&osd_op, features);
8794 }
8795 object_copy_data_t &reply_obj = cb ? cb->reply_obj : _reply_obj;
8796 // size, mtime
8797 reply_obj.size = oi.size;
8798 reply_obj.mtime = oi.mtime;
11fdf7f2 8799 ceph_assert(obc->ssc);
7c673cae 8800 if (soid.snap < CEPH_NOSNAP) {
11fdf7f2
TL
8801 auto p = obc->ssc->snapset.clone_snaps.find(soid.snap);
8802 ceph_assert(p != obc->ssc->snapset.clone_snaps.end()); // warn?
8803 reply_obj.snaps = p->second;
7c673cae
FG
8804 } else {
8805 reply_obj.snap_seq = obc->ssc->snapset.seq;
8806 }
11fdf7f2 8807 if (oi.is_data_digest()) {
7c673cae
FG
8808 reply_obj.flags |= object_copy_data_t::FLAG_DATA_DIGEST;
8809 reply_obj.data_digest = oi.data_digest;
8810 }
8811 if (oi.is_omap_digest()) {
8812 reply_obj.flags |= object_copy_data_t::FLAG_OMAP_DIGEST;
8813 reply_obj.omap_digest = oi.omap_digest;
8814 }
8815 reply_obj.truncate_seq = oi.truncate_seq;
8816 reply_obj.truncate_size = oi.truncate_size;
8817
8818 // attrs
8819 map<string,bufferlist>& out_attrs = reply_obj.attrs;
8820 if (!cursor.attr_complete) {
8821 result = getattrs_maybe_cache(
8822 ctx->obc,
b32b8144 8823 &out_attrs);
7c673cae
FG
8824 if (result < 0) {
8825 if (cb) {
8826 delete cb;
8827 }
8828 return result;
8829 }
8830 cursor.attr_complete = true;
8831 dout(20) << " got attrs" << dendl;
8832 }
8833
8834 int64_t left = out_max - osd_op.outdata.length();
8835
8836 // data
8837 bufferlist& bl = reply_obj.data;
8838 if (left > 0 && !cursor.data_complete) {
8839 if (cursor.data_offset < oi.size) {
11fdf7f2 8840 uint64_t max_read = std::min(oi.size - cursor.data_offset, (uint64_t)left);
7c673cae
FG
8841 if (cb) {
8842 async_read_started = true;
8843 ctx->pending_async_reads.push_back(
8844 make_pair(
8845 boost::make_tuple(cursor.data_offset, max_read, osd_op.op.flags),
8846 make_pair(&bl, cb)));
c07f9fc5
FG
8847 cb->len = max_read;
8848
8849 ctx->op_finishers[ctx->current_osd_subop_num].reset(
8850 new ReadFinisher(osd_op));
8851 result = -EINPROGRESS;
8852
8853 dout(10) << __func__ << ": async_read noted for " << soid << dendl;
7c673cae
FG
8854 } else {
8855 result = pgbackend->objects_read_sync(
c07f9fc5 8856 oi.soid, cursor.data_offset, max_read, osd_op.op.flags, &bl);
7c673cae
FG
8857 if (result < 0)
8858 return result;
8859 }
c07f9fc5
FG
8860 left -= max_read;
8861 cursor.data_offset += max_read;
7c673cae
FG
8862 }
8863 if (cursor.data_offset == oi.size) {
8864 cursor.data_complete = true;
8865 dout(20) << " got data" << dendl;
8866 }
11fdf7f2 8867 ceph_assert(cursor.data_offset <= oi.size);
7c673cae
FG
8868 }
8869
8870 // omap
8871 uint32_t omap_keys = 0;
8872 if (!pool.info.supports_omap() || !oi.is_omap()) {
8873 cursor.omap_complete = true;
8874 } else {
8875 if (left > 0 && !cursor.omap_complete) {
11fdf7f2 8876 ceph_assert(cursor.data_complete);
7c673cae
FG
8877 if (cursor.omap_offset.empty()) {
8878 osd->store->omap_get_header(ch, ghobject_t(oi.soid),
8879 &reply_obj.omap_header);
8880 }
8881 bufferlist omap_data;
8882 ObjectMap::ObjectMapIterator iter =
11fdf7f2
TL
8883 osd->store->get_omap_iterator(ch, ghobject_t(oi.soid));
8884 ceph_assert(iter);
7c673cae 8885 iter->upper_bound(cursor.omap_offset);
11fdf7f2 8886 for (; iter->valid(); iter->next()) {
7c673cae 8887 ++omap_keys;
11fdf7f2
TL
8888 encode(iter->key(), omap_data);
8889 encode(iter->value(), omap_data);
7c673cae
FG
8890 left -= iter->key().length() + 4 + iter->value().length() + 4;
8891 if (left <= 0)
8892 break;
8893 }
8894 if (omap_keys) {
11fdf7f2 8895 encode(omap_keys, reply_obj.omap_data);
7c673cae
FG
8896 reply_obj.omap_data.claim_append(omap_data);
8897 }
8898 if (iter->valid()) {
8899 cursor.omap_offset = iter->key();
8900 } else {
8901 cursor.omap_complete = true;
8902 dout(20) << " got omap" << dendl;
8903 }
8904 }
8905 }
8906
8907 if (cursor.is_complete()) {
8908 // include reqids only in the final step. this is a bit fragile
8909 // but it works...
9f95a23c 8910 recovery_state.get_pg_log().get_log().get_object_reqids(ctx->obc->obs.oi.soid, 10,
11fdf7f2
TL
8911 &reply_obj.reqids,
8912 &reply_obj.reqid_return_codes);
7c673cae
FG
8913 dout(20) << " got reqids" << dendl;
8914 }
8915
8916 dout(20) << " cursor.is_complete=" << cursor.is_complete()
8917 << " " << out_attrs.size() << " attrs"
8918 << " " << bl.length() << " bytes"
8919 << " " << reply_obj.omap_header.length() << " omap header bytes"
8920 << " " << reply_obj.omap_data.length() << " omap data bytes in "
8921 << omap_keys << " keys"
8922 << " " << reply_obj.reqids.size() << " reqids"
8923 << dendl;
8924 reply_obj.cursor = cursor;
8925 if (!async_read_started) {
11fdf7f2 8926 encode(reply_obj, osd_op.outdata, features);
7c673cae
FG
8927 }
8928 if (cb && !async_read_started) {
8929 delete cb;
8930 }
c07f9fc5
FG
8931
8932 if (result > 0) {
8933 result = 0;
8934 }
7c673cae
FG
8935 return result;
8936}
8937
8938void PrimaryLogPG::fill_in_copy_get_noent(OpRequestRef& op, hobject_t oid,
8939 OSDOp& osd_op)
8940{
9f95a23c 8941 const MOSDOp *m = static_cast<const MOSDOp*>(op->get_req());
7c673cae
FG
8942 uint64_t features = m->get_features();
8943 object_copy_data_t reply_obj;
8944
9f95a23c 8945 recovery_state.get_pg_log().get_log().get_object_reqids(oid, 10, &reply_obj.reqids,
11fdf7f2 8946 &reply_obj.reqid_return_codes);
7c673cae 8947 dout(20) << __func__ << " got reqids " << reply_obj.reqids << dendl;
11fdf7f2 8948 encode(reply_obj, osd_op.outdata, features);
7c673cae 8949 osd_op.rval = -ENOENT;
11fdf7f2 8950 MOSDOpReply *reply = new MOSDOpReply(m, 0, get_osdmap_epoch(), 0, false);
7c673cae
FG
8951 reply->set_result(-ENOENT);
8952 reply->add_flags(CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK);
8953 osd->send_message_osd_client(reply, m->get_connection());
8954}
8955
8956void PrimaryLogPG::start_copy(CopyCallback *cb, ObjectContextRef obc,
8957 hobject_t src, object_locator_t oloc,
8958 version_t version, unsigned flags,
8959 bool mirror_snapset,
8960 unsigned src_obj_fadvise_flags,
8961 unsigned dest_obj_fadvise_flags)
8962{
8963 const hobject_t& dest = obc->obs.oi.soid;
8964 dout(10) << __func__ << " " << dest
8965 << " from " << src << " " << oloc << " v" << version
8966 << " flags " << flags
8967 << (mirror_snapset ? " mirror_snapset" : "")
8968 << dendl;
8969
11fdf7f2 8970 ceph_assert(!mirror_snapset || src.snap == CEPH_NOSNAP);
7c673cae
FG
8971
8972 // cancel a previous in-progress copy?
8973 if (copy_ops.count(dest)) {
8974 // FIXME: if the src etc match, we could avoid restarting from the
8975 // beginning.
8976 CopyOpRef cop = copy_ops[dest];
94b18763
FG
8977 vector<ceph_tid_t> tids;
8978 cancel_copy(cop, false, &tids);
8979 osd->objecter->op_cancel(tids, -ECANCELED);
7c673cae
FG
8980 }
8981
8982 CopyOpRef cop(std::make_shared<CopyOp>(cb, obc, src, oloc, version, flags,
8983 mirror_snapset, src_obj_fadvise_flags,
8984 dest_obj_fadvise_flags));
8985 copy_ops[dest] = cop;
8986 obc->start_block();
8987
11fdf7f2
TL
8988 if (!obc->obs.oi.has_manifest()) {
8989 _copy_some(obc, cop);
8990 } else {
8991 if (obc->obs.oi.manifest.is_redirect()) {
8992 _copy_some(obc, cop);
8993 } else if (obc->obs.oi.manifest.is_chunked()) {
8994 auto p = obc->obs.oi.manifest.chunk_map.begin();
8995 _copy_some_manifest(obc, cop, p->first);
8996 } else {
8997 ceph_abort_msg("unrecognized manifest type");
8998 }
8999 }
7c673cae
FG
9000}
9001
9002void PrimaryLogPG::_copy_some(ObjectContextRef obc, CopyOpRef cop)
9003{
91327a77 9004 dout(10) << __func__ << " " << *obc << " " << cop << dendl;
7c673cae
FG
9005
9006 unsigned flags = 0;
9007 if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_FLUSH)
9008 flags |= CEPH_OSD_FLAG_FLUSH;
9009 if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_IGNORE_CACHE)
9010 flags |= CEPH_OSD_FLAG_IGNORE_CACHE;
9011 if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_IGNORE_OVERLAY)
9012 flags |= CEPH_OSD_FLAG_IGNORE_OVERLAY;
9013 if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_MAP_SNAP_CLONE)
9014 flags |= CEPH_OSD_FLAG_MAP_SNAP_CLONE;
9015 if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_RWORDERED)
9016 flags |= CEPH_OSD_FLAG_RWORDERED;
9017
9018 C_GatherBuilder gather(cct);
9019
9020 if (cop->cursor.is_initial() && cop->mirror_snapset) {
9021 // list snaps too.
11fdf7f2 9022 ceph_assert(cop->src.snap == CEPH_NOSNAP);
7c673cae
FG
9023 ObjectOperation op;
9024 op.list_snaps(&cop->results.snapset, NULL);
9025 ceph_tid_t tid = osd->objecter->read(cop->src.oid, cop->oloc, op,
9026 CEPH_SNAPDIR, NULL,
9027 flags, gather.new_sub(), NULL);
9028 cop->objecter_tid2 = tid;
9029 }
9030
9031 ObjectOperation op;
9032 if (cop->results.user_version) {
9033 op.assert_version(cop->results.user_version);
9034 } else {
9035 // we should learn the version after the first chunk, if we didn't know
9036 // it already!
11fdf7f2 9037 ceph_assert(cop->cursor.is_initial());
7c673cae
FG
9038 }
9039 op.copy_get(&cop->cursor, get_copy_chunk_size(),
9040 &cop->results.object_size, &cop->results.mtime,
9041 &cop->attrs, &cop->data, &cop->omap_header, &cop->omap_data,
9042 &cop->results.snaps, &cop->results.snap_seq,
9043 &cop->results.flags,
9044 &cop->results.source_data_digest,
9045 &cop->results.source_omap_digest,
9046 &cop->results.reqids,
11fdf7f2 9047 &cop->results.reqid_return_codes,
7c673cae
FG
9048 &cop->results.truncate_seq,
9049 &cop->results.truncate_size,
9050 &cop->rval);
9051 op.set_last_op_flags(cop->src_obj_fadvise_flags);
9052
9053 C_Copyfrom *fin = new C_Copyfrom(this, obc->obs.oi.soid,
9054 get_last_peering_reset(), cop);
9055 gather.set_finisher(new C_OnFinisher(fin,
9f95a23c 9056 osd->get_objecter_finisher(get_pg_shard())));
7c673cae
FG
9057
9058 ceph_tid_t tid = osd->objecter->read(cop->src.oid, cop->oloc, op,
9059 cop->src.snap, NULL,
9060 flags,
9061 gather.new_sub(),
9062 // discover the object version if we don't know it yet
9063 cop->results.user_version ? NULL : &cop->results.user_version);
9064 fin->tid = tid;
9065 cop->objecter_tid = tid;
9066 gather.activate();
9067}
9068
11fdf7f2
TL
9069void PrimaryLogPG::_copy_some_manifest(ObjectContextRef obc, CopyOpRef cop, uint64_t start_offset)
9070{
9071 dout(10) << __func__ << " " << *obc << " " << cop << dendl;
9072
9073 unsigned flags = 0;
9074 if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_FLUSH)
9075 flags |= CEPH_OSD_FLAG_FLUSH;
9076 if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_IGNORE_CACHE)
9077 flags |= CEPH_OSD_FLAG_IGNORE_CACHE;
9078 if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_IGNORE_OVERLAY)
9079 flags |= CEPH_OSD_FLAG_IGNORE_OVERLAY;
9080 if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_MAP_SNAP_CLONE)
9081 flags |= CEPH_OSD_FLAG_MAP_SNAP_CLONE;
9082 if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_RWORDERED)
9083 flags |= CEPH_OSD_FLAG_RWORDERED;
9084
9085 int num_chunks = 0;
9086 uint64_t last_offset = 0, chunks_size = 0;
9087 object_manifest_t *manifest = &obc->obs.oi.manifest;
9088 map<uint64_t, chunk_info_t>::iterator iter = manifest->chunk_map.find(start_offset);
9089 for (;iter != manifest->chunk_map.end(); ++iter) {
9090 num_chunks++;
9091 chunks_size += iter->second.length;
9092 last_offset = iter->first;
9093 if (get_copy_chunk_size() < chunks_size) {
9094 break;
9095 }
9096 }
9097
9098 cop->num_chunk = num_chunks;
9099 cop->start_offset = start_offset;
9100 cop->last_offset = last_offset;
9101 dout(20) << __func__ << " oid " << obc->obs.oi.soid << " num_chunks: " << num_chunks
9102 << " start_offset: " << start_offset << " chunks_size: " << chunks_size
9103 << " last_offset: " << last_offset << dendl;
9104
9105 iter = manifest->chunk_map.find(start_offset);
9106 for (;iter != manifest->chunk_map.end(); ++iter) {
9107 uint64_t obj_offset = iter->first;
9108 uint64_t length = manifest->chunk_map[iter->first].length;
9109 hobject_t soid = manifest->chunk_map[iter->first].oid;
9110 object_locator_t oloc(soid);
9111 CopyCallback * cb = NULL;
9112 CopyOpRef sub_cop(std::make_shared<CopyOp>(cb, ObjectContextRef(), cop->src, oloc,
9113 cop->results.user_version, cop->flags, cop->mirror_snapset,
9114 cop->src_obj_fadvise_flags, cop->dest_obj_fadvise_flags));
9115 sub_cop->cursor.data_offset = obj_offset;
9116 cop->chunk_cops[obj_offset] = sub_cop;
9117
9118 int s = sub_cop->chunk_ops.size();
9119 sub_cop->chunk_ops.resize(s+1);
9120 sub_cop->chunk_ops[s].op.op = CEPH_OSD_OP_READ;
9121 sub_cop->chunk_ops[s].op.extent.offset = manifest->chunk_map[iter->first].offset;
9122 sub_cop->chunk_ops[s].op.extent.length = length;
9123
9124 ObjectOperation op;
9125 op.dup(sub_cop->chunk_ops);
9126
9127 dout(20) << __func__ << " tgt_oid: " << soid.oid << " tgt_offset: "
9128 << manifest->chunk_map[iter->first].offset
9129 << " length: " << length << " pool id: " << oloc.pool << dendl;
9130
9131 if (cop->results.user_version) {
9132 op.assert_version(cop->results.user_version);
9133 } else {
9134 // we should learn the version after the first chunk, if we didn't know
9135 // it already!
9136 ceph_assert(cop->cursor.is_initial());
9137 }
9138 op.set_last_op_flags(cop->src_obj_fadvise_flags);
9139
9140 C_CopyChunk *fin = new C_CopyChunk(this, obc->obs.oi.soid,
9141 get_last_peering_reset(), cop);
9142 fin->offset = obj_offset;
9f95a23c
TL
9143
9144 ceph_tid_t tid = osd->objecter->read(
9145 soid.oid, oloc, op,
9146 sub_cop->src.snap, NULL,
9147 flags,
9148 new C_OnFinisher(fin, osd->get_objecter_finisher(get_pg_shard())),
9149 // discover the object version if we don't know it yet
9150 sub_cop->results.user_version ? NULL : &sub_cop->results.user_version);
11fdf7f2
TL
9151 fin->tid = tid;
9152 sub_cop->objecter_tid = tid;
9153 if (last_offset < iter->first) {
9154 break;
9155 }
9156 }
9157}
9158
7c673cae
FG
9159void PrimaryLogPG::process_copy_chunk(hobject_t oid, ceph_tid_t tid, int r)
9160{
9161 dout(10) << __func__ << " " << oid << " tid " << tid
9162 << " " << cpp_strerror(r) << dendl;
9163 map<hobject_t,CopyOpRef>::iterator p = copy_ops.find(oid);
9164 if (p == copy_ops.end()) {
9165 dout(10) << __func__ << " no copy_op found" << dendl;
9166 return;
9167 }
9168 CopyOpRef cop = p->second;
9169 if (tid != cop->objecter_tid) {
9170 dout(10) << __func__ << " tid " << tid << " != cop " << cop
9171 << " tid " << cop->objecter_tid << dendl;
9172 return;
9173 }
9174
9175 if (cop->omap_data.length() || cop->omap_header.length())
9176 cop->results.has_omap = true;
9177
9178 if (r >= 0 && !pool.info.supports_omap() &&
9179 (cop->omap_data.length() || cop->omap_header.length())) {
9180 r = -EOPNOTSUPP;
9181 }
9182 cop->objecter_tid = 0;
9183 cop->objecter_tid2 = 0; // assume this ordered before us (if it happened)
9184 ObjectContextRef& cobc = cop->obc;
9185
9186 if (r < 0)
9187 goto out;
9188
11fdf7f2 9189 ceph_assert(cop->rval >= 0);
7c673cae
FG
9190
9191 if (oid.snap < CEPH_NOSNAP && !cop->results.snaps.empty()) {
9192 // verify snap hasn't been deleted
9193 vector<snapid_t>::iterator p = cop->results.snaps.begin();
9194 while (p != cop->results.snaps.end()) {
9f95a23c
TL
9195 // make best effort to sanitize snaps/clones.
9196 if (get_osdmap()->in_removed_snaps_queue(info.pgid.pgid.pool(), *p)) {
7c673cae
FG
9197 dout(10) << __func__ << " clone snap " << *p << " has been deleted"
9198 << dendl;
9199 for (vector<snapid_t>::iterator q = p + 1;
9200 q != cop->results.snaps.end();
9201 ++q)
9202 *(q - 1) = *q;
9203 cop->results.snaps.resize(cop->results.snaps.size() - 1);
9204 } else {
9205 ++p;
9206 }
9207 }
9208 if (cop->results.snaps.empty()) {
9209 dout(10) << __func__ << " no more snaps for " << oid << dendl;
9210 r = -ENOENT;
9211 goto out;
9212 }
9213 }
9214
11fdf7f2 9215 ceph_assert(cop->rval >= 0);
7c673cae
FG
9216
9217 if (!cop->temp_cursor.data_complete) {
9218 cop->results.data_digest = cop->data.crc32c(cop->results.data_digest);
9219 }
9220 if (pool.info.supports_omap() && !cop->temp_cursor.omap_complete) {
9221 if (cop->omap_header.length()) {
9222 cop->results.omap_digest =
9223 cop->omap_header.crc32c(cop->results.omap_digest);
9224 }
9225 if (cop->omap_data.length()) {
9226 bufferlist keys;
9227 keys.substr_of(cop->omap_data, 4, cop->omap_data.length() - 4);
9228 cop->results.omap_digest = keys.crc32c(cop->results.omap_digest);
9229 }
9230 }
9231
9232 if (!cop->temp_cursor.attr_complete) {
9233 for (map<string,bufferlist>::iterator p = cop->attrs.begin();
9234 p != cop->attrs.end();
9235 ++p) {
9236 cop->results.attrs[string("_") + p->first] = p->second;
9237 }
9238 cop->attrs.clear();
9239 }
9240
9241 if (!cop->cursor.is_complete()) {
9242 // write out what we have so far
9243 if (cop->temp_cursor.is_initial()) {
11fdf7f2 9244 ceph_assert(!cop->results.started_temp_obj);
7c673cae
FG
9245 cop->results.started_temp_obj = true;
9246 cop->results.temp_oid = generate_temp_object(oid);
9247 dout(20) << __func__ << " using temp " << cop->results.temp_oid << dendl;
9248 }
9249 ObjectContextRef tempobc = get_object_context(cop->results.temp_oid, true);
9250 OpContextUPtr ctx = simple_opc_create(tempobc);
9251 if (cop->temp_cursor.is_initial()) {
9252 ctx->new_temp_oid = cop->results.temp_oid;
9253 }
9254 _write_copy_chunk(cop, ctx->op_t.get());
9255 simple_opc_submit(std::move(ctx));
9256 dout(10) << __func__ << " fetching more" << dendl;
9257 _copy_some(cobc, cop);
9258 return;
9259 }
9260
9261 // verify digests?
9262 if (cop->results.is_data_digest() || cop->results.is_omap_digest()) {
9263 dout(20) << __func__ << std::hex
9264 << " got digest: rx data 0x" << cop->results.data_digest
9265 << " omap 0x" << cop->results.omap_digest
9266 << ", source: data 0x" << cop->results.source_data_digest
9267 << " omap 0x" << cop->results.source_omap_digest
9268 << std::dec
9269 << " flags " << cop->results.flags
9270 << dendl;
9271 }
9272 if (cop->results.is_data_digest() &&
9273 cop->results.data_digest != cop->results.source_data_digest) {
9274 derr << __func__ << std::hex << " data digest 0x" << cop->results.data_digest
9275 << " != source 0x" << cop->results.source_data_digest << std::dec
9276 << dendl;
9277 osd->clog->error() << info.pgid << " copy from " << cop->src
9278 << " to " << cop->obc->obs.oi.soid << std::hex
9279 << " data digest 0x" << cop->results.data_digest
9280 << " != source 0x" << cop->results.source_data_digest
9281 << std::dec;
9282 r = -EIO;
9283 goto out;
9284 }
9285 if (cop->results.is_omap_digest() &&
9286 cop->results.omap_digest != cop->results.source_omap_digest) {
9287 derr << __func__ << std::hex
9288 << " omap digest 0x" << cop->results.omap_digest
9289 << " != source 0x" << cop->results.source_omap_digest
9290 << std::dec << dendl;
9291 osd->clog->error() << info.pgid << " copy from " << cop->src
9292 << " to " << cop->obc->obs.oi.soid << std::hex
9293 << " omap digest 0x" << cop->results.omap_digest
9294 << " != source 0x" << cop->results.source_omap_digest
9295 << std::dec;
9296 r = -EIO;
9297 goto out;
9298 }
9299 if (cct->_conf->osd_debug_inject_copyfrom_error) {
9300 derr << __func__ << " injecting copyfrom failure" << dendl;
9301 r = -EIO;
9302 goto out;
9303 }
9304
9305 cop->results.fill_in_final_tx = std::function<void(PGTransaction*)>(
9306 [this, &cop /* avoid ref cycle */](PGTransaction *t) {
9307 ObjectState& obs = cop->obc->obs;
9308 if (cop->temp_cursor.is_initial()) {
9309 dout(20) << "fill_in_final_tx: writing "
9310 << "directly to final object" << dendl;
9311 // write directly to final object
9312 cop->results.temp_oid = obs.oi.soid;
9313 _write_copy_chunk(cop, t);
9314 } else {
9315 // finish writing to temp object, then move into place
9316 dout(20) << "fill_in_final_tx: writing to temp object" << dendl;
9317 _write_copy_chunk(cop, t);
9318 t->rename(obs.oi.soid, cop->results.temp_oid);
9319 }
9320 t->setattrs(obs.oi.soid, cop->results.attrs);
9321 });
9322
9323 dout(20) << __func__ << " success; committing" << dendl;
9324
9325 out:
9326 dout(20) << __func__ << " complete r = " << cpp_strerror(r) << dendl;
9327 CopyCallbackResults results(r, &cop->results);
9328 cop->cb->complete(results);
9329
9330 copy_ops.erase(cobc->obs.oi.soid);
9331 cobc->stop_block();
9332
9333 if (r < 0 && cop->results.started_temp_obj) {
9334 dout(10) << __func__ << " deleting partial temp object "
9335 << cop->results.temp_oid << dendl;
9336 ObjectContextRef tempobc = get_object_context(cop->results.temp_oid, true);
9337 OpContextUPtr ctx = simple_opc_create(tempobc);
9338 ctx->op_t->remove(cop->results.temp_oid);
9339 ctx->discard_temp_oid = cop->results.temp_oid;
9340 simple_opc_submit(std::move(ctx));
9341 }
9342
9343 // cancel and requeue proxy ops on this object
9344 if (!r) {
11fdf7f2
TL
9345 cancel_and_requeue_proxy_ops(cobc->obs.oi.soid);
9346 }
9347
9348 kick_object_context_blocked(cobc);
9349}
9350
9351void PrimaryLogPG::process_copy_chunk_manifest(hobject_t oid, ceph_tid_t tid, int r, uint64_t offset)
9352{
9353 dout(10) << __func__ << " " << oid << " tid " << tid
9354 << " " << cpp_strerror(r) << dendl;
9355 map<hobject_t,CopyOpRef>::iterator p = copy_ops.find(oid);
9356 if (p == copy_ops.end()) {
9357 dout(10) << __func__ << " no copy_op found" << dendl;
9358 return;
9359 }
9360 CopyOpRef obj_cop = p->second;
9361 CopyOpRef chunk_cop = obj_cop->chunk_cops[offset];
9362
9363 if (tid != chunk_cop->objecter_tid) {
9364 dout(10) << __func__ << " tid " << tid << " != cop " << chunk_cop
9365 << " tid " << chunk_cop->objecter_tid << dendl;
9366 return;
9367 }
9368
9369 if (chunk_cop->omap_data.length() || chunk_cop->omap_header.length()) {
9370 r = -EOPNOTSUPP;
9371 }
9372
9373 chunk_cop->objecter_tid = 0;
9374 chunk_cop->objecter_tid2 = 0; // assume this ordered before us (if it happened)
9375 ObjectContextRef& cobc = obj_cop->obc;
9376 OSDOp &chunk_data = chunk_cop->chunk_ops[0];
9377
9378 if (r < 0) {
9379 obj_cop->failed = true;
9380 goto out;
9381 }
9382
9383 if (obj_cop->failed) {
9384 return;
9385 }
9386 if (!chunk_data.outdata.length()) {
9387 r = -EIO;
9388 obj_cop->failed = true;
9389 goto out;
9390 }
9391
9392 obj_cop->num_chunk--;
9393
9394 /* check all of the copyop are completed */
9395 if (obj_cop->num_chunk) {
9396 dout(20) << __func__ << " num_chunk: " << obj_cop->num_chunk << dendl;
9397 return;
9398 }
9399
9400 {
9401 OpContextUPtr ctx = simple_opc_create(obj_cop->obc);
9402 if (!ctx->lock_manager.take_write_lock(
9403 obj_cop->obc->obs.oi.soid,
9404 obj_cop->obc)) {
9405 // recovery op can take read lock.
9406 // so need to wait for recovery completion
9407 r = -EAGAIN;
9408 obj_cop->failed = true;
9409 close_op_ctx(ctx.release());
9410 goto out;
7c673cae 9411 }
11fdf7f2
TL
9412 dout(20) << __func__ << " took lock on obc, " << obj_cop->obc->rwstate << dendl;
9413
9414 PGTransaction *t = ctx->op_t.get();
9415 ObjectState& obs = ctx->new_obs;
9416 for (auto p : obj_cop->chunk_cops) {
9417 OSDOp &sub_chunk = p.second->chunk_ops[0];
9418 t->write(cobc->obs.oi.soid,
9419 p.second->cursor.data_offset,
9420 sub_chunk.outdata.length(),
9421 sub_chunk.outdata,
9422 p.second->dest_obj_fadvise_flags);
9423 dout(20) << __func__ << " offset: " << p.second->cursor.data_offset
9424 << " length: " << sub_chunk.outdata.length() << dendl;
9425 write_update_size_and_usage(ctx->delta_stats, obs.oi, ctx->modified_ranges,
9426 p.second->cursor.data_offset, sub_chunk.outdata.length());
9427 obs.oi.manifest.chunk_map[p.second->cursor.data_offset].clear_flag(chunk_info_t::FLAG_DIRTY);
9428 obs.oi.manifest.chunk_map[p.second->cursor.data_offset].clear_flag(chunk_info_t::FLAG_MISSING);
9f95a23c 9429 ctx->clean_regions.mark_data_region_dirty(p.second->cursor.data_offset, sub_chunk.outdata.length());
11fdf7f2
TL
9430 sub_chunk.outdata.clear();
9431 }
9432 obs.oi.clear_data_digest();
9433 ctx->at_version = get_next_version();
9434 finish_ctx(ctx.get(), pg_log_entry_t::PROMOTE);
9435 simple_opc_submit(std::move(ctx));
9436
9437 auto p = cobc->obs.oi.manifest.chunk_map.rbegin();
9438 /* check remaining work */
9439 if (p != cobc->obs.oi.manifest.chunk_map.rend()) {
9440 if (obj_cop->last_offset >= p->first + p->second.length) {
9441 for (auto &en : cobc->obs.oi.manifest.chunk_map) {
9442 if (obj_cop->last_offset < en.first) {
9443 _copy_some_manifest(cobc, obj_cop, en.first);
9444 return;
9445 }
9446 }
7c673cae
FG
9447 }
9448 }
11fdf7f2
TL
9449 }
9450
9451 out:
9452 dout(20) << __func__ << " complete r = " << cpp_strerror(r) << dendl;
9453 CopyCallbackResults results(r, &obj_cop->results);
9454 obj_cop->cb->complete(results);
9455
9456 copy_ops.erase(cobc->obs.oi.soid);
9457 cobc->stop_block();
9458
9459 // cancel and requeue proxy ops on this object
9460 if (!r) {
9461 cancel_and_requeue_proxy_ops(cobc->obs.oi.soid);
7c673cae
FG
9462 }
9463
9464 kick_object_context_blocked(cobc);
9465}
9466
94b18763
FG
9467void PrimaryLogPG::cancel_and_requeue_proxy_ops(hobject_t oid) {
9468 vector<ceph_tid_t> tids;
9469 for (map<ceph_tid_t, ProxyReadOpRef>::iterator it = proxyread_ops.begin();
9470 it != proxyread_ops.end();) {
9471 if (it->second->soid == oid) {
9472 cancel_proxy_read((it++)->second, &tids);
9473 } else {
9474 ++it;
9475 }
9476 }
9477 for (map<ceph_tid_t, ProxyWriteOpRef>::iterator it = proxywrite_ops.begin();
9478 it != proxywrite_ops.end();) {
9479 if (it->second->soid == oid) {
9480 cancel_proxy_write((it++)->second, &tids);
9481 } else {
9482 ++it;
9483 }
9484 }
9485 osd->objecter->op_cancel(tids, -ECANCELED);
9486 kick_proxy_ops_blocked(oid);
9487}
9488
7c673cae
FG
9489void PrimaryLogPG::_write_copy_chunk(CopyOpRef cop, PGTransaction *t)
9490{
9491 dout(20) << __func__ << " " << cop
9492 << " " << cop->attrs.size() << " attrs"
9493 << " " << cop->data.length() << " bytes"
9494 << " " << cop->omap_header.length() << " omap header bytes"
9495 << " " << cop->omap_data.length() << " omap data bytes"
9496 << dendl;
9497 if (!cop->temp_cursor.attr_complete) {
9498 t->create(cop->results.temp_oid);
9499 }
9500 if (!cop->temp_cursor.data_complete) {
11fdf7f2 9501 ceph_assert(cop->data.length() + cop->temp_cursor.data_offset ==
7c673cae 9502 cop->cursor.data_offset);
11fdf7f2 9503 if (pool.info.required_alignment() &&
7c673cae
FG
9504 !cop->cursor.data_complete) {
9505 /**
9506 * Trim off the unaligned bit at the end, we'll adjust cursor.data_offset
9507 * to pick it up on the next pass.
9508 */
11fdf7f2 9509 ceph_assert(cop->temp_cursor.data_offset %
7c673cae
FG
9510 pool.info.required_alignment() == 0);
9511 if (cop->data.length() % pool.info.required_alignment() != 0) {
9512 uint64_t to_trim =
9513 cop->data.length() % pool.info.required_alignment();
9514 bufferlist bl;
9515 bl.substr_of(cop->data, 0, cop->data.length() - to_trim);
9516 cop->data.swap(bl);
9517 cop->cursor.data_offset -= to_trim;
11fdf7f2 9518 ceph_assert(cop->data.length() + cop->temp_cursor.data_offset ==
7c673cae
FG
9519 cop->cursor.data_offset);
9520 }
9521 }
9522 if (cop->data.length()) {
9523 t->write(
9524 cop->results.temp_oid,
9525 cop->temp_cursor.data_offset,
9526 cop->data.length(),
9527 cop->data,
9528 cop->dest_obj_fadvise_flags);
9529 }
9530 cop->data.clear();
9531 }
9532 if (pool.info.supports_omap()) {
9533 if (!cop->temp_cursor.omap_complete) {
9534 if (cop->omap_header.length()) {
9535 t->omap_setheader(
9536 cop->results.temp_oid,
9537 cop->omap_header);
9538 cop->omap_header.clear();
9539 }
9540 if (cop->omap_data.length()) {
9541 map<string,bufferlist> omap;
11fdf7f2
TL
9542 bufferlist::const_iterator p = cop->omap_data.begin();
9543 decode(omap, p);
7c673cae
FG
9544 t->omap_setkeys(cop->results.temp_oid, omap);
9545 cop->omap_data.clear();
9546 }
9547 }
9548 } else {
11fdf7f2
TL
9549 ceph_assert(cop->omap_header.length() == 0);
9550 ceph_assert(cop->omap_data.length() == 0);
7c673cae
FG
9551 }
9552 cop->temp_cursor = cop->cursor;
9553}
9554
c07f9fc5 9555void PrimaryLogPG::finish_copyfrom(CopyFromCallback *cb)
7c673cae 9556{
c07f9fc5 9557 OpContext *ctx = cb->ctx;
7c673cae 9558 dout(20) << "finish_copyfrom on " << ctx->obs->oi.soid << dendl;
7c673cae 9559
c07f9fc5 9560 ObjectState& obs = ctx->new_obs;
7c673cae
FG
9561 if (obs.exists) {
9562 dout(20) << __func__ << ": exists, removing" << dendl;
9563 ctx->op_t->remove(obs.oi.soid);
9564 } else {
9565 ctx->delta_stats.num_objects++;
9566 obs.exists = true;
9567 }
9568 if (cb->is_temp_obj_used()) {
9569 ctx->discard_temp_oid = cb->results->temp_oid;
9570 }
9571 cb->results->fill_in_final_tx(ctx->op_t.get());
9572
9573 // CopyFromCallback fills this in for us
9574 obs.oi.user_version = ctx->user_at_version;
9575
28e407b8
AA
9576 if (cb->results->is_data_digest()) {
9577 obs.oi.set_data_digest(cb->results->data_digest);
9578 } else {
9579 obs.oi.clear_data_digest();
9580 }
9581 if (cb->results->is_omap_digest()) {
9582 obs.oi.set_omap_digest(cb->results->omap_digest);
9583 } else {
9584 obs.oi.clear_omap_digest();
9585 }
7c673cae 9586
9f95a23c
TL
9587 obs.oi.truncate_seq = cb->truncate_seq;
9588 obs.oi.truncate_size = cb->truncate_size;
9589
9590 obs.oi.mtime = ceph::real_clock::to_timespec(cb->results->mtime);
9591 ctx->mtime = utime_t();
7c673cae
FG
9592
9593 ctx->extra_reqids = cb->results->reqids;
11fdf7f2 9594 ctx->extra_reqid_return_codes = cb->results->reqid_return_codes;
7c673cae
FG
9595
9596 // cache: clear whiteout?
9597 if (obs.oi.is_whiteout()) {
9598 dout(10) << __func__ << " clearing whiteout on " << obs.oi.soid << dendl;
9599 obs.oi.clear_flag(object_info_t::FLAG_WHITEOUT);
9600 --ctx->delta_stats.num_whiteouts;
9601 }
9602
9603 if (cb->results->has_omap) {
9604 dout(10) << __func__ << " setting omap flag on " << obs.oi.soid << dendl;
9605 obs.oi.set_flag(object_info_t::FLAG_OMAP);
9f95a23c 9606 ctx->clean_regions.mark_omap_dirty();
7c673cae
FG
9607 } else {
9608 dout(10) << __func__ << " clearing omap flag on " << obs.oi.soid << dendl;
9609 obs.oi.clear_flag(object_info_t::FLAG_OMAP);
9610 }
9611
9612 interval_set<uint64_t> ch;
9613 if (obs.oi.size > 0)
9614 ch.insert(0, obs.oi.size);
9615 ctx->modified_ranges.union_of(ch);
9f95a23c 9616 ctx->clean_regions.mark_data_region_dirty(0, std::max(obs.oi.size, cb->get_data_size()));
7c673cae
FG
9617
9618 if (cb->get_data_size() != obs.oi.size) {
9619 ctx->delta_stats.num_bytes -= obs.oi.size;
9620 obs.oi.size = cb->get_data_size();
9621 ctx->delta_stats.num_bytes += obs.oi.size;
9622 }
9623 ctx->delta_stats.num_wr++;
11fdf7f2 9624 ctx->delta_stats.num_wr_kb += shift_round_up(obs.oi.size, 10);
7c673cae
FG
9625
9626 osd->logger->inc(l_osd_copyfrom);
9627}
9628
9629void PrimaryLogPG::finish_promote(int r, CopyResults *results,
9630 ObjectContextRef obc)
9631{
9632 const hobject_t& soid = obc->obs.oi.soid;
9633 dout(10) << __func__ << " " << soid << " r=" << r
9634 << " uv" << results->user_version << dendl;
9635
9636 if (r == -ECANCELED) {
9637 return;
9638 }
9639
9640 if (r != -ENOENT && soid.is_snap()) {
9641 if (results->snaps.empty()) {
9f95a23c
TL
9642 // we must have read "snap" content from the head object in the
9643 // base pool. use snap_seq to construct what snaps should be
9644 // for this clone (what is was before we evicted the clean clone
9645 // from this pool, and what it will be when we flush and the
9646 // clone eventually happens in the base pool). we want to use
9647 // snaps in (results->snap_seq,soid.snap]
7c673cae 9648 SnapSet& snapset = obc->ssc->snapset;
9f95a23c
TL
9649 for (auto p = snapset.clone_snaps.rbegin();
9650 p != snapset.clone_snaps.rend();
9651 ++p) {
9652 for (auto snap : p->second) {
9653 if (snap > soid.snap) {
9654 continue;
9655 }
9656 if (snap <= results->snap_seq) {
9657 break;
9658 }
9659 results->snaps.push_back(snap);
9660 }
7c673cae
FG
9661 }
9662 }
9663
9664 dout(20) << __func__ << " snaps " << results->snaps << dendl;
9665 filter_snapc(results->snaps);
9666
9667 dout(20) << __func__ << " filtered snaps " << results->snaps << dendl;
9668 if (results->snaps.empty()) {
9669 dout(20) << __func__
9670 << " snaps are empty, clone is invalid,"
9671 << " setting r to ENOENT" << dendl;
9672 r = -ENOENT;
9673 }
9674 }
9675
9676 if (r < 0 && results->started_temp_obj) {
9677 dout(10) << __func__ << " abort; will clean up partial work" << dendl;
9678 ObjectContextRef tempobc = get_object_context(results->temp_oid, false);
11fdf7f2 9679 ceph_assert(tempobc);
7c673cae
FG
9680 OpContextUPtr ctx = simple_opc_create(tempobc);
9681 ctx->op_t->remove(results->temp_oid);
9682 simple_opc_submit(std::move(ctx));
9683 results->started_temp_obj = false;
9684 }
9685
9686 if (r == -ENOENT && soid.is_snap()) {
9687 dout(10) << __func__
9688 << ": enoent while trying to promote clone, " << soid
9689 << " must have been trimmed, removing from snapset"
9690 << dendl;
9691 hobject_t head(soid.get_head());
9692 ObjectContextRef obc = get_object_context(head, false);
11fdf7f2 9693 ceph_assert(obc);
7c673cae
FG
9694
9695 OpContextUPtr tctx = simple_opc_create(obc);
9696 tctx->at_version = get_next_version();
9f95a23c
TL
9697 if (get_osdmap()->require_osd_release < ceph_release_t::octopus) {
9698 filter_snapc(tctx->new_snapset.snaps);
9699 } else {
9700 tctx->new_snapset.snaps.clear();
9701 }
7c673cae
FG
9702 vector<snapid_t> new_clones;
9703 map<snapid_t, vector<snapid_t>> new_clone_snaps;
9704 for (vector<snapid_t>::iterator i = tctx->new_snapset.clones.begin();
9705 i != tctx->new_snapset.clones.end();
9706 ++i) {
9707 if (*i != soid.snap) {
9708 new_clones.push_back(*i);
9709 auto p = tctx->new_snapset.clone_snaps.find(*i);
9710 if (p != tctx->new_snapset.clone_snaps.end()) {
9711 new_clone_snaps[*i] = p->second;
9712 }
9713 }
9714 }
9715 tctx->new_snapset.clones.swap(new_clones);
9716 tctx->new_snapset.clone_overlap.erase(soid.snap);
9717 tctx->new_snapset.clone_size.erase(soid.snap);
9718 tctx->new_snapset.clone_snaps.swap(new_clone_snaps);
9719
9720 // take RWWRITE lock for duration of our local write. ignore starvation.
9721 if (!tctx->lock_manager.take_write_lock(
9722 head,
9723 obc)) {
11fdf7f2 9724 ceph_abort_msg("problem!");
7c673cae
FG
9725 }
9726 dout(20) << __func__ << " took lock on obc, " << obc->rwstate << dendl;
9727
9728 finish_ctx(tctx.get(), pg_log_entry_t::PROMOTE);
9729
9730 simple_opc_submit(std::move(tctx));
9731 return;
9732 }
9733
9734 bool whiteout = false;
9735 if (r == -ENOENT) {
11fdf7f2 9736 ceph_assert(soid.snap == CEPH_NOSNAP); // snap case is above
7c673cae
FG
9737 dout(10) << __func__ << " whiteout " << soid << dendl;
9738 whiteout = true;
9739 }
9740
9741 if (r < 0 && !whiteout) {
9742 derr << __func__ << " unexpected promote error " << cpp_strerror(r) << dendl;
9743 // pass error to everyone blocked on this object
9744 // FIXME: this is pretty sloppy, but at this point we got
9745 // something unexpected and don't have many other options.
9746 map<hobject_t,list<OpRequestRef>>::iterator blocked_iter =
9747 waiting_for_blocked_object.find(soid);
9748 if (blocked_iter != waiting_for_blocked_object.end()) {
9749 while (!blocked_iter->second.empty()) {
9750 osd->reply_op_error(blocked_iter->second.front(), r);
9751 blocked_iter->second.pop_front();
9752 }
9753 waiting_for_blocked_object.erase(blocked_iter);
9754 }
9755 return;
9756 }
9757
9758 osd->promote_finish(results->object_size);
9759
9760 OpContextUPtr tctx = simple_opc_create(obc);
9761 tctx->at_version = get_next_version();
9762
11fdf7f2
TL
9763 if (!obc->obs.oi.has_manifest()) {
9764 ++tctx->delta_stats.num_objects;
9765 }
7c673cae
FG
9766 if (soid.snap < CEPH_NOSNAP)
9767 ++tctx->delta_stats.num_object_clones;
9768 tctx->new_obs.exists = true;
9769
9770 tctx->extra_reqids = results->reqids;
11fdf7f2 9771 tctx->extra_reqid_return_codes = results->reqid_return_codes;
7c673cae
FG
9772
9773 if (whiteout) {
9774 // create a whiteout
9775 tctx->op_t->create(soid);
9776 tctx->new_obs.oi.set_flag(object_info_t::FLAG_WHITEOUT);
9777 ++tctx->delta_stats.num_whiteouts;
9778 dout(20) << __func__ << " creating whiteout on " << soid << dendl;
9779 osd->logger->inc(l_osd_tier_whiteout);
9780 } else {
9781 if (results->has_omap) {
9782 dout(10) << __func__ << " setting omap flag on " << soid << dendl;
9783 tctx->new_obs.oi.set_flag(object_info_t::FLAG_OMAP);
9784 ++tctx->delta_stats.num_objects_omap;
9785 }
9786
9787 results->fill_in_final_tx(tctx->op_t.get());
9788 if (results->started_temp_obj) {
9789 tctx->discard_temp_oid = results->temp_oid;
9790 }
9791 tctx->new_obs.oi.size = results->object_size;
9792 tctx->new_obs.oi.user_version = results->user_version;
9f95a23c
TL
9793 tctx->new_obs.oi.mtime = ceph::real_clock::to_timespec(results->mtime);
9794 tctx->mtime = utime_t();
28e407b8 9795 if (results->is_data_digest()) {
7c673cae 9796 tctx->new_obs.oi.set_data_digest(results->data_digest);
28e407b8
AA
9797 } else {
9798 tctx->new_obs.oi.clear_data_digest();
9799 }
9f95a23c
TL
9800 if (results->object_size)
9801 tctx->clean_regions.mark_data_region_dirty(0, results->object_size);
28e407b8 9802 if (results->is_omap_digest()) {
7c673cae 9803 tctx->new_obs.oi.set_omap_digest(results->omap_digest);
28e407b8
AA
9804 } else {
9805 tctx->new_obs.oi.clear_omap_digest();
9806 }
9f95a23c
TL
9807 if (results->has_omap)
9808 tctx->clean_regions.mark_omap_dirty();
7c673cae
FG
9809 tctx->new_obs.oi.truncate_seq = results->truncate_seq;
9810 tctx->new_obs.oi.truncate_size = results->truncate_size;
9811
9812 if (soid.snap != CEPH_NOSNAP) {
11fdf7f2
TL
9813 ceph_assert(obc->ssc->snapset.clone_snaps.count(soid.snap));
9814 ceph_assert(obc->ssc->snapset.clone_size.count(soid.snap));
9815 ceph_assert(obc->ssc->snapset.clone_size[soid.snap] ==
7c673cae 9816 results->object_size);
11fdf7f2 9817 ceph_assert(obc->ssc->snapset.clone_overlap.count(soid.snap));
7c673cae
FG
9818
9819 tctx->delta_stats.num_bytes += obc->ssc->snapset.get_clone_bytes(soid.snap);
9820 } else {
9821 tctx->delta_stats.num_bytes += results->object_size;
9822 }
9823 }
9824
9825 if (results->mirror_snapset) {
11fdf7f2 9826 ceph_assert(tctx->new_obs.oi.soid.snap == CEPH_NOSNAP);
7c673cae
FG
9827 tctx->new_snapset.from_snap_set(
9828 results->snapset,
9f95a23c 9829 get_osdmap()->require_osd_release < ceph_release_t::luminous);
7c673cae 9830 }
7c673cae
FG
9831 dout(20) << __func__ << " new_snapset " << tctx->new_snapset << dendl;
9832
9833 // take RWWRITE lock for duration of our local write. ignore starvation.
9834 if (!tctx->lock_manager.take_write_lock(
9835 obc->obs.oi.soid,
9836 obc)) {
11fdf7f2 9837 ceph_abort_msg("problem!");
7c673cae
FG
9838 }
9839 dout(20) << __func__ << " took lock on obc, " << obc->rwstate << dendl;
9840
9841 finish_ctx(tctx.get(), pg_log_entry_t::PROMOTE);
9842
9843 simple_opc_submit(std::move(tctx));
9844
9845 osd->logger->inc(l_osd_tier_promote);
9846
9847 if (agent_state &&
9848 agent_state->is_idle())
9849 agent_choose_mode();
9850}
9851
11fdf7f2
TL
9852void PrimaryLogPG::finish_promote_manifest(int r, CopyResults *results,
9853 ObjectContextRef obc)
9854{
9855 const hobject_t& soid = obc->obs.oi.soid;
9856 dout(10) << __func__ << " " << soid << " r=" << r
9857 << " uv" << results->user_version << dendl;
9858
9859 if (r == -ECANCELED || r == -EAGAIN) {
9860 return;
9861 }
9862
9863 if (r < 0) {
9864 derr << __func__ << " unexpected promote error " << cpp_strerror(r) << dendl;
9865 // pass error to everyone blocked on this object
9866 // FIXME: this is pretty sloppy, but at this point we got
9867 // something unexpected and don't have many other options.
9868 map<hobject_t,list<OpRequestRef>>::iterator blocked_iter =
9869 waiting_for_blocked_object.find(soid);
9870 if (blocked_iter != waiting_for_blocked_object.end()) {
9871 while (!blocked_iter->second.empty()) {
9872 osd->reply_op_error(blocked_iter->second.front(), r);
9873 blocked_iter->second.pop_front();
9874 }
9875 waiting_for_blocked_object.erase(blocked_iter);
9876 }
9877 return;
9878 }
9879
9880 osd->promote_finish(results->object_size);
9881 osd->logger->inc(l_osd_tier_promote);
9882
9883 if (agent_state &&
9884 agent_state->is_idle())
9885 agent_choose_mode();
9886}
9887
94b18763
FG
9888void PrimaryLogPG::cancel_copy(CopyOpRef cop, bool requeue,
9889 vector<ceph_tid_t> *tids)
7c673cae
FG
9890{
9891 dout(10) << __func__ << " " << cop->obc->obs.oi.soid
9892 << " from " << cop->src << " " << cop->oloc
9893 << " v" << cop->results.user_version << dendl;
9894
9895 // cancel objecter op, if we can
9896 if (cop->objecter_tid) {
94b18763 9897 tids->push_back(cop->objecter_tid);
7c673cae
FG
9898 cop->objecter_tid = 0;
9899 if (cop->objecter_tid2) {
94b18763 9900 tids->push_back(cop->objecter_tid2);
7c673cae
FG
9901 cop->objecter_tid2 = 0;
9902 }
9903 }
9904
9905 copy_ops.erase(cop->obc->obs.oi.soid);
9906 cop->obc->stop_block();
9907
9908 kick_object_context_blocked(cop->obc);
9909 cop->results.should_requeue = requeue;
9910 CopyCallbackResults result(-ECANCELED, &cop->results);
9911 cop->cb->complete(result);
9912
9913 // There may still be an objecter callback referencing this copy op.
9914 // That callback will not need the obc since it's been canceled, and
9915 // we need the obc reference to go away prior to flush.
9916 cop->obc = ObjectContextRef();
9917}
9918
94b18763 9919void PrimaryLogPG::cancel_copy_ops(bool requeue, vector<ceph_tid_t> *tids)
7c673cae
FG
9920{
9921 dout(10) << __func__ << dendl;
9922 map<hobject_t,CopyOpRef>::iterator p = copy_ops.begin();
9923 while (p != copy_ops.end()) {
9924 // requeue this op? can I queue up all of them?
94b18763 9925 cancel_copy((p++)->second, requeue, tids);
7c673cae
FG
9926 }
9927}
9928
9929
9930// ========================================================================
9931// flush
9932//
9933// Flush a dirty object in the cache tier by writing it back to the
9934// base tier. The sequence looks like:
9935//
9936// * send a copy-from operation to the base tier to copy the current
9937// version of the object
9938// * base tier will pull the object via (perhaps multiple) copy-get(s)
9939// * on completion, we check if the object has been modified. if so,
9940// just reply with -EAGAIN.
9941// * try to take a write lock so we can clear the dirty flag. if this
9942// fails, wait and retry
9943// * start a repop that clears the bit.
9944//
9945// If we have to wait, we will retry by coming back through the
9946// start_flush method. We check if a flush is already in progress
9947// and, if so, try to finish it by rechecking the version and trying
9948// to clear the dirty bit.
9949//
9950// In order for the cache-flush (a write op) to not block the copy-get
9951// from reading the object, the client *must* set the SKIPRWLOCKS
9952// flag.
9953//
9954// NOTE: normally writes are strictly ordered for the client, but
9955// flushes are special in that they can be reordered with respect to
9956// other writes. In particular, we can't have a flush request block
9957// an update to the cache pool object!
9958
9959struct C_Flush : public Context {
9960 PrimaryLogPGRef pg;
9961 hobject_t oid;
9962 epoch_t last_peering_reset;
9963 ceph_tid_t tid;
9964 utime_t start;
9965 C_Flush(PrimaryLogPG *p, hobject_t o, epoch_t lpr)
9966 : pg(p), oid(o), last_peering_reset(lpr),
9967 tid(0), start(ceph_clock_now())
9968 {}
9969 void finish(int r) override {
9970 if (r == -ECANCELED)
9971 return;
9f95a23c 9972 std::scoped_lock locker{*pg};
7c673cae
FG
9973 if (last_peering_reset == pg->get_last_peering_reset()) {
9974 pg->finish_flush(oid, tid, r);
9975 pg->osd->logger->tinc(l_osd_tier_flush_lat, ceph_clock_now() - start);
9976 }
7c673cae
FG
9977 }
9978};
9979
9980int PrimaryLogPG::start_flush(
9981 OpRequestRef op, ObjectContextRef obc,
9982 bool blocking, hobject_t *pmissing,
9f95a23c 9983 std::optional<std::function<void()>> &&on_flush)
7c673cae
FG
9984{
9985 const object_info_t& oi = obc->obs.oi;
9986 const hobject_t& soid = oi.soid;
9987 dout(10) << __func__ << " " << soid
9988 << " v" << oi.version
9989 << " uv" << oi.user_version
9990 << " " << (blocking ? "blocking" : "non-blocking/best-effort")
9991 << dendl;
9992
9f95a23c
TL
9993 bool preoctopus_compat =
9994 get_osdmap()->require_osd_release < ceph_release_t::octopus;
9995 SnapSet snapset;
9996 if (preoctopus_compat) {
9997 // for pre-octopus compatibility, filter SnapSet::snaps. not
9998 // certain we need this, but let's be conservative.
9999 snapset = obc->ssc->snapset.get_filtered(pool.info);
10000 } else {
10001 // NOTE: change this to a const ref when we remove this compat code
10002 snapset = obc->ssc->snapset;
10003 }
7c673cae
FG
10004
10005 // verify there are no (older) check for dirty clones
10006 {
10007 dout(20) << " snapset " << snapset << dendl;
10008 vector<snapid_t>::reverse_iterator p = snapset.clones.rbegin();
10009 while (p != snapset.clones.rend() && *p >= soid.snap)
10010 ++p;
10011 if (p != snapset.clones.rend()) {
10012 hobject_t next = soid;
10013 next.snap = *p;
11fdf7f2 10014 ceph_assert(next.snap < soid.snap);
9f95a23c 10015 if (recovery_state.get_pg_log().get_missing().is_missing(next)) {
7c673cae
FG
10016 dout(10) << __func__ << " missing clone is " << next << dendl;
10017 if (pmissing)
10018 *pmissing = next;
10019 return -ENOENT;
10020 }
10021 ObjectContextRef older_obc = get_object_context(next, false);
10022 if (older_obc) {
10023 dout(20) << __func__ << " next oldest clone is " << older_obc->obs.oi
10024 << dendl;
10025 if (older_obc->obs.oi.is_dirty()) {
10026 dout(10) << __func__ << " next oldest clone is dirty: "
10027 << older_obc->obs.oi << dendl;
10028 return -EBUSY;
10029 }
10030 } else {
10031 dout(20) << __func__ << " next oldest clone " << next
10032 << " is not present; implicitly clean" << dendl;
10033 }
10034 } else {
10035 dout(20) << __func__ << " no older clones" << dendl;
10036 }
10037 }
10038
10039 if (blocking)
10040 obc->start_block();
10041
10042 map<hobject_t,FlushOpRef>::iterator p = flush_ops.find(soid);
10043 if (p != flush_ops.end()) {
10044 FlushOpRef fop = p->second;
10045 if (fop->op == op) {
10046 // we couldn't take the write lock on a cache-try-flush before;
10047 // now we are trying again for the lock.
10048 return try_flush_mark_clean(fop);
10049 }
10050 if (fop->flushed_version == obc->obs.oi.user_version &&
10051 (fop->blocking || !blocking)) {
10052 // nonblocking can join anything
10053 // blocking can only join a blocking flush
10054 dout(20) << __func__ << " piggybacking on existing flush " << dendl;
10055 if (op)
10056 fop->dup_ops.push_back(op);
10057 return -EAGAIN; // clean up this ctx; op will retry later
10058 }
10059
10060 // cancel current flush since it will fail anyway, or because we
10061 // are blocking and the existing flush is nonblocking.
10062 dout(20) << __func__ << " canceling previous flush; it will fail" << dendl;
10063 if (fop->op)
10064 osd->reply_op_error(fop->op, -EBUSY);
10065 while (!fop->dup_ops.empty()) {
10066 osd->reply_op_error(fop->dup_ops.front(), -EBUSY);
10067 fop->dup_ops.pop_front();
10068 }
94b18763
FG
10069 vector<ceph_tid_t> tids;
10070 cancel_flush(fop, false, &tids);
10071 osd->objecter->op_cancel(tids, -ECANCELED);
7c673cae
FG
10072 }
10073
11fdf7f2
TL
10074 if (obc->obs.oi.has_manifest() && obc->obs.oi.manifest.is_chunked()) {
10075 int r = start_manifest_flush(op, obc, blocking, std::move(on_flush));
10076 if (r != -EINPROGRESS) {
10077 if (blocking)
10078 obc->stop_block();
10079 }
10080 return r;
10081 }
10082
7c673cae
FG
10083 /**
10084 * In general, we need to send a delete and a copyfrom.
10085 * Consider snapc 10:[10, 9, 8, 4, 3, 2]:[10(10, 9), 4(4,3,2)]
10086 * where 4 is marked as clean. To flush 10, we have to:
10087 * 1) delete 4:[4,3,2] -- Logically, the object does not exist after 4
10088 * 2) copyfrom 8:[8,4,3,2] -- flush object after snap 8
10089 *
10090 * There is a complicating case. Supposed there had been a clone 7
10091 * for snaps [7, 6] which has been trimmed since they no longer exist.
10092 * In the base pool, we'd have 5:[4,3,2]:[4(4,3,2)]+head. When we submit
10093 * the delete, the snap will be promoted to 5, and the head will become
11fdf7f2 10094 * a whiteout. When the copy-from goes through, we'll end up with
7c673cae
FG
10095 * 8:[8,4,3,2]:[4(4,3,2)]+head.
10096 *
10097 * Another complication is the case where there is an interval change
10098 * after doing the delete and the flush but before marking the object
10099 * clean. We'll happily delete head and then recreate it at the same
10100 * sequence number, which works out ok.
10101 */
10102
10103 SnapContext snapc, dsnapc;
10104 if (snapset.seq != 0) {
10105 if (soid.snap == CEPH_NOSNAP) {
9f95a23c 10106 snapc = snapset.get_ssc_as_of(snapset.seq);
7c673cae
FG
10107 } else {
10108 snapid_t min_included_snap;
11fdf7f2
TL
10109 auto p = snapset.clone_snaps.find(soid.snap);
10110 ceph_assert(p != snapset.clone_snaps.end());
10111 min_included_snap = p->second.back();
7c673cae
FG
10112 snapc = snapset.get_ssc_as_of(min_included_snap - 1);
10113 }
10114
10115 snapid_t prev_snapc = 0;
10116 for (vector<snapid_t>::reverse_iterator citer = snapset.clones.rbegin();
10117 citer != snapset.clones.rend();
10118 ++citer) {
10119 if (*citer < soid.snap) {
10120 prev_snapc = *citer;
10121 break;
10122 }
10123 }
10124
10125 dsnapc = snapset.get_ssc_as_of(prev_snapc);
10126 }
10127
10128 object_locator_t base_oloc(soid);
10129 base_oloc.pool = pool.info.tier_of;
10130
10131 if (dsnapc.seq < snapc.seq) {
10132 ObjectOperation o;
10133 o.remove();
10134 osd->objecter->mutate(
10135 soid.oid,
10136 base_oloc,
10137 o,
10138 dsnapc,
10139 ceph::real_clock::from_ceph_timespec(oi.mtime),
10140 (CEPH_OSD_FLAG_IGNORE_OVERLAY |
10141 CEPH_OSD_FLAG_ENFORCE_SNAPC),
10142 NULL /* no callback, we'll rely on the ordering w.r.t the next op */);
10143 }
10144
10145 FlushOpRef fop(std::make_shared<FlushOp>());
10146 fop->obc = obc;
10147 fop->flushed_version = oi.user_version;
10148 fop->blocking = blocking;
10149 fop->on_flush = std::move(on_flush);
10150 fop->op = op;
10151
10152 ObjectOperation o;
10153 if (oi.is_whiteout()) {
10154 fop->removal = true;
10155 o.remove();
10156 } else {
10157 object_locator_t oloc(soid);
10158 o.copy_from(soid.oid.name, soid.snap, oloc, oi.user_version,
10159 CEPH_OSD_COPY_FROM_FLAG_FLUSH |
10160 CEPH_OSD_COPY_FROM_FLAG_IGNORE_OVERLAY |
10161 CEPH_OSD_COPY_FROM_FLAG_IGNORE_CACHE |
10162 CEPH_OSD_COPY_FROM_FLAG_MAP_SNAP_CLONE,
10163 LIBRADOS_OP_FLAG_FADVISE_SEQUENTIAL|LIBRADOS_OP_FLAG_FADVISE_NOCACHE);
10164
10165 //mean the base tier don't cache data after this
10166 if (agent_state && agent_state->evict_mode != TierAgentState::EVICT_MODE_FULL)
10167 o.set_last_op_flags(LIBRADOS_OP_FLAG_FADVISE_DONTNEED);
10168 }
10169 C_Flush *fin = new C_Flush(this, soid, get_last_peering_reset());
10170
10171 ceph_tid_t tid = osd->objecter->mutate(
10172 soid.oid, base_oloc, o, snapc,
10173 ceph::real_clock::from_ceph_timespec(oi.mtime),
10174 CEPH_OSD_FLAG_IGNORE_OVERLAY | CEPH_OSD_FLAG_ENFORCE_SNAPC,
10175 new C_OnFinisher(fin,
9f95a23c 10176 osd->get_objecter_finisher(get_pg_shard())));
7c673cae
FG
10177 /* we're under the pg lock and fin->finish() is grabbing that */
10178 fin->tid = tid;
10179 fop->objecter_tid = tid;
10180
10181 flush_ops[soid] = fop;
9f95a23c
TL
10182
10183 recovery_state.update_stats(
10184 [&oi](auto &history, auto &stats) {
10185 stats.stats.sum.num_flush++;
10186 stats.stats.sum.num_flush_kb += shift_round_up(oi.size, 10);
10187 return false;
10188 });
7c673cae
FG
10189 return -EINPROGRESS;
10190}
10191
10192void PrimaryLogPG::finish_flush(hobject_t oid, ceph_tid_t tid, int r)
10193{
10194 dout(10) << __func__ << " " << oid << " tid " << tid
10195 << " " << cpp_strerror(r) << dendl;
10196 map<hobject_t,FlushOpRef>::iterator p = flush_ops.find(oid);
10197 if (p == flush_ops.end()) {
10198 dout(10) << __func__ << " no flush_op found" << dendl;
10199 return;
10200 }
10201 FlushOpRef fop = p->second;
11fdf7f2 10202 if (tid != fop->objecter_tid && !fop->obc->obs.oi.has_manifest()) {
7c673cae
FG
10203 dout(10) << __func__ << " tid " << tid << " != fop " << fop
10204 << " tid " << fop->objecter_tid << dendl;
10205 return;
10206 }
10207 ObjectContextRef obc = fop->obc;
10208 fop->objecter_tid = 0;
10209
10210 if (r < 0 && !(r == -ENOENT && fop->removal)) {
10211 if (fop->op)
10212 osd->reply_op_error(fop->op, -EBUSY);
10213 if (fop->blocking) {
10214 obc->stop_block();
10215 kick_object_context_blocked(obc);
10216 }
10217
10218 if (!fop->dup_ops.empty()) {
10219 dout(20) << __func__ << " requeueing dups" << dendl;
10220 requeue_ops(fop->dup_ops);
10221 }
10222 if (fop->on_flush) {
10223 (*(fop->on_flush))();
9f95a23c 10224 fop->on_flush = std::nullopt;
7c673cae
FG
10225 }
10226 flush_ops.erase(oid);
10227 return;
10228 }
10229
10230 r = try_flush_mark_clean(fop);
10231 if (r == -EBUSY && fop->op) {
10232 osd->reply_op_error(fop->op, r);
10233 }
10234}
10235
10236int PrimaryLogPG::try_flush_mark_clean(FlushOpRef fop)
10237{
10238 ObjectContextRef obc = fop->obc;
10239 const hobject_t& oid = obc->obs.oi.soid;
10240
10241 if (fop->blocking) {
10242 obc->stop_block();
10243 kick_object_context_blocked(obc);
10244 }
10245
10246 if (fop->flushed_version != obc->obs.oi.user_version ||
10247 !obc->obs.exists) {
10248 if (obc->obs.exists)
10249 dout(10) << __func__ << " flushed_version " << fop->flushed_version
10250 << " != current " << obc->obs.oi.user_version
10251 << dendl;
10252 else
10253 dout(10) << __func__ << " object no longer exists" << dendl;
10254
10255 if (!fop->dup_ops.empty()) {
10256 dout(20) << __func__ << " requeueing dups" << dendl;
10257 requeue_ops(fop->dup_ops);
10258 }
10259 if (fop->on_flush) {
10260 (*(fop->on_flush))();
9f95a23c 10261 fop->on_flush = std::nullopt;
7c673cae
FG
10262 }
10263 flush_ops.erase(oid);
10264 if (fop->blocking)
10265 osd->logger->inc(l_osd_tier_flush_fail);
10266 else
10267 osd->logger->inc(l_osd_tier_try_flush_fail);
10268 return -EBUSY;
10269 }
10270
10271 if (!fop->blocking &&
28e407b8 10272 write_blocked_by_scrub(oid)) {
7c673cae
FG
10273 if (fop->op) {
10274 dout(10) << __func__ << " blocked by scrub" << dendl;
10275 requeue_op(fop->op);
10276 requeue_ops(fop->dup_ops);
10277 return -EAGAIN; // will retry
10278 } else {
10279 osd->logger->inc(l_osd_tier_try_flush_fail);
94b18763
FG
10280 vector<ceph_tid_t> tids;
10281 cancel_flush(fop, false, &tids);
10282 osd->objecter->op_cancel(tids, -ECANCELED);
7c673cae
FG
10283 return -ECANCELED;
10284 }
10285 }
10286
10287 // successfully flushed, can we evict this object?
11fdf7f2
TL
10288 if (!obc->obs.oi.has_manifest() && !fop->op &&
10289 agent_state && agent_state->evict_mode != TierAgentState::EVICT_MODE_IDLE &&
7c673cae
FG
10290 agent_maybe_evict(obc, true)) {
10291 osd->logger->inc(l_osd_tier_clean);
10292 if (fop->on_flush) {
10293 (*(fop->on_flush))();
9f95a23c 10294 fop->on_flush = std::nullopt;
7c673cae
FG
10295 }
10296 flush_ops.erase(oid);
10297 return 0;
10298 }
10299
10300 dout(10) << __func__ << " clearing DIRTY flag for " << oid << dendl;
10301 OpContextUPtr ctx = simple_opc_create(fop->obc);
10302
10303 // successfully flushed; can we clear the dirty bit?
10304 // try to take the lock manually, since we don't
10305 // have a ctx yet.
10306 if (ctx->lock_manager.get_lock_type(
9f95a23c 10307 RWState::RWWRITE,
7c673cae
FG
10308 oid,
10309 obc,
10310 fop->op)) {
10311 dout(20) << __func__ << " took write lock" << dendl;
10312 } else if (fop->op) {
28e407b8
AA
10313 dout(10) << __func__ << " waiting on write lock " << fop->op << " "
10314 << fop->dup_ops << dendl;
28e407b8
AA
10315 // fop->op is now waiting on the lock; get fop->dup_ops to wait too.
10316 for (auto op : fop->dup_ops) {
10317 bool locked = ctx->lock_manager.get_lock_type(
9f95a23c 10318 RWState::RWWRITE,
28e407b8
AA
10319 oid,
10320 obc,
10321 op);
11fdf7f2 10322 ceph_assert(!locked);
28e407b8 10323 }
11fdf7f2 10324 close_op_ctx(ctx.release());
7c673cae
FG
10325 return -EAGAIN; // will retry
10326 } else {
10327 dout(10) << __func__ << " failed write lock, no op; failing" << dendl;
10328 close_op_ctx(ctx.release());
10329 osd->logger->inc(l_osd_tier_try_flush_fail);
94b18763
FG
10330 vector<ceph_tid_t> tids;
10331 cancel_flush(fop, false, &tids);
10332 osd->objecter->op_cancel(tids, -ECANCELED);
7c673cae
FG
10333 return -ECANCELED;
10334 }
10335
10336 if (fop->on_flush) {
10337 ctx->register_on_finish(*(fop->on_flush));
9f95a23c 10338 fop->on_flush = std::nullopt;
7c673cae
FG
10339 }
10340
10341 ctx->at_version = get_next_version();
10342
10343 ctx->new_obs = obc->obs;
10344 ctx->new_obs.oi.clear_flag(object_info_t::FLAG_DIRTY);
10345 --ctx->delta_stats.num_objects_dirty;
11fdf7f2
TL
10346 if (fop->obc->obs.oi.has_manifest()) {
10347 ceph_assert(obc->obs.oi.manifest.is_chunked());
10348 PGTransaction* t = ctx->op_t.get();
10349 uint64_t chunks_size = 0;
10350 for (auto &p : ctx->new_obs.oi.manifest.chunk_map) {
10351 chunks_size += p.second.length;
10352 }
10353 if (ctx->new_obs.oi.is_omap() && pool.info.supports_omap()) {
10354 t->omap_clear(oid);
10355 ctx->new_obs.oi.clear_omap_digest();
10356 ctx->new_obs.oi.clear_flag(object_info_t::FLAG_OMAP);
9f95a23c 10357 ctx->clean_regions.mark_omap_dirty();
11fdf7f2
TL
10358 }
10359 if (obc->obs.oi.size == chunks_size) {
10360 t->truncate(oid, 0);
10361 interval_set<uint64_t> trim;
10362 trim.insert(0, ctx->new_obs.oi.size);
10363 ctx->modified_ranges.union_of(trim);
10364 truncate_update_size_and_usage(ctx->delta_stats,
10365 ctx->new_obs.oi,
10366 0);
9f95a23c 10367 ctx->clean_regions.mark_data_region_dirty(0, ctx->new_obs.oi.size);
11fdf7f2
TL
10368 ctx->new_obs.oi.new_object();
10369 for (auto &p : ctx->new_obs.oi.manifest.chunk_map) {
10370 p.second.clear_flag(chunk_info_t::FLAG_DIRTY);
10371 p.second.set_flag(chunk_info_t::FLAG_MISSING);
10372 }
10373 } else {
10374 for (auto &p : ctx->new_obs.oi.manifest.chunk_map) {
10375 if (p.second.is_dirty()) {
10376 dout(20) << __func__ << " offset: " << p.second.offset
10377 << " length: " << p.second.length << dendl;
10378 p.second.clear_flag(chunk_info_t::FLAG_DIRTY);
10379 p.second.clear_flag(chunk_info_t::FLAG_MISSING); // CLEAN
10380 }
10381 }
10382 }
10383 }
7c673cae
FG
10384
10385 finish_ctx(ctx.get(), pg_log_entry_t::CLEAN);
10386
10387 osd->logger->inc(l_osd_tier_clean);
10388
10389 if (!fop->dup_ops.empty() || fop->op) {
10390 dout(20) << __func__ << " requeueing for " << ctx->at_version << dendl;
10391 list<OpRequestRef> ls;
10392 if (fop->op)
10393 ls.push_back(fop->op);
10394 ls.splice(ls.end(), fop->dup_ops);
10395 requeue_ops(ls);
10396 }
10397
10398 simple_opc_submit(std::move(ctx));
10399
10400 flush_ops.erase(oid);
10401
10402 if (fop->blocking)
10403 osd->logger->inc(l_osd_tier_flush);
10404 else
10405 osd->logger->inc(l_osd_tier_try_flush);
10406
10407 return -EINPROGRESS;
10408}
10409
94b18763
FG
10410void PrimaryLogPG::cancel_flush(FlushOpRef fop, bool requeue,
10411 vector<ceph_tid_t> *tids)
7c673cae
FG
10412{
10413 dout(10) << __func__ << " " << fop->obc->obs.oi.soid << " tid "
10414 << fop->objecter_tid << dendl;
10415 if (fop->objecter_tid) {
94b18763 10416 tids->push_back(fop->objecter_tid);
7c673cae
FG
10417 fop->objecter_tid = 0;
10418 }
94b18763
FG
10419 if (fop->io_tids.size()) {
10420 for (auto &p : fop->io_tids) {
10421 tids->push_back(p.second);
10422 p.second = 0;
10423 }
10424 }
10425 if (fop->blocking && fop->obc->is_blocked()) {
7c673cae
FG
10426 fop->obc->stop_block();
10427 kick_object_context_blocked(fop->obc);
10428 }
10429 if (requeue) {
10430 if (fop->op)
10431 requeue_op(fop->op);
10432 requeue_ops(fop->dup_ops);
10433 }
10434 if (fop->on_flush) {
10435 (*(fop->on_flush))();
9f95a23c 10436 fop->on_flush = std::nullopt;
7c673cae
FG
10437 }
10438 flush_ops.erase(fop->obc->obs.oi.soid);
10439}
10440
94b18763 10441void PrimaryLogPG::cancel_flush_ops(bool requeue, vector<ceph_tid_t> *tids)
7c673cae
FG
10442{
10443 dout(10) << __func__ << dendl;
10444 map<hobject_t,FlushOpRef>::iterator p = flush_ops.begin();
10445 while (p != flush_ops.end()) {
94b18763 10446 cancel_flush((p++)->second, requeue, tids);
7c673cae
FG
10447 }
10448}
10449
10450bool PrimaryLogPG::is_present_clone(hobject_t coid)
10451{
10452 if (!pool.info.allow_incomplete_clones())
10453 return true;
10454 if (is_missing_object(coid))
10455 return true;
10456 ObjectContextRef obc = get_object_context(coid, false);
10457 return obc && obc->obs.exists;
10458}
10459
10460// ========================================================================
11fdf7f2 10461// rep op gather
7c673cae
FG
10462
10463class C_OSD_RepopCommit : public Context {
10464 PrimaryLogPGRef pg;
10465 boost::intrusive_ptr<PrimaryLogPG::RepGather> repop;
10466public:
10467 C_OSD_RepopCommit(PrimaryLogPG *pg, PrimaryLogPG::RepGather *repop)
10468 : pg(pg), repop(repop) {}
10469 void finish(int) override {
10470 pg->repop_all_committed(repop.get());
10471 }
10472};
10473
10474void PrimaryLogPG::repop_all_committed(RepGather *repop)
10475{
10476 dout(10) << __func__ << ": repop tid " << repop->rep_tid << " all committed "
10477 << dendl;
10478 repop->all_committed = true;
7c673cae
FG
10479 if (!repop->rep_aborted) {
10480 if (repop->v != eversion_t()) {
9f95a23c 10481 recovery_state.complete_write(repop->v, repop->pg_local_last_complete);
7c673cae
FG
10482 }
10483 eval_repop(repop);
10484 }
10485}
10486
10487void PrimaryLogPG::op_applied(const eversion_t &applied_version)
10488{
10489 dout(10) << "op_applied version " << applied_version << dendl;
11fdf7f2
TL
10490 ceph_assert(applied_version != eversion_t());
10491 ceph_assert(applied_version <= info.last_update);
9f95a23c 10492 recovery_state.local_write_applied(applied_version);
7c673cae
FG
10493 if (is_primary()) {
10494 if (scrubber.active) {
9f95a23c
TL
10495 if (recovery_state.get_last_update_applied() >=
10496 scrubber.subset_last_update) {
11fdf7f2 10497 requeue_scrub(ops_blocked_by_scrub());
7c673cae
FG
10498 }
10499 } else {
11fdf7f2 10500 ceph_assert(scrubber.start == scrubber.end);
7c673cae
FG
10501 }
10502 }
10503}
10504
10505void PrimaryLogPG::eval_repop(RepGather *repop)
10506{
9f95a23c
TL
10507 dout(10) << "eval_repop " << *repop
10508 << (repop->op && repop->op->get_req<MOSDOp>() ? "" : " (no op)") << dendl;
7c673cae
FG
10509
10510 // ondisk?
10511 if (repop->all_committed) {
10512 dout(10) << " commit: " << *repop << dendl;
10513 for (auto p = repop->on_committed.begin();
10514 p != repop->on_committed.end();
10515 repop->on_committed.erase(p++)) {
10516 (*p)();
10517 }
10518 // send dup commits, in order
11fdf7f2
TL
10519 auto it = waiting_for_ondisk.find(repop->v);
10520 if (it != waiting_for_ondisk.end()) {
10521 ceph_assert(waiting_for_ondisk.begin()->first == repop->v);
10522 for (auto& i : it->second) {
10523 int return_code = repop->r;
10524 if (return_code >= 0) {
10525 return_code = std::get<2>(i);
10526 }
10527 osd->reply_op_error(std::get<0>(i), return_code, repop->v,
9f95a23c 10528 std::get<1>(i), std::get<3>(i));
7c673cae 10529 }
11fdf7f2 10530 waiting_for_ondisk.erase(it);
7c673cae 10531 }
7c673cae
FG
10532
10533 publish_stats_to_osd();
7c673cae
FG
10534
10535 dout(10) << " removing " << *repop << dendl;
11fdf7f2 10536 ceph_assert(!repop_queue.empty());
7c673cae 10537 dout(20) << " q front is " << *repop_queue.front() << dendl;
11fdf7f2 10538 if (repop_queue.front() == repop) {
7c673cae
FG
10539 RepGather *to_remove = nullptr;
10540 while (!repop_queue.empty() &&
11fdf7f2 10541 (to_remove = repop_queue.front())->all_committed) {
7c673cae
FG
10542 repop_queue.pop_front();
10543 for (auto p = to_remove->on_success.begin();
10544 p != to_remove->on_success.end();
10545 to_remove->on_success.erase(p++)) {
10546 (*p)();
10547 }
10548 remove_repop(to_remove);
10549 }
10550 }
10551 }
10552}
10553
10554void PrimaryLogPG::issue_repop(RepGather *repop, OpContext *ctx)
10555{
11fdf7f2 10556 FUNCTRACE(cct);
7c673cae
FG
10557 const hobject_t& soid = ctx->obs->oi.soid;
10558 dout(7) << "issue_repop rep_tid " << repop->rep_tid
10559 << " o " << soid
10560 << dendl;
10561
10562 repop->v = ctx->at_version;
7c673cae 10563
7c673cae
FG
10564 ctx->op_t->add_obc(ctx->obc);
10565 if (ctx->clone_obc) {
7c673cae
FG
10566 ctx->op_t->add_obc(ctx->clone_obc);
10567 }
11fdf7f2
TL
10568 if (ctx->head_obc) {
10569 ctx->op_t->add_obc(ctx->head_obc);
7c673cae
FG
10570 }
10571
10572 Context *on_all_commit = new C_OSD_RepopCommit(this, repop);
7c673cae 10573 if (!(ctx->log.empty())) {
11fdf7f2 10574 ceph_assert(ctx->at_version >= projected_last_update);
7c673cae
FG
10575 projected_last_update = ctx->at_version;
10576 }
10577 for (auto &&entry: ctx->log) {
10578 projected_log.add(entry);
10579 }
11fdf7f2 10580
9f95a23c
TL
10581 recovery_state.pre_submit_op(
10582 soid,
10583 ctx->log,
10584 ctx->at_version);
7c673cae
FG
10585 pgbackend->submit_transaction(
10586 soid,
10587 ctx->delta_stats,
10588 ctx->at_version,
10589 std::move(ctx->op_t),
9f95a23c
TL
10590 recovery_state.get_pg_trim_to(),
10591 recovery_state.get_min_last_complete_ondisk(),
7c673cae
FG
10592 ctx->log,
10593 ctx->updated_hset_history,
7c673cae
FG
10594 on_all_commit,
10595 repop->rep_tid,
10596 ctx->reqid,
10597 ctx->op);
10598}
10599
10600PrimaryLogPG::RepGather *PrimaryLogPG::new_repop(
10601 OpContext *ctx, ObjectContextRef obc,
10602 ceph_tid_t rep_tid)
10603{
10604 if (ctx->op)
10605 dout(10) << "new_repop rep_tid " << rep_tid << " on " << *ctx->op->get_req() << dendl;
10606 else
10607 dout(10) << "new_repop rep_tid " << rep_tid << " (no op)" << dendl;
10608
10609 RepGather *repop = new RepGather(
11fdf7f2 10610 ctx, rep_tid, info.last_complete);
7c673cae
FG
10611
10612 repop->start = ceph_clock_now();
10613
10614 repop_queue.push_back(&repop->queue_item);
10615 repop->get();
10616
10617 osd->logger->inc(l_osd_op_wip);
10618
10619 dout(10) << __func__ << ": " << *repop << dendl;
10620 return repop;
10621}
10622
10623boost::intrusive_ptr<PrimaryLogPG::RepGather> PrimaryLogPG::new_repop(
10624 eversion_t version,
10625 int r,
10626 ObcLockManager &&manager,
10627 OpRequestRef &&op,
9f95a23c 10628 std::optional<std::function<void(void)> > &&on_complete)
7c673cae
FG
10629{
10630 RepGather *repop = new RepGather(
10631 std::move(manager),
10632 std::move(op),
10633 std::move(on_complete),
10634 osd->get_tid(),
10635 info.last_complete,
7c673cae
FG
10636 r);
10637 repop->v = version;
10638
10639 repop->start = ceph_clock_now();
10640
10641 repop_queue.push_back(&repop->queue_item);
10642
10643 osd->logger->inc(l_osd_op_wip);
10644
10645 dout(10) << __func__ << ": " << *repop << dendl;
10646 return boost::intrusive_ptr<RepGather>(repop);
10647}
10648
10649void PrimaryLogPG::remove_repop(RepGather *repop)
10650{
10651 dout(20) << __func__ << " " << *repop << dendl;
10652
10653 for (auto p = repop->on_finish.begin();
10654 p != repop->on_finish.end();
10655 repop->on_finish.erase(p++)) {
10656 (*p)();
10657 }
10658
10659 release_object_locks(
10660 repop->lock_manager);
10661 repop->put();
10662
10663 osd->logger->dec(l_osd_op_wip);
10664}
10665
10666PrimaryLogPG::OpContextUPtr PrimaryLogPG::simple_opc_create(ObjectContextRef obc)
10667{
10668 dout(20) << __func__ << " " << obc->obs.oi.soid << dendl;
7c673cae
FG
10669 ceph_tid_t rep_tid = osd->get_tid();
10670 osd_reqid_t reqid(osd->get_cluster_msgr_name(), 0, rep_tid);
c07f9fc5 10671 OpContextUPtr ctx(new OpContext(OpRequestRef(), reqid, nullptr, obc, this));
7c673cae
FG
10672 ctx->op_t.reset(new PGTransaction());
10673 ctx->mtime = ceph_clock_now();
10674 return ctx;
10675}
10676
10677void PrimaryLogPG::simple_opc_submit(OpContextUPtr ctx)
10678{
10679 RepGather *repop = new_repop(ctx.get(), ctx->obc, ctx->reqid.tid);
10680 dout(20) << __func__ << " " << repop << dendl;
10681 issue_repop(repop, ctx.get());
10682 eval_repop(repop);
9f95a23c 10683 recovery_state.update_trim_to();
7c673cae
FG
10684 repop->put();
10685}
10686
10687
10688void PrimaryLogPG::submit_log_entries(
31f18b77 10689 const mempool::osd_pglog::list<pg_log_entry_t> &entries,
7c673cae 10690 ObcLockManager &&manager,
9f95a23c 10691 std::optional<std::function<void(void)> > &&_on_complete,
7c673cae
FG
10692 OpRequestRef op,
10693 int r)
10694{
10695 dout(10) << __func__ << " " << entries << dendl;
11fdf7f2 10696 ceph_assert(is_primary());
7c673cae
FG
10697
10698 eversion_t version;
10699 if (!entries.empty()) {
11fdf7f2 10700 ceph_assert(entries.rbegin()->version >= projected_last_update);
7c673cae
FG
10701 version = projected_last_update = entries.rbegin()->version;
10702 }
10703
10704 boost::intrusive_ptr<RepGather> repop;
9f95a23c
TL
10705 std::optional<std::function<void(void)> > on_complete;
10706 if (get_osdmap()->require_osd_release >= ceph_release_t::jewel) {
7c673cae
FG
10707 repop = new_repop(
10708 version,
10709 r,
10710 std::move(manager),
10711 std::move(op),
10712 std::move(_on_complete));
10713 } else {
10714 on_complete = std::move(_on_complete);
10715 }
10716
10717 pgbackend->call_write_ordered(
10718 [this, entries, repop, on_complete]() {
10719 ObjectStore::Transaction t;
10720 eversion_t old_last_update = info.last_update;
9f95a23c
TL
10721 recovery_state.merge_new_log_entries(
10722 entries, t, recovery_state.get_pg_trim_to(),
10723 recovery_state.get_min_last_complete_ondisk());
7c673cae
FG
10724
10725 set<pg_shard_t> waiting_on;
9f95a23c
TL
10726 for (set<pg_shard_t>::const_iterator i = get_acting_recovery_backfill().begin();
10727 i != get_acting_recovery_backfill().end();
7c673cae
FG
10728 ++i) {
10729 pg_shard_t peer(*i);
10730 if (peer == pg_whoami) continue;
9f95a23c
TL
10731 ceph_assert(recovery_state.get_peer_missing().count(peer));
10732 ceph_assert(recovery_state.has_peer_info(peer));
10733 if (get_osdmap()->require_osd_release >= ceph_release_t::jewel) {
11fdf7f2 10734 ceph_assert(repop);
7c673cae
FG
10735 MOSDPGUpdateLogMissing *m = new MOSDPGUpdateLogMissing(
10736 entries,
10737 spg_t(info.pgid.pgid, i->shard),
10738 pg_whoami.shard,
11fdf7f2 10739 get_osdmap_epoch(),
9f95a23c 10740 get_last_peering_reset(),
94b18763 10741 repop->rep_tid,
9f95a23c
TL
10742 recovery_state.get_pg_trim_to(),
10743 recovery_state.get_min_last_complete_ondisk());
7c673cae 10744 osd->send_message_osd_cluster(
11fdf7f2 10745 peer.osd, m, get_osdmap_epoch());
7c673cae
FG
10746 waiting_on.insert(peer);
10747 } else {
10748 MOSDPGLog *m = new MOSDPGLog(
10749 peer.shard, pg_whoami.shard,
10750 info.last_update.epoch,
9f95a23c 10751 info, get_last_peering_reset());
7c673cae
FG
10752 m->log.log = entries;
10753 m->log.tail = old_last_update;
10754 m->log.head = info.last_update;
10755 osd->send_message_osd_cluster(
11fdf7f2 10756 peer.osd, m, get_osdmap_epoch());
7c673cae
FG
10757 }
10758 }
11fdf7f2
TL
10759 ceph_tid_t rep_tid = repop->rep_tid;
10760 waiting_on.insert(pg_whoami);
10761 log_entry_update_waiting_on.insert(
10762 make_pair(
10763 rep_tid,
10764 LogUpdateCtx{std::move(repop), std::move(waiting_on)}
10765 ));
10766 struct OnComplete : public Context {
10767 PrimaryLogPGRef pg;
10768 ceph_tid_t rep_tid;
10769 epoch_t epoch;
10770 OnComplete(
10771 PrimaryLogPGRef pg,
10772 ceph_tid_t rep_tid,
10773 epoch_t epoch)
10774 : pg(pg), rep_tid(rep_tid), epoch(epoch) {}
10775 void finish(int) override {
9f95a23c 10776 std::scoped_lock l{*pg};
11fdf7f2
TL
10777 if (!pg->pg_has_reset_since(epoch)) {
10778 auto it = pg->log_entry_update_waiting_on.find(rep_tid);
10779 ceph_assert(it != pg->log_entry_update_waiting_on.end());
10780 auto it2 = it->second.waiting_on.find(pg->pg_whoami);
10781 ceph_assert(it2 != it->second.waiting_on.end());
10782 it->second.waiting_on.erase(it2);
10783 if (it->second.waiting_on.empty()) {
10784 pg->repop_all_committed(it->second.repop.get());
10785 pg->log_entry_update_waiting_on.erase(it);
7c673cae 10786 }
7c673cae 10787 }
11fdf7f2
TL
10788 }
10789 };
10790 t.register_on_commit(
10791 new OnComplete{this, rep_tid, get_osdmap_epoch()});
10792 int r = osd->store->queue_transaction(ch, std::move(t), NULL);
10793 ceph_assert(r == 0);
10794 op_applied(info.last_update);
7c673cae 10795 });
94b18763 10796
9f95a23c 10797 recovery_state.update_trim_to();
7c673cae
FG
10798}
10799
10800void PrimaryLogPG::cancel_log_updates()
10801{
10802 // get rid of all the LogUpdateCtx so their references to repops are
10803 // dropped
10804 log_entry_update_waiting_on.clear();
10805}
10806
10807// -------------------------------------------------------
10808
11fdf7f2 10809void PrimaryLogPG::get_watchers(list<obj_watch_item_t> *ls)
7c673cae 10810{
9f95a23c 10811 std::scoped_lock l{*this};
7c673cae
FG
10812 pair<hobject_t, ObjectContextRef> i;
10813 while (object_contexts.get_next(i.first, &i)) {
10814 ObjectContextRef obc(i.second);
11fdf7f2 10815 get_obc_watchers(obc, *ls);
7c673cae
FG
10816 }
10817}
10818
10819void PrimaryLogPG::get_obc_watchers(ObjectContextRef obc, list<obj_watch_item_t> &pg_watchers)
10820{
10821 for (map<pair<uint64_t, entity_name_t>, WatchRef>::iterator j =
10822 obc->watchers.begin();
10823 j != obc->watchers.end();
10824 ++j) {
10825 obj_watch_item_t owi;
10826
10827 owi.obj = obc->obs.oi.soid;
10828 owi.wi.addr = j->second->get_peer_addr();
10829 owi.wi.name = j->second->get_entity();
10830 owi.wi.cookie = j->second->get_cookie();
10831 owi.wi.timeout_seconds = j->second->get_timeout();
10832
10833 dout(30) << "watch: Found oid=" << owi.obj << " addr=" << owi.wi.addr
10834 << " name=" << owi.wi.name << " cookie=" << owi.wi.cookie << dendl;
10835
10836 pg_watchers.push_back(owi);
10837 }
10838}
10839
10840void PrimaryLogPG::check_blacklisted_watchers()
10841{
10842 dout(20) << "PrimaryLogPG::check_blacklisted_watchers for pg " << get_pgid() << dendl;
10843 pair<hobject_t, ObjectContextRef> i;
10844 while (object_contexts.get_next(i.first, &i))
10845 check_blacklisted_obc_watchers(i.second);
10846}
10847
10848void PrimaryLogPG::check_blacklisted_obc_watchers(ObjectContextRef obc)
10849{
10850 dout(20) << "PrimaryLogPG::check_blacklisted_obc_watchers for obc " << obc->obs.oi.soid << dendl;
10851 for (map<pair<uint64_t, entity_name_t>, WatchRef>::iterator k =
10852 obc->watchers.begin();
10853 k != obc->watchers.end();
10854 ) {
10855 //Advance iterator now so handle_watch_timeout() can erase element
10856 map<pair<uint64_t, entity_name_t>, WatchRef>::iterator j = k++;
10857 dout(30) << "watch: Found " << j->second->get_entity() << " cookie " << j->second->get_cookie() << dendl;
10858 entity_addr_t ea = j->second->get_peer_addr();
10859 dout(30) << "watch: Check entity_addr_t " << ea << dendl;
10860 if (get_osdmap()->is_blacklisted(ea)) {
10861 dout(10) << "watch: Found blacklisted watcher for " << ea << dendl;
11fdf7f2 10862 ceph_assert(j->second->get_pg() == this);
7c673cae
FG
10863 j->second->unregister_cb();
10864 handle_watch_timeout(j->second);
10865 }
10866 }
10867}
10868
10869void PrimaryLogPG::populate_obc_watchers(ObjectContextRef obc)
10870{
f6b5b4d7 10871 ceph_assert(is_primary() && is_active());
9f95a23c 10872 auto it_objects = recovery_state.get_pg_log().get_log().objects.find(obc->obs.oi.soid);
11fdf7f2 10873 ceph_assert((recovering.count(obc->obs.oi.soid) ||
7c673cae 10874 !is_missing_object(obc->obs.oi.soid)) ||
9f95a23c 10875 (it_objects != recovery_state.get_pg_log().get_log().objects.end() && // or this is a revert... see recover_primary()
11fdf7f2 10876 it_objects->second->op ==
7c673cae 10877 pg_log_entry_t::LOST_REVERT &&
11fdf7f2 10878 it_objects->second->reverting_to ==
7c673cae
FG
10879 obc->obs.oi.version));
10880
10881 dout(10) << "populate_obc_watchers " << obc->obs.oi.soid << dendl;
11fdf7f2 10882 ceph_assert(obc->watchers.empty());
7c673cae
FG
10883 // populate unconnected_watchers
10884 for (map<pair<uint64_t, entity_name_t>, watch_info_t>::iterator p =
10885 obc->obs.oi.watchers.begin();
10886 p != obc->obs.oi.watchers.end();
10887 ++p) {
10888 utime_t expire = info.stats.last_became_active;
10889 expire += p->second.timeout_seconds;
10890 dout(10) << " unconnected watcher " << p->first << " will expire " << expire << dendl;
10891 WatchRef watch(
10892 Watch::makeWatchRef(
10893 this, osd, obc, p->second.timeout_seconds, p->first.first,
10894 p->first.second, p->second.addr));
10895 watch->disconnect();
10896 obc->watchers.insert(
10897 make_pair(
10898 make_pair(p->first.first, p->first.second),
10899 watch));
10900 }
10901 // Look for watchers from blacklisted clients and drop
10902 check_blacklisted_obc_watchers(obc);
10903}
10904
10905void PrimaryLogPG::handle_watch_timeout(WatchRef watch)
10906{
10907 ObjectContextRef obc = watch->get_obc(); // handle_watch_timeout owns this ref
10908 dout(10) << "handle_watch_timeout obc " << obc << dendl;
10909
10910 if (!is_active()) {
10911 dout(10) << "handle_watch_timeout not active, no-op" << dendl;
10912 return;
10913 }
a8e16298
TL
10914 if (!obc->obs.exists) {
10915 dout(10) << __func__ << " object " << obc->obs.oi.soid << " dne" << dendl;
10916 return;
10917 }
7c673cae
FG
10918 if (is_degraded_or_backfilling_object(obc->obs.oi.soid)) {
10919 callbacks_for_degraded_object[obc->obs.oi.soid].push_back(
10920 watch->get_delayed_cb()
10921 );
10922 dout(10) << "handle_watch_timeout waiting for degraded on obj "
10923 << obc->obs.oi.soid
10924 << dendl;
10925 return;
10926 }
10927
28e407b8 10928 if (write_blocked_by_scrub(obc->obs.oi.soid)) {
7c673cae
FG
10929 dout(10) << "handle_watch_timeout waiting for scrub on obj "
10930 << obc->obs.oi.soid
10931 << dendl;
10932 scrubber.add_callback(
10933 watch->get_delayed_cb() // This callback!
10934 );
10935 return;
10936 }
10937
10938 OpContextUPtr ctx = simple_opc_create(obc);
10939 ctx->at_version = get_next_version();
10940
10941 object_info_t& oi = ctx->new_obs.oi;
10942 oi.watchers.erase(make_pair(watch->get_cookie(),
10943 watch->get_entity()));
10944
10945 list<watch_disconnect_t> watch_disconnects = {
10946 watch_disconnect_t(watch->get_cookie(), watch->get_entity(), true)
10947 };
10948 ctx->register_on_success(
10949 [this, obc, watch_disconnects]() {
10950 complete_disconnect_watches(obc, watch_disconnects);
10951 });
10952
10953
10954 PGTransaction *t = ctx->op_t.get();
10955 ctx->log.push_back(pg_log_entry_t(pg_log_entry_t::MODIFY, obc->obs.oi.soid,
10956 ctx->at_version,
10957 oi.version,
10958 0,
10959 osd_reqid_t(), ctx->mtime, 0));
10960
10961 oi.prior_version = obc->obs.oi.version;
10962 oi.version = ctx->at_version;
10963 bufferlist bl;
11fdf7f2 10964 encode(oi, bl, get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
7c673cae
FG
10965 t->setattr(obc->obs.oi.soid, OI_ATTR, bl);
10966
10967 // apply new object state.
10968 ctx->obc->obs = ctx->new_obs;
10969
10970 // no ctx->delta_stats
10971 simple_opc_submit(std::move(ctx));
10972}
10973
10974ObjectContextRef PrimaryLogPG::create_object_context(const object_info_t& oi,
10975 SnapSetContext *ssc)
10976{
10977 ObjectContextRef obc(object_contexts.lookup_or_create(oi.soid));
11fdf7f2 10978 ceph_assert(obc->destructor_callback == NULL);
7c673cae
FG
10979 obc->destructor_callback = new C_PG_ObjectContext(this, obc.get());
10980 obc->obs.oi = oi;
10981 obc->obs.exists = false;
10982 obc->ssc = ssc;
10983 if (ssc)
10984 register_snapset_context(ssc);
10985 dout(10) << "create_object_context " << (void*)obc.get() << " " << oi.soid << " " << dendl;
10986 if (is_active())
10987 populate_obc_watchers(obc);
10988 return obc;
10989}
10990
10991ObjectContextRef PrimaryLogPG::get_object_context(
10992 const hobject_t& soid,
10993 bool can_create,
10994 const map<string, bufferlist> *attrs)
10995{
9f95a23c 10996 auto it_objects = recovery_state.get_pg_log().get_log().objects.find(soid);
11fdf7f2 10997 ceph_assert(
9f95a23c 10998 attrs || !recovery_state.get_pg_log().get_missing().is_missing(soid) ||
7c673cae 10999 // or this is a revert... see recover_primary()
9f95a23c 11000 (it_objects != recovery_state.get_pg_log().get_log().objects.end() &&
11fdf7f2 11001 it_objects->second->op ==
7c673cae
FG
11002 pg_log_entry_t::LOST_REVERT));
11003 ObjectContextRef obc = object_contexts.lookup(soid);
11004 osd->logger->inc(l_osd_object_ctx_cache_total);
11005 if (obc) {
11006 osd->logger->inc(l_osd_object_ctx_cache_hit);
11007 dout(10) << __func__ << ": found obc in cache: " << obc
11008 << dendl;
11009 } else {
11010 dout(10) << __func__ << ": obc NOT found in cache: " << soid << dendl;
11011 // check disk
11012 bufferlist bv;
11013 if (attrs) {
11fdf7f2
TL
11014 auto it_oi = attrs->find(OI_ATTR);
11015 ceph_assert(it_oi != attrs->end());
11016 bv = it_oi->second;
7c673cae
FG
11017 } else {
11018 int r = pgbackend->objects_get_attr(soid, OI_ATTR, &bv);
11019 if (r < 0) {
11020 if (!can_create) {
11021 dout(10) << __func__ << ": no obc for soid "
11022 << soid << " and !can_create"
11023 << dendl;
11024 return ObjectContextRef(); // -ENOENT!
11025 }
11026
11027 dout(10) << __func__ << ": no obc for soid "
11028 << soid << " but can_create"
11029 << dendl;
11030 // new object.
11031 object_info_t oi(soid);
11032 SnapSetContext *ssc = get_snapset_context(
11033 soid, true, 0, false);
11fdf7f2 11034 ceph_assert(ssc);
7c673cae
FG
11035 obc = create_object_context(oi, ssc);
11036 dout(10) << __func__ << ": " << obc << " " << soid
11037 << " " << obc->rwstate
11038 << " oi: " << obc->obs.oi
11039 << " ssc: " << obc->ssc
11040 << " snapset: " << obc->ssc->snapset << dendl;
11041 return obc;
11042 }
11043 }
11044
11045 object_info_t oi;
11046 try {
11fdf7f2
TL
11047 bufferlist::const_iterator bliter = bv.begin();
11048 decode(oi, bliter);
7c673cae
FG
11049 } catch (...) {
11050 dout(0) << __func__ << ": obc corrupt: " << soid << dendl;
11051 return ObjectContextRef(); // -ENOENT!
11052 }
11053
11fdf7f2 11054 ceph_assert(oi.soid.pool == (int64_t)info.pgid.pool());
7c673cae
FG
11055
11056 obc = object_contexts.lookup_or_create(oi.soid);
11057 obc->destructor_callback = new C_PG_ObjectContext(this, obc.get());
11058 obc->obs.oi = oi;
11059 obc->obs.exists = true;
11060
11061 obc->ssc = get_snapset_context(
11062 soid, true,
11063 soid.has_snapset() ? attrs : 0);
11064
f6b5b4d7 11065 if (is_primary() && is_active())
7c673cae
FG
11066 populate_obc_watchers(obc);
11067
11fdf7f2 11068 if (pool.info.is_erasure()) {
7c673cae
FG
11069 if (attrs) {
11070 obc->attr_cache = *attrs;
11071 } else {
11072 int r = pgbackend->objects_get_attrs(
11073 soid,
11074 &obc->attr_cache);
11fdf7f2 11075 ceph_assert(r == 0);
7c673cae
FG
11076 }
11077 }
11078
11079 dout(10) << __func__ << ": creating obc from disk: " << obc
11080 << dendl;
11081 }
224ce89b
WB
11082
11083 // XXX: Caller doesn't expect this
11084 if (obc->ssc == NULL) {
11085 derr << __func__ << ": obc->ssc not available, not returning context" << dendl;
11086 return ObjectContextRef(); // -ENOENT!
11087 }
11088
7c673cae
FG
11089 dout(10) << __func__ << ": " << obc << " " << soid
11090 << " " << obc->rwstate
11091 << " oi: " << obc->obs.oi
11092 << " exists: " << (int)obc->obs.exists
11093 << " ssc: " << obc->ssc
11094 << " snapset: " << obc->ssc->snapset << dendl;
11095 return obc;
11096}
11097
11098void PrimaryLogPG::context_registry_on_change()
11099{
11100 pair<hobject_t, ObjectContextRef> i;
11101 while (object_contexts.get_next(i.first, &i)) {
11102 ObjectContextRef obc(i.second);
11103 if (obc) {
11104 for (map<pair<uint64_t, entity_name_t>, WatchRef>::iterator j =
11105 obc->watchers.begin();
11106 j != obc->watchers.end();
11107 obc->watchers.erase(j++)) {
11108 j->second->discard();
11109 }
11110 }
11111 }
11112}
11113
11114
11115/*
11116 * If we return an error, and set *pmissing, then promoting that
11117 * object may help.
11118 *
11119 * If we return -EAGAIN, we will always set *pmissing to the missing
11120 * object to wait for.
11121 *
11122 * If we return an error but do not set *pmissing, then we know the
11123 * object does not exist.
11124 */
11125int PrimaryLogPG::find_object_context(const hobject_t& oid,
11126 ObjectContextRef *pobc,
11127 bool can_create,
11128 bool map_snapid_to_clone,
11129 hobject_t *pmissing)
11130{
11fdf7f2
TL
11131 FUNCTRACE(cct);
11132 ceph_assert(oid.pool == static_cast<int64_t>(info.pgid.pool()));
7c673cae
FG
11133 // want the head?
11134 if (oid.snap == CEPH_NOSNAP) {
11135 ObjectContextRef obc = get_object_context(oid, can_create);
11136 if (!obc) {
11137 if (pmissing)
11138 *pmissing = oid;
11139 return -ENOENT;
11140 }
11fdf7f2 11141 dout(10) << __func__ << " " << oid
7c673cae
FG
11142 << " @" << oid.snap
11143 << " oi=" << obc->obs.oi
11144 << dendl;
11145 *pobc = obc;
11146
11147 return 0;
11148 }
11149
7c673cae 11150 // we want a snap
7c673cae 11151
9f95a23c 11152 hobject_t head = oid.get_head();
7c673cae
FG
11153 SnapSetContext *ssc = get_snapset_context(oid, can_create);
11154 if (!ssc || !(ssc->exists || can_create)) {
11155 dout(20) << __func__ << " " << oid << " no snapset" << dendl;
11156 if (pmissing)
11157 *pmissing = head; // start by getting the head
11158 if (ssc)
11159 put_snapset_context(ssc);
11160 return -ENOENT;
11161 }
11162
11163 if (map_snapid_to_clone) {
11fdf7f2 11164 dout(10) << __func__ << " " << oid << " @" << oid.snap
7c673cae
FG
11165 << " snapset " << ssc->snapset
11166 << " map_snapid_to_clone=true" << dendl;
11167 if (oid.snap > ssc->snapset.seq) {
11168 // already must be readable
11169 ObjectContextRef obc = get_object_context(head, false);
11fdf7f2 11170 dout(10) << __func__ << " " << oid << " @" << oid.snap
7c673cae
FG
11171 << " snapset " << ssc->snapset
11172 << " maps to head" << dendl;
11173 *pobc = obc;
11174 put_snapset_context(ssc);
11175 return (obc && obc->obs.exists) ? 0 : -ENOENT;
11176 } else {
11177 vector<snapid_t>::const_iterator citer = std::find(
11178 ssc->snapset.clones.begin(),
11179 ssc->snapset.clones.end(),
11180 oid.snap);
11181 if (citer == ssc->snapset.clones.end()) {
11fdf7f2 11182 dout(10) << __func__ << " " << oid << " @" << oid.snap
7c673cae
FG
11183 << " snapset " << ssc->snapset
11184 << " maps to nothing" << dendl;
11185 put_snapset_context(ssc);
11186 return -ENOENT;
11187 }
11188
11fdf7f2 11189 dout(10) << __func__ << " " << oid << " @" << oid.snap
7c673cae
FG
11190 << " snapset " << ssc->snapset
11191 << " maps to " << oid << dendl;
11192
9f95a23c 11193 if (recovery_state.get_pg_log().get_missing().is_missing(oid)) {
11fdf7f2 11194 dout(10) << __func__ << " " << oid << " @" << oid.snap
7c673cae
FG
11195 << " snapset " << ssc->snapset
11196 << " " << oid << " is missing" << dendl;
11197 if (pmissing)
11198 *pmissing = oid;
11199 put_snapset_context(ssc);
11200 return -EAGAIN;
11201 }
11202
11203 ObjectContextRef obc = get_object_context(oid, false);
11204 if (!obc || !obc->obs.exists) {
11fdf7f2 11205 dout(10) << __func__ << " " << oid << " @" << oid.snap
7c673cae
FG
11206 << " snapset " << ssc->snapset
11207 << " " << oid << " is not present" << dendl;
11208 if (pmissing)
11209 *pmissing = oid;
11210 put_snapset_context(ssc);
11211 return -ENOENT;
11212 }
11fdf7f2 11213 dout(10) << __func__ << " " << oid << " @" << oid.snap
7c673cae
FG
11214 << " snapset " << ssc->snapset
11215 << " " << oid << " HIT" << dendl;
11216 *pobc = obc;
11217 put_snapset_context(ssc);
11218 return 0;
11219 }
11220 ceph_abort(); //unreachable
11221 }
11222
11fdf7f2 11223 dout(10) << __func__ << " " << oid << " @" << oid.snap
7c673cae
FG
11224 << " snapset " << ssc->snapset << dendl;
11225
11226 // head?
11227 if (oid.snap > ssc->snapset.seq) {
11fdf7f2
TL
11228 ObjectContextRef obc = get_object_context(head, false);
11229 dout(10) << __func__ << " " << head
7c673cae 11230 << " want " << oid.snap << " > snapset seq " << ssc->snapset.seq
11fdf7f2 11231 << " -- HIT " << obc->obs
7c673cae 11232 << dendl;
11fdf7f2
TL
11233 if (!obc->ssc)
11234 obc->ssc = ssc;
11235 else {
11236 ceph_assert(ssc == obc->ssc);
11237 put_snapset_context(ssc);
11238 }
11239 *pobc = obc;
11240 return 0;
7c673cae
FG
11241 }
11242
11243 // which clone would it be?
11244 unsigned k = 0;
11245 while (k < ssc->snapset.clones.size() &&
11246 ssc->snapset.clones[k] < oid.snap)
11247 k++;
11248 if (k == ssc->snapset.clones.size()) {
11fdf7f2 11249 dout(10) << __func__ << " no clones with last >= oid.snap "
7c673cae
FG
11250 << oid.snap << " -- DNE" << dendl;
11251 put_snapset_context(ssc);
11252 return -ENOENT;
11253 }
11254 hobject_t soid(oid.oid, oid.get_key(), ssc->snapset.clones[k], oid.get_hash(),
11255 info.pgid.pool(), oid.get_namespace());
11256
9f95a23c 11257 if (recovery_state.get_pg_log().get_missing().is_missing(soid)) {
11fdf7f2 11258 dout(20) << __func__ << " " << soid << " missing, try again later"
7c673cae
FG
11259 << dendl;
11260 if (pmissing)
11261 *pmissing = soid;
11262 put_snapset_context(ssc);
11263 return -EAGAIN;
11264 }
11265
11266 ObjectContextRef obc = get_object_context(soid, false);
11267 if (!obc || !obc->obs.exists) {
7c673cae
FG
11268 if (pmissing)
11269 *pmissing = soid;
11270 put_snapset_context(ssc);
9f95a23c
TL
11271 if (is_primary()) {
11272 if (is_degraded_or_backfilling_object(soid)) {
11273 dout(20) << __func__ << " clone is degraded or backfilling " << soid << dendl;
11274 return -EAGAIN;
11275 } else if (is_degraded_on_async_recovery_target(soid)) {
11276 dout(20) << __func__ << " clone is recovering " << soid << dendl;
11277 return -EAGAIN;
11278 } else {
11279 dout(20) << __func__ << " missing clone " << soid << dendl;
11280 return -ENOENT;
11281 }
c07f9fc5 11282 } else {
9f95a23c 11283 dout(20) << __func__ << " replica missing clone" << soid << dendl;
c07f9fc5
FG
11284 return -ENOENT;
11285 }
7c673cae
FG
11286 }
11287
11288 if (!obc->ssc) {
11289 obc->ssc = ssc;
11290 } else {
11fdf7f2 11291 ceph_assert(obc->ssc == ssc);
7c673cae
FG
11292 put_snapset_context(ssc);
11293 }
11294 ssc = 0;
11295
11296 // clone
11fdf7f2 11297 dout(20) << __func__ << " " << soid
7c673cae 11298 << " snapset " << obc->ssc->snapset
7c673cae
FG
11299 << dendl;
11300 snapid_t first, last;
11fdf7f2
TL
11301 auto p = obc->ssc->snapset.clone_snaps.find(soid.snap);
11302 ceph_assert(p != obc->ssc->snapset.clone_snaps.end());
11303 if (p->second.empty()) {
11304 dout(1) << __func__ << " " << soid << " empty snapset -- DNE" << dendl;
11305 ceph_assert(!cct->_conf->osd_debug_verify_snaps);
11306 return -ENOENT;
7c673cae 11307 }
9f95a23c
TL
11308 if (std::find(p->second.begin(), p->second.end(), oid.snap) ==
11309 p->second.end()) {
11310 dout(20) << __func__ << " " << soid << " clone_snaps " << p->second
11311 << " does not contain " << oid.snap << " -- DNE" << dendl;
11312 return -ENOENT;
11313 }
11314 if (get_osdmap()->in_removed_snaps_queue(info.pgid.pgid.pool(), oid.snap)) {
11315 dout(20) << __func__ << " " << soid << " snap " << oid.snap
11316 << " in removed_snaps_queue" << " -- DNE" << dendl;
7c673cae
FG
11317 return -ENOENT;
11318 }
9f95a23c
TL
11319 dout(20) << __func__ << " " << soid << " clone_snaps " << p->second
11320 << " contains " << oid.snap << " -- HIT " << obc->obs << dendl;
11321 *pobc = obc;
11322 return 0;
7c673cae
FG
11323}
11324
11325void PrimaryLogPG::object_context_destructor_callback(ObjectContext *obc)
11326{
11327 if (obc->ssc)
11328 put_snapset_context(obc->ssc);
11329}
11330
11331void PrimaryLogPG::add_object_context_to_pg_stat(ObjectContextRef obc, pg_stat_t *pgstat)
11332{
11333 object_info_t& oi = obc->obs.oi;
11334
11fdf7f2
TL
11335 dout(10) << __func__ << " " << oi.soid << dendl;
11336 ceph_assert(!oi.soid.is_snapdir());
7c673cae 11337
11fdf7f2
TL
11338 object_stat_sum_t stat;
11339 stat.num_objects++;
7c673cae
FG
11340 if (oi.is_dirty())
11341 stat.num_objects_dirty++;
11342 if (oi.is_whiteout())
11343 stat.num_whiteouts++;
11344 if (oi.is_omap())
11345 stat.num_objects_omap++;
11346 if (oi.is_cache_pinned())
11347 stat.num_objects_pinned++;
11fdf7f2
TL
11348 if (oi.has_manifest())
11349 stat.num_objects_manifest++;
7c673cae 11350
11fdf7f2 11351 if (oi.soid.is_snap()) {
7c673cae
FG
11352 stat.num_object_clones++;
11353
11354 if (!obc->ssc)
11355 obc->ssc = get_snapset_context(oi.soid, false);
11fdf7f2
TL
11356 ceph_assert(obc->ssc);
11357 stat.num_bytes += obc->ssc->snapset.get_clone_bytes(oi.soid.snap);
11358 } else {
11359 stat.num_bytes += oi.size;
7c673cae
FG
11360 }
11361
11362 // add it in
11363 pgstat->stats.sum.add(stat);
11364}
11365
11366void PrimaryLogPG::kick_object_context_blocked(ObjectContextRef obc)
11367{
11368 const hobject_t& soid = obc->obs.oi.soid;
11369 if (obc->is_blocked()) {
11370 dout(10) << __func__ << " " << soid << " still blocked" << dendl;
11371 return;
11372 }
11373
11374 map<hobject_t, list<OpRequestRef>>::iterator p = waiting_for_blocked_object.find(soid);
11375 if (p != waiting_for_blocked_object.end()) {
11376 list<OpRequestRef>& ls = p->second;
11377 dout(10) << __func__ << " " << soid << " requeuing " << ls.size() << " requests" << dendl;
11378 requeue_ops(ls);
11379 waiting_for_blocked_object.erase(p);
11380 }
11381
11382 map<hobject_t, ObjectContextRef>::iterator i =
11383 objects_blocked_on_snap_promotion.find(obc->obs.oi.soid.get_head());
11384 if (i != objects_blocked_on_snap_promotion.end()) {
11fdf7f2 11385 ceph_assert(i->second == obc);
7c673cae
FG
11386 objects_blocked_on_snap_promotion.erase(i);
11387 }
11388
11389 if (obc->requeue_scrub_on_unblock) {
11390 obc->requeue_scrub_on_unblock = false;
494da23a
TL
11391 // only requeue if we are still active: we may be unblocking
11392 // because we are resetting for a new peering interval
11393 if (is_active()) {
11394 requeue_scrub();
11395 }
7c673cae
FG
11396 }
11397}
11398
11399SnapSetContext *PrimaryLogPG::get_snapset_context(
11400 const hobject_t& oid,
11401 bool can_create,
11402 const map<string, bufferlist> *attrs,
11403 bool oid_existed)
11404{
11fdf7f2 11405 std::lock_guard l(snapset_contexts_lock);
7c673cae
FG
11406 SnapSetContext *ssc;
11407 map<hobject_t, SnapSetContext*>::iterator p = snapset_contexts.find(
11408 oid.get_snapdir());
11409 if (p != snapset_contexts.end()) {
11410 if (can_create || p->second->exists) {
11411 ssc = p->second;
11412 } else {
11413 return NULL;
11414 }
11415 } else {
11416 bufferlist bv;
11417 if (!attrs) {
11418 int r = -ENOENT;
11fdf7f2 11419 if (!(oid.is_head() && !oid_existed)) {
7c673cae 11420 r = pgbackend->objects_get_attr(oid.get_head(), SS_ATTR, &bv);
7c673cae 11421 }
11fdf7f2
TL
11422 if (r < 0 && !can_create)
11423 return NULL;
7c673cae 11424 } else {
11fdf7f2
TL
11425 auto it_ss = attrs->find(SS_ATTR);
11426 ceph_assert(it_ss != attrs->end());
11427 bv = it_ss->second;
7c673cae
FG
11428 }
11429 ssc = new SnapSetContext(oid.get_snapdir());
11430 _register_snapset_context(ssc);
11431 if (bv.length()) {
11fdf7f2 11432 bufferlist::const_iterator bvp = bv.begin();
224ce89b
WB
11433 try {
11434 ssc->snapset.decode(bvp);
11435 } catch (buffer::error& e) {
11436 dout(0) << __func__ << " Can't decode snapset: " << e << dendl;
11437 return NULL;
11438 }
7c673cae
FG
11439 ssc->exists = true;
11440 } else {
11441 ssc->exists = false;
11442 }
11443 }
11fdf7f2 11444 ceph_assert(ssc);
7c673cae
FG
11445 ssc->ref++;
11446 return ssc;
11447}
11448
11449void PrimaryLogPG::put_snapset_context(SnapSetContext *ssc)
11450{
11fdf7f2 11451 std::lock_guard l(snapset_contexts_lock);
7c673cae
FG
11452 --ssc->ref;
11453 if (ssc->ref == 0) {
11454 if (ssc->registered)
11455 snapset_contexts.erase(ssc->oid);
11456 delete ssc;
11457 }
11458}
11459
7c673cae
FG
11460/*
11461 * Return values:
11462 * NONE - didn't pull anything
11463 * YES - pulled what the caller wanted
11fdf7f2 11464 * HEAD - needed to pull head first
7c673cae 11465 */
11fdf7f2 11466enum { PULL_NONE, PULL_HEAD, PULL_YES };
7c673cae
FG
11467
11468int PrimaryLogPG::recover_missing(
11469 const hobject_t &soid, eversion_t v,
11470 int priority,
11471 PGBackend::RecoveryHandle *h)
11472{
9f95a23c 11473 if (recovery_state.get_missing_loc().is_unfound(soid)) {
11fdf7f2 11474 dout(7) << __func__ << " " << soid
7c673cae
FG
11475 << " v " << v
11476 << " but it is unfound" << dendl;
11477 return PULL_NONE;
11478 }
11479
9f95a23c 11480 if (recovery_state.get_missing_loc().is_deleted(soid)) {
c07f9fc5 11481 start_recovery_op(soid);
11fdf7f2 11482 ceph_assert(!recovering.count(soid));
c07f9fc5 11483 recovering.insert(make_pair(soid, ObjectContextRef()));
11fdf7f2 11484 epoch_t cur_epoch = get_osdmap_epoch();
9f95a23c 11485 remove_missing_object(soid, v, new LambdaContext(
c07f9fc5 11486 [=](int) {
9f95a23c 11487 std::scoped_lock locker{*this};
c07f9fc5
FG
11488 if (!pg_has_reset_since(cur_epoch)) {
11489 bool object_missing = false;
9f95a23c 11490 for (const auto& shard : get_acting_recovery_backfill()) {
c07f9fc5
FG
11491 if (shard == pg_whoami)
11492 continue;
9f95a23c 11493 if (recovery_state.get_peer_missing(shard).is_missing(soid)) {
c07f9fc5
FG
11494 dout(20) << __func__ << ": soid " << soid << " needs to be deleted from replica " << shard << dendl;
11495 object_missing = true;
11496 break;
11497 }
11498 }
11499 if (!object_missing) {
11500 object_stat_sum_t stat_diff;
11501 stat_diff.num_objects_recovered = 1;
11fdf7f2
TL
11502 if (scrub_after_recovery)
11503 stat_diff.num_objects_repaired = 1;
c07f9fc5
FG
11504 on_global_recover(soid, stat_diff, true);
11505 } else {
11506 auto recovery_handle = pgbackend->open_recovery_op();
11507 pgbackend->recover_delete_object(soid, v, recovery_handle);
11508 pgbackend->run_recovery_op(recovery_handle, priority);
11509 }
11510 }
c07f9fc5
FG
11511 }));
11512 return PULL_YES;
11513 }
11514
7c673cae
FG
11515 // is this a snapped object? if so, consult the snapset.. we may not need the entire object!
11516 ObjectContextRef obc;
11517 ObjectContextRef head_obc;
11518 if (soid.snap && soid.snap < CEPH_NOSNAP) {
11fdf7f2 11519 // do we have the head?
7c673cae 11520 hobject_t head = soid.get_head();
9f95a23c 11521 if (recovery_state.get_pg_log().get_missing().is_missing(head)) {
7c673cae
FG
11522 if (recovering.count(head)) {
11523 dout(10) << " missing but already recovering head " << head << dendl;
11524 return PULL_NONE;
11525 } else {
11526 int r = recover_missing(
9f95a23c 11527 head, recovery_state.get_pg_log().get_missing().get_items().find(head)->second.need, priority,
7c673cae
FG
11528 h);
11529 if (r != PULL_NONE)
11fdf7f2 11530 return PULL_HEAD;
7c673cae
FG
11531 return PULL_NONE;
11532 }
11533 }
7c673cae 11534 head_obc = get_object_context(
11fdf7f2 11535 head,
7c673cae
FG
11536 false,
11537 0);
11fdf7f2 11538 ceph_assert(head_obc);
7c673cae
FG
11539 }
11540 start_recovery_op(soid);
11fdf7f2 11541 ceph_assert(!recovering.count(soid));
7c673cae 11542 recovering.insert(make_pair(soid, obc));
224ce89b 11543 int r = pgbackend->recover_object(
7c673cae
FG
11544 soid,
11545 v,
11546 head_obc,
11547 obc,
11548 h);
224ce89b 11549 // This is only a pull which shouldn't return an error
11fdf7f2 11550 ceph_assert(r >= 0);
7c673cae
FG
11551 return PULL_YES;
11552}
11553
c07f9fc5
FG
11554void PrimaryLogPG::remove_missing_object(const hobject_t &soid,
11555 eversion_t v, Context *on_complete)
11556{
11557 dout(20) << __func__ << " " << soid << " " << v << dendl;
11fdf7f2 11558 ceph_assert(on_complete != nullptr);
c07f9fc5
FG
11559 // delete locally
11560 ObjectStore::Transaction t;
11561 remove_snap_mapped_object(t, soid);
11562
11563 ObjectRecoveryInfo recovery_info;
11564 recovery_info.soid = soid;
11565 recovery_info.version = v;
11566
11fdf7f2 11567 epoch_t cur_epoch = get_osdmap_epoch();
9f95a23c 11568 t.register_on_complete(new LambdaContext(
c07f9fc5 11569 [=](int) {
9f95a23c 11570 std::unique_lock locker{*this};
c07f9fc5
FG
11571 if (!pg_has_reset_since(cur_epoch)) {
11572 ObjectStore::Transaction t2;
11573 on_local_recover(soid, recovery_info, ObjectContextRef(), true, &t2);
11574 t2.register_on_complete(on_complete);
11fdf7f2
TL
11575 int r = osd->store->queue_transaction(ch, std::move(t2), nullptr);
11576 ceph_assert(r == 0);
9f95a23c 11577 locker.unlock();
c07f9fc5 11578 } else {
9f95a23c 11579 locker.unlock();
c07f9fc5
FG
11580 on_complete->complete(-EAGAIN);
11581 }
11582 }));
11fdf7f2
TL
11583 int r = osd->store->queue_transaction(ch, std::move(t), nullptr);
11584 ceph_assert(r == 0);
c07f9fc5 11585}
7c673cae 11586
eafe8130 11587void PrimaryLogPG::finish_degraded_object(const hobject_t oid)
7c673cae 11588{
11fdf7f2 11589 dout(10) << __func__ << " " << oid << dendl;
7c673cae
FG
11590 if (callbacks_for_degraded_object.count(oid)) {
11591 list<Context*> contexts;
11592 contexts.swap(callbacks_for_degraded_object[oid]);
11593 callbacks_for_degraded_object.erase(oid);
11594 for (list<Context*>::iterator i = contexts.begin();
11595 i != contexts.end();
11596 ++i) {
11597 (*i)->complete(0);
11598 }
11599 }
11600 map<hobject_t, snapid_t>::iterator i = objects_blocked_on_degraded_snap.find(
11601 oid.get_head());
11602 if (i != objects_blocked_on_degraded_snap.end() &&
11603 i->second == oid.snap)
11604 objects_blocked_on_degraded_snap.erase(i);
11605}
11606
11607void PrimaryLogPG::_committed_pushed_object(
11608 epoch_t epoch, eversion_t last_complete)
11609{
9f95a23c 11610 std::scoped_lock locker{*this};
7c673cae 11611 if (!pg_has_reset_since(epoch)) {
9f95a23c 11612 recovery_state.recovery_committed_to(last_complete);
7c673cae 11613 } else {
9f95a23c
TL
11614 dout(10) << __func__
11615 << " pg has changed, not touching last_complete_ondisk" << dendl;
7c673cae 11616 }
7c673cae
FG
11617}
11618
11619void PrimaryLogPG::_applied_recovered_object(ObjectContextRef obc)
11620{
c07f9fc5
FG
11621 dout(20) << __func__ << dendl;
11622 if (obc) {
11623 dout(20) << "obc = " << *obc << dendl;
11624 }
11fdf7f2 11625 ceph_assert(active_pushes >= 1);
7c673cae
FG
11626 --active_pushes;
11627
11628 // requeue an active chunky scrub waiting on recovery ops
9f95a23c 11629 if (!recovery_state.is_deleting() && active_pushes == 0
7c673cae 11630 && scrubber.is_chunky_scrub_active()) {
11fdf7f2 11631 requeue_scrub(ops_blocked_by_scrub());
7c673cae 11632 }
7c673cae
FG
11633}
11634
11635void PrimaryLogPG::_applied_recovered_object_replica()
11636{
c07f9fc5 11637 dout(20) << __func__ << dendl;
11fdf7f2 11638 ceph_assert(active_pushes >= 1);
7c673cae
FG
11639 --active_pushes;
11640
11641 // requeue an active chunky scrub waiting on recovery ops
9f95a23c 11642 if (!recovery_state.is_deleting() && active_pushes == 0 &&
7c673cae
FG
11643 scrubber.active_rep_scrub && static_cast<const MOSDRepScrub*>(
11644 scrubber.active_rep_scrub->get_req())->chunky) {
11fdf7f2 11645 auto& op = scrubber.active_rep_scrub;
7c673cae 11646 osd->enqueue_back(
9f95a23c
TL
11647 OpSchedulerItem(
11648 unique_ptr<OpSchedulerItem::OpQueueable>(new PGOpItem(info.pgid, op)),
11fdf7f2
TL
11649 op->get_req()->get_cost(),
11650 op->get_req()->get_priority(),
11651 op->get_req()->get_recv_stamp(),
11652 op->get_req()->get_source().num(),
11653 get_osdmap_epoch()));
11654 scrubber.active_rep_scrub.reset();
7c673cae 11655 }
7c673cae
FG
11656}
11657
9f95a23c
TL
11658void PrimaryLogPG::on_failed_pull(
11659 const set<pg_shard_t> &from,
11660 const hobject_t &soid,
11661 const eversion_t &v)
7c673cae
FG
11662{
11663 dout(20) << __func__ << ": " << soid << dendl;
11fdf7f2 11664 ceph_assert(recovering.count(soid));
7c673cae
FG
11665 auto obc = recovering[soid];
11666 if (obc) {
11667 list<OpRequestRef> blocked_ops;
11668 obc->drop_recovery_read(&blocked_ops);
11669 requeue_ops(blocked_ops);
11670 }
11671 recovering.erase(soid);
81eedcae 11672 for (auto&& i : from) {
9f95a23c
TL
11673 if (i != pg_whoami) { // we'll get it below in primary_error
11674 recovery_state.force_object_missing(i, soid, v);
81eedcae
TL
11675 }
11676 }
9f95a23c 11677
7c673cae 11678 dout(0) << __func__ << " " << soid << " from shard " << from
9f95a23c
TL
11679 << ", reps on " << recovery_state.get_missing_loc().get_locations(soid)
11680 << " unfound? " << recovery_state.get_missing_loc().is_unfound(soid)
11681 << dendl;
7c673cae 11682 finish_recovery_op(soid); // close out this attempt,
9f95a23c
TL
11683 finish_degraded_object(soid);
11684
11685 if (from.count(pg_whoami)) {
11686 dout(0) << " primary missing oid " << soid << " version " << v << dendl;
11687 primary_error(soid, v);
11688 backfills_in_flight.erase(soid);
11689 }
7c673cae
FG
11690}
11691
7c673cae
FG
11692eversion_t PrimaryLogPG::pick_newest_available(const hobject_t& oid)
11693{
11694 eversion_t v;
11695 pg_missing_item pmi;
9f95a23c 11696 bool is_missing = recovery_state.get_pg_log().get_missing().is_missing(oid, &pmi);
11fdf7f2 11697 ceph_assert(is_missing);
7c673cae
FG
11698 v = pmi.have;
11699 dout(10) << "pick_newest_available " << oid << " " << v << " on osd." << osd->whoami << " (local)" << dendl;
11700
9f95a23c
TL
11701 ceph_assert(!get_acting_recovery_backfill().empty());
11702 for (set<pg_shard_t>::iterator i = get_acting_recovery_backfill().begin();
11703 i != get_acting_recovery_backfill().end();
7c673cae
FG
11704 ++i) {
11705 if (*i == get_primary()) continue;
11706 pg_shard_t peer = *i;
9f95a23c 11707 if (!recovery_state.get_peer_missing(peer).is_missing(oid)) {
7c673cae
FG
11708 continue;
11709 }
9f95a23c 11710 eversion_t h = recovery_state.get_peer_missing(peer).get_items().at(oid).have;
7c673cae
FG
11711 dout(10) << "pick_newest_available " << oid << " " << h << " on osd." << peer << dendl;
11712 if (h > v)
11713 v = h;
11714 }
11715
11716 dout(10) << "pick_newest_available " << oid << " " << v << " (newest)" << dendl;
11717 return v;
11718}
11719
11720void PrimaryLogPG::do_update_log_missing(OpRequestRef &op)
11721{
11722 const MOSDPGUpdateLogMissing *m = static_cast<const MOSDPGUpdateLogMissing*>(
11723 op->get_req());
11fdf7f2 11724 ceph_assert(m->get_type() == MSG_OSD_PG_UPDATE_LOG_MISSING);
7c673cae 11725 ObjectStore::Transaction t;
9f95a23c 11726 std::optional<eversion_t> op_trim_to, op_roll_forward_to;
94b18763
FG
11727 if (m->pg_trim_to != eversion_t())
11728 op_trim_to = m->pg_trim_to;
11729 if (m->pg_roll_forward_to != eversion_t())
11730 op_roll_forward_to = m->pg_roll_forward_to;
11731
9f95a23c
TL
11732 dout(20) << __func__
11733 << " op_trim_to = " << op_trim_to << " op_roll_forward_to = " << op_roll_forward_to << dendl;
94b18763 11734
9f95a23c
TL
11735 recovery_state.append_log_entries_update_missing(
11736 m->entries, t, op_trim_to, op_roll_forward_to);
94b18763 11737 eversion_t new_lcod = info.last_complete;
7c673cae 11738
9f95a23c 11739 Context *complete = new LambdaContext(
7c673cae
FG
11740 [=](int) {
11741 const MOSDPGUpdateLogMissing *msg = static_cast<const MOSDPGUpdateLogMissing*>(
11742 op->get_req());
9f95a23c 11743 std::scoped_lock locker{*this};
7c673cae 11744 if (!pg_has_reset_since(msg->get_epoch())) {
94b18763 11745 update_last_complete_ondisk(new_lcod);
7c673cae
FG
11746 MOSDPGUpdateLogMissingReply *reply =
11747 new MOSDPGUpdateLogMissingReply(
11748 spg_t(info.pgid.pgid, primary_shard().shard),
11749 pg_whoami.shard,
11750 msg->get_epoch(),
11751 msg->min_epoch,
94b18763
FG
11752 msg->get_tid(),
11753 new_lcod);
7c673cae
FG
11754 reply->set_priority(CEPH_MSG_PRIO_HIGH);
11755 msg->get_connection()->send_message(reply);
11756 }
7c673cae
FG
11757 });
11758
9f95a23c 11759 if (get_osdmap()->require_osd_release >= ceph_release_t::kraken) {
7c673cae
FG
11760 t.register_on_commit(complete);
11761 } else {
11762 /* Hack to work around the fact that ReplicatedBackend sends
11763 * ack+commit if commit happens first
11764 *
11765 * This behavior is no longer necessary, but we preserve it so old
11766 * primaries can keep their repops in order */
11fdf7f2 11767 if (pool.info.is_erasure()) {
7c673cae
FG
11768 t.register_on_complete(complete);
11769 } else {
11770 t.register_on_commit(complete);
11771 }
11772 }
7c673cae 11773 int tr = osd->store->queue_transaction(
11fdf7f2 11774 ch,
7c673cae
FG
11775 std::move(t),
11776 nullptr);
11fdf7f2
TL
11777 ceph_assert(tr == 0);
11778 op_applied(info.last_update);
7c673cae
FG
11779}
11780
11781void PrimaryLogPG::do_update_log_missing_reply(OpRequestRef &op)
11782{
11783 const MOSDPGUpdateLogMissingReply *m =
11784 static_cast<const MOSDPGUpdateLogMissingReply*>(
11785 op->get_req());
11786 dout(20) << __func__ << " got reply from "
11787 << m->get_from() << dendl;
11788
11789 auto it = log_entry_update_waiting_on.find(m->get_tid());
11790 if (it != log_entry_update_waiting_on.end()) {
11791 if (it->second.waiting_on.count(m->get_from())) {
11792 it->second.waiting_on.erase(m->get_from());
94b18763
FG
11793 if (m->last_complete_ondisk != eversion_t()) {
11794 update_peer_last_complete_ondisk(m->get_from(), m->last_complete_ondisk);
11795 }
7c673cae
FG
11796 } else {
11797 osd->clog->error()
11798 << info.pgid << " got reply "
11799 << *m << " from shard we are not waiting for "
11800 << m->get_from();
11801 }
11802
11803 if (it->second.waiting_on.empty()) {
11804 repop_all_committed(it->second.repop.get());
11805 log_entry_update_waiting_on.erase(it);
11806 }
11807 } else {
11808 osd->clog->error()
11809 << info.pgid << " got reply "
11810 << *m << " on unknown tid " << m->get_tid();
11811 }
11812}
11813
11814/* Mark all unfound objects as lost.
11815 */
11816void PrimaryLogPG::mark_all_unfound_lost(
11817 int what,
9f95a23c 11818 std::function<void(int,const std::string&,bufferlist&)> on_finish)
7c673cae
FG
11819{
11820 dout(3) << __func__ << " " << pg_log_entry_t::get_op_name(what) << dendl;
224ce89b 11821 list<hobject_t> oids;
7c673cae
FG
11822
11823 dout(30) << __func__ << ": log before:\n";
9f95a23c 11824 recovery_state.get_pg_log().get_log().print(*_dout);
7c673cae
FG
11825 *_dout << dendl;
11826
31f18b77 11827 mempool::osd_pglog::list<pg_log_entry_t> log_entries;
7c673cae
FG
11828
11829 utime_t mtime = ceph_clock_now();
11830 map<hobject_t, pg_missing_item>::const_iterator m =
9f95a23c 11831 recovery_state.get_missing_loc().get_needs_recovery().begin();
7c673cae 11832 map<hobject_t, pg_missing_item>::const_iterator mend =
9f95a23c 11833 recovery_state.get_missing_loc().get_needs_recovery().end();
7c673cae
FG
11834
11835 ObcLockManager manager;
11836 eversion_t v = get_next_version();
11fdf7f2 11837 v.epoch = get_osdmap_epoch();
9f95a23c 11838 uint64_t num_unfound = recovery_state.get_missing_loc().num_unfound();
7c673cae
FG
11839 while (m != mend) {
11840 const hobject_t &oid(m->first);
9f95a23c 11841 if (!recovery_state.get_missing_loc().is_unfound(oid)) {
7c673cae
FG
11842 // We only care about unfound objects
11843 ++m;
11844 continue;
11845 }
11846
11847 ObjectContextRef obc;
11848 eversion_t prev;
11849
11850 switch (what) {
11851 case pg_log_entry_t::LOST_MARK:
11fdf7f2 11852 ceph_abort_msg("actually, not implemented yet!");
7c673cae
FG
11853 break;
11854
11855 case pg_log_entry_t::LOST_REVERT:
11856 prev = pick_newest_available(oid);
11857 if (prev > eversion_t()) {
11858 // log it
11859 pg_log_entry_t e(
11860 pg_log_entry_t::LOST_REVERT, oid, v,
11861 m->second.need, 0, osd_reqid_t(), mtime, 0);
11862 e.reverting_to = prev;
11863 e.mark_unrollbackable();
11864 log_entries.push_back(e);
11865 dout(10) << e << dendl;
11866
11867 // we are now missing the new version; recovery code will sort it out.
11868 ++v.version;
11869 ++m;
11870 break;
11871 }
11872
11873 case pg_log_entry_t::LOST_DELETE:
11874 {
11875 pg_log_entry_t e(pg_log_entry_t::LOST_DELETE, oid, v, m->second.need,
11876 0, osd_reqid_t(), mtime, 0);
9f95a23c 11877 if (get_osdmap()->require_osd_release >= ceph_release_t::jewel) {
7c673cae
FG
11878 if (pool.info.require_rollback()) {
11879 e.mod_desc.try_rmobject(v.version);
11880 } else {
11881 e.mark_unrollbackable();
11882 }
11883 } // otherwise, just do what we used to do
11884 dout(10) << e << dendl;
11885 log_entries.push_back(e);
224ce89b 11886 oids.push_back(oid);
7c673cae 11887
b32b8144
FG
11888 // If context found mark object as deleted in case
11889 // of racing with new creation. This can happen if
11890 // object lost and EIO at primary.
11891 obc = object_contexts.lookup(oid);
11892 if (obc)
11893 obc->obs.exists = false;
11894
7c673cae
FG
11895 ++v.version;
11896 ++m;
11897 }
11898 break;
11899
11900 default:
11901 ceph_abort();
11902 }
11903 }
11904
9f95a23c
TL
11905 recovery_state.update_stats(
11906 [](auto &history, auto &stats) {
11907 stats.stats_invalid = true;
11908 return false;
11909 });
7c673cae
FG
11910
11911 submit_log_entries(
11912 log_entries,
11913 std::move(manager),
9f95a23c
TL
11914 std::optional<std::function<void(void)> >(
11915 [this, oids, num_unfound, on_finish]() {
11916 if (recovery_state.perform_deletes_during_peering()) {
c07f9fc5
FG
11917 for (auto oid : oids) {
11918 // clear old locations - merge_new_log_entries will have
11919 // handled rebuilding missing_loc for each of these
11920 // objects if we have the RECOVERY_DELETES flag
9f95a23c 11921 recovery_state.object_recovered(oid, object_stat_sum_t());
c07f9fc5
FG
11922 }
11923 }
11924
b32b8144
FG
11925 if (is_recovery_unfound()) {
11926 queue_peering_event(
11fdf7f2
TL
11927 PGPeeringEventRef(
11928 std::make_shared<PGPeeringEvent>(
11929 get_osdmap_epoch(),
11930 get_osdmap_epoch(),
9f95a23c 11931 PeeringState::DoRecovery())));
b32b8144
FG
11932 } else if (is_backfill_unfound()) {
11933 queue_peering_event(
11fdf7f2
TL
11934 PGPeeringEventRef(
11935 std::make_shared<PGPeeringEvent>(
11936 get_osdmap_epoch(),
11937 get_osdmap_epoch(),
9f95a23c 11938 PeeringState::RequestBackfill())));
b32b8144
FG
11939 } else {
11940 queue_recovery();
7c673cae 11941 }
7c673cae
FG
11942
11943 stringstream ss;
11944 ss << "pg has " << num_unfound
11945 << " objects unfound and apparently lost marking";
11946 string rs = ss.str();
11947 dout(0) << "do_command r=" << 0 << " " << rs << dendl;
11948 osd->clog->info() << rs;
9f95a23c
TL
11949 bufferlist empty;
11950 on_finish(0, rs, empty);
7c673cae
FG
11951 }),
11952 OpRequestRef());
11953}
11954
11955void PrimaryLogPG::_split_into(pg_t child_pgid, PG *child, unsigned split_bits)
11956{
11fdf7f2 11957 ceph_assert(repop_queue.empty());
7c673cae
FG
11958}
11959
11960/*
11961 * pg status change notification
11962 */
11963
11964void PrimaryLogPG::apply_and_flush_repops(bool requeue)
11965{
11966 list<OpRequestRef> rq;
11967
11968 // apply all repops
11969 while (!repop_queue.empty()) {
11970 RepGather *repop = repop_queue.front();
11971 repop_queue.pop_front();
11972 dout(10) << " canceling repop tid " << repop->rep_tid << dendl;
11973 repop->rep_aborted = true;
7c673cae
FG
11974 repop->on_committed.clear();
11975 repop->on_success.clear();
11976
11977 if (requeue) {
11978 if (repop->op) {
11979 dout(10) << " requeuing " << *repop->op->get_req() << dendl;
11980 rq.push_back(repop->op);
11981 repop->op = OpRequestRef();
11982 }
11983
11984 // also requeue any dups, interleaved into position
11fdf7f2 11985 auto p = waiting_for_ondisk.find(repop->v);
7c673cae
FG
11986 if (p != waiting_for_ondisk.end()) {
11987 dout(10) << " also requeuing ondisk waiters " << p->second << dendl;
11fdf7f2
TL
11988 for (auto& i : p->second) {
11989 rq.push_back(std::get<0>(i));
7c673cae
FG
11990 }
11991 waiting_for_ondisk.erase(p);
11992 }
11993 }
11994
11995 remove_repop(repop);
11996 }
11997
11fdf7f2 11998 ceph_assert(repop_queue.empty());
7c673cae
FG
11999
12000 if (requeue) {
12001 requeue_ops(rq);
12002 if (!waiting_for_ondisk.empty()) {
11fdf7f2
TL
12003 for (auto& i : waiting_for_ondisk) {
12004 for (auto& j : i.second) {
12005 derr << __func__ << ": op " << *(std::get<0>(j)->get_req())
12006 << " waiting on " << i.first << dendl;
12007 }
7c673cae 12008 }
11fdf7f2 12009 ceph_assert(waiting_for_ondisk.empty());
7c673cae
FG
12010 }
12011 }
12012
12013 waiting_for_ondisk.clear();
12014}
12015
12016void PrimaryLogPG::on_flushed()
12017{
9f95a23c 12018 requeue_ops(waiting_for_flush);
7c673cae
FG
12019 if (!is_peered() || !is_primary()) {
12020 pair<hobject_t, ObjectContextRef> i;
12021 while (object_contexts.get_next(i.first, &i)) {
11fdf7f2 12022 derr << __func__ << ": object " << i.first << " obc still alive" << dendl;
7c673cae 12023 }
11fdf7f2 12024 ceph_assert(object_contexts.empty());
7c673cae 12025 }
7c673cae
FG
12026}
12027
9f95a23c 12028void PrimaryLogPG::on_removal(ObjectStore::Transaction &t)
7c673cae 12029{
11fdf7f2 12030 dout(10) << __func__ << dendl;
7c673cae 12031
11fdf7f2 12032 on_shutdown();
9f95a23c
TL
12033
12034 t.register_on_commit(new C_DeleteMore(this, get_osdmap_epoch()));
7c673cae
FG
12035}
12036
c07f9fc5
FG
12037void PrimaryLogPG::clear_async_reads()
12038{
12039 dout(10) << __func__ << dendl;
12040 for(auto& i : in_progress_async_reads) {
12041 dout(10) << "clear ctx: "
12042 << "OpRequestRef " << i.first
12043 << " OpContext " << i.second
12044 << dendl;
12045 close_op_ctx(i.second);
12046 }
12047}
12048
11fdf7f2 12049void PrimaryLogPG::clear_cache()
7c673cae 12050{
11fdf7f2
TL
12051 object_contexts.clear();
12052}
7c673cae 12053
11fdf7f2
TL
12054void PrimaryLogPG::on_shutdown()
12055{
12056 dout(10) << __func__ << dendl;
7c673cae 12057
224ce89b
WB
12058 if (recovery_queued) {
12059 recovery_queued = false;
12060 osd->clear_queued_recovery(this);
12061 }
12062
7c673cae
FG
12063 clear_scrub_reserved();
12064 scrub_clear_state();
12065
12066 unreg_next_scrub();
94b18763
FG
12067
12068 vector<ceph_tid_t> tids;
12069 cancel_copy_ops(false, &tids);
12070 cancel_flush_ops(false, &tids);
12071 cancel_proxy_ops(false, &tids);
9f95a23c 12072 cancel_manifest_ops(false, &tids);
94b18763
FG
12073 osd->objecter->op_cancel(tids, -ECANCELED);
12074
7c673cae
FG
12075 apply_and_flush_repops(false);
12076 cancel_log_updates();
31f18b77
FG
12077 // we must remove PGRefs, so do this this prior to release_backoffs() callers
12078 clear_backoffs();
12079 // clean up snap trim references
12080 snap_trimmer_machine.process_event(Reset());
7c673cae
FG
12081
12082 pgbackend->on_change();
12083
12084 context_registry_on_change();
12085 object_contexts.clear();
12086
c07f9fc5
FG
12087 clear_async_reads();
12088
7c673cae
FG
12089 osd->remote_reserver.cancel_reservation(info.pgid);
12090 osd->local_reserver.cancel_reservation(info.pgid);
12091
12092 clear_primary_state();
12093 cancel_recovery();
11fdf7f2
TL
12094
12095 if (is_primary()) {
12096 osd->clear_ready_to_merge(this);
12097 }
7c673cae
FG
12098}
12099
9f95a23c 12100void PrimaryLogPG::on_activate_complete()
7c673cae 12101{
9f95a23c
TL
12102 check_local();
12103 // waiters
12104 if (!recovery_state.needs_flush()) {
12105 requeue_ops(waiting_for_peered);
12106 } else if (!waiting_for_peered.empty()) {
12107 dout(10) << __func__ << " flushes in progress, moving "
12108 << waiting_for_peered.size()
12109 << " items to waiting_for_flush"
12110 << dendl;
12111 ceph_assert(waiting_for_flush.empty());
12112 waiting_for_flush.swap(waiting_for_peered);
12113 }
12114
12115
7c673cae
FG
12116 // all clean?
12117 if (needs_recovery()) {
12118 dout(10) << "activate not all replicas are up-to-date, queueing recovery" << dendl;
12119 queue_peering_event(
11fdf7f2
TL
12120 PGPeeringEventRef(
12121 std::make_shared<PGPeeringEvent>(
12122 get_osdmap_epoch(),
12123 get_osdmap_epoch(),
9f95a23c 12124 PeeringState::DoRecovery())));
7c673cae
FG
12125 } else if (needs_backfill()) {
12126 dout(10) << "activate queueing backfill" << dendl;
12127 queue_peering_event(
11fdf7f2
TL
12128 PGPeeringEventRef(
12129 std::make_shared<PGPeeringEvent>(
12130 get_osdmap_epoch(),
12131 get_osdmap_epoch(),
9f95a23c 12132 PeeringState::RequestBackfill())));
7c673cae
FG
12133 } else {
12134 dout(10) << "activate all replicas clean, no recovery" << dendl;
224ce89b 12135 eio_errors_to_process = false;
7c673cae 12136 queue_peering_event(
11fdf7f2
TL
12137 PGPeeringEventRef(
12138 std::make_shared<PGPeeringEvent>(
12139 get_osdmap_epoch(),
12140 get_osdmap_epoch(),
9f95a23c 12141 PeeringState::AllReplicasRecovered())));
7c673cae
FG
12142 }
12143
12144 publish_stats_to_osd();
12145
9f95a23c 12146 if (get_backfill_targets().size()) {
7c673cae
FG
12147 last_backfill_started = earliest_backfill();
12148 new_backfill = true;
11fdf7f2 12149 ceph_assert(!last_backfill_started.is_max());
9f95a23c 12150 dout(5) << __func__ << ": bft=" << get_backfill_targets()
7c673cae 12151 << " from " << last_backfill_started << dendl;
9f95a23c
TL
12152 for (set<pg_shard_t>::const_iterator i = get_backfill_targets().begin();
12153 i != get_backfill_targets().end();
7c673cae
FG
12154 ++i) {
12155 dout(5) << "target shard " << *i
9f95a23c 12156 << " from " << recovery_state.get_peer_info(*i).last_backfill
7c673cae
FG
12157 << dendl;
12158 }
12159 }
12160
12161 hit_set_setup();
12162 agent_setup();
12163}
12164
9f95a23c 12165void PrimaryLogPG::on_change(ObjectStore::Transaction &t)
7c673cae 12166{
11fdf7f2 12167 dout(10) << __func__ << dendl;
7c673cae
FG
12168
12169 if (hit_set && hit_set->insert_count() == 0) {
12170 dout(20) << " discarding empty hit_set" << dendl;
12171 hit_set_clear();
12172 }
12173
12174 if (recovery_queued) {
12175 recovery_queued = false;
12176 osd->clear_queued_recovery(this);
12177 }
12178
12179 // requeue everything in the reverse order they should be
12180 // reexamined.
12181 requeue_ops(waiting_for_peered);
b32b8144 12182 requeue_ops(waiting_for_flush);
7c673cae 12183 requeue_ops(waiting_for_active);
9f95a23c 12184 requeue_ops(waiting_for_readable);
7c673cae
FG
12185
12186 clear_scrub_reserved();
12187
94b18763
FG
12188 vector<ceph_tid_t> tids;
12189 cancel_copy_ops(is_primary(), &tids);
12190 cancel_flush_ops(is_primary(), &tids);
12191 cancel_proxy_ops(is_primary(), &tids);
9f95a23c 12192 cancel_manifest_ops(is_primary(), &tids);
94b18763 12193 osd->objecter->op_cancel(tids, -ECANCELED);
7c673cae
FG
12194
12195 // requeue object waiters
12196 for (auto& p : waiting_for_unreadable_object) {
12197 release_backoffs(p.first);
12198 }
12199 if (is_primary()) {
12200 requeue_object_waiters(waiting_for_unreadable_object);
12201 } else {
12202 waiting_for_unreadable_object.clear();
12203 }
12204 for (map<hobject_t,list<OpRequestRef>>::iterator p = waiting_for_degraded_object.begin();
12205 p != waiting_for_degraded_object.end();
12206 waiting_for_degraded_object.erase(p++)) {
12207 release_backoffs(p->first);
12208 if (is_primary())
12209 requeue_ops(p->second);
12210 else
12211 p->second.clear();
12212 finish_degraded_object(p->first);
12213 }
12214
12215 // requeues waiting_for_scrub
12216 scrub_clear_state();
12217
12218 for (auto p = waiting_for_blocked_object.begin();
12219 p != waiting_for_blocked_object.end();
12220 waiting_for_blocked_object.erase(p++)) {
12221 if (is_primary())
12222 requeue_ops(p->second);
12223 else
12224 p->second.clear();
12225 }
12226 for (auto i = callbacks_for_degraded_object.begin();
12227 i != callbacks_for_degraded_object.end();
12228 ) {
12229 finish_degraded_object((i++)->first);
12230 }
11fdf7f2 12231 ceph_assert(callbacks_for_degraded_object.empty());
7c673cae
FG
12232
12233 if (is_primary()) {
12234 requeue_ops(waiting_for_cache_not_full);
7c673cae
FG
12235 } else {
12236 waiting_for_cache_not_full.clear();
7c673cae
FG
12237 }
12238 objects_blocked_on_cache_full.clear();
12239
12240 for (list<pair<OpRequestRef, OpContext*> >::iterator i =
12241 in_progress_async_reads.begin();
12242 i != in_progress_async_reads.end();
12243 in_progress_async_reads.erase(i++)) {
12244 close_op_ctx(i->second);
12245 if (is_primary())
12246 requeue_op(i->first);
12247 }
12248
12249 // this will requeue ops we were working on but didn't finish, and
12250 // any dups
12251 apply_and_flush_repops(is_primary());
12252 cancel_log_updates();
12253
12254 // do this *after* apply_and_flush_repops so that we catch any newly
12255 // registered watches.
12256 context_registry_on_change();
12257
9f95a23c
TL
12258 pgbackend->on_change_cleanup(&t);
12259 scrubber.cleanup_store(&t);
7c673cae
FG
12260 pgbackend->on_change();
12261
12262 // clear snap_trimmer state
12263 snap_trimmer_machine.process_event(Reset());
12264
12265 debug_op_order.clear();
12266 unstable_stats.clear();
12267
12268 // we don't want to cache object_contexts through the interval change
12269 // NOTE: we actually assert that all currently live references are dead
12270 // by the time the flush for the next interval completes.
12271 object_contexts.clear();
12272
12273 // should have been cleared above by finishing all of the degraded objects
11fdf7f2 12274 ceph_assert(objects_blocked_on_degraded_snap.empty());
7c673cae
FG
12275}
12276
9f95a23c 12277void PrimaryLogPG::plpg_on_role_change()
7c673cae 12278{
11fdf7f2 12279 dout(10) << __func__ << dendl;
7c673cae
FG
12280 if (get_role() != 0 && hit_set) {
12281 dout(10) << " clearing hit set" << dendl;
12282 hit_set_clear();
12283 }
12284}
12285
9f95a23c 12286void PrimaryLogPG::plpg_on_pool_change()
7c673cae
FG
12287{
12288 dout(10) << __func__ << dendl;
12289 // requeue cache full waiters just in case the cache_mode is
12290 // changing away from writeback mode. note that if we are not
12291 // active the normal requeuing machinery is sufficient (and properly
12292 // ordered).
12293 if (is_active() &&
12294 pool.info.cache_mode != pg_pool_t::CACHEMODE_WRITEBACK &&
12295 !waiting_for_cache_not_full.empty()) {
12296 dout(10) << __func__ << " requeuing full waiters (not in writeback) "
12297 << dendl;
12298 requeue_ops(waiting_for_cache_not_full);
12299 objects_blocked_on_cache_full.clear();
12300 }
12301 hit_set_setup();
12302 agent_setup();
12303}
12304
12305// clear state. called on recovery completion AND cancellation.
12306void PrimaryLogPG::_clear_recovery_state()
12307{
7c673cae
FG
12308#ifdef DEBUG_RECOVERY_OIDS
12309 recovering_oids.clear();
12310#endif
12311 last_backfill_started = hobject_t();
12312 set<hobject_t>::iterator i = backfills_in_flight.begin();
12313 while (i != backfills_in_flight.end()) {
11fdf7f2 12314 ceph_assert(recovering.count(*i));
7c673cae
FG
12315 backfills_in_flight.erase(i++);
12316 }
12317
12318 list<OpRequestRef> blocked_ops;
12319 for (map<hobject_t, ObjectContextRef>::iterator i = recovering.begin();
12320 i != recovering.end();
12321 recovering.erase(i++)) {
12322 if (i->second) {
12323 i->second->drop_recovery_read(&blocked_ops);
12324 requeue_ops(blocked_ops);
12325 }
12326 }
11fdf7f2 12327 ceph_assert(backfills_in_flight.empty());
7c673cae 12328 pending_backfill_updates.clear();
11fdf7f2 12329 ceph_assert(recovering.empty());
7c673cae
FG
12330 pgbackend->clear_recovery_state();
12331}
12332
12333void PrimaryLogPG::cancel_pull(const hobject_t &soid)
12334{
12335 dout(20) << __func__ << ": " << soid << dendl;
11fdf7f2 12336 ceph_assert(recovering.count(soid));
7c673cae
FG
12337 ObjectContextRef obc = recovering[soid];
12338 if (obc) {
12339 list<OpRequestRef> blocked_ops;
12340 obc->drop_recovery_read(&blocked_ops);
12341 requeue_ops(blocked_ops);
12342 }
12343 recovering.erase(soid);
12344 finish_recovery_op(soid);
12345 release_backoffs(soid);
12346 if (waiting_for_degraded_object.count(soid)) {
12347 dout(20) << " kicking degraded waiters on " << soid << dendl;
12348 requeue_ops(waiting_for_degraded_object[soid]);
12349 waiting_for_degraded_object.erase(soid);
12350 }
12351 if (waiting_for_unreadable_object.count(soid)) {
12352 dout(20) << " kicking unreadable waiters on " << soid << dendl;
12353 requeue_ops(waiting_for_unreadable_object[soid]);
12354 waiting_for_unreadable_object.erase(soid);
12355 }
12356 if (is_missing_object(soid))
9f95a23c 12357 recovery_state.set_last_requested(0);
7c673cae
FG
12358 finish_degraded_object(soid);
12359}
12360
12361void PrimaryLogPG::check_recovery_sources(const OSDMapRef& osdmap)
12362{
7c673cae 12363 pgbackend->check_recovery_sources(osdmap);
7c673cae
FG
12364}
12365
7c673cae
FG
12366bool PrimaryLogPG::start_recovery_ops(
12367 uint64_t max,
12368 ThreadPool::TPHandle &handle,
12369 uint64_t *ops_started)
12370{
12371 uint64_t& started = *ops_started;
12372 started = 0;
12373 bool work_in_progress = false;
11fdf7f2
TL
12374 bool recovery_started = false;
12375 ceph_assert(is_primary());
12376 ceph_assert(is_peered());
9f95a23c 12377 ceph_assert(!recovery_state.is_deleting());
11fdf7f2
TL
12378
12379 ceph_assert(recovery_queued);
12380 recovery_queued = false;
7c673cae
FG
12381
12382 if (!state_test(PG_STATE_RECOVERING) &&
3efd9988 12383 !state_test(PG_STATE_BACKFILLING)) {
7c673cae
FG
12384 /* TODO: I think this case is broken and will make do_recovery()
12385 * unhappy since we're returning false */
12386 dout(10) << "recovery raced and were queued twice, ignoring!" << dendl;
11fdf7f2 12387 return have_unfound();
7c673cae
FG
12388 }
12389
9f95a23c 12390 const auto &missing = recovery_state.get_pg_log().get_missing();
7c673cae 12391
7c673cae
FG
12392 uint64_t num_unfound = get_num_unfound();
12393
9f95a23c
TL
12394 if (!recovery_state.have_missing()) {
12395 recovery_state.local_recovery_complete();
7c673cae
FG
12396 }
12397
81eedcae 12398 if (!missing.have_missing() || // Primary does not have missing
9f95a23c
TL
12399 // or all of the missing objects are unfound.
12400 recovery_state.all_missing_unfound()) {
7c673cae 12401 // Recover the replicas.
11fdf7f2 12402 started = recover_replicas(max, handle, &recovery_started);
7c673cae
FG
12403 }
12404 if (!started) {
12405 // We still have missing objects that we should grab from replicas.
12406 started += recover_primary(max, handle);
12407 }
12408 if (!started && num_unfound != get_num_unfound()) {
12409 // second chance to recovery replicas
11fdf7f2 12410 started = recover_replicas(max, handle, &recovery_started);
7c673cae
FG
12411 }
12412
11fdf7f2 12413 if (started || recovery_started)
7c673cae
FG
12414 work_in_progress = true;
12415
12416 bool deferred_backfill = false;
12417 if (recovering.empty() &&
3efd9988 12418 state_test(PG_STATE_BACKFILLING) &&
9f95a23c 12419 !get_backfill_targets().empty() && started < max &&
7c673cae
FG
12420 missing.num_missing() == 0 &&
12421 waiting_on_backfill.empty()) {
12422 if (get_osdmap()->test_flag(CEPH_OSDMAP_NOBACKFILL)) {
12423 dout(10) << "deferring backfill due to NOBACKFILL" << dendl;
12424 deferred_backfill = true;
12425 } else if (get_osdmap()->test_flag(CEPH_OSDMAP_NOREBALANCE) &&
12426 !is_degraded()) {
12427 dout(10) << "deferring backfill due to NOREBALANCE" << dendl;
12428 deferred_backfill = true;
9f95a23c 12429 } else if (!recovery_state.is_backfill_reserved()) {
7c673cae
FG
12430 dout(10) << "deferring backfill due to !backfill_reserved" << dendl;
12431 if (!backfill_reserving) {
12432 dout(10) << "queueing RequestBackfill" << dendl;
12433 backfill_reserving = true;
12434 queue_peering_event(
11fdf7f2
TL
12435 PGPeeringEventRef(
12436 std::make_shared<PGPeeringEvent>(
12437 get_osdmap_epoch(),
12438 get_osdmap_epoch(),
9f95a23c 12439 PeeringState::RequestBackfill())));
7c673cae
FG
12440 }
12441 deferred_backfill = true;
12442 } else {
12443 started += recover_backfill(max - started, handle, &work_in_progress);
12444 }
12445 }
12446
12447 dout(10) << " started " << started << dendl;
12448 osd->logger->inc(l_osd_rop, started);
12449
12450 if (!recovering.empty() ||
12451 work_in_progress || recovery_ops_active > 0 || deferred_backfill)
11fdf7f2 12452 return !work_in_progress && have_unfound();
7c673cae 12453
11fdf7f2
TL
12454 ceph_assert(recovering.empty());
12455 ceph_assert(recovery_ops_active == 0);
7c673cae
FG
12456
12457 dout(10) << __func__ << " needs_recovery: "
9f95a23c 12458 << recovery_state.get_missing_loc().get_needs_recovery()
7c673cae
FG
12459 << dendl;
12460 dout(10) << __func__ << " missing_loc: "
9f95a23c 12461 << recovery_state.get_missing_loc().get_missing_locs()
7c673cae
FG
12462 << dendl;
12463 int unfound = get_num_unfound();
12464 if (unfound) {
12465 dout(10) << " still have " << unfound << " unfound" << dendl;
11fdf7f2 12466 return true;
7c673cae
FG
12467 }
12468
12469 if (missing.num_missing() > 0) {
12470 // this shouldn't happen!
c07f9fc5
FG
12471 osd->clog->error() << info.pgid << " Unexpected Error: recovery ending with "
12472 << missing.num_missing() << ": " << missing.get_items();
11fdf7f2 12473 return false;
7c673cae
FG
12474 }
12475
12476 if (needs_recovery()) {
12477 // this shouldn't happen!
12478 // We already checked num_missing() so we must have missing replicas
c07f9fc5
FG
12479 osd->clog->error() << info.pgid
12480 << " Unexpected Error: recovery ending with missing replicas";
11fdf7f2 12481 return false;
7c673cae
FG
12482 }
12483
12484 if (state_test(PG_STATE_RECOVERING)) {
12485 state_clear(PG_STATE_RECOVERING);
c07f9fc5 12486 state_clear(PG_STATE_FORCED_RECOVERY);
7c673cae
FG
12487 if (needs_backfill()) {
12488 dout(10) << "recovery done, queuing backfill" << dendl;
12489 queue_peering_event(
11fdf7f2
TL
12490 PGPeeringEventRef(
12491 std::make_shared<PGPeeringEvent>(
12492 get_osdmap_epoch(),
12493 get_osdmap_epoch(),
9f95a23c 12494 PeeringState::RequestBackfill())));
7c673cae
FG
12495 } else {
12496 dout(10) << "recovery done, no backfill" << dendl;
224ce89b 12497 eio_errors_to_process = false;
c07f9fc5 12498 state_clear(PG_STATE_FORCED_BACKFILL);
7c673cae 12499 queue_peering_event(
11fdf7f2
TL
12500 PGPeeringEventRef(
12501 std::make_shared<PGPeeringEvent>(
12502 get_osdmap_epoch(),
12503 get_osdmap_epoch(),
9f95a23c 12504 PeeringState::AllReplicasRecovered())));
7c673cae
FG
12505 }
12506 } else { // backfilling
3efd9988 12507 state_clear(PG_STATE_BACKFILLING);
c07f9fc5
FG
12508 state_clear(PG_STATE_FORCED_BACKFILL);
12509 state_clear(PG_STATE_FORCED_RECOVERY);
7c673cae 12510 dout(10) << "recovery done, backfill done" << dendl;
224ce89b 12511 eio_errors_to_process = false;
7c673cae 12512 queue_peering_event(
11fdf7f2
TL
12513 PGPeeringEventRef(
12514 std::make_shared<PGPeeringEvent>(
12515 get_osdmap_epoch(),
12516 get_osdmap_epoch(),
9f95a23c 12517 PeeringState::Backfilled())));
7c673cae
FG
12518 }
12519
12520 return false;
12521}
12522
12523/**
12524 * do one recovery op.
12525 * return true if done, false if nothing left to do.
12526 */
12527uint64_t PrimaryLogPG::recover_primary(uint64_t max, ThreadPool::TPHandle &handle)
12528{
11fdf7f2 12529 ceph_assert(is_primary());
7c673cae 12530
9f95a23c 12531 const auto &missing = recovery_state.get_pg_log().get_missing();
7c673cae 12532
11fdf7f2
TL
12533 dout(10) << __func__ << " recovering " << recovering.size()
12534 << " in pg,"
12535 << " missing " << missing << dendl;
12536
12537 dout(25) << __func__ << " " << missing.get_items() << dendl;
7c673cae
FG
12538
12539 // look at log!
12540 pg_log_entry_t *latest = 0;
12541 unsigned started = 0;
12542 int skipped = 0;
12543
12544 PGBackend::RecoveryHandle *h = pgbackend->open_recovery_op();
12545 map<version_t, hobject_t>::const_iterator p =
9f95a23c 12546 missing.get_rmissing().lower_bound(recovery_state.get_pg_log().get_log().last_requested);
7c673cae
FG
12547 while (p != missing.get_rmissing().end()) {
12548 handle.reset_tp_timeout();
12549 hobject_t soid;
12550 version_t v = p->first;
12551
9f95a23c
TL
12552 auto it_objects = recovery_state.get_pg_log().get_log().objects.find(p->second);
12553 if (it_objects != recovery_state.get_pg_log().get_log().objects.end()) {
11fdf7f2
TL
12554 latest = it_objects->second;
12555 ceph_assert(latest->is_update() || latest->is_delete());
7c673cae
FG
12556 soid = latest->soid;
12557 } else {
12558 latest = 0;
12559 soid = p->second;
12560 }
12561 const pg_missing_item& item = missing.get_items().find(p->second)->second;
12562 ++p;
12563
224ce89b 12564 hobject_t head = soid.get_head();
7c673cae
FG
12565
12566 eversion_t need = item.need;
12567
11fdf7f2 12568 dout(10) << __func__ << " "
7c673cae
FG
12569 << soid << " " << item.need
12570 << (missing.is_missing(soid) ? " (missing)":"")
12571 << (missing.is_missing(head) ? " (missing head)":"")
12572 << (recovering.count(soid) ? " (recovering)":"")
12573 << (recovering.count(head) ? " (recovering head)":"")
12574 << dendl;
12575
12576 if (latest) {
12577 switch (latest->op) {
12578 case pg_log_entry_t::CLONE:
12579 /*
12580 * Handling for this special case removed for now, until we
12581 * can correctly construct an accurate SnapSet from the old
12582 * one.
12583 */
12584 break;
12585
12586 case pg_log_entry_t::LOST_REVERT:
12587 {
12588 if (item.have == latest->reverting_to) {
12589 ObjectContextRef obc = get_object_context(soid, true);
12590
12591 if (obc->obs.oi.version == latest->version) {
12592 // I'm already reverting
12593 dout(10) << " already reverting " << soid << dendl;
12594 } else {
12595 dout(10) << " reverting " << soid << " to " << latest->prior_version << dendl;
7c673cae
FG
12596 obc->obs.oi.version = latest->version;
12597
12598 ObjectStore::Transaction t;
12599 bufferlist b2;
12600 obc->obs.oi.encode(
12601 b2,
12602 get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
11fdf7f2 12603 ceph_assert(!pool.info.require_rollback());
7c673cae
FG
12604 t.setattr(coll, ghobject_t(soid), OI_ATTR, b2);
12605
9f95a23c
TL
12606 recovery_state.recover_got(
12607 soid,
12608 latest->version,
12609 false,
12610 t);
7c673cae
FG
12611
12612 ++active_pushes;
12613
11fdf7f2
TL
12614 t.register_on_applied(new C_OSD_AppliedRecoveredObject(this, obc));
12615 t.register_on_commit(new C_OSD_CommittedPushedObject(
12616 this,
12617 get_osdmap_epoch(),
12618 info.last_complete));
12619 osd->store->queue_transaction(ch, std::move(t));
7c673cae
FG
12620 continue;
12621 }
12622 } else {
12623 /*
12624 * Pull the old version of the object. Update missing_loc here to have the location
12625 * of the version we want.
12626 *
12627 * This doesn't use the usual missing_loc paths, but that's okay:
12628 * - if we have it locally, we hit the case above, and go from there.
12629 * - if we don't, we always pass through this case during recovery and set up the location
12630 * properly.
12631 * - this way we don't need to mangle the missing code to be general about needing an old
12632 * version...
12633 */
12634 eversion_t alternate_need = latest->reverting_to;
12635 dout(10) << " need to pull prior_version " << alternate_need << " for revert " << item << dendl;
12636
9f95a23c
TL
12637 set<pg_shard_t> good_peers;
12638 for (auto p = recovery_state.get_peer_missing().begin();
12639 p != recovery_state.get_peer_missing().end();
12640 ++p) {
7c673cae
FG
12641 if (p->second.is_missing(soid, need) &&
12642 p->second.get_items().at(soid).have == alternate_need) {
9f95a23c 12643 good_peers.insert(p->first);
7c673cae 12644 }
9f95a23c
TL
12645 }
12646 recovery_state.set_revert_with_targets(
12647 soid,
12648 good_peers);
7c673cae 12649 dout(10) << " will pull " << alternate_need << " or " << need
9f95a23c
TL
12650 << " from one of "
12651 << recovery_state.get_missing_loc().get_locations(soid)
7c673cae
FG
12652 << dendl;
12653 }
12654 }
12655 break;
12656 }
12657 }
12658
12659 if (!recovering.count(soid)) {
12660 if (recovering.count(head)) {
12661 ++skipped;
12662 } else {
12663 int r = recover_missing(
12664 soid, need, get_recovery_op_priority(), h);
12665 switch (r) {
12666 case PULL_YES:
12667 ++started;
12668 break;
11fdf7f2 12669 case PULL_HEAD:
7c673cae
FG
12670 ++started;
12671 case PULL_NONE:
12672 ++skipped;
12673 break;
12674 default:
12675 ceph_abort();
12676 }
12677 if (started >= max)
12678 break;
12679 }
12680 }
12681
12682 // only advance last_requested if we haven't skipped anything
12683 if (!skipped)
9f95a23c 12684 recovery_state.set_last_requested(v);
7c673cae
FG
12685 }
12686
12687 pgbackend->run_recovery_op(h, get_recovery_op_priority());
12688 return started;
12689}
12690
224ce89b
WB
12691bool PrimaryLogPG::primary_error(
12692 const hobject_t& soid, eversion_t v)
12693{
9f95a23c
TL
12694 recovery_state.force_object_missing(pg_whoami, soid, v);
12695 bool uhoh = recovery_state.get_missing_loc().is_unfound(soid);
224ce89b 12696 if (uhoh)
9f95a23c
TL
12697 osd->clog->error() << info.pgid << " missing primary copy of "
12698 << soid << ", unfound";
224ce89b 12699 else
9f95a23c
TL
12700 osd->clog->error() << info.pgid << " missing primary copy of "
12701 << soid
12702 << ", will try copies on "
12703 << recovery_state.get_missing_loc().get_locations(soid);
224ce89b
WB
12704 return uhoh;
12705}
12706
c07f9fc5
FG
12707int PrimaryLogPG::prep_object_replica_deletes(
12708 const hobject_t& soid, eversion_t v,
11fdf7f2
TL
12709 PGBackend::RecoveryHandle *h,
12710 bool *work_started)
c07f9fc5 12711{
11fdf7f2 12712 ceph_assert(is_primary());
c07f9fc5
FG
12713 dout(10) << __func__ << ": on " << soid << dendl;
12714
11fdf7f2
TL
12715 ObjectContextRef obc = get_object_context(soid, false);
12716 if (obc) {
12717 if (!obc->get_recovery_read()) {
12718 dout(20) << "replica delete delayed on " << soid
12719 << "; could not get rw_manager lock" << dendl;
12720 *work_started = true;
12721 return 0;
12722 } else {
12723 dout(20) << "replica delete got recovery read lock on " << soid
12724 << dendl;
12725 }
12726 }
12727
c07f9fc5 12728 start_recovery_op(soid);
11fdf7f2
TL
12729 ceph_assert(!recovering.count(soid));
12730 if (!obc)
12731 recovering.insert(make_pair(soid, ObjectContextRef()));
12732 else
12733 recovering.insert(make_pair(soid, obc));
c07f9fc5
FG
12734
12735 pgbackend->recover_delete_object(soid, v, h);
12736 return 1;
12737}
12738
7c673cae
FG
12739int PrimaryLogPG::prep_object_replica_pushes(
12740 const hobject_t& soid, eversion_t v,
11fdf7f2
TL
12741 PGBackend::RecoveryHandle *h,
12742 bool *work_started)
7c673cae 12743{
11fdf7f2 12744 ceph_assert(is_primary());
7c673cae
FG
12745 dout(10) << __func__ << ": on " << soid << dendl;
12746
9f95a23c
TL
12747 if (soid.snap && soid.snap < CEPH_NOSNAP) {
12748 // do we have the head and/or snapdir?
12749 hobject_t head = soid.get_head();
12750 if (recovery_state.get_pg_log().get_missing().is_missing(head)) {
12751 if (recovering.count(head)) {
12752 dout(10) << " missing but already recovering head " << head << dendl;
12753 return 0;
12754 } else {
12755 int r = recover_missing(
12756 head, recovery_state.get_pg_log().get_missing().get_items().find(head)->second.need,
12757 get_recovery_op_priority(), h);
12758 if (r != PULL_NONE)
12759 return 1;
12760 return 0;
12761 }
12762 }
12763 }
12764
7c673cae
FG
12765 // NOTE: we know we will get a valid oloc off of disk here.
12766 ObjectContextRef obc = get_object_context(soid, false);
12767 if (!obc) {
224ce89b 12768 primary_error(soid, v);
7c673cae
FG
12769 return 0;
12770 }
12771
12772 if (!obc->get_recovery_read()) {
12773 dout(20) << "recovery delayed on " << soid
12774 << "; could not get rw_manager lock" << dendl;
11fdf7f2 12775 *work_started = true;
7c673cae
FG
12776 return 0;
12777 } else {
12778 dout(20) << "recovery got recovery read lock on " << soid
12779 << dendl;
12780 }
12781
12782 start_recovery_op(soid);
11fdf7f2 12783 ceph_assert(!recovering.count(soid));
7c673cae
FG
12784 recovering.insert(make_pair(soid, obc));
12785
224ce89b 12786 int r = pgbackend->recover_object(
7c673cae
FG
12787 soid,
12788 v,
12789 ObjectContextRef(),
12790 obc, // has snapset context
12791 h);
224ce89b
WB
12792 if (r < 0) {
12793 dout(0) << __func__ << " Error " << r << " on oid " << soid << dendl;
9f95a23c 12794 on_failed_pull({ pg_whoami }, soid, v);
224ce89b
WB
12795 return 0;
12796 }
7c673cae
FG
12797 return 1;
12798}
12799
11fdf7f2
TL
12800uint64_t PrimaryLogPG::recover_replicas(uint64_t max, ThreadPool::TPHandle &handle,
12801 bool *work_started)
7c673cae
FG
12802{
12803 dout(10) << __func__ << "(" << max << ")" << dendl;
12804 uint64_t started = 0;
12805
12806 PGBackend::RecoveryHandle *h = pgbackend->open_recovery_op();
12807
12808 // this is FAR from an optimal recovery order. pretty lame, really.
9f95a23c 12809 ceph_assert(!get_acting_recovery_backfill().empty());
11fdf7f2
TL
12810 // choose replicas to recover, replica has the shortest missing list first
12811 // so we can bring it back to normal ASAP
12812 std::vector<std::pair<unsigned int, pg_shard_t>> replicas_by_num_missing,
12813 async_by_num_missing;
9f95a23c
TL
12814 replicas_by_num_missing.reserve(get_acting_recovery_backfill().size() - 1);
12815 for (auto &p: get_acting_recovery_backfill()) {
11fdf7f2
TL
12816 if (p == get_primary()) {
12817 continue;
12818 }
9f95a23c
TL
12819 auto pm = recovery_state.get_peer_missing().find(p);
12820 ceph_assert(pm != recovery_state.get_peer_missing().end());
11fdf7f2
TL
12821 auto nm = pm->second.num_missing();
12822 if (nm != 0) {
9f95a23c 12823 if (is_async_recovery_target(p)) {
11fdf7f2
TL
12824 async_by_num_missing.push_back(make_pair(nm, p));
12825 } else {
12826 replicas_by_num_missing.push_back(make_pair(nm, p));
12827 }
12828 }
12829 }
12830 // sort by number of missing objects, in ascending order.
12831 auto func = [](const std::pair<unsigned int, pg_shard_t> &lhs,
12832 const std::pair<unsigned int, pg_shard_t> &rhs) {
12833 return lhs.first < rhs.first;
12834 };
12835 // acting goes first
12836 std::sort(replicas_by_num_missing.begin(), replicas_by_num_missing.end(), func);
12837 // then async_recovery_targets
12838 std::sort(async_by_num_missing.begin(), async_by_num_missing.end(), func);
12839 replicas_by_num_missing.insert(replicas_by_num_missing.end(),
12840 async_by_num_missing.begin(), async_by_num_missing.end());
12841 for (auto &replica: replicas_by_num_missing) {
12842 pg_shard_t &peer = replica.second;
12843 ceph_assert(peer != get_primary());
9f95a23c
TL
12844 auto pm = recovery_state.get_peer_missing().find(peer);
12845 ceph_assert(pm != recovery_state.get_peer_missing().end());
7c673cae
FG
12846 size_t m_sz = pm->second.num_missing();
12847
12848 dout(10) << " peer osd." << peer << " missing " << m_sz << " objects." << dendl;
12849 dout(20) << " peer osd." << peer << " missing " << pm->second.get_items() << dendl;
12850
12851 // oldest first!
12852 const pg_missing_t &m(pm->second);
12853 for (map<version_t, hobject_t>::const_iterator p = m.get_rmissing().begin();
12854 p != m.get_rmissing().end() && started < max;
12855 ++p) {
12856 handle.reset_tp_timeout();
12857 const hobject_t soid(p->second);
12858
9f95a23c 12859 if (recovery_state.get_missing_loc().is_unfound(soid)) {
224ce89b
WB
12860 dout(10) << __func__ << ": " << soid << " still unfound" << dendl;
12861 continue;
12862 }
12863
9f95a23c
TL
12864 const pg_info_t &pi = recovery_state.get_peer_info(peer);
12865 if (soid > pi.last_backfill) {
7c673cae 12866 if (!recovering.count(soid)) {
9f95a23c
TL
12867 derr << __func__ << ": object " << soid << " last_backfill "
12868 << pi.last_backfill << dendl;
7c673cae
FG
12869 derr << __func__ << ": object added to missing set for backfill, but "
12870 << "is not in recovering, error!" << dendl;
12871 ceph_abort();
12872 }
12873 continue;
12874 }
12875
12876 if (recovering.count(soid)) {
12877 dout(10) << __func__ << ": already recovering " << soid << dendl;
12878 continue;
12879 }
12880
9f95a23c 12881 if (recovery_state.get_missing_loc().is_deleted(soid)) {
c07f9fc5
FG
12882 dout(10) << __func__ << ": " << soid << " is a delete, removing" << dendl;
12883 map<hobject_t,pg_missing_item>::const_iterator r = m.get_items().find(soid);
11fdf7f2 12884 started += prep_object_replica_deletes(soid, r->second.need, h, work_started);
c07f9fc5
FG
12885 continue;
12886 }
12887
9f95a23c
TL
12888 if (soid.is_snap() &&
12889 recovery_state.get_pg_log().get_missing().is_missing(
12890 soid.get_head())) {
7c673cae
FG
12891 dout(10) << __func__ << ": " << soid.get_head()
12892 << " still missing on primary" << dendl;
12893 continue;
12894 }
12895
9f95a23c 12896 if (recovery_state.get_pg_log().get_missing().is_missing(soid)) {
7c673cae
FG
12897 dout(10) << __func__ << ": " << soid << " still missing on primary" << dendl;
12898 continue;
12899 }
12900
12901 dout(10) << __func__ << ": recover_object_replicas(" << soid << ")" << dendl;
12902 map<hobject_t,pg_missing_item>::const_iterator r = m.get_items().find(soid);
11fdf7f2 12903 started += prep_object_replica_pushes(soid, r->second.need, h, work_started);
7c673cae
FG
12904 }
12905 }
12906
12907 pgbackend->run_recovery_op(h, get_recovery_op_priority());
12908 return started;
12909}
12910
12911hobject_t PrimaryLogPG::earliest_peer_backfill() const
12912{
12913 hobject_t e = hobject_t::get_max();
9f95a23c
TL
12914 for (const pg_shard_t& peer : get_backfill_targets()) {
12915 const auto iter = peer_backfill_info.find(peer);
11fdf7f2 12916 ceph_assert(iter != peer_backfill_info.end());
9f95a23c 12917 e = std::min(e, iter->second.begin);
7c673cae
FG
12918 }
12919 return e;
12920}
12921
12922bool PrimaryLogPG::all_peer_done() const
12923{
12924 // Primary hasn't got any more objects
11fdf7f2 12925 ceph_assert(backfill_info.empty());
7c673cae 12926
9f95a23c
TL
12927 for (const pg_shard_t& bt : get_backfill_targets()) {
12928 const auto piter = peer_backfill_info.find(bt);
11fdf7f2 12929 ceph_assert(piter != peer_backfill_info.end());
7c673cae
FG
12930 const BackfillInterval& pbi = piter->second;
12931 // See if peer has more to process
12932 if (!pbi.extends_to_end() || !pbi.empty())
12933 return false;
12934 }
12935 return true;
12936}
12937
12938/**
12939 * recover_backfill
12940 *
12941 * Invariants:
12942 *
12943 * backfilled: fully pushed to replica or present in replica's missing set (both
12944 * our copy and theirs).
12945 *
12946 * All objects on a backfill_target in
12947 * [MIN,peer_backfill_info[backfill_target].begin) are valid; logically-removed
12948 * objects have been actually deleted and all logically-valid objects are replicated.
12949 * There may be PG objects in this interval yet to be backfilled.
12950 *
12951 * All objects in PG in [MIN,backfill_info.begin) have been backfilled to all
12952 * backfill_targets. There may be objects on backfill_target(s) yet to be deleted.
12953 *
11fdf7f2 12954 * For a backfill target, all objects < std::min(peer_backfill_info[target].begin,
7c673cae
FG
12955 * backfill_info.begin) in PG are backfilled. No deleted objects in this
12956 * interval remain on the backfill target.
12957 *
12958 * For a backfill target, all objects <= peer_info[target].last_backfill
12959 * have been backfilled to target
12960 *
12961 * There *MAY* be missing/outdated objects between last_backfill_started and
11fdf7f2 12962 * std::min(peer_backfill_info[*].begin, backfill_info.begin) in the event that client
7c673cae
FG
12963 * io created objects since the last scan. For this reason, we call
12964 * update_range() again before continuing backfill.
12965 */
12966uint64_t PrimaryLogPG::recover_backfill(
12967 uint64_t max,
12968 ThreadPool::TPHandle &handle, bool *work_started)
12969{
11fdf7f2 12970 dout(10) << __func__ << " (" << max << ")"
9f95a23c 12971 << " bft=" << get_backfill_targets()
7c673cae
FG
12972 << " last_backfill_started " << last_backfill_started
12973 << (new_backfill ? " new_backfill":"")
12974 << dendl;
9f95a23c 12975 ceph_assert(!get_backfill_targets().empty());
7c673cae
FG
12976
12977 // Initialize from prior backfill state
12978 if (new_backfill) {
12979 // on_activate() was called prior to getting here
11fdf7f2 12980 ceph_assert(last_backfill_started == earliest_backfill());
7c673cae
FG
12981 new_backfill = false;
12982
12983 // initialize BackfillIntervals
9f95a23c
TL
12984 for (set<pg_shard_t>::const_iterator i = get_backfill_targets().begin();
12985 i != get_backfill_targets().end();
7c673cae 12986 ++i) {
9f95a23c
TL
12987 peer_backfill_info[*i].reset(
12988 recovery_state.get_peer_info(*i).last_backfill);
7c673cae
FG
12989 }
12990 backfill_info.reset(last_backfill_started);
12991
12992 backfills_in_flight.clear();
12993 pending_backfill_updates.clear();
12994 }
12995
9f95a23c
TL
12996 for (set<pg_shard_t>::const_iterator i = get_backfill_targets().begin();
12997 i != get_backfill_targets().end();
7c673cae
FG
12998 ++i) {
12999 dout(10) << "peer osd." << *i
9f95a23c 13000 << " info " << recovery_state.get_peer_info(*i)
7c673cae
FG
13001 << " interval " << peer_backfill_info[*i].begin
13002 << "-" << peer_backfill_info[*i].end
13003 << " " << peer_backfill_info[*i].objects.size() << " objects"
13004 << dendl;
13005 }
13006
13007 // update our local interval to cope with recent changes
13008 backfill_info.begin = last_backfill_started;
13009 update_range(&backfill_info, handle);
13010
13011 unsigned ops = 0;
7c673cae
FG
13012 vector<boost::tuple<hobject_t, eversion_t, pg_shard_t> > to_remove;
13013 set<hobject_t> add_to_stat;
13014
9f95a23c
TL
13015 for (set<pg_shard_t>::const_iterator i = get_backfill_targets().begin();
13016 i != get_backfill_targets().end();
7c673cae
FG
13017 ++i) {
13018 peer_backfill_info[*i].trim_to(
9f95a23c
TL
13019 std::max(
13020 recovery_state.get_peer_info(*i).last_backfill,
13021 last_backfill_started));
7c673cae
FG
13022 }
13023 backfill_info.trim_to(last_backfill_started);
13024
224ce89b 13025 PGBackend::RecoveryHandle *h = pgbackend->open_recovery_op();
7c673cae
FG
13026 while (ops < max) {
13027 if (backfill_info.begin <= earliest_peer_backfill() &&
13028 !backfill_info.extends_to_end() && backfill_info.empty()) {
13029 hobject_t next = backfill_info.end;
13030 backfill_info.reset(next);
13031 backfill_info.end = hobject_t::get_max();
13032 update_range(&backfill_info, handle);
13033 backfill_info.trim();
13034 }
13035
13036 dout(20) << " my backfill interval " << backfill_info << dendl;
13037
13038 bool sent_scan = false;
9f95a23c
TL
13039 for (set<pg_shard_t>::const_iterator i = get_backfill_targets().begin();
13040 i != get_backfill_targets().end();
7c673cae
FG
13041 ++i) {
13042 pg_shard_t bt = *i;
13043 BackfillInterval& pbi = peer_backfill_info[bt];
13044
13045 dout(20) << " peer shard " << bt << " backfill " << pbi << dendl;
13046 if (pbi.begin <= backfill_info.begin &&
13047 !pbi.extends_to_end() && pbi.empty()) {
13048 dout(10) << " scanning peer osd." << bt << " from " << pbi.end << dendl;
11fdf7f2 13049 epoch_t e = get_osdmap_epoch();
7c673cae 13050 MOSDPGScan *m = new MOSDPGScan(
9f95a23c 13051 MOSDPGScan::OP_SCAN_GET_DIGEST, pg_whoami, e, get_last_peering_reset(),
7c673cae
FG
13052 spg_t(info.pgid.pgid, bt.shard),
13053 pbi.end, hobject_t());
11fdf7f2
TL
13054 osd->send_message_osd_cluster(bt.osd, m, get_osdmap_epoch());
13055 ceph_assert(waiting_on_backfill.find(bt) == waiting_on_backfill.end());
7c673cae
FG
13056 waiting_on_backfill.insert(bt);
13057 sent_scan = true;
13058 }
13059 }
13060
13061 // Count simultaneous scans as a single op and let those complete
13062 if (sent_scan) {
13063 ops++;
13064 start_recovery_op(hobject_t::get_max()); // XXX: was pbi.end
13065 break;
13066 }
13067
13068 if (backfill_info.empty() && all_peer_done()) {
13069 dout(10) << " reached end for both local and all peers" << dendl;
13070 break;
13071 }
13072
13073 // Get object within set of peers to operate on and
13074 // the set of targets for which that object applies.
13075 hobject_t check = earliest_peer_backfill();
13076
13077 if (check < backfill_info.begin) {
13078
13079 set<pg_shard_t> check_targets;
9f95a23c
TL
13080 for (set<pg_shard_t>::const_iterator i = get_backfill_targets().begin();
13081 i != get_backfill_targets().end();
7c673cae
FG
13082 ++i) {
13083 pg_shard_t bt = *i;
13084 BackfillInterval& pbi = peer_backfill_info[bt];
13085 if (pbi.begin == check)
13086 check_targets.insert(bt);
13087 }
11fdf7f2 13088 ceph_assert(!check_targets.empty());
7c673cae
FG
13089
13090 dout(20) << " BACKFILL removing " << check
13091 << " from peers " << check_targets << dendl;
13092 for (set<pg_shard_t>::iterator i = check_targets.begin();
13093 i != check_targets.end();
13094 ++i) {
13095 pg_shard_t bt = *i;
13096 BackfillInterval& pbi = peer_backfill_info[bt];
11fdf7f2 13097 ceph_assert(pbi.begin == check);
7c673cae
FG
13098
13099 to_remove.push_back(boost::make_tuple(check, pbi.objects.begin()->second, bt));
13100 pbi.pop_front();
13101 }
13102
11fdf7f2 13103 last_backfill_started = check;
7c673cae
FG
13104
13105 // Don't increment ops here because deletions
13106 // are cheap and not replied to unlike real recovery_ops,
13107 // and we can't increment ops without requeueing ourself
13108 // for recovery.
13109 } else {
13110 eversion_t& obj_v = backfill_info.objects.begin()->second;
13111
13112 vector<pg_shard_t> need_ver_targs, missing_targs, keep_ver_targs, skip_targs;
9f95a23c
TL
13113 for (set<pg_shard_t>::const_iterator i = get_backfill_targets().begin();
13114 i != get_backfill_targets().end();
7c673cae
FG
13115 ++i) {
13116 pg_shard_t bt = *i;
13117 BackfillInterval& pbi = peer_backfill_info[bt];
13118 // Find all check peers that have the wrong version
13119 if (check == backfill_info.begin && check == pbi.begin) {
13120 if (pbi.objects.begin()->second != obj_v) {
13121 need_ver_targs.push_back(bt);
13122 } else {
13123 keep_ver_targs.push_back(bt);
13124 }
13125 } else {
9f95a23c 13126 const pg_info_t& pinfo = recovery_state.get_peer_info(bt);
7c673cae
FG
13127
13128 // Only include peers that we've caught up to their backfill line
13129 // otherwise, they only appear to be missing this object
13130 // because their pbi.begin > backfill_info.begin.
13131 if (backfill_info.begin > pinfo.last_backfill)
13132 missing_targs.push_back(bt);
13133 else
13134 skip_targs.push_back(bt);
13135 }
13136 }
13137
13138 if (!keep_ver_targs.empty()) {
13139 // These peers have version obj_v
13140 dout(20) << " BACKFILL keeping " << check
13141 << " with ver " << obj_v
13142 << " on peers " << keep_ver_targs << dendl;
13143 //assert(!waiting_for_degraded_object.count(check));
13144 }
13145 if (!need_ver_targs.empty() || !missing_targs.empty()) {
13146 ObjectContextRef obc = get_object_context(backfill_info.begin, false);
11fdf7f2 13147 ceph_assert(obc);
7c673cae
FG
13148 if (obc->get_recovery_read()) {
13149 if (!need_ver_targs.empty()) {
13150 dout(20) << " BACKFILL replacing " << check
13151 << " with ver " << obj_v
13152 << " to peers " << need_ver_targs << dendl;
13153 }
13154 if (!missing_targs.empty()) {
13155 dout(20) << " BACKFILL pushing " << backfill_info.begin
13156 << " with ver " << obj_v
13157 << " to peers " << missing_targs << dendl;
13158 }
13159 vector<pg_shard_t> all_push = need_ver_targs;
13160 all_push.insert(all_push.end(), missing_targs.begin(), missing_targs.end());
13161
224ce89b
WB
13162 handle.reset_tp_timeout();
13163 int r = prep_backfill_object_push(backfill_info.begin, obj_v, obc, all_push, h);
13164 if (r < 0) {
13165 *work_started = true;
13166 dout(0) << __func__ << " Error " << r << " trying to backfill " << backfill_info.begin << dendl;
13167 break;
13168 }
7c673cae
FG
13169 ops++;
13170 } else {
13171 *work_started = true;
13172 dout(20) << "backfill blocking on " << backfill_info.begin
13173 << "; could not get rw_manager lock" << dendl;
13174 break;
13175 }
13176 }
13177 dout(20) << "need_ver_targs=" << need_ver_targs
13178 << " keep_ver_targs=" << keep_ver_targs << dendl;
9f95a23c 13179 dout(20) << "backfill_targets=" << get_backfill_targets()
7c673cae
FG
13180 << " missing_targs=" << missing_targs
13181 << " skip_targs=" << skip_targs << dendl;
13182
13183 last_backfill_started = backfill_info.begin;
13184 add_to_stat.insert(backfill_info.begin); // XXX: Only one for all pushes?
13185 backfill_info.pop_front();
13186 vector<pg_shard_t> check_targets = need_ver_targs;
13187 check_targets.insert(check_targets.end(), keep_ver_targs.begin(), keep_ver_targs.end());
13188 for (vector<pg_shard_t>::iterator i = check_targets.begin();
13189 i != check_targets.end();
13190 ++i) {
13191 pg_shard_t bt = *i;
13192 BackfillInterval& pbi = peer_backfill_info[bt];
13193 pbi.pop_front();
13194 }
13195 }
13196 }
13197
13198 hobject_t backfill_pos =
13199 std::min(backfill_info.begin, earliest_peer_backfill());
13200
13201 for (set<hobject_t>::iterator i = add_to_stat.begin();
13202 i != add_to_stat.end();
13203 ++i) {
13204 ObjectContextRef obc = get_object_context(*i, false);
11fdf7f2 13205 ceph_assert(obc);
7c673cae
FG
13206 pg_stat_t stat;
13207 add_object_context_to_pg_stat(obc, &stat);
13208 pending_backfill_updates[*i] = stat;
13209 }
11fdf7f2
TL
13210 map<pg_shard_t,MOSDPGBackfillRemove*> reqs;
13211 for (unsigned i = 0; i < to_remove.size(); ++i) {
13212 handle.reset_tp_timeout();
13213 const hobject_t& oid = to_remove[i].get<0>();
13214 eversion_t v = to_remove[i].get<1>();
13215 pg_shard_t peer = to_remove[i].get<2>();
13216 MOSDPGBackfillRemove *m;
13217 auto it = reqs.find(peer);
13218 if (it != reqs.end()) {
13219 m = it->second;
13220 } else {
13221 m = reqs[peer] = new MOSDPGBackfillRemove(
13222 spg_t(info.pgid.pgid, peer.shard),
13223 get_osdmap_epoch());
7c673cae 13224 }
11fdf7f2 13225 m->ls.push_back(make_pair(oid, v));
7c673cae 13226
11fdf7f2
TL
13227 if (oid <= last_backfill_started)
13228 pending_backfill_updates[oid]; // add empty stat!
13229 }
13230 for (auto p : reqs) {
13231 osd->send_message_osd_cluster(p.first.osd, p.second,
13232 get_osdmap_epoch());
7c673cae
FG
13233 }
13234
7c673cae
FG
13235 pgbackend->run_recovery_op(h, get_recovery_op_priority());
13236
13237 dout(5) << "backfill_pos is " << backfill_pos << dendl;
13238 for (set<hobject_t>::iterator i = backfills_in_flight.begin();
13239 i != backfills_in_flight.end();
13240 ++i) {
13241 dout(20) << *i << " is still in flight" << dendl;
13242 }
13243
13244 hobject_t next_backfill_to_complete = backfills_in_flight.empty() ?
13245 backfill_pos : *(backfills_in_flight.begin());
13246 hobject_t new_last_backfill = earliest_backfill();
13247 dout(10) << "starting new_last_backfill at " << new_last_backfill << dendl;
13248 for (map<hobject_t, pg_stat_t>::iterator i =
13249 pending_backfill_updates.begin();
13250 i != pending_backfill_updates.end() &&
13251 i->first < next_backfill_to_complete;
13252 pending_backfill_updates.erase(i++)) {
13253 dout(20) << " pending_backfill_update " << i->first << dendl;
11fdf7f2 13254 ceph_assert(i->first > new_last_backfill);
9f95a23c
TL
13255 recovery_state.update_complete_backfill_object_stats(
13256 i->first,
13257 i->second);
7c673cae
FG
13258 new_last_backfill = i->first;
13259 }
13260 dout(10) << "possible new_last_backfill at " << new_last_backfill << dendl;
13261
11fdf7f2 13262 ceph_assert(!pending_backfill_updates.empty() ||
7c673cae
FG
13263 new_last_backfill == last_backfill_started);
13264 if (pending_backfill_updates.empty() &&
13265 backfill_pos.is_max()) {
11fdf7f2 13266 ceph_assert(backfills_in_flight.empty());
7c673cae
FG
13267 new_last_backfill = backfill_pos;
13268 last_backfill_started = backfill_pos;
13269 }
13270 dout(10) << "final new_last_backfill at " << new_last_backfill << dendl;
13271
13272 // If new_last_backfill == MAX, then we will send OP_BACKFILL_FINISH to
13273 // all the backfill targets. Otherwise, we will move last_backfill up on
13274 // those targets need it and send OP_BACKFILL_PROGRESS to them.
9f95a23c
TL
13275 for (set<pg_shard_t>::const_iterator i = get_backfill_targets().begin();
13276 i != get_backfill_targets().end();
7c673cae
FG
13277 ++i) {
13278 pg_shard_t bt = *i;
9f95a23c 13279 const pg_info_t& pinfo = recovery_state.get_peer_info(bt);
7c673cae
FG
13280
13281 if (new_last_backfill > pinfo.last_backfill) {
9f95a23c 13282 recovery_state.update_peer_last_backfill(bt, new_last_backfill);
11fdf7f2 13283 epoch_t e = get_osdmap_epoch();
7c673cae
FG
13284 MOSDPGBackfill *m = NULL;
13285 if (pinfo.last_backfill.is_max()) {
13286 m = new MOSDPGBackfill(
13287 MOSDPGBackfill::OP_BACKFILL_FINISH,
13288 e,
9f95a23c 13289 get_last_peering_reset(),
7c673cae
FG
13290 spg_t(info.pgid.pgid, bt.shard));
13291 // Use default priority here, must match sub_op priority
7c673cae
FG
13292 start_recovery_op(hobject_t::get_max());
13293 } else {
13294 m = new MOSDPGBackfill(
13295 MOSDPGBackfill::OP_BACKFILL_PROGRESS,
13296 e,
9f95a23c 13297 get_last_peering_reset(),
7c673cae
FG
13298 spg_t(info.pgid.pgid, bt.shard));
13299 // Use default priority here, must match sub_op priority
13300 }
13301 m->last_backfill = pinfo.last_backfill;
13302 m->stats = pinfo.stats;
11fdf7f2 13303 osd->send_message_osd_cluster(bt.osd, m, get_osdmap_epoch());
7c673cae
FG
13304 dout(10) << " peer " << bt
13305 << " num_objects now " << pinfo.stats.stats.sum.num_objects
13306 << " / " << info.stats.stats.sum.num_objects << dendl;
13307 }
13308 }
13309
13310 if (ops)
13311 *work_started = true;
13312 return ops;
13313}
13314
224ce89b 13315int PrimaryLogPG::prep_backfill_object_push(
7c673cae
FG
13316 hobject_t oid, eversion_t v,
13317 ObjectContextRef obc,
13318 vector<pg_shard_t> peers,
13319 PGBackend::RecoveryHandle *h)
13320{
224ce89b 13321 dout(10) << __func__ << " " << oid << " v " << v << " to peers " << peers << dendl;
11fdf7f2 13322 ceph_assert(!peers.empty());
7c673cae
FG
13323
13324 backfills_in_flight.insert(oid);
9f95a23c 13325 recovery_state.prepare_backfill_for_missing(oid, v, peers);
7c673cae 13326
11fdf7f2 13327 ceph_assert(!recovering.count(oid));
7c673cae
FG
13328
13329 start_recovery_op(oid);
13330 recovering.insert(make_pair(oid, obc));
13331
224ce89b 13332 int r = pgbackend->recover_object(
7c673cae
FG
13333 oid,
13334 v,
13335 ObjectContextRef(),
13336 obc,
13337 h);
224ce89b
WB
13338 if (r < 0) {
13339 dout(0) << __func__ << " Error " << r << " on oid " << oid << dendl;
9f95a23c 13340 on_failed_pull({ pg_whoami }, oid, v);
224ce89b
WB
13341 }
13342 return r;
7c673cae
FG
13343}
13344
13345void PrimaryLogPG::update_range(
13346 BackfillInterval *bi,
13347 ThreadPool::TPHandle &handle)
13348{
13349 int local_min = cct->_conf->osd_backfill_scan_min;
13350 int local_max = cct->_conf->osd_backfill_scan_max;
13351
13352 if (bi->version < info.log_tail) {
13353 dout(10) << __func__<< ": bi is old, rescanning local backfill_info"
13354 << dendl;
11fdf7f2 13355 bi->version = info.last_update;
7c673cae
FG
13356 scan_range(local_min, local_max, bi, handle);
13357 }
13358
13359 if (bi->version >= projected_last_update) {
13360 dout(10) << __func__<< ": bi is current " << dendl;
11fdf7f2 13361 ceph_assert(bi->version == projected_last_update);
7c673cae 13362 } else if (bi->version >= info.log_tail) {
9f95a23c 13363 if (recovery_state.get_pg_log().get_log().empty() && projected_log.empty()) {
7c673cae
FG
13364 /* Because we don't move log_tail on split, the log might be
13365 * empty even if log_tail != last_update. However, the only
13366 * way to get here with an empty log is if log_tail is actually
13367 * eversion_t(), because otherwise the entry which changed
13368 * last_update since the last scan would have to be present.
13369 */
11fdf7f2 13370 ceph_assert(bi->version == eversion_t());
7c673cae
FG
13371 return;
13372 }
13373
13374 dout(10) << __func__<< ": bi is old, (" << bi->version
13375 << ") can be updated with log to projected_last_update "
13376 << projected_last_update << dendl;
13377
13378 auto func = [&](const pg_log_entry_t &e) {
13379 dout(10) << __func__ << ": updating from version " << e.version
13380 << dendl;
13381 const hobject_t &soid = e.soid;
13382 if (soid >= bi->begin &&
13383 soid < bi->end) {
13384 if (e.is_update()) {
13385 dout(10) << __func__ << ": " << e.soid << " updated to version "
13386 << e.version << dendl;
13387 bi->objects.erase(e.soid);
13388 bi->objects.insert(
13389 make_pair(
13390 e.soid,
13391 e.version));
13392 } else if (e.is_delete()) {
13393 dout(10) << __func__ << ": " << e.soid << " removed" << dendl;
13394 bi->objects.erase(e.soid);
13395 }
13396 }
13397 };
13398 dout(10) << "scanning pg log first" << dendl;
9f95a23c 13399 recovery_state.get_pg_log().get_log().scan_log_after(bi->version, func);
7c673cae
FG
13400 dout(10) << "scanning projected log" << dendl;
13401 projected_log.scan_log_after(bi->version, func);
13402 bi->version = projected_last_update;
13403 } else {
11fdf7f2 13404 ceph_abort_msg("scan_range should have raised bi->version past log_tail");
7c673cae
FG
13405 }
13406}
13407
13408void PrimaryLogPG::scan_range(
13409 int min, int max, BackfillInterval *bi,
13410 ThreadPool::TPHandle &handle)
13411{
11fdf7f2 13412 ceph_assert(is_locked());
7c673cae
FG
13413 dout(10) << "scan_range from " << bi->begin << dendl;
13414 bi->clear_objects();
13415
13416 vector<hobject_t> ls;
13417 ls.reserve(max);
13418 int r = pgbackend->objects_list_partial(bi->begin, min, max, &ls, &bi->end);
11fdf7f2 13419 ceph_assert(r >= 0);
7c673cae
FG
13420 dout(10) << " got " << ls.size() << " items, next " << bi->end << dendl;
13421 dout(20) << ls << dendl;
13422
13423 for (vector<hobject_t>::iterator p = ls.begin(); p != ls.end(); ++p) {
13424 handle.reset_tp_timeout();
13425 ObjectContextRef obc;
13426 if (is_primary())
13427 obc = object_contexts.lookup(*p);
13428 if (obc) {
92f5a8d4
TL
13429 if (!obc->obs.exists) {
13430 /* If the object does not exist here, it must have been removed
13431 * between the collection_list_partial and here. This can happen
13432 * for the first item in the range, which is usually last_backfill.
13433 */
13434 continue;
13435 }
7c673cae
FG
13436 bi->objects[*p] = obc->obs.oi.version;
13437 dout(20) << " " << *p << " " << obc->obs.oi.version << dendl;
13438 } else {
13439 bufferlist bl;
13440 int r = pgbackend->objects_get_attr(*p, OI_ATTR, &bl);
7c673cae 13441 /* If the object does not exist here, it must have been removed
92f5a8d4
TL
13442 * between the collection_list_partial and here. This can happen
13443 * for the first item in the range, which is usually last_backfill.
13444 */
7c673cae
FG
13445 if (r == -ENOENT)
13446 continue;
13447
11fdf7f2 13448 ceph_assert(r >= 0);
7c673cae
FG
13449 object_info_t oi(bl);
13450 bi->objects[*p] = oi.version;
13451 dout(20) << " " << *p << " " << oi.version << dendl;
13452 }
13453 }
13454}
13455
13456
13457/** check_local
13458 *
13459 * verifies that stray objects have been deleted
13460 */
13461void PrimaryLogPG::check_local()
13462{
13463 dout(10) << __func__ << dendl;
13464
9f95a23c
TL
13465 ceph_assert(
13466 info.last_update >=
13467 recovery_state.get_pg_log().get_tail()); // otherwise we need some help!
7c673cae
FG
13468
13469 if (!cct->_conf->osd_debug_verify_stray_on_activate)
13470 return;
13471
13472 // just scan the log.
13473 set<hobject_t> did;
9f95a23c
TL
13474 for (list<pg_log_entry_t>::const_reverse_iterator p = recovery_state.get_pg_log().get_log().log.rbegin();
13475 p != recovery_state.get_pg_log().get_log().log.rend();
7c673cae
FG
13476 ++p) {
13477 if (did.count(p->soid))
13478 continue;
13479 did.insert(p->soid);
13480
c07f9fc5 13481 if (p->is_delete() && !is_missing_object(p->soid)) {
7c673cae
FG
13482 dout(10) << " checking " << p->soid
13483 << " at " << p->version << dendl;
13484 struct stat st;
13485 int r = osd->store->stat(
13486 ch,
13487 ghobject_t(p->soid, ghobject_t::NO_GEN, pg_whoami.shard),
13488 &st);
13489 if (r != -ENOENT) {
13490 derr << __func__ << " " << p->soid << " exists, but should have been "
13491 << "deleted" << dendl;
11fdf7f2 13492 ceph_abort_msg("erroneously present object");
7c673cae
FG
13493 }
13494 } else {
13495 // ignore old(+missing) objects
13496 }
13497 }
13498}
13499
13500
13501
13502// ===========================
13503// hit sets
13504
13505hobject_t PrimaryLogPG::get_hit_set_current_object(utime_t stamp)
13506{
13507 ostringstream ss;
13508 ss << "hit_set_" << info.pgid.pgid << "_current_" << stamp;
13509 hobject_t hoid(sobject_t(ss.str(), CEPH_NOSNAP), "",
13510 info.pgid.ps(), info.pgid.pool(),
13511 cct->_conf->osd_hit_set_namespace);
13512 dout(20) << __func__ << " " << hoid << dendl;
13513 return hoid;
13514}
13515
13516hobject_t PrimaryLogPG::get_hit_set_archive_object(utime_t start,
13517 utime_t end,
13518 bool using_gmt)
13519{
13520 ostringstream ss;
13521 ss << "hit_set_" << info.pgid.pgid << "_archive_";
13522 if (using_gmt) {
9f95a23c
TL
13523 start.gmtime(ss, true /* legacy pre-octopus form */) << "_";
13524 end.gmtime(ss, true /* legacy pre-octopus form */);
7c673cae 13525 } else {
9f95a23c
TL
13526 start.localtime(ss, true /* legacy pre-octopus form */) << "_";
13527 end.localtime(ss, true /* legacy pre-octopus form */);
7c673cae
FG
13528 }
13529 hobject_t hoid(sobject_t(ss.str(), CEPH_NOSNAP), "",
13530 info.pgid.ps(), info.pgid.pool(),
13531 cct->_conf->osd_hit_set_namespace);
13532 dout(20) << __func__ << " " << hoid << dendl;
13533 return hoid;
13534}
13535
13536void PrimaryLogPG::hit_set_clear()
13537{
13538 dout(20) << __func__ << dendl;
13539 hit_set.reset();
13540 hit_set_start_stamp = utime_t();
13541}
13542
13543void PrimaryLogPG::hit_set_setup()
13544{
13545 if (!is_active() ||
13546 !is_primary()) {
13547 hit_set_clear();
13548 return;
13549 }
13550
13551 if (is_active() && is_primary() &&
13552 (!pool.info.hit_set_count ||
13553 !pool.info.hit_set_period ||
13554 pool.info.hit_set_params.get_type() == HitSet::TYPE_NONE)) {
13555 hit_set_clear();
13556
13557 // only primary is allowed to remove all the hit set objects
13558 hit_set_remove_all();
13559 return;
13560 }
13561
13562 // FIXME: discard any previous data for now
13563 hit_set_create();
13564
13565 // include any writes we know about from the pg log. this doesn't
13566 // capture reads, but it is better than nothing!
13567 hit_set_apply_log();
13568}
13569
13570void PrimaryLogPG::hit_set_remove_all()
13571{
13572 // If any archives are degraded we skip this
9f95a23c 13573 for (auto p = info.hit_set.history.begin();
7c673cae
FG
13574 p != info.hit_set.history.end();
13575 ++p) {
13576 hobject_t aoid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt);
13577
13578 // Once we hit a degraded object just skip
13579 if (is_degraded_or_backfilling_object(aoid))
13580 return;
28e407b8 13581 if (write_blocked_by_scrub(aoid))
7c673cae
FG
13582 return;
13583 }
13584
13585 if (!info.hit_set.history.empty()) {
9f95a23c 13586 auto p = info.hit_set.history.rbegin();
11fdf7f2 13587 ceph_assert(p != info.hit_set.history.rend());
7c673cae 13588 hobject_t oid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt);
11fdf7f2 13589 ceph_assert(!is_degraded_or_backfilling_object(oid));
7c673cae 13590 ObjectContextRef obc = get_object_context(oid, false);
11fdf7f2 13591 ceph_assert(obc);
7c673cae
FG
13592
13593 OpContextUPtr ctx = simple_opc_create(obc);
13594 ctx->at_version = get_next_version();
13595 ctx->updated_hset_history = info.hit_set;
13596 utime_t now = ceph_clock_now();
13597 ctx->mtime = now;
13598 hit_set_trim(ctx, 0);
13599 simple_opc_submit(std::move(ctx));
13600 }
13601
9f95a23c 13602 recovery_state.update_hset(pg_hit_set_history_t());
7c673cae
FG
13603 if (agent_state) {
13604 agent_state->discard_hit_sets();
13605 }
13606}
13607
13608void PrimaryLogPG::hit_set_create()
13609{
13610 utime_t now = ceph_clock_now();
13611 // make a copy of the params to modify
13612 HitSet::Params params(pool.info.hit_set_params);
13613
13614 dout(20) << __func__ << " " << params << dendl;
13615 if (pool.info.hit_set_params.get_type() == HitSet::TYPE_BLOOM) {
13616 BloomHitSet::Params *p =
13617 static_cast<BloomHitSet::Params*>(params.impl.get());
13618
13619 // convert false positive rate so it holds up across the full period
13620 p->set_fpp(p->get_fpp() / pool.info.hit_set_count);
13621 if (p->get_fpp() <= 0.0)
13622 p->set_fpp(.01); // fpp cannot be zero!
13623
13624 // if we don't have specified size, estimate target size based on the
13625 // previous bin!
13626 if (p->target_size == 0 && hit_set) {
13627 utime_t dur = now - hit_set_start_stamp;
13628 unsigned unique = hit_set->approx_unique_insert_count();
13629 dout(20) << __func__ << " previous set had approx " << unique
13630 << " unique items over " << dur << " seconds" << dendl;
13631 p->target_size = (double)unique * (double)pool.info.hit_set_period
13632 / (double)dur;
13633 }
13634 if (p->target_size <
13635 static_cast<uint64_t>(cct->_conf->osd_hit_set_min_size))
13636 p->target_size = cct->_conf->osd_hit_set_min_size;
13637
13638 if (p->target_size
13639 > static_cast<uint64_t>(cct->_conf->osd_hit_set_max_size))
13640 p->target_size = cct->_conf->osd_hit_set_max_size;
13641
13642 p->seed = now.sec();
13643
13644 dout(10) << __func__ << " target_size " << p->target_size
13645 << " fpp " << p->get_fpp() << dendl;
13646 }
13647 hit_set.reset(new HitSet(params));
13648 hit_set_start_stamp = now;
13649}
13650
13651/**
13652 * apply log entries to set
13653 *
13654 * this would only happen after peering, to at least capture writes
13655 * during an interval that was potentially lost.
13656 */
13657bool PrimaryLogPG::hit_set_apply_log()
13658{
13659 if (!hit_set)
13660 return false;
13661
13662 eversion_t to = info.last_update;
13663 eversion_t from = info.hit_set.current_last_update;
13664 if (to <= from) {
13665 dout(20) << __func__ << " no update" << dendl;
13666 return false;
13667 }
13668
13669 dout(20) << __func__ << " " << to << " .. " << info.last_update << dendl;
9f95a23c
TL
13670 list<pg_log_entry_t>::const_reverse_iterator p =
13671 recovery_state.get_pg_log().get_log().log.rbegin();
13672 while (p != recovery_state.get_pg_log().get_log().log.rend() && p->version > to)
7c673cae 13673 ++p;
9f95a23c 13674 while (p != recovery_state.get_pg_log().get_log().log.rend() && p->version > from) {
7c673cae
FG
13675 hit_set->insert(p->soid);
13676 ++p;
13677 }
13678
13679 return true;
13680}
13681
13682void PrimaryLogPG::hit_set_persist()
13683{
13684 dout(10) << __func__ << dendl;
13685 bufferlist bl;
13686 unsigned max = pool.info.hit_set_count;
13687
13688 utime_t now = ceph_clock_now();
13689 hobject_t oid;
13690
13691 // If any archives are degraded we skip this persist request
13692 // account for the additional entry being added below
9f95a23c 13693 for (auto p = info.hit_set.history.begin();
7c673cae
FG
13694 p != info.hit_set.history.end();
13695 ++p) {
13696 hobject_t aoid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt);
13697
13698 // Once we hit a degraded object just skip further trim
13699 if (is_degraded_or_backfilling_object(aoid))
13700 return;
28e407b8 13701 if (write_blocked_by_scrub(aoid))
7c673cae
FG
13702 return;
13703 }
13704
13705 // If backfill is in progress and we could possibly overlap with the
13706 // hit_set_* objects, back off. Since these all have
13707 // hobject_t::hash set to pgid.ps(), and those sort first, we can
13708 // look just at that. This is necessary because our transactions
13709 // may include a modify of the new hit_set *and* a delete of the
13710 // old one, and this may span the backfill boundary.
9f95a23c
TL
13711 for (set<pg_shard_t>::const_iterator p = get_backfill_targets().begin();
13712 p != get_backfill_targets().end();
7c673cae 13713 ++p) {
9f95a23c 13714 const pg_info_t& pi = recovery_state.get_peer_info(*p);
7c673cae
FG
13715 if (pi.last_backfill == hobject_t() ||
13716 pi.last_backfill.get_hash() == info.pgid.ps()) {
13717 dout(10) << __func__ << " backfill target osd." << *p
13718 << " last_backfill has not progressed past pgid ps"
13719 << dendl;
13720 return;
13721 }
13722 }
13723
13724
13725 pg_hit_set_info_t new_hset = pg_hit_set_info_t(pool.info.use_gmt_hitset);
13726 new_hset.begin = hit_set_start_stamp;
13727 new_hset.end = now;
13728 oid = get_hit_set_archive_object(
13729 new_hset.begin,
13730 new_hset.end,
13731 new_hset.using_gmt);
13732
13733 // If the current object is degraded we skip this persist request
28e407b8 13734 if (write_blocked_by_scrub(oid))
7c673cae
FG
13735 return;
13736
13737 hit_set->seal();
11fdf7f2 13738 encode(*hit_set, bl);
7c673cae
FG
13739 dout(20) << __func__ << " archive " << oid << dendl;
13740
13741 if (agent_state) {
13742 agent_state->add_hit_set(new_hset.begin, hit_set);
13743 uint32_t size = agent_state->hit_set_map.size();
13744 if (size >= pool.info.hit_set_count) {
13745 size = pool.info.hit_set_count > 0 ? pool.info.hit_set_count - 1: 0;
13746 }
13747 hit_set_in_memory_trim(size);
13748 }
13749
13750 ObjectContextRef obc = get_object_context(oid, true);
13751 OpContextUPtr ctx = simple_opc_create(obc);
13752
13753 ctx->at_version = get_next_version();
13754 ctx->updated_hset_history = info.hit_set;
13755 pg_hit_set_history_t &updated_hit_set_hist = *(ctx->updated_hset_history);
13756
13757 updated_hit_set_hist.current_last_update = info.last_update;
13758 new_hset.version = ctx->at_version;
13759
13760 updated_hit_set_hist.history.push_back(new_hset);
13761 hit_set_create();
13762
13763 // fabricate an object_info_t and SnapSet
13764 obc->obs.oi.version = ctx->at_version;
13765 obc->obs.oi.mtime = now;
13766 obc->obs.oi.size = bl.length();
13767 obc->obs.exists = true;
13768 obc->obs.oi.set_data_digest(bl.crc32c(-1));
13769
13770 ctx->new_obs = obc->obs;
13771
7c673cae
FG
13772 ctx->new_snapset = obc->ssc->snapset;
13773
13774 ctx->delta_stats.num_objects++;
13775 ctx->delta_stats.num_objects_hit_set_archive++;
11fdf7f2 13776
7c673cae
FG
13777 ctx->delta_stats.num_bytes += bl.length();
13778 ctx->delta_stats.num_bytes_hit_set_archive += bl.length();
13779
13780 bufferlist bss;
11fdf7f2 13781 encode(ctx->new_snapset, bss);
7c673cae 13782 bufferlist boi(sizeof(ctx->new_obs.oi));
11fdf7f2 13783 encode(ctx->new_obs.oi, boi,
7c673cae
FG
13784 get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
13785
13786 ctx->op_t->create(oid);
13787 if (bl.length()) {
13788 ctx->op_t->write(oid, 0, bl.length(), bl, 0);
9f95a23c
TL
13789 write_update_size_and_usage(ctx->delta_stats, obc->obs.oi, ctx->modified_ranges,
13790 0, bl.length());
13791 ctx->clean_regions.mark_data_region_dirty(0, bl.length());
7c673cae
FG
13792 }
13793 map <string, bufferlist> attrs;
13794 attrs[OI_ATTR].claim(boi);
13795 attrs[SS_ATTR].claim(bss);
11fdf7f2 13796 setattrs_maybe_cache(ctx->obc, ctx->op_t.get(), attrs);
7c673cae
FG
13797 ctx->log.push_back(
13798 pg_log_entry_t(
13799 pg_log_entry_t::MODIFY,
13800 oid,
13801 ctx->at_version,
13802 eversion_t(),
13803 0,
13804 osd_reqid_t(),
13805 ctx->mtime,
13806 0)
13807 );
9f95a23c 13808 ctx->log.back().clean_regions = ctx->clean_regions;
7c673cae
FG
13809
13810 hit_set_trim(ctx, max);
13811
13812 simple_opc_submit(std::move(ctx));
13813}
13814
13815void PrimaryLogPG::hit_set_trim(OpContextUPtr &ctx, unsigned max)
13816{
11fdf7f2 13817 ceph_assert(ctx->updated_hset_history);
7c673cae
FG
13818 pg_hit_set_history_t &updated_hit_set_hist =
13819 *(ctx->updated_hset_history);
13820 for (unsigned num = updated_hit_set_hist.history.size(); num > max; --num) {
13821 list<pg_hit_set_info_t>::iterator p = updated_hit_set_hist.history.begin();
11fdf7f2 13822 ceph_assert(p != updated_hit_set_hist.history.end());
7c673cae
FG
13823 hobject_t oid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt);
13824
11fdf7f2 13825 ceph_assert(!is_degraded_or_backfilling_object(oid));
7c673cae
FG
13826
13827 dout(20) << __func__ << " removing " << oid << dendl;
13828 ++ctx->at_version.version;
13829 ctx->log.push_back(
13830 pg_log_entry_t(pg_log_entry_t::DELETE,
13831 oid,
13832 ctx->at_version,
13833 p->version,
13834 0,
13835 osd_reqid_t(),
13836 ctx->mtime,
13837 0));
13838
13839 ctx->op_t->remove(oid);
13840 updated_hit_set_hist.history.pop_front();
13841
13842 ObjectContextRef obc = get_object_context(oid, false);
11fdf7f2 13843 ceph_assert(obc);
7c673cae
FG
13844 --ctx->delta_stats.num_objects;
13845 --ctx->delta_stats.num_objects_hit_set_archive;
13846 ctx->delta_stats.num_bytes -= obc->obs.oi.size;
13847 ctx->delta_stats.num_bytes_hit_set_archive -= obc->obs.oi.size;
13848 }
13849}
13850
13851void PrimaryLogPG::hit_set_in_memory_trim(uint32_t max_in_memory)
13852{
13853 while (agent_state->hit_set_map.size() > max_in_memory) {
13854 agent_state->remove_oldest_hit_set();
13855 }
13856}
13857
13858
13859// =======================================
13860// cache agent
13861
13862void PrimaryLogPG::agent_setup()
13863{
11fdf7f2 13864 ceph_assert(is_locked());
7c673cae
FG
13865 if (!is_active() ||
13866 !is_primary() ||
11fdf7f2 13867 state_test(PG_STATE_PREMERGE) ||
7c673cae
FG
13868 pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE ||
13869 pool.info.tier_of < 0 ||
13870 !get_osdmap()->have_pg_pool(pool.info.tier_of)) {
13871 agent_clear();
13872 return;
13873 }
13874 if (!agent_state) {
13875 agent_state.reset(new TierAgentState);
13876
13877 // choose random starting position
13878 agent_state->position = hobject_t();
13879 agent_state->position.pool = info.pgid.pool();
13880 agent_state->position.set_hash(pool.info.get_random_pg_position(
13881 info.pgid.pgid,
13882 rand()));
13883 agent_state->start = agent_state->position;
13884
13885 dout(10) << __func__ << " allocated new state, position "
13886 << agent_state->position << dendl;
13887 } else {
13888 dout(10) << __func__ << " keeping existing state" << dendl;
13889 }
13890
13891 if (info.stats.stats_invalid) {
13892 osd->clog->warn() << "pg " << info.pgid << " has invalid (post-split) stats; must scrub before tier agent can activate";
13893 }
13894
13895 agent_choose_mode();
13896}
13897
13898void PrimaryLogPG::agent_clear()
13899{
13900 agent_stop();
13901 agent_state.reset(NULL);
13902}
13903
13904// Return false if no objects operated on since start of object hash space
13905bool PrimaryLogPG::agent_work(int start_max, int agent_flush_quota)
13906{
9f95a23c 13907 std::scoped_lock locker{*this};
7c673cae
FG
13908 if (!agent_state) {
13909 dout(10) << __func__ << " no agent state, stopping" << dendl;
7c673cae
FG
13910 return true;
13911 }
13912
9f95a23c 13913 ceph_assert(!recovery_state.is_deleting());
7c673cae
FG
13914
13915 if (agent_state->is_idle()) {
13916 dout(10) << __func__ << " idle, stopping" << dendl;
7c673cae
FG
13917 return true;
13918 }
13919
13920 osd->logger->inc(l_osd_agent_wake);
13921
13922 dout(10) << __func__
13923 << " max " << start_max
13924 << ", flush " << agent_state->get_flush_mode_name()
13925 << ", evict " << agent_state->get_evict_mode_name()
13926 << ", pos " << agent_state->position
13927 << dendl;
11fdf7f2
TL
13928 ceph_assert(is_primary());
13929 ceph_assert(is_active());
7c673cae
FG
13930
13931 agent_load_hit_sets();
13932
13933 const pg_pool_t *base_pool = get_osdmap()->get_pg_pool(pool.info.tier_of);
11fdf7f2 13934 ceph_assert(base_pool);
7c673cae
FG
13935
13936 int ls_min = 1;
13937 int ls_max = cct->_conf->osd_pool_default_cache_max_evict_check_size;
13938
13939 // list some objects. this conveniently lists clones (oldest to
13940 // newest) before heads... the same order we want to flush in.
13941 //
13942 // NOTE: do not flush the Sequencer. we will assume that the
13943 // listing we get back is imprecise.
13944 vector<hobject_t> ls;
13945 hobject_t next;
13946 int r = pgbackend->objects_list_partial(agent_state->position, ls_min, ls_max,
13947 &ls, &next);
11fdf7f2 13948 ceph_assert(r >= 0);
7c673cae
FG
13949 dout(20) << __func__ << " got " << ls.size() << " objects" << dendl;
13950 int started = 0;
13951 for (vector<hobject_t>::iterator p = ls.begin();
13952 p != ls.end();
13953 ++p) {
13954 if (p->nspace == cct->_conf->osd_hit_set_namespace) {
13955 dout(20) << __func__ << " skip (hit set) " << *p << dendl;
13956 osd->logger->inc(l_osd_agent_skip);
13957 continue;
13958 }
13959 if (is_degraded_or_backfilling_object(*p)) {
13960 dout(20) << __func__ << " skip (degraded) " << *p << dendl;
13961 osd->logger->inc(l_osd_agent_skip);
13962 continue;
13963 }
13964 if (is_missing_object(p->get_head())) {
13965 dout(20) << __func__ << " skip (missing head) " << *p << dendl;
13966 osd->logger->inc(l_osd_agent_skip);
13967 continue;
13968 }
13969 ObjectContextRef obc = get_object_context(*p, false, NULL);
13970 if (!obc) {
13971 // we didn't flush; we may miss something here.
13972 dout(20) << __func__ << " skip (no obc) " << *p << dendl;
13973 osd->logger->inc(l_osd_agent_skip);
13974 continue;
13975 }
13976 if (!obc->obs.exists) {
13977 dout(20) << __func__ << " skip (dne) " << obc->obs.oi.soid << dendl;
13978 osd->logger->inc(l_osd_agent_skip);
13979 continue;
13980 }
28e407b8
AA
13981 if (range_intersects_scrub(obc->obs.oi.soid,
13982 obc->obs.oi.soid.get_head())) {
7c673cae
FG
13983 dout(20) << __func__ << " skip (scrubbing) " << obc->obs.oi << dendl;
13984 osd->logger->inc(l_osd_agent_skip);
13985 continue;
13986 }
13987 if (obc->is_blocked()) {
13988 dout(20) << __func__ << " skip (blocked) " << obc->obs.oi << dendl;
13989 osd->logger->inc(l_osd_agent_skip);
13990 continue;
13991 }
13992 if (obc->is_request_pending()) {
13993 dout(20) << __func__ << " skip (request pending) " << obc->obs.oi << dendl;
13994 osd->logger->inc(l_osd_agent_skip);
13995 continue;
13996 }
13997
13998 // be careful flushing omap to an EC pool.
13999 if (!base_pool->supports_omap() &&
14000 obc->obs.oi.is_omap()) {
14001 dout(20) << __func__ << " skip (omap to EC) " << obc->obs.oi << dendl;
14002 osd->logger->inc(l_osd_agent_skip);
14003 continue;
14004 }
14005
14006 if (agent_state->evict_mode != TierAgentState::EVICT_MODE_IDLE &&
14007 agent_maybe_evict(obc, false))
14008 ++started;
14009 else if (agent_state->flush_mode != TierAgentState::FLUSH_MODE_IDLE &&
14010 agent_flush_quota > 0 && agent_maybe_flush(obc)) {
14011 ++started;
14012 --agent_flush_quota;
14013 }
14014 if (started >= start_max) {
14015 // If finishing early, set "next" to the next object
14016 if (++p != ls.end())
14017 next = *p;
14018 break;
14019 }
14020 }
14021
14022 if (++agent_state->hist_age > cct->_conf->osd_agent_hist_halflife) {
14023 dout(20) << __func__ << " resetting atime and temp histograms" << dendl;
14024 agent_state->hist_age = 0;
14025 agent_state->temp_hist.decay();
14026 }
14027
14028 // Total objects operated on so far
14029 int total_started = agent_state->started + started;
14030 bool need_delay = false;
14031
14032 dout(20) << __func__ << " start pos " << agent_state->position
14033 << " next start pos " << next
14034 << " started " << total_started << dendl;
14035
14036 // See if we've made a full pass over the object hash space
14037 // This might check at most ls_max objects a second time to notice that
14038 // we've checked every objects at least once.
14039 if (agent_state->position < agent_state->start &&
14040 next >= agent_state->start) {
14041 dout(20) << __func__ << " wrap around " << agent_state->start << dendl;
14042 if (total_started == 0)
14043 need_delay = true;
14044 else
14045 total_started = 0;
14046 agent_state->start = next;
14047 }
14048 agent_state->started = total_started;
14049
14050 // See if we are starting from beginning
14051 if (next.is_max())
14052 agent_state->position = hobject_t();
14053 else
14054 agent_state->position = next;
14055
14056 // Discard old in memory HitSets
14057 hit_set_in_memory_trim(pool.info.hit_set_count);
14058
14059 if (need_delay) {
11fdf7f2 14060 ceph_assert(agent_state->delaying == false);
7c673cae 14061 agent_delay();
7c673cae
FG
14062 return false;
14063 }
14064 agent_choose_mode();
7c673cae
FG
14065 return true;
14066}
14067
14068void PrimaryLogPG::agent_load_hit_sets()
14069{
14070 if (agent_state->evict_mode == TierAgentState::EVICT_MODE_IDLE) {
14071 return;
14072 }
14073
14074 if (agent_state->hit_set_map.size() < info.hit_set.history.size()) {
14075 dout(10) << __func__ << dendl;
9f95a23c 14076 for (auto p = info.hit_set.history.begin();
7c673cae
FG
14077 p != info.hit_set.history.end(); ++p) {
14078 if (agent_state->hit_set_map.count(p->begin.sec()) == 0) {
14079 dout(10) << __func__ << " loading " << p->begin << "-"
14080 << p->end << dendl;
14081 if (!pool.info.is_replicated()) {
14082 // FIXME: EC not supported here yet
14083 derr << __func__ << " on non-replicated pool" << dendl;
14084 break;
14085 }
14086
14087 hobject_t oid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt);
14088 if (is_unreadable_object(oid)) {
14089 dout(10) << __func__ << " unreadable " << oid << ", waiting" << dendl;
14090 break;
14091 }
14092
14093 ObjectContextRef obc = get_object_context(oid, false);
14094 if (!obc) {
14095 derr << __func__ << ": could not load hitset " << oid << dendl;
14096 break;
14097 }
14098
14099 bufferlist bl;
14100 {
7c673cae 14101 int r = osd->store->read(ch, ghobject_t(oid), 0, 0, bl);
11fdf7f2 14102 ceph_assert(r >= 0);
7c673cae
FG
14103 }
14104 HitSetRef hs(new HitSet);
11fdf7f2
TL
14105 bufferlist::const_iterator pbl = bl.begin();
14106 decode(*hs, pbl);
7c673cae
FG
14107 agent_state->add_hit_set(p->begin.sec(), hs);
14108 }
14109 }
14110 }
14111}
14112
14113bool PrimaryLogPG::agent_maybe_flush(ObjectContextRef& obc)
14114{
14115 if (!obc->obs.oi.is_dirty()) {
14116 dout(20) << __func__ << " skip (clean) " << obc->obs.oi << dendl;
14117 osd->logger->inc(l_osd_agent_skip);
14118 return false;
14119 }
14120 if (obc->obs.oi.is_cache_pinned()) {
14121 dout(20) << __func__ << " skip (cache_pinned) " << obc->obs.oi << dendl;
14122 osd->logger->inc(l_osd_agent_skip);
14123 return false;
14124 }
14125
14126 utime_t now = ceph_clock_now();
14127 utime_t ob_local_mtime;
14128 if (obc->obs.oi.local_mtime != utime_t()) {
14129 ob_local_mtime = obc->obs.oi.local_mtime;
14130 } else {
14131 ob_local_mtime = obc->obs.oi.mtime;
14132 }
14133 bool evict_mode_full =
14134 (agent_state->evict_mode == TierAgentState::EVICT_MODE_FULL);
14135 if (!evict_mode_full &&
14136 obc->obs.oi.soid.snap == CEPH_NOSNAP && // snaps immutable; don't delay
14137 (ob_local_mtime + utime_t(pool.info.cache_min_flush_age, 0) > now)) {
14138 dout(20) << __func__ << " skip (too young) " << obc->obs.oi << dendl;
14139 osd->logger->inc(l_osd_agent_skip);
14140 return false;
14141 }
14142
14143 if (osd->agent_is_active_oid(obc->obs.oi.soid)) {
14144 dout(20) << __func__ << " skip (flushing) " << obc->obs.oi << dendl;
14145 osd->logger->inc(l_osd_agent_skip);
14146 return false;
14147 }
14148
14149 dout(10) << __func__ << " flushing " << obc->obs.oi << dendl;
14150
14151 // FIXME: flush anything dirty, regardless of what distribution of
14152 // ages we expect.
14153
14154 hobject_t oid = obc->obs.oi.soid;
14155 osd->agent_start_op(oid);
14156 // no need to capture a pg ref, can't outlive fop or ctx
14157 std::function<void()> on_flush = [this, oid]() {
14158 osd->agent_finish_op(oid);
14159 };
14160
14161 int result = start_flush(
14162 OpRequestRef(), obc, false, NULL,
14163 on_flush);
14164 if (result != -EINPROGRESS) {
14165 on_flush();
14166 dout(10) << __func__ << " start_flush() failed " << obc->obs.oi
14167 << " with " << result << dendl;
14168 osd->logger->inc(l_osd_agent_skip);
14169 return false;
14170 }
14171
14172 osd->logger->inc(l_osd_agent_flush);
14173 return true;
14174}
14175
14176bool PrimaryLogPG::agent_maybe_evict(ObjectContextRef& obc, bool after_flush)
14177{
14178 const hobject_t& soid = obc->obs.oi.soid;
14179 if (!after_flush && obc->obs.oi.is_dirty()) {
14180 dout(20) << __func__ << " skip (dirty) " << obc->obs.oi << dendl;
14181 return false;
14182 }
81eedcae
TL
14183 // This is already checked by agent_work() which passes after_flush = false
14184 if (after_flush && range_intersects_scrub(soid, soid.get_head())) {
14185 dout(20) << __func__ << " skip (scrubbing) " << obc->obs.oi << dendl;
14186 return false;
14187 }
7c673cae
FG
14188 if (!obc->obs.oi.watchers.empty()) {
14189 dout(20) << __func__ << " skip (watchers) " << obc->obs.oi << dendl;
14190 return false;
14191 }
14192 if (obc->is_blocked()) {
14193 dout(20) << __func__ << " skip (blocked) " << obc->obs.oi << dendl;
14194 return false;
14195 }
14196 if (obc->obs.oi.is_cache_pinned()) {
14197 dout(20) << __func__ << " skip (cache_pinned) " << obc->obs.oi << dendl;
14198 return false;
14199 }
14200
14201 if (soid.snap == CEPH_NOSNAP) {
14202 int result = _verify_no_head_clones(soid, obc->ssc->snapset);
14203 if (result < 0) {
14204 dout(20) << __func__ << " skip (clones) " << obc->obs.oi << dendl;
14205 return false;
14206 }
14207 }
14208
14209 if (agent_state->evict_mode != TierAgentState::EVICT_MODE_FULL) {
14210 // is this object old than cache_min_evict_age?
14211 utime_t now = ceph_clock_now();
14212 utime_t ob_local_mtime;
14213 if (obc->obs.oi.local_mtime != utime_t()) {
14214 ob_local_mtime = obc->obs.oi.local_mtime;
14215 } else {
14216 ob_local_mtime = obc->obs.oi.mtime;
14217 }
14218 if (ob_local_mtime + utime_t(pool.info.cache_min_evict_age, 0) > now) {
14219 dout(20) << __func__ << " skip (too young) " << obc->obs.oi << dendl;
14220 osd->logger->inc(l_osd_agent_skip);
14221 return false;
14222 }
14223 // is this object old and/or cold enough?
14224 int temp = 0;
14225 uint64_t temp_upper = 0, temp_lower = 0;
14226 if (hit_set)
14227 agent_estimate_temp(soid, &temp);
14228 agent_state->temp_hist.add(temp);
14229 agent_state->temp_hist.get_position_micro(temp, &temp_lower, &temp_upper);
14230
14231 dout(20) << __func__
14232 << " temp " << temp
14233 << " pos " << temp_lower << "-" << temp_upper
14234 << ", evict_effort " << agent_state->evict_effort
14235 << dendl;
14236 dout(30) << "agent_state:\n";
14237 Formatter *f = Formatter::create("");
14238 f->open_object_section("agent_state");
14239 agent_state->dump(f);
14240 f->close_section();
14241 f->flush(*_dout);
14242 delete f;
14243 *_dout << dendl;
14244
14245 if (1000000 - temp_upper >= agent_state->evict_effort)
14246 return false;
14247 }
14248
14249 dout(10) << __func__ << " evicting " << obc->obs.oi << dendl;
14250 OpContextUPtr ctx = simple_opc_create(obc);
14251
11fdf7f2 14252 auto null_op_req = OpRequestRef();
7c673cae 14253 if (!ctx->lock_manager.get_lock_type(
9f95a23c 14254 RWState::RWWRITE,
7c673cae
FG
14255 obc->obs.oi.soid,
14256 obc,
11fdf7f2 14257 null_op_req)) {
7c673cae
FG
14258 close_op_ctx(ctx.release());
14259 dout(20) << __func__ << " skip (cannot get lock) " << obc->obs.oi << dendl;
14260 return false;
14261 }
14262
14263 osd->agent_start_evict_op();
14264 ctx->register_on_finish(
14265 [this]() {
14266 osd->agent_finish_evict_op();
14267 });
14268
14269 ctx->at_version = get_next_version();
11fdf7f2 14270 ceph_assert(ctx->new_obs.exists);
7c673cae
FG
14271 int r = _delete_oid(ctx.get(), true, false);
14272 if (obc->obs.oi.is_omap())
14273 ctx->delta_stats.num_objects_omap--;
14274 ctx->delta_stats.num_evict++;
11fdf7f2 14275 ctx->delta_stats.num_evict_kb += shift_round_up(obc->obs.oi.size, 10);
7c673cae
FG
14276 if (obc->obs.oi.is_dirty())
14277 --ctx->delta_stats.num_objects_dirty;
11fdf7f2
TL
14278 ceph_assert(r == 0);
14279 finish_ctx(ctx.get(), pg_log_entry_t::DELETE);
7c673cae
FG
14280 simple_opc_submit(std::move(ctx));
14281 osd->logger->inc(l_osd_tier_evict);
14282 osd->logger->inc(l_osd_agent_evict);
14283 return true;
14284}
14285
14286void PrimaryLogPG::agent_stop()
14287{
14288 dout(20) << __func__ << dendl;
14289 if (agent_state && !agent_state->is_idle()) {
14290 agent_state->evict_mode = TierAgentState::EVICT_MODE_IDLE;
14291 agent_state->flush_mode = TierAgentState::FLUSH_MODE_IDLE;
14292 osd->agent_disable_pg(this, agent_state->evict_effort);
14293 }
14294}
14295
14296void PrimaryLogPG::agent_delay()
14297{
14298 dout(20) << __func__ << dendl;
14299 if (agent_state && !agent_state->is_idle()) {
11fdf7f2 14300 ceph_assert(agent_state->delaying == false);
7c673cae
FG
14301 agent_state->delaying = true;
14302 osd->agent_disable_pg(this, agent_state->evict_effort);
14303 }
14304}
14305
14306void PrimaryLogPG::agent_choose_mode_restart()
14307{
14308 dout(20) << __func__ << dendl;
9f95a23c 14309 std::scoped_lock locker{*this};
7c673cae
FG
14310 if (agent_state && agent_state->delaying) {
14311 agent_state->delaying = false;
14312 agent_choose_mode(true);
14313 }
7c673cae
FG
14314}
14315
14316bool PrimaryLogPG::agent_choose_mode(bool restart, OpRequestRef op)
14317{
14318 bool requeued = false;
14319 // Let delay play out
14320 if (agent_state->delaying) {
11fdf7f2 14321 dout(20) << __func__ << " " << this << " delaying, ignored" << dendl;
7c673cae
FG
14322 return requeued;
14323 }
14324
14325 TierAgentState::flush_mode_t flush_mode = TierAgentState::FLUSH_MODE_IDLE;
14326 TierAgentState::evict_mode_t evict_mode = TierAgentState::EVICT_MODE_IDLE;
14327 unsigned evict_effort = 0;
14328
14329 if (info.stats.stats_invalid) {
14330 // idle; stats can't be trusted until we scrub.
14331 dout(20) << __func__ << " stats invalid (post-split), idle" << dendl;
14332 goto skip_calc;
14333 }
14334
14335 {
14336 uint64_t divisor = pool.info.get_pg_num_divisor(info.pgid.pgid);
11fdf7f2 14337 ceph_assert(divisor > 0);
7c673cae
FG
14338
14339 // adjust (effective) user objects down based on the number
14340 // of HitSet objects, which should not count toward our total since
14341 // they cannot be flushed.
14342 uint64_t unflushable = info.stats.stats.sum.num_objects_hit_set_archive;
14343
14344 // also exclude omap objects if ec backing pool
14345 const pg_pool_t *base_pool = get_osdmap()->get_pg_pool(pool.info.tier_of);
11fdf7f2 14346 ceph_assert(base_pool);
7c673cae
FG
14347 if (!base_pool->supports_omap())
14348 unflushable += info.stats.stats.sum.num_objects_omap;
14349
14350 uint64_t num_user_objects = info.stats.stats.sum.num_objects;
14351 if (num_user_objects > unflushable)
14352 num_user_objects -= unflushable;
14353 else
14354 num_user_objects = 0;
14355
14356 uint64_t num_user_bytes = info.stats.stats.sum.num_bytes;
14357 uint64_t unflushable_bytes = info.stats.stats.sum.num_bytes_hit_set_archive;
14358 num_user_bytes -= unflushable_bytes;
14359 uint64_t num_overhead_bytes = osd->store->estimate_objects_overhead(num_user_objects);
14360 num_user_bytes += num_overhead_bytes;
14361
14362 // also reduce the num_dirty by num_objects_omap
14363 int64_t num_dirty = info.stats.stats.sum.num_objects_dirty;
14364 if (!base_pool->supports_omap()) {
14365 if (num_dirty > info.stats.stats.sum.num_objects_omap)
14366 num_dirty -= info.stats.stats.sum.num_objects_omap;
14367 else
14368 num_dirty = 0;
14369 }
14370
14371 dout(10) << __func__
14372 << " flush_mode: "
14373 << TierAgentState::get_flush_mode_name(agent_state->flush_mode)
14374 << " evict_mode: "
14375 << TierAgentState::get_evict_mode_name(agent_state->evict_mode)
14376 << " num_objects: " << info.stats.stats.sum.num_objects
14377 << " num_bytes: " << info.stats.stats.sum.num_bytes
14378 << " num_objects_dirty: " << info.stats.stats.sum.num_objects_dirty
14379 << " num_objects_omap: " << info.stats.stats.sum.num_objects_omap
14380 << " num_dirty: " << num_dirty
14381 << " num_user_objects: " << num_user_objects
14382 << " num_user_bytes: " << num_user_bytes
14383 << " num_overhead_bytes: " << num_overhead_bytes
14384 << " pool.info.target_max_bytes: " << pool.info.target_max_bytes
14385 << " pool.info.target_max_objects: " << pool.info.target_max_objects
14386 << dendl;
14387
14388 // get dirty, full ratios
14389 uint64_t dirty_micro = 0;
14390 uint64_t full_micro = 0;
14391 if (pool.info.target_max_bytes && num_user_objects > 0) {
14392 uint64_t avg_size = num_user_bytes / num_user_objects;
14393 dirty_micro =
14394 num_dirty * avg_size * 1000000 /
11fdf7f2 14395 std::max<uint64_t>(pool.info.target_max_bytes / divisor, 1);
7c673cae
FG
14396 full_micro =
14397 num_user_objects * avg_size * 1000000 /
11fdf7f2 14398 std::max<uint64_t>(pool.info.target_max_bytes / divisor, 1);
7c673cae
FG
14399 }
14400 if (pool.info.target_max_objects > 0) {
14401 uint64_t dirty_objects_micro =
14402 num_dirty * 1000000 /
11fdf7f2 14403 std::max<uint64_t>(pool.info.target_max_objects / divisor, 1);
7c673cae
FG
14404 if (dirty_objects_micro > dirty_micro)
14405 dirty_micro = dirty_objects_micro;
14406 uint64_t full_objects_micro =
14407 num_user_objects * 1000000 /
11fdf7f2 14408 std::max<uint64_t>(pool.info.target_max_objects / divisor, 1);
7c673cae
FG
14409 if (full_objects_micro > full_micro)
14410 full_micro = full_objects_micro;
14411 }
14412 dout(20) << __func__ << " dirty " << ((float)dirty_micro / 1000000.0)
14413 << " full " << ((float)full_micro / 1000000.0)
14414 << dendl;
14415
14416 // flush mode
14417 uint64_t flush_target = pool.info.cache_target_dirty_ratio_micro;
14418 uint64_t flush_high_target = pool.info.cache_target_dirty_high_ratio_micro;
14419 uint64_t flush_slop = (float)flush_target * cct->_conf->osd_agent_slop;
14420 if (restart || agent_state->flush_mode == TierAgentState::FLUSH_MODE_IDLE) {
14421 flush_target += flush_slop;
14422 flush_high_target += flush_slop;
14423 } else {
11fdf7f2
TL
14424 flush_target -= std::min(flush_target, flush_slop);
14425 flush_high_target -= std::min(flush_high_target, flush_slop);
7c673cae
FG
14426 }
14427
14428 if (dirty_micro > flush_high_target) {
14429 flush_mode = TierAgentState::FLUSH_MODE_HIGH;
11fdf7f2 14430 } else if (dirty_micro > flush_target || (!flush_target && num_dirty > 0)) {
7c673cae
FG
14431 flush_mode = TierAgentState::FLUSH_MODE_LOW;
14432 }
14433
14434 // evict mode
14435 uint64_t evict_target = pool.info.cache_target_full_ratio_micro;
14436 uint64_t evict_slop = (float)evict_target * cct->_conf->osd_agent_slop;
14437 if (restart || agent_state->evict_mode == TierAgentState::EVICT_MODE_IDLE)
14438 evict_target += evict_slop;
14439 else
11fdf7f2 14440 evict_target -= std::min(evict_target, evict_slop);
7c673cae
FG
14441
14442 if (full_micro > 1000000) {
14443 // evict anything clean
14444 evict_mode = TierAgentState::EVICT_MODE_FULL;
14445 evict_effort = 1000000;
14446 } else if (full_micro > evict_target) {
14447 // set effort in [0..1] range based on where we are between
14448 evict_mode = TierAgentState::EVICT_MODE_SOME;
14449 uint64_t over = full_micro - evict_target;
14450 uint64_t span = 1000000 - evict_target;
11fdf7f2
TL
14451 evict_effort = std::max(over * 1000000 / span,
14452 uint64_t(1000000.0 *
14453 cct->_conf->osd_agent_min_evict_effort));
7c673cae
FG
14454
14455 // quantize effort to avoid too much reordering in the agent_queue.
14456 uint64_t inc = cct->_conf->osd_agent_quantize_effort * 1000000;
11fdf7f2 14457 ceph_assert(inc > 0);
7c673cae
FG
14458 uint64_t was = evict_effort;
14459 evict_effort -= evict_effort % inc;
14460 if (evict_effort < inc)
14461 evict_effort = inc;
11fdf7f2 14462 ceph_assert(evict_effort >= inc && evict_effort <= 1000000);
7c673cae
FG
14463 dout(30) << __func__ << " evict_effort " << was << " quantized by " << inc << " to " << evict_effort << dendl;
14464 }
14465 }
14466
14467 skip_calc:
14468 bool old_idle = agent_state->is_idle();
14469 if (flush_mode != agent_state->flush_mode) {
14470 dout(5) << __func__ << " flush_mode "
14471 << TierAgentState::get_flush_mode_name(agent_state->flush_mode)
14472 << " -> "
14473 << TierAgentState::get_flush_mode_name(flush_mode)
14474 << dendl;
9f95a23c
TL
14475 recovery_state.update_stats(
14476 [=](auto &history, auto &stats) {
14477 if (flush_mode == TierAgentState::FLUSH_MODE_HIGH) {
14478 osd->agent_inc_high_count();
14479 stats.stats.sum.num_flush_mode_high = 1;
14480 } else if (flush_mode == TierAgentState::FLUSH_MODE_LOW) {
14481 stats.stats.sum.num_flush_mode_low = 1;
14482 }
14483 if (agent_state->flush_mode == TierAgentState::FLUSH_MODE_HIGH) {
14484 osd->agent_dec_high_count();
14485 stats.stats.sum.num_flush_mode_high = 0;
14486 } else if (agent_state->flush_mode == TierAgentState::FLUSH_MODE_LOW) {
14487 stats.stats.sum.num_flush_mode_low = 0;
14488 }
14489 return false;
14490 });
7c673cae
FG
14491 agent_state->flush_mode = flush_mode;
14492 }
14493 if (evict_mode != agent_state->evict_mode) {
14494 dout(5) << __func__ << " evict_mode "
14495 << TierAgentState::get_evict_mode_name(agent_state->evict_mode)
14496 << " -> "
14497 << TierAgentState::get_evict_mode_name(evict_mode)
14498 << dendl;
14499 if (agent_state->evict_mode == TierAgentState::EVICT_MODE_FULL &&
14500 is_active()) {
14501 if (op)
14502 requeue_op(op);
b32b8144 14503 requeue_ops(waiting_for_flush);
7c673cae 14504 requeue_ops(waiting_for_active);
9f95a23c 14505 requeue_ops(waiting_for_readable);
7c673cae
FG
14506 requeue_ops(waiting_for_scrub);
14507 requeue_ops(waiting_for_cache_not_full);
14508 objects_blocked_on_cache_full.clear();
14509 requeued = true;
14510 }
9f95a23c
TL
14511 recovery_state.update_stats(
14512 [=](auto &history, auto &stats) {
14513 if (evict_mode == TierAgentState::EVICT_MODE_SOME) {
14514 stats.stats.sum.num_evict_mode_some = 1;
14515 } else if (evict_mode == TierAgentState::EVICT_MODE_FULL) {
14516 stats.stats.sum.num_evict_mode_full = 1;
14517 }
14518 if (agent_state->evict_mode == TierAgentState::EVICT_MODE_SOME) {
14519 stats.stats.sum.num_evict_mode_some = 0;
14520 } else if (agent_state->evict_mode == TierAgentState::EVICT_MODE_FULL) {
14521 stats.stats.sum.num_evict_mode_full = 0;
14522 }
14523 return false;
14524 });
7c673cae
FG
14525 agent_state->evict_mode = evict_mode;
14526 }
14527 uint64_t old_effort = agent_state->evict_effort;
14528 if (evict_effort != agent_state->evict_effort) {
14529 dout(5) << __func__ << " evict_effort "
14530 << ((float)agent_state->evict_effort / 1000000.0)
14531 << " -> "
14532 << ((float)evict_effort / 1000000.0)
14533 << dendl;
14534 agent_state->evict_effort = evict_effort;
14535 }
14536
14537 // NOTE: we are using evict_effort as a proxy for *all* agent effort
14538 // (including flush). This is probably fine (they should be
14539 // correlated) but it is not precisely correct.
14540 if (agent_state->is_idle()) {
14541 if (!restart && !old_idle) {
14542 osd->agent_disable_pg(this, old_effort);
14543 }
14544 } else {
14545 if (restart || old_idle) {
14546 osd->agent_enable_pg(this, agent_state->evict_effort);
14547 } else if (old_effort != agent_state->evict_effort) {
14548 osd->agent_adjust_pg(this, old_effort, agent_state->evict_effort);
14549 }
14550 }
14551 return requeued;
14552}
14553
14554void PrimaryLogPG::agent_estimate_temp(const hobject_t& oid, int *temp)
14555{
11fdf7f2
TL
14556 ceph_assert(hit_set);
14557 ceph_assert(temp);
7c673cae
FG
14558 *temp = 0;
14559 if (hit_set->contains(oid))
14560 *temp = 1000000;
14561 unsigned i = 0;
14562 int last_n = pool.info.hit_set_search_last_n;
14563 for (map<time_t,HitSetRef>::reverse_iterator p =
14564 agent_state->hit_set_map.rbegin(); last_n > 0 &&
14565 p != agent_state->hit_set_map.rend(); ++p, ++i) {
14566 if (p->second->contains(oid)) {
14567 *temp += pool.info.get_grade(i);
14568 --last_n;
14569 }
14570 }
14571}
14572
14573// Dup op detection
14574
14575bool PrimaryLogPG::already_complete(eversion_t v)
14576{
14577 dout(20) << __func__ << ": " << v << dendl;
14578 for (xlist<RepGather*>::iterator i = repop_queue.begin();
14579 !i.end();
14580 ++i) {
14581 dout(20) << __func__ << ": " << **i << dendl;
14582 // skip copy from temp object ops
14583 if ((*i)->v == eversion_t()) {
14584 dout(20) << __func__ << ": " << **i
14585 << " version is empty" << dendl;
14586 continue;
14587 }
14588 if ((*i)->v > v) {
14589 dout(20) << __func__ << ": " << **i
14590 << " (*i)->v past v" << dendl;
14591 break;
14592 }
14593 if (!(*i)->all_committed) {
14594 dout(20) << __func__ << ": " << **i
14595 << " not committed, returning false"
14596 << dendl;
14597 return false;
14598 }
14599 }
14600 dout(20) << __func__ << ": returning true" << dendl;
14601 return true;
14602}
14603
7c673cae
FG
14604
14605// ==========================================================================================
14606// SCRUB
14607
14608
14609bool PrimaryLogPG::_range_available_for_scrub(
14610 const hobject_t &begin, const hobject_t &end)
14611{
14612 pair<hobject_t, ObjectContextRef> next;
14613 next.second = object_contexts.lookup(begin);
14614 next.first = begin;
14615 bool more = true;
14616 while (more && next.first < end) {
14617 if (next.second && next.second->is_blocked()) {
14618 next.second->requeue_scrub_on_unblock = true;
14619 dout(10) << __func__ << ": scrub delayed, "
14620 << next.first << " is blocked"
14621 << dendl;
14622 return false;
14623 }
14624 more = object_contexts.get_next(next.first, &next);
14625 }
14626 return true;
14627}
14628
9f95a23c 14629static bool doing_clones(const std::optional<SnapSet> &snapset,
7c673cae 14630 const vector<snapid_t>::reverse_iterator &curclone) {
9f95a23c 14631 return snapset && curclone != snapset->clones.rend();
7c673cae
FG
14632}
14633
14634void PrimaryLogPG::log_missing(unsigned missing,
9f95a23c 14635 const std::optional<hobject_t> &head,
7c673cae
FG
14636 LogChannelRef clog,
14637 const spg_t &pgid,
14638 const char *func,
14639 const char *mode,
14640 bool allow_incomplete_clones)
14641{
11fdf7f2 14642 ceph_assert(head);
7c673cae 14643 if (allow_incomplete_clones) {
9f95a23c
TL
14644 dout(20) << func << " " << mode << " " << pgid << " " << *head
14645 << " skipped " << missing << " clone(s) in cache tier" << dendl;
7c673cae 14646 } else {
9f95a23c
TL
14647 clog->info() << mode << " " << pgid << " " << *head
14648 << " : " << missing << " missing clone(s)";
7c673cae
FG
14649 }
14650}
14651
9f95a23c
TL
14652unsigned PrimaryLogPG::process_clones_to(const std::optional<hobject_t> &head,
14653 const std::optional<SnapSet> &snapset,
7c673cae
FG
14654 LogChannelRef clog,
14655 const spg_t &pgid,
14656 const char *mode,
14657 bool allow_incomplete_clones,
9f95a23c 14658 std::optional<snapid_t> target,
7c673cae
FG
14659 vector<snapid_t>::reverse_iterator *curclone,
14660 inconsistent_snapset_wrapper &e)
14661{
11fdf7f2
TL
14662 ceph_assert(head);
14663 ceph_assert(snapset);
7c673cae
FG
14664 unsigned missing = 0;
14665
14666 // NOTE: clones are in descending order, thus **curclone > target test here
9f95a23c 14667 hobject_t next_clone(*head);
7c673cae
FG
14668 while(doing_clones(snapset, *curclone) && (!target || **curclone > *target)) {
14669 ++missing;
14670 // it is okay to be missing one or more clones in a cache tier.
14671 // skip higher-numbered clones in the list.
14672 if (!allow_incomplete_clones) {
14673 next_clone.snap = **curclone;
9f95a23c 14674 clog->error() << mode << " " << pgid << " " << *head
91327a77 14675 << " : expected clone " << next_clone << " " << missing
c07f9fc5 14676 << " missing";
7c673cae
FG
14677 ++scrubber.shallow_errors;
14678 e.set_clone_missing(next_clone.snap);
14679 }
14680 // Clones are descending
14681 ++(*curclone);
14682 }
14683 return missing;
14684}
14685
14686/*
14687 * Validate consistency of the object info and snap sets.
14688 *
14689 * We are sort of comparing 2 lists. The main loop is on objmap.objects. But
14690 * the comparison of the objects is against multiple snapset.clones. There are
11fdf7f2 14691 * multiple clone lists and in between lists we expect head.
7c673cae
FG
14692 *
14693 * Example
14694 *
14695 * objects expected
14696 * ======= =======
11fdf7f2
TL
14697 * obj1 snap 1 head, unexpected obj1 snap 1
14698 * obj2 head head, match
7c673cae
FG
14699 * [SnapSet clones 6 4 2 1]
14700 * obj2 snap 7 obj2 snap 6, unexpected obj2 snap 7
14701 * obj2 snap 6 obj2 snap 6, match
14702 * obj2 snap 4 obj2 snap 4, match
11fdf7f2 14703 * obj3 head obj2 snap 2 (expected), obj2 snap 1 (expected), match
7c673cae
FG
14704 * [Snapset clones 3 1]
14705 * obj3 snap 3 obj3 snap 3 match
14706 * obj3 snap 1 obj3 snap 1 match
11fdf7f2 14707 * obj4 head head, match
7c673cae
FG
14708 * [Snapset clones 4]
14709 * EOL obj4 snap 4, (expected)
14710 */
14711void PrimaryLogPG::scrub_snapshot_metadata(
14712 ScrubMap &scrubmap,
28e407b8 14713 const map<hobject_t,
9f95a23c
TL
14714 pair<std::optional<uint32_t>,
14715 std::optional<uint32_t>>> &missing_digest)
7c673cae
FG
14716{
14717 dout(10) << __func__ << dendl;
14718
7c673cae
FG
14719 bool repair = state_test(PG_STATE_REPAIR);
14720 bool deep_scrub = state_test(PG_STATE_DEEP_SCRUB);
14721 const char *mode = (repair ? "repair": (deep_scrub ? "deep-scrub" : "scrub"));
9f95a23c 14722 std::optional<snapid_t> all_clones; // Unspecified snapid_t or std::nullopt
7c673cae 14723
7c673cae 14724 // traverse in reverse order.
9f95a23c
TL
14725 std::optional<hobject_t> head;
14726 std::optional<SnapSet> snapset; // If initialized so will head (above)
7c673cae
FG
14727 vector<snapid_t>::reverse_iterator curclone; // Defined only if snapset initialized
14728 unsigned missing = 0;
14729 inconsistent_snapset_wrapper soid_error, head_error;
94b18763 14730 unsigned soid_error_count = 0;
7c673cae 14731
7c673cae
FG
14732 for (map<hobject_t,ScrubMap::object>::reverse_iterator
14733 p = scrubmap.objects.rbegin(); p != scrubmap.objects.rend(); ++p) {
14734 const hobject_t& soid = p->first;
11fdf7f2 14735 ceph_assert(!soid.is_snapdir());
7c673cae
FG
14736 soid_error = inconsistent_snapset_wrapper{soid};
14737 object_stat_sum_t stat;
9f95a23c 14738 std::optional<object_info_t> oi;
7c673cae 14739
11fdf7f2 14740 stat.num_objects++;
7c673cae
FG
14741
14742 if (soid.nspace == cct->_conf->osd_hit_set_namespace)
14743 stat.num_objects_hit_set_archive++;
14744
14745 if (soid.is_snap()) {
14746 // it's a clone
14747 stat.num_object_clones++;
14748 }
14749
14750 // basic checks.
14751 if (p->second.attrs.count(OI_ATTR) == 0) {
9f95a23c 14752 oi = std::nullopt;
7c673cae 14753 osd->clog->error() << mode << " " << info.pgid << " " << soid
91327a77 14754 << " : no '" << OI_ATTR << "' attr";
7c673cae 14755 ++scrubber.shallow_errors;
94b18763 14756 soid_error.set_info_missing();
7c673cae
FG
14757 } else {
14758 bufferlist bv;
14759 bv.push_back(p->second.attrs[OI_ATTR]);
14760 try {
14761 oi = object_info_t(); // Initialize optional<> before decode into it
9f95a23c 14762 oi->decode(bv);
7c673cae 14763 } catch (buffer::error& e) {
9f95a23c 14764 oi = std::nullopt;
7c673cae 14765 osd->clog->error() << mode << " " << info.pgid << " " << soid
91327a77 14766 << " : can't decode '" << OI_ATTR << "' attr " << e.what();
7c673cae 14767 ++scrubber.shallow_errors;
94b18763
FG
14768 soid_error.set_info_corrupted();
14769 soid_error.set_info_missing(); // Not available too
7c673cae
FG
14770 }
14771 }
14772
14773 if (oi) {
14774 if (pgbackend->be_get_ondisk_size(oi->size) != p->second.size) {
14775 osd->clog->error() << mode << " " << info.pgid << " " << soid
91327a77 14776 << " : on disk size (" << p->second.size
7c673cae
FG
14777 << ") does not match object info size ("
14778 << oi->size << ") adjusted for ondisk to ("
14779 << pgbackend->be_get_ondisk_size(oi->size)
14780 << ")";
14781 soid_error.set_size_mismatch();
14782 ++scrubber.shallow_errors;
14783 }
14784
9f95a23c 14785 dout(20) << mode << " " << soid << " " << *oi << dendl;
7c673cae
FG
14786
14787 // A clone num_bytes will be added later when we have snapset
14788 if (!soid.is_snap()) {
11fdf7f2 14789 stat.num_bytes += oi->size;
7c673cae
FG
14790 }
14791 if (soid.nspace == cct->_conf->osd_hit_set_namespace)
14792 stat.num_bytes_hit_set_archive += oi->size;
14793
11fdf7f2
TL
14794 if (oi->is_dirty())
14795 ++stat.num_objects_dirty;
14796 if (oi->is_whiteout())
14797 ++stat.num_whiteouts;
14798 if (oi->is_omap())
14799 ++stat.num_objects_omap;
14800 if (oi->is_cache_pinned())
14801 ++stat.num_objects_pinned;
14802 if (oi->has_manifest())
14803 ++stat.num_objects_manifest;
7c673cae
FG
14804 }
14805
14806 // Check for any problems while processing clones
14807 if (doing_clones(snapset, curclone)) {
9f95a23c 14808 std::optional<snapid_t> target;
7c673cae
FG
14809 // Expecting an object with snap for current head
14810 if (soid.has_snapset() || soid.get_head() != head->get_head()) {
14811
14812 dout(10) << __func__ << " " << mode << " " << info.pgid << " new object "
9f95a23c 14813 << soid << " while processing " << *head << dendl;
7c673cae
FG
14814
14815 target = all_clones;
14816 } else {
11fdf7f2 14817 ceph_assert(soid.is_snap());
7c673cae
FG
14818 target = soid.snap;
14819 }
14820
14821 // Log any clones we were expecting to be there up to target
14822 // This will set missing, but will be a no-op if snap.soid == *curclone.
14823 missing += process_clones_to(head, snapset, osd->clog, info.pgid, mode,
14824 pool.info.allow_incomplete_clones(), target, &curclone,
14825 head_error);
14826 }
14827 bool expected;
14828 // Check doing_clones() again in case we ran process_clones_to()
14829 if (doing_clones(snapset, curclone)) {
11fdf7f2 14830 // A head would have processed all clones above
7c673cae 14831 // or all greater than *curclone.
11fdf7f2 14832 ceph_assert(soid.is_snap() && *curclone <= soid.snap);
7c673cae
FG
14833
14834 // After processing above clone snap should match the expected curclone
14835 expected = (*curclone == soid.snap);
14836 } else {
11fdf7f2 14837 // If we aren't doing clones any longer, then expecting head
7c673cae
FG
14838 expected = soid.has_snapset();
14839 }
14840 if (!expected) {
14841 // If we couldn't read the head's snapset, just ignore clones
14842 if (head && !snapset) {
14843 osd->clog->error() << mode << " " << info.pgid << " " << soid
91327a77 14844 << " : clone ignored due to missing snapset";
7c673cae
FG
14845 } else {
14846 osd->clog->error() << mode << " " << info.pgid << " " << soid
91327a77 14847 << " : is an unexpected clone";
7c673cae
FG
14848 }
14849 ++scrubber.shallow_errors;
14850 soid_error.set_headless();
14851 scrubber.store->add_snap_error(pool.id, soid_error);
94b18763 14852 ++soid_error_count;
7c673cae
FG
14853 if (head && soid.get_head() == head->get_head())
14854 head_error.set_clone(soid.snap);
14855 continue;
14856 }
14857
14858 // new snapset?
14859 if (soid.has_snapset()) {
14860
14861 if (missing) {
14862 log_missing(missing, head, osd->clog, info.pgid, __func__, mode,
14863 pool.info.allow_incomplete_clones());
14864 }
14865
14866 // Save previous head error information
94b18763 14867 if (head && (head_error.errors || soid_error_count))
7c673cae
FG
14868 scrubber.store->add_snap_error(pool.id, head_error);
14869 // Set this as a new head object
14870 head = soid;
14871 missing = 0;
14872 head_error = soid_error;
94b18763 14873 soid_error_count = 0;
7c673cae
FG
14874
14875 dout(20) << __func__ << " " << mode << " new head " << head << dendl;
14876
14877 if (p->second.attrs.count(SS_ATTR) == 0) {
14878 osd->clog->error() << mode << " " << info.pgid << " " << soid
91327a77 14879 << " : no '" << SS_ATTR << "' attr";
7c673cae 14880 ++scrubber.shallow_errors;
9f95a23c 14881 snapset = std::nullopt;
94b18763 14882 head_error.set_snapset_missing();
7c673cae
FG
14883 } else {
14884 bufferlist bl;
14885 bl.push_back(p->second.attrs[SS_ATTR]);
11fdf7f2 14886 auto blp = bl.cbegin();
7c673cae
FG
14887 try {
14888 snapset = SnapSet(); // Initialize optional<> before decoding into it
9f95a23c 14889 decode(*snapset, blp);
94b18763 14890 head_error.ss_bl.push_back(p->second.attrs[SS_ATTR]);
7c673cae 14891 } catch (buffer::error& e) {
9f95a23c 14892 snapset = std::nullopt;
7c673cae 14893 osd->clog->error() << mode << " " << info.pgid << " " << soid
91327a77 14894 << " : can't decode '" << SS_ATTR << "' attr " << e.what();
7c673cae 14895 ++scrubber.shallow_errors;
94b18763 14896 head_error.set_snapset_corrupted();
7c673cae
FG
14897 }
14898 }
14899
14900 if (snapset) {
14901 // what will be next?
14902 curclone = snapset->clones.rbegin();
14903
14904 if (!snapset->clones.empty()) {
9f95a23c 14905 dout(20) << " snapset " << *snapset << dendl;
7c673cae
FG
14906 if (snapset->seq == 0) {
14907 osd->clog->error() << mode << " " << info.pgid << " " << soid
91327a77 14908 << " : snaps.seq not set";
7c673cae 14909 ++scrubber.shallow_errors;
94b18763 14910 head_error.set_snapset_error();
7c673cae
FG
14911 }
14912 }
7c673cae
FG
14913 }
14914 } else {
11fdf7f2
TL
14915 ceph_assert(soid.is_snap());
14916 ceph_assert(head);
14917 ceph_assert(snapset);
14918 ceph_assert(soid.snap == *curclone);
7c673cae
FG
14919
14920 dout(20) << __func__ << " " << mode << " matched clone " << soid << dendl;
14921
14922 if (snapset->clone_size.count(soid.snap) == 0) {
14923 osd->clog->error() << mode << " " << info.pgid << " " << soid
91327a77 14924 << " : is missing in clone_size";
7c673cae
FG
14925 ++scrubber.shallow_errors;
14926 soid_error.set_size_mismatch();
14927 } else {
14928 if (oi && oi->size != snapset->clone_size[soid.snap]) {
14929 osd->clog->error() << mode << " " << info.pgid << " " << soid
91327a77 14930 << " : size " << oi->size << " != clone_size "
7c673cae
FG
14931 << snapset->clone_size[*curclone];
14932 ++scrubber.shallow_errors;
14933 soid_error.set_size_mismatch();
14934 }
14935
14936 if (snapset->clone_overlap.count(soid.snap) == 0) {
14937 osd->clog->error() << mode << " " << info.pgid << " " << soid
91327a77 14938 << " : is missing in clone_overlap";
7c673cae
FG
14939 ++scrubber.shallow_errors;
14940 soid_error.set_size_mismatch();
14941 } else {
14942 // This checking is based on get_clone_bytes(). The first 2 asserts
14943 // can't happen because we know we have a clone_size and
14944 // a clone_overlap. Now we check that the interval_set won't
14945 // cause the last assert.
14946 uint64_t size = snapset->clone_size.find(soid.snap)->second;
14947 const interval_set<uint64_t> &overlap =
14948 snapset->clone_overlap.find(soid.snap)->second;
14949 bool bad_interval_set = false;
14950 for (interval_set<uint64_t>::const_iterator i = overlap.begin();
14951 i != overlap.end(); ++i) {
14952 if (size < i.get_len()) {
14953 bad_interval_set = true;
14954 break;
14955 }
14956 size -= i.get_len();
14957 }
14958
14959 if (bad_interval_set) {
14960 osd->clog->error() << mode << " " << info.pgid << " " << soid
91327a77 14961 << " : bad interval_set in clone_overlap";
7c673cae
FG
14962 ++scrubber.shallow_errors;
14963 soid_error.set_size_mismatch();
14964 } else {
14965 stat.num_bytes += snapset->get_clone_bytes(soid.snap);
14966 }
14967 }
14968 }
14969
7c673cae
FG
14970 // what's next?
14971 ++curclone;
94b18763 14972 if (soid_error.errors) {
7c673cae 14973 scrubber.store->add_snap_error(pool.id, soid_error);
94b18763
FG
14974 ++soid_error_count;
14975 }
7c673cae
FG
14976 }
14977
14978 scrub_cstat.add(stat);
14979 }
14980
14981 if (doing_clones(snapset, curclone)) {
14982 dout(10) << __func__ << " " << mode << " " << info.pgid
9f95a23c 14983 << " No more objects while processing " << *head << dendl;
7c673cae
FG
14984
14985 missing += process_clones_to(head, snapset, osd->clog, info.pgid, mode,
14986 pool.info.allow_incomplete_clones(), all_clones, &curclone,
14987 head_error);
14988 }
14989 // There could be missing found by the test above or even
14990 // before dropping out of the loop for the last head.
14991 if (missing) {
14992 log_missing(missing, head, osd->clog, info.pgid, __func__,
14993 mode, pool.info.allow_incomplete_clones());
14994 }
94b18763 14995 if (head && (head_error.errors || soid_error_count))
7c673cae
FG
14996 scrubber.store->add_snap_error(pool.id, head_error);
14997
28e407b8 14998 for (auto p = missing_digest.begin(); p != missing_digest.end(); ++p) {
11fdf7f2 14999 ceph_assert(!p->first.is_snapdir());
7c673cae
FG
15000 dout(10) << __func__ << " recording digests for " << p->first << dendl;
15001 ObjectContextRef obc = get_object_context(p->first, false);
15002 if (!obc) {
15003 osd->clog->error() << info.pgid << " " << mode
c07f9fc5 15004 << " cannot get object context for object "
7c673cae
FG
15005 << p->first;
15006 continue;
15007 } else if (obc->obs.oi.soid != p->first) {
15008 osd->clog->error() << info.pgid << " " << mode
91327a77
AA
15009 << " " << p->first
15010 << " : object has a valid oi attr with a mismatched name, "
7c673cae
FG
15011 << " obc->obs.oi.soid: " << obc->obs.oi.soid;
15012 continue;
15013 }
15014 OpContextUPtr ctx = simple_opc_create(obc);
15015 ctx->at_version = get_next_version();
15016 ctx->mtime = utime_t(); // do not update mtime
28e407b8
AA
15017 if (p->second.first) {
15018 ctx->new_obs.oi.set_data_digest(*p->second.first);
15019 } else {
15020 ctx->new_obs.oi.clear_data_digest();
15021 }
15022 if (p->second.second) {
15023 ctx->new_obs.oi.set_omap_digest(*p->second.second);
15024 } else {
15025 ctx->new_obs.oi.clear_omap_digest();
15026 }
7c673cae
FG
15027 finish_ctx(ctx.get(), pg_log_entry_t::MODIFY);
15028
15029 ctx->register_on_success(
15030 [this]() {
15031 dout(20) << "updating scrub digest" << dendl;
15032 if (--scrubber.num_digest_updates_pending == 0) {
15033 requeue_scrub();
15034 }
15035 });
15036
15037 simple_opc_submit(std::move(ctx));
15038 ++scrubber.num_digest_updates_pending;
15039 }
7c673cae
FG
15040
15041 dout(10) << __func__ << " (" << mode << ") finish" << dendl;
15042}
15043
15044void PrimaryLogPG::_scrub_clear_state()
15045{
15046 scrub_cstat = object_stat_collection_t();
15047}
15048
15049void PrimaryLogPG::_scrub_finish()
15050{
15051 bool repair = state_test(PG_STATE_REPAIR);
15052 bool deep_scrub = state_test(PG_STATE_DEEP_SCRUB);
15053 const char *mode = (repair ? "repair": (deep_scrub ? "deep-scrub" : "scrub"));
15054
15055 if (info.stats.stats_invalid) {
9f95a23c
TL
15056 recovery_state.update_stats(
15057 [=](auto &history, auto &stats) {
15058 stats.stats = scrub_cstat;
15059 stats.stats_invalid = false;
15060 return false;
15061 });
7c673cae
FG
15062
15063 if (agent_state)
15064 agent_choose_mode();
15065 }
15066
15067 dout(10) << mode << " got "
15068 << scrub_cstat.sum.num_objects << "/" << info.stats.stats.sum.num_objects << " objects, "
15069 << scrub_cstat.sum.num_object_clones << "/" << info.stats.stats.sum.num_object_clones << " clones, "
15070 << scrub_cstat.sum.num_objects_dirty << "/" << info.stats.stats.sum.num_objects_dirty << " dirty, "
15071 << scrub_cstat.sum.num_objects_omap << "/" << info.stats.stats.sum.num_objects_omap << " omap, "
15072 << scrub_cstat.sum.num_objects_pinned << "/" << info.stats.stats.sum.num_objects_pinned << " pinned, "
15073 << scrub_cstat.sum.num_objects_hit_set_archive << "/" << info.stats.stats.sum.num_objects_hit_set_archive << " hit_set_archive, "
15074 << scrub_cstat.sum.num_bytes << "/" << info.stats.stats.sum.num_bytes << " bytes, "
11fdf7f2 15075 << scrub_cstat.sum.num_objects_manifest << "/" << info.stats.stats.sum.num_objects_manifest << " manifest objects, "
7c673cae
FG
15076 << scrub_cstat.sum.num_bytes_hit_set_archive << "/" << info.stats.stats.sum.num_bytes_hit_set_archive << " hit_set_archive bytes."
15077 << dendl;
15078
15079 if (scrub_cstat.sum.num_objects != info.stats.stats.sum.num_objects ||
15080 scrub_cstat.sum.num_object_clones != info.stats.stats.sum.num_object_clones ||
15081 (scrub_cstat.sum.num_objects_dirty != info.stats.stats.sum.num_objects_dirty &&
15082 !info.stats.dirty_stats_invalid) ||
15083 (scrub_cstat.sum.num_objects_omap != info.stats.stats.sum.num_objects_omap &&
15084 !info.stats.omap_stats_invalid) ||
15085 (scrub_cstat.sum.num_objects_pinned != info.stats.stats.sum.num_objects_pinned &&
15086 !info.stats.pin_stats_invalid) ||
15087 (scrub_cstat.sum.num_objects_hit_set_archive != info.stats.stats.sum.num_objects_hit_set_archive &&
15088 !info.stats.hitset_stats_invalid) ||
15089 (scrub_cstat.sum.num_bytes_hit_set_archive != info.stats.stats.sum.num_bytes_hit_set_archive &&
15090 !info.stats.hitset_bytes_stats_invalid) ||
11fdf7f2
TL
15091 (scrub_cstat.sum.num_objects_manifest != info.stats.stats.sum.num_objects_manifest &&
15092 !info.stats.manifest_stats_invalid) ||
7c673cae
FG
15093 scrub_cstat.sum.num_whiteouts != info.stats.stats.sum.num_whiteouts ||
15094 scrub_cstat.sum.num_bytes != info.stats.stats.sum.num_bytes) {
15095 osd->clog->error() << info.pgid << " " << mode
91327a77 15096 << " : stat mismatch, got "
7c673cae
FG
15097 << scrub_cstat.sum.num_objects << "/" << info.stats.stats.sum.num_objects << " objects, "
15098 << scrub_cstat.sum.num_object_clones << "/" << info.stats.stats.sum.num_object_clones << " clones, "
15099 << scrub_cstat.sum.num_objects_dirty << "/" << info.stats.stats.sum.num_objects_dirty << " dirty, "
15100 << scrub_cstat.sum.num_objects_omap << "/" << info.stats.stats.sum.num_objects_omap << " omap, "
15101 << scrub_cstat.sum.num_objects_pinned << "/" << info.stats.stats.sum.num_objects_pinned << " pinned, "
15102 << scrub_cstat.sum.num_objects_hit_set_archive << "/" << info.stats.stats.sum.num_objects_hit_set_archive << " hit_set_archive, "
15103 << scrub_cstat.sum.num_whiteouts << "/" << info.stats.stats.sum.num_whiteouts << " whiteouts, "
15104 << scrub_cstat.sum.num_bytes << "/" << info.stats.stats.sum.num_bytes << " bytes, "
11fdf7f2 15105 << scrub_cstat.sum.num_objects_manifest << "/" << info.stats.stats.sum.num_objects_manifest << " manifest objects, "
7c673cae
FG
15106 << scrub_cstat.sum.num_bytes_hit_set_archive << "/" << info.stats.stats.sum.num_bytes_hit_set_archive << " hit_set_archive bytes.";
15107 ++scrubber.shallow_errors;
15108
15109 if (repair) {
15110 ++scrubber.fixed;
9f95a23c
TL
15111 recovery_state.update_stats(
15112 [this](auto &history, auto &stats) {
15113 stats.stats = scrub_cstat;
15114 stats.dirty_stats_invalid = false;
15115 stats.omap_stats_invalid = false;
15116 stats.hitset_stats_invalid = false;
15117 stats.hitset_bytes_stats_invalid = false;
15118 stats.pin_stats_invalid = false;
15119 stats.manifest_stats_invalid = false;
15120 return false;
15121 });
7c673cae 15122 publish_stats_to_osd();
9f95a23c 15123 recovery_state.share_pg_info();
7c673cae 15124 }
7c673cae 15125 }
224ce89b
WB
15126 // Clear object context cache to get repair information
15127 if (repair)
15128 object_contexts.clear();
7c673cae
FG
15129}
15130
11fdf7f2 15131int PrimaryLogPG::rep_repair_primary_object(const hobject_t& soid, OpContext *ctx)
224ce89b 15132{
11fdf7f2 15133 OpRequestRef op = ctx->op;
224ce89b 15134 // Only supports replicated pools
11fdf7f2
TL
15135 ceph_assert(!pool.info.is_erasure());
15136 ceph_assert(is_primary());
224ce89b
WB
15137
15138 dout(10) << __func__ << " " << soid
9f95a23c 15139 << " peers osd.{" << get_acting_recovery_backfill() << "}" << dendl;
224ce89b
WB
15140
15141 if (!is_clean()) {
15142 block_for_clean(soid, op);
15143 return -EAGAIN;
15144 }
15145
9f95a23c 15146 ceph_assert(!recovery_state.get_pg_log().get_missing().is_missing(soid));
11fdf7f2
TL
15147 auto& oi = ctx->new_obs.oi;
15148 eversion_t v = oi.version;
224ce89b 15149
224ce89b
WB
15150 if (primary_error(soid, v)) {
15151 dout(0) << __func__ << " No other replicas available for " << soid << dendl;
15152 // XXX: If we knew that there is no down osd which could include this
15153 // object, it would be nice if we could return EIO here.
15154 // If a "never fail" flag was available, that could be used
15155 // for rbd to NOT return EIO until object marked lost.
15156
15157 // Drop through to save this op in case an osd comes up with the object.
15158 }
15159
15160 // Restart the op after object becomes readable again
15161 waiting_for_unreadable_object[soid].push_back(op);
15162 op->mark_delayed("waiting for missing object");
15163
15164 if (!eio_errors_to_process) {
15165 eio_errors_to_process = true;
11fdf7f2
TL
15166 ceph_assert(is_clean());
15167 state_set(PG_STATE_REPAIR);
eafe8130 15168 state_clear(PG_STATE_CLEAN);
224ce89b 15169 queue_peering_event(
11fdf7f2
TL
15170 PGPeeringEventRef(
15171 std::make_shared<PGPeeringEvent>(
15172 get_osdmap_epoch(),
15173 get_osdmap_epoch(),
9f95a23c 15174 PeeringState::DoRecovery())));
224ce89b
WB
15175 } else {
15176 // A prior error must have already cleared clean state and queued recovery
15177 // or a map change has triggered re-peering.
15178 // Not inlining the recovery by calling maybe_kick_recovery(soid);
15179 dout(5) << __func__<< ": Read error on " << soid << ", but already seen errors" << dendl;
15180 }
15181
15182 return -EAGAIN;
15183}
15184
7c673cae
FG
15185/*---SnapTrimmer Logging---*/
15186#undef dout_prefix
11fdf7f2 15187#define dout_prefix pg->gen_prefix(*_dout)
7c673cae
FG
15188
15189void PrimaryLogPG::SnapTrimmer::log_enter(const char *state_name)
15190{
15191 ldout(pg->cct, 20) << "enter " << state_name << dendl;
15192}
15193
15194void PrimaryLogPG::SnapTrimmer::log_exit(const char *state_name, utime_t enter_time)
15195{
15196 ldout(pg->cct, 20) << "exit " << state_name << dendl;
15197}
15198
15199/*---SnapTrimmer states---*/
15200#undef dout_prefix
11fdf7f2 15201#define dout_prefix (context< SnapTrimmer >().pg->gen_prefix(*_dout) \
7c673cae
FG
15202 << "SnapTrimmer state<" << get_state_name() << ">: ")
15203
15204/* NotTrimming */
15205PrimaryLogPG::NotTrimming::NotTrimming(my_context ctx)
15206 : my_base(ctx),
9f95a23c 15207 NamedState(nullptr, "NotTrimming")
7c673cae
FG
15208{
15209 context< SnapTrimmer >().log_enter(state_name);
15210}
15211
15212void PrimaryLogPG::NotTrimming::exit()
15213{
15214 context< SnapTrimmer >().log_exit(state_name, enter_time);
15215}
15216
15217boost::statechart::result PrimaryLogPG::NotTrimming::react(const KickTrim&)
15218{
15219 PrimaryLogPG *pg = context< SnapTrimmer >().pg;
15220 ldout(pg->cct, 10) << "NotTrimming react KickTrim" << dendl;
15221
15222 if (!(pg->is_primary() && pg->is_active())) {
15223 ldout(pg->cct, 10) << "NotTrimming not primary or active" << dendl;
15224 return discard_event();
15225 }
15226 if (!pg->is_clean() ||
15227 pg->snap_trimq.empty()) {
15228 ldout(pg->cct, 10) << "NotTrimming not clean or nothing to trim" << dendl;
15229 return discard_event();
15230 }
15231 if (pg->scrubber.active) {
15232 ldout(pg->cct, 10) << " scrubbing, will requeue snap_trimmer after" << dendl;
7c673cae
FG
15233 return transit< WaitScrub >();
15234 } else {
15235 return transit< Trimming >();
15236 }
15237}
15238
15239boost::statechart::result PrimaryLogPG::WaitReservation::react(const SnapTrimReserved&)
15240{
15241 PrimaryLogPG *pg = context< SnapTrimmer >().pg;
15242 ldout(pg->cct, 10) << "WaitReservation react SnapTrimReserved" << dendl;
15243
15244 pending = nullptr;
15245 if (!context< SnapTrimmer >().can_trim()) {
15246 post_event(KickTrim());
15247 return transit< NotTrimming >();
15248 }
15249
15250 context<Trimming>().snap_to_trim = pg->snap_trimq.range_start();
15251 ldout(pg->cct, 10) << "NotTrimming: trimming "
15252 << pg->snap_trimq.range_start()
15253 << dendl;
15254 return transit< AwaitAsyncWork >();
15255}
15256
15257/* AwaitAsyncWork */
15258PrimaryLogPG::AwaitAsyncWork::AwaitAsyncWork(my_context ctx)
15259 : my_base(ctx),
9f95a23c 15260 NamedState(nullptr, "Trimming/AwaitAsyncWork")
7c673cae
FG
15261{
15262 auto *pg = context< SnapTrimmer >().pg;
15263 context< SnapTrimmer >().log_enter(state_name);
15264 context< SnapTrimmer >().pg->osd->queue_for_snap_trim(pg);
15265 pg->state_set(PG_STATE_SNAPTRIM);
224ce89b 15266 pg->state_clear(PG_STATE_SNAPTRIM_ERROR);
7c673cae
FG
15267 pg->publish_stats_to_osd();
15268}
15269
15270boost::statechart::result PrimaryLogPG::AwaitAsyncWork::react(const DoSnapWork&)
15271{
15272 PrimaryLogPGRef pg = context< SnapTrimmer >().pg;
15273 snapid_t snap_to_trim = context<Trimming>().snap_to_trim;
15274 auto &in_flight = context<Trimming>().in_flight;
11fdf7f2 15275 ceph_assert(in_flight.empty());
7c673cae 15276
11fdf7f2 15277 ceph_assert(pg->is_primary() && pg->is_active());
7c673cae
FG
15278 if (!context< SnapTrimmer >().can_trim()) {
15279 ldout(pg->cct, 10) << "something changed, reverting to NotTrimming" << dendl;
15280 post_event(KickTrim());
15281 return transit< NotTrimming >();
15282 }
15283
15284 ldout(pg->cct, 10) << "AwaitAsyncWork: trimming snap " << snap_to_trim << dendl;
15285
15286 vector<hobject_t> to_trim;
15287 unsigned max = pg->cct->_conf->osd_pg_max_concurrent_snap_trims;
15288 to_trim.reserve(max);
15289 int r = pg->snap_mapper.get_next_objects_to_trim(
15290 snap_to_trim,
15291 max,
15292 &to_trim);
15293 if (r != 0 && r != -ENOENT) {
15294 lderr(pg->cct) << "get_next_objects_to_trim returned "
15295 << cpp_strerror(r) << dendl;
11fdf7f2 15296 ceph_abort_msg("get_next_objects_to_trim returned an invalid code");
7c673cae
FG
15297 } else if (r == -ENOENT) {
15298 // Done!
15299 ldout(pg->cct, 10) << "got ENOENT" << dendl;
15300
7c673cae 15301 pg->snap_trimq.erase(snap_to_trim);
7c673cae 15302
9f95a23c
TL
15303 if (pg->snap_trimq_repeat.count(snap_to_trim)) {
15304 ldout(pg->cct, 10) << " removing from snap_trimq_repeat" << dendl;
15305 pg->snap_trimq_repeat.erase(snap_to_trim);
15306 } else {
15307 ldout(pg->cct, 10) << "adding snap " << snap_to_trim
15308 << " to purged_snaps"
15309 << dendl;
15310 ObjectStore::Transaction t;
15311 pg->recovery_state.adjust_purged_snaps(
15312 [snap_to_trim](auto &purged_snaps) {
15313 purged_snaps.insert(snap_to_trim);
15314 });
15315 pg->write_if_dirty(t);
15316
15317 ldout(pg->cct, 10) << "purged_snaps now "
15318 << pg->info.purged_snaps << ", snap_trimq now "
15319 << pg->snap_trimq << dendl;
15320
15321 int tr = pg->osd->store->queue_transaction(pg->ch, std::move(t), NULL);
15322 ceph_assert(tr == 0);
7c673cae 15323
9f95a23c
TL
15324 pg->recovery_state.share_pg_info();
15325 }
7c673cae
FG
15326 post_event(KickTrim());
15327 return transit< NotTrimming >();
15328 }
11fdf7f2 15329 ceph_assert(!to_trim.empty());
7c673cae
FG
15330
15331 for (auto &&object: to_trim) {
15332 // Get next
15333 ldout(pg->cct, 10) << "AwaitAsyncWork react trimming " << object << dendl;
224ce89b 15334 OpContextUPtr ctx;
9f95a23c 15335 int error = pg->trim_object(in_flight.empty(), object, snap_to_trim, &ctx);
224ce89b
WB
15336 if (error) {
15337 if (error == -ENOLCK) {
15338 ldout(pg->cct, 10) << "could not get write lock on obj "
15339 << object << dendl;
15340 } else {
15341 pg->state_set(PG_STATE_SNAPTRIM_ERROR);
15342 ldout(pg->cct, 10) << "Snaptrim error=" << error << dendl;
15343 }
15344 if (!in_flight.empty()) {
15345 ldout(pg->cct, 10) << "letting the ones we already started finish" << dendl;
15346 return transit< WaitRepops >();
15347 }
15348 if (error == -ENOLCK) {
7c673cae
FG
15349 ldout(pg->cct, 10) << "waiting for it to clear"
15350 << dendl;
15351 return transit< WaitRWLock >();
7c673cae 15352 } else {
224ce89b 15353 return transit< NotTrimming >();
7c673cae
FG
15354 }
15355 }
15356
15357 in_flight.insert(object);
15358 ctx->register_on_success(
15359 [pg, object, &in_flight]() {
11fdf7f2 15360 ceph_assert(in_flight.find(object) != in_flight.end());
7c673cae 15361 in_flight.erase(object);
224ce89b
WB
15362 if (in_flight.empty()) {
15363 if (pg->state_test(PG_STATE_SNAPTRIM_ERROR)) {
15364 pg->snap_trimmer_machine.process_event(Reset());
15365 } else {
15366 pg->snap_trimmer_machine.process_event(RepopsComplete());
15367 }
15368 }
7c673cae
FG
15369 });
15370
15371 pg->simple_opc_submit(std::move(ctx));
15372 }
15373
15374 return transit< WaitRepops >();
15375}
15376
15377void PrimaryLogPG::setattr_maybe_cache(
15378 ObjectContextRef obc,
7c673cae
FG
15379 PGTransaction *t,
15380 const string &key,
15381 bufferlist &val)
15382{
15383 t->setattr(obc->obs.oi.soid, key, val);
15384}
15385
15386void PrimaryLogPG::setattrs_maybe_cache(
15387 ObjectContextRef obc,
7c673cae
FG
15388 PGTransaction *t,
15389 map<string, bufferlist> &attrs)
15390{
15391 t->setattrs(obc->obs.oi.soid, attrs);
15392}
15393
15394void PrimaryLogPG::rmattr_maybe_cache(
15395 ObjectContextRef obc,
7c673cae
FG
15396 PGTransaction *t,
15397 const string &key)
15398{
15399 t->rmattr(obc->obs.oi.soid, key);
15400}
15401
15402int PrimaryLogPG::getattr_maybe_cache(
15403 ObjectContextRef obc,
15404 const string &key,
15405 bufferlist *val)
15406{
11fdf7f2 15407 if (pool.info.is_erasure()) {
7c673cae
FG
15408 map<string, bufferlist>::iterator i = obc->attr_cache.find(key);
15409 if (i != obc->attr_cache.end()) {
15410 if (val)
15411 *val = i->second;
15412 return 0;
15413 } else {
15414 return -ENODATA;
15415 }
15416 }
15417 return pgbackend->objects_get_attr(obc->obs.oi.soid, key, val);
15418}
15419
15420int PrimaryLogPG::getattrs_maybe_cache(
15421 ObjectContextRef obc,
b32b8144 15422 map<string, bufferlist> *out)
7c673cae
FG
15423{
15424 int r = 0;
11fdf7f2
TL
15425 ceph_assert(out);
15426 if (pool.info.is_erasure()) {
b32b8144 15427 *out = obc->attr_cache;
7c673cae
FG
15428 } else {
15429 r = pgbackend->objects_get_attrs(obc->obs.oi.soid, out);
15430 }
b32b8144
FG
15431 map<string, bufferlist> tmp;
15432 for (map<string, bufferlist>::iterator i = out->begin();
15433 i != out->end();
15434 ++i) {
15435 if (i->first.size() > 1 && i->first[0] == '_')
15436 tmp[i->first.substr(1, i->first.size())].claim(i->second);
7c673cae 15437 }
b32b8144 15438 tmp.swap(*out);
7c673cae
FG
15439 return r;
15440}
15441
11fdf7f2
TL
15442bool PrimaryLogPG::check_failsafe_full() {
15443 return osd->check_failsafe_full(get_dpp());
7c673cae
FG
15444}
15445
15446void intrusive_ptr_add_ref(PrimaryLogPG *pg) { pg->get("intptr"); }
15447void intrusive_ptr_release(PrimaryLogPG *pg) { pg->put("intptr"); }
15448
15449#ifdef PG_DEBUG_REFS
15450uint64_t get_with_id(PrimaryLogPG *pg) { return pg->get_with_id(); }
15451void put_with_id(PrimaryLogPG *pg, uint64_t id) { return pg->put_with_id(id); }
15452#endif
15453
15454void intrusive_ptr_add_ref(PrimaryLogPG::RepGather *repop) { repop->get(); }
15455void intrusive_ptr_release(PrimaryLogPG::RepGather *repop) { repop->put(); }