]> git.proxmox.com Git - ceph.git/blob - ceph/src/osd/PrimaryLogPG.cc
import quincy beta 17.1.0
[ceph.git] / ceph / src / osd / PrimaryLogPG.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 * Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com>
8 *
9 * Author: Loic Dachary <loic@dachary.org>
10 *
11 * This is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public
13 * License version 2.1, as published by the Free Software
14 * Foundation. See file COPYING.
15 *
16 */
17 #include "PrimaryLogPG.h"
18
19 #include <errno.h>
20
21 #include <charconv>
22 #include <sstream>
23 #include <utility>
24
25 #include <boost/intrusive_ptr.hpp>
26 #include <boost/tuple/tuple.hpp>
27
28 #include "PrimaryLogPG.h"
29
30 #include "cls/cas/cls_cas_ops.h"
31 #include "common/CDC.h"
32 #include "common/EventTrace.h"
33 #include "common/ceph_crypto.h"
34 #include "common/config.h"
35 #include "common/errno.h"
36 #include "common/perf_counters.h"
37 #include "common/scrub_types.h"
38 #include "include/compat.h"
39 #include "json_spirit/json_spirit_reader.h"
40 #include "json_spirit/json_spirit_value.h"
41 #include "messages/MCommandReply.h"
42 #include "messages/MOSDBackoff.h"
43 #include "messages/MOSDOp.h"
44 #include "messages/MOSDPGBackfill.h"
45 #include "messages/MOSDPGBackfillRemove.h"
46 #include "messages/MOSDPGLog.h"
47 #include "messages/MOSDPGScan.h"
48 #include "messages/MOSDPGTrim.h"
49 #include "messages/MOSDPGUpdateLogMissing.h"
50 #include "messages/MOSDPGUpdateLogMissingReply.h"
51 #include "messages/MOSDRepScrub.h"
52 #include "messages/MOSDScrubReserve.h"
53 #include "mon/MonClient.h"
54 #include "objclass/objclass.h"
55 #include "osd/ClassHandler.h"
56 #include "osdc/Objecter.h"
57 #include "osd/scrubber/PrimaryLogScrub.h"
58 #include "osd/scrubber/ScrubStore.h"
59 #include "osd/scrubber/pg_scrubber.h"
60
61 #include "OSD.h"
62 #include "OpRequest.h"
63 #include "PG.h"
64 #include "Session.h"
65
66 // required includes order:
67 #include "json_spirit/json_spirit_value.h"
68 #include "json_spirit/json_spirit_reader.h"
69 #include "include/ceph_assert.h" // json_spirit clobbers it
70 #include "include/rados/rados_types.hpp"
71
72 #ifdef WITH_LTTNG
73 #include "tracing/osd.h"
74 #else
75 #define tracepoint(...)
76 #endif
77
78 #define dout_context cct
79 #define dout_subsys ceph_subsys_osd
80 #define DOUT_PREFIX_ARGS this, osd->whoami, get_osdmap()
81 #undef dout_prefix
82 #define dout_prefix _prefix(_dout, this)
83
84 #include "osd_tracer.h"
85
86 MEMPOOL_DEFINE_OBJECT_FACTORY(PrimaryLogPG, replicatedpg, osd);
87
88 using std::less;
89 using std::list;
90 using std::ostream;
91 using std::pair;
92 using std::make_pair;
93 using std::make_unique;
94 using std::map;
95 using std::ostringstream;
96 using std::set;
97 using std::string;
98 using std::string_view;
99 using std::stringstream;
100 using std::unique_ptr;
101 using std::vector;
102
103 using ceph::bufferlist;
104 using ceph::bufferptr;
105 using ceph::Formatter;
106 using ceph::decode;
107 using ceph::decode_noclear;
108 using ceph::encode;
109 using ceph::encode_destructively;
110
111 using namespace ceph::osd::scheduler;
112 using TOPNSPC::common::cmd_getval;
113 using TOPNSPC::common::cmd_getval_or;
114
115 template <typename T>
116 static ostream& _prefix(std::ostream *_dout, T *pg) {
117 return pg->gen_prefix(*_dout);
118 }
119
120 /**
121 * The CopyCallback class defines an interface for completions to the
122 * copy_start code. Users of the copy infrastructure must implement
123 * one and give an instance of the class to start_copy.
124 *
125 * The implementer is responsible for making sure that the CopyCallback
126 * can associate itself with the correct copy operation.
127 */
128 class PrimaryLogPG::CopyCallback : public GenContext<CopyCallbackResults> {
129 protected:
130 CopyCallback() {}
131 /**
132 * results.get<0>() is the return code: 0 for success; -ECANCELED if
133 * the operation was cancelled by the local OSD; -errno for other issues.
134 * results.get<1>() is a pointer to a CopyResults object, which you are
135 * responsible for deleting.
136 */
137 void finish(CopyCallbackResults results_) override = 0;
138
139 public:
140 /// Provide the final size of the copied object to the CopyCallback
141 ~CopyCallback() override {}
142 };
143
144 template <typename T>
145 class PrimaryLogPG::BlessedGenContext : public GenContext<T> {
146 PrimaryLogPGRef pg;
147 unique_ptr<GenContext<T>> c;
148 epoch_t e;
149 public:
150 BlessedGenContext(PrimaryLogPG *pg, GenContext<T> *c, epoch_t e)
151 : pg(pg), c(c), e(e) {}
152 void finish(T t) override {
153 std::scoped_lock locker{*pg};
154 if (pg->pg_has_reset_since(e))
155 c.reset();
156 else
157 c.release()->complete(t);
158 }
159 bool sync_finish(T t) {
160 // we assume here all blessed/wrapped Contexts can complete synchronously.
161 c.release()->complete(t);
162 return true;
163 }
164 };
165
166 GenContext<ThreadPool::TPHandle&> *PrimaryLogPG::bless_gencontext(
167 GenContext<ThreadPool::TPHandle&> *c) {
168 return new BlessedGenContext<ThreadPool::TPHandle&>(
169 this, c, get_osdmap_epoch());
170 }
171
172 template <typename T>
173 class PrimaryLogPG::UnlockedBlessedGenContext : public GenContext<T> {
174 PrimaryLogPGRef pg;
175 unique_ptr<GenContext<T>> c;
176 epoch_t e;
177 public:
178 UnlockedBlessedGenContext(PrimaryLogPG *pg, GenContext<T> *c, epoch_t e)
179 : pg(pg), c(c), e(e) {}
180 void finish(T t) override {
181 if (pg->pg_has_reset_since(e))
182 c.reset();
183 else
184 c.release()->complete(t);
185 }
186 bool sync_finish(T t) {
187 // we assume here all blessed/wrapped Contexts can complete synchronously.
188 c.release()->complete(t);
189 return true;
190 }
191 };
192
193 GenContext<ThreadPool::TPHandle&> *PrimaryLogPG::bless_unlocked_gencontext(
194 GenContext<ThreadPool::TPHandle&> *c) {
195 return new UnlockedBlessedGenContext<ThreadPool::TPHandle&>(
196 this, c, get_osdmap_epoch());
197 }
198
199 class PrimaryLogPG::BlessedContext : public Context {
200 PrimaryLogPGRef pg;
201 unique_ptr<Context> c;
202 epoch_t e;
203 public:
204 BlessedContext(PrimaryLogPG *pg, Context *c, epoch_t e)
205 : pg(pg), c(c), e(e) {}
206 void finish(int r) override {
207 std::scoped_lock locker{*pg};
208 if (pg->pg_has_reset_since(e))
209 c.reset();
210 else
211 c.release()->complete(r);
212 }
213 bool sync_finish(int r) override {
214 // we assume here all blessed/wrapped Contexts can complete synchronously.
215 c.release()->complete(r);
216 return true;
217 }
218 };
219
220 Context *PrimaryLogPG::bless_context(Context *c) {
221 return new BlessedContext(this, c, get_osdmap_epoch());
222 }
223
224 class PrimaryLogPG::C_PG_ObjectContext : public Context {
225 PrimaryLogPGRef pg;
226 ObjectContext *obc;
227 public:
228 C_PG_ObjectContext(PrimaryLogPG *p, ObjectContext *o) :
229 pg(p), obc(o) {}
230 void finish(int r) override {
231 pg->object_context_destructor_callback(obc);
232 }
233 };
234
235 struct OnReadComplete : public Context {
236 PrimaryLogPG *pg;
237 PrimaryLogPG::OpContext *opcontext;
238 OnReadComplete(
239 PrimaryLogPG *pg,
240 PrimaryLogPG::OpContext *ctx) : pg(pg), opcontext(ctx) {}
241 void finish(int r) override {
242 opcontext->finish_read(pg);
243 }
244 ~OnReadComplete() override {}
245 };
246
247 class PrimaryLogPG::C_OSD_AppliedRecoveredObject : public Context {
248 PrimaryLogPGRef pg;
249 ObjectContextRef obc;
250 public:
251 C_OSD_AppliedRecoveredObject(PrimaryLogPG *p, ObjectContextRef o) :
252 pg(p), obc(o) {}
253 bool sync_finish(int r) override {
254 pg->_applied_recovered_object(obc);
255 return true;
256 }
257 void finish(int r) override {
258 std::scoped_lock locker{*pg};
259 pg->_applied_recovered_object(obc);
260 }
261 };
262
263 class PrimaryLogPG::C_OSD_CommittedPushedObject : public Context {
264 PrimaryLogPGRef pg;
265 epoch_t epoch;
266 eversion_t last_complete;
267 public:
268 C_OSD_CommittedPushedObject(
269 PrimaryLogPG *p, epoch_t epoch, eversion_t lc) :
270 pg(p), epoch(epoch), last_complete(lc) {
271 }
272 void finish(int r) override {
273 pg->_committed_pushed_object(epoch, last_complete);
274 }
275 };
276
277 class PrimaryLogPG::C_OSD_AppliedRecoveredObjectReplica : public Context {
278 PrimaryLogPGRef pg;
279 public:
280 explicit C_OSD_AppliedRecoveredObjectReplica(PrimaryLogPG *p) :
281 pg(p) {}
282 bool sync_finish(int r) override {
283 pg->_applied_recovered_object_replica();
284 return true;
285 }
286 void finish(int r) override {
287 std::scoped_lock locker{*pg};
288 pg->_applied_recovered_object_replica();
289 }
290 };
291
292 // OpContext
293 void PrimaryLogPG::OpContext::start_async_reads(PrimaryLogPG *pg)
294 {
295 inflightreads = 1;
296 list<pair<boost::tuple<uint64_t, uint64_t, unsigned>,
297 pair<bufferlist*, Context*> > > in;
298 in.swap(pending_async_reads);
299 pg->pgbackend->objects_read_async(
300 obc->obs.oi.soid,
301 in,
302 new OnReadComplete(pg, this), pg->get_pool().fast_read);
303 }
304 void PrimaryLogPG::OpContext::finish_read(PrimaryLogPG *pg)
305 {
306 ceph_assert(inflightreads > 0);
307 --inflightreads;
308 if (async_reads_complete()) {
309 ceph_assert(pg->in_progress_async_reads.size());
310 ceph_assert(pg->in_progress_async_reads.front().second == this);
311 pg->in_progress_async_reads.pop_front();
312
313 // Restart the op context now that all reads have been
314 // completed. Read failures will be handled by the op finisher
315 pg->execute_ctx(this);
316 }
317 }
318
319 class CopyFromCallback : public PrimaryLogPG::CopyCallback {
320 public:
321 PrimaryLogPG::CopyResults *results = nullptr;
322 PrimaryLogPG::OpContext *ctx;
323 OSDOp &osd_op;
324 uint32_t truncate_seq;
325 uint64_t truncate_size;
326 bool have_truncate = false;
327
328 CopyFromCallback(PrimaryLogPG::OpContext *ctx, OSDOp &osd_op)
329 : ctx(ctx), osd_op(osd_op) {
330 }
331 ~CopyFromCallback() override {}
332
333 void finish(PrimaryLogPG::CopyCallbackResults results_) override {
334 results = results_.get<1>();
335 int r = results_.get<0>();
336
337 // Only use truncate_{seq,size} from the original object if the client
338 // did not sent us these parameters
339 if (!have_truncate) {
340 truncate_seq = results->truncate_seq;
341 truncate_size = results->truncate_size;
342 }
343
344 // for finish_copyfrom
345 ctx->user_at_version = results->user_version;
346
347 if (r >= 0) {
348 ctx->pg->execute_ctx(ctx);
349 } else {
350 if (r != -ECANCELED) { // on cancel just toss it out; client resends
351 if (ctx->op)
352 ctx->pg->osd->reply_op_error(ctx->op, r);
353 } else if (results->should_requeue) {
354 if (ctx->op)
355 ctx->pg->requeue_op(ctx->op);
356 }
357 ctx->pg->close_op_ctx(ctx);
358 }
359 }
360
361 bool is_temp_obj_used() {
362 return results->started_temp_obj;
363 }
364 uint64_t get_data_size() {
365 return results->object_size;
366 }
367 void set_truncate(uint32_t seq, uint64_t size) {
368 truncate_seq = seq;
369 truncate_size = size;
370 have_truncate = true;
371 }
372 };
373
374 struct CopyFromFinisher : public PrimaryLogPG::OpFinisher {
375 CopyFromCallback *copy_from_callback;
376
377 explicit CopyFromFinisher(CopyFromCallback *copy_from_callback)
378 : copy_from_callback(copy_from_callback) {
379 }
380
381 int execute() override {
382 // instance will be destructed after this method completes
383 copy_from_callback->ctx->pg->finish_copyfrom(copy_from_callback);
384 return 0;
385 }
386 };
387
388 // ======================
389 // PGBackend::Listener
390
391 void PrimaryLogPG::on_local_recover(
392 const hobject_t &hoid,
393 const ObjectRecoveryInfo &_recovery_info,
394 ObjectContextRef obc,
395 bool is_delete,
396 ObjectStore::Transaction *t
397 )
398 {
399 dout(10) << __func__ << ": " << hoid << dendl;
400
401 ObjectRecoveryInfo recovery_info(_recovery_info);
402 clear_object_snap_mapping(t, hoid);
403 if (!is_delete && recovery_info.soid.is_snap()) {
404 OSDriver::OSTransaction _t(osdriver.get_transaction(t));
405 set<snapid_t> snaps;
406 dout(20) << " snapset " << recovery_info.ss << dendl;
407 auto p = recovery_info.ss.clone_snaps.find(hoid.snap);
408 if (p != recovery_info.ss.clone_snaps.end()) {
409 snaps.insert(p->second.begin(), p->second.end());
410 dout(20) << " snaps " << snaps << dendl;
411 snap_mapper.add_oid(
412 recovery_info.soid,
413 snaps,
414 &_t);
415 } else {
416 derr << __func__ << " " << hoid << " had no clone_snaps" << dendl;
417 }
418 }
419 if (!is_delete && recovery_state.get_pg_log().get_missing().is_missing(recovery_info.soid) &&
420 recovery_state.get_pg_log().get_missing().get_items().find(recovery_info.soid)->second.need > recovery_info.version) {
421 ceph_assert(is_primary());
422 const pg_log_entry_t *latest = recovery_state.get_pg_log().get_log().objects.find(recovery_info.soid)->second;
423 if (latest->op == pg_log_entry_t::LOST_REVERT &&
424 latest->reverting_to == recovery_info.version) {
425 dout(10) << " got old revert version " << recovery_info.version
426 << " for " << *latest << dendl;
427 recovery_info.version = latest->version;
428 // update the attr to the revert event version
429 recovery_info.oi.prior_version = recovery_info.oi.version;
430 recovery_info.oi.version = latest->version;
431 bufferlist bl;
432 encode(recovery_info.oi, bl,
433 get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
434 ceph_assert(!pool.info.is_erasure());
435 t->setattr(coll, ghobject_t(recovery_info.soid), OI_ATTR, bl);
436 if (obc)
437 obc->attr_cache[OI_ATTR] = bl;
438 }
439 }
440
441 // keep track of active pushes for scrub
442 ++active_pushes;
443
444 recovery_state.recover_got(
445 recovery_info.soid,
446 recovery_info.version,
447 is_delete,
448 *t);
449
450 if (is_primary()) {
451 if (!is_delete) {
452 obc->obs.exists = true;
453
454 bool got = obc->get_recovery_read();
455 ceph_assert(got);
456
457 ceph_assert(recovering.count(obc->obs.oi.soid));
458 recovering[obc->obs.oi.soid] = obc;
459 obc->obs.oi = recovery_info.oi; // may have been updated above
460 }
461
462 t->register_on_applied(new C_OSD_AppliedRecoveredObject(this, obc));
463
464 publish_stats_to_osd();
465 release_backoffs(hoid);
466 if (!is_unreadable_object(hoid)) {
467 auto unreadable_object_entry = waiting_for_unreadable_object.find(hoid);
468 if (unreadable_object_entry != waiting_for_unreadable_object.end()) {
469 dout(20) << " kicking unreadable waiters on " << hoid << dendl;
470 requeue_ops(unreadable_object_entry->second);
471 waiting_for_unreadable_object.erase(unreadable_object_entry);
472 }
473 }
474 } else {
475 t->register_on_applied(
476 new C_OSD_AppliedRecoveredObjectReplica(this));
477
478 }
479
480 t->register_on_commit(
481 new C_OSD_CommittedPushedObject(
482 this,
483 get_osdmap_epoch(),
484 info.last_complete));
485 }
486
487 void PrimaryLogPG::on_global_recover(
488 const hobject_t &soid,
489 const object_stat_sum_t &stat_diff,
490 bool is_delete)
491 {
492 recovery_state.object_recovered(soid, stat_diff);
493 publish_stats_to_osd();
494 dout(10) << "pushed " << soid << " to all replicas" << dendl;
495 auto i = recovering.find(soid);
496 ceph_assert(i != recovering.end());
497
498 if (i->second && i->second->rwstate.recovery_read_marker) {
499 // recover missing won't have had an obc, but it gets filled in
500 // during on_local_recover
501 ceph_assert(i->second);
502 list<OpRequestRef> requeue_list;
503 i->second->drop_recovery_read(&requeue_list);
504 requeue_ops(requeue_list);
505 }
506
507 backfills_in_flight.erase(soid);
508
509 recovering.erase(i);
510 finish_recovery_op(soid);
511 release_backoffs(soid);
512 auto degraded_object_entry = waiting_for_degraded_object.find(soid);
513 if (degraded_object_entry != waiting_for_degraded_object.end()) {
514 dout(20) << " kicking degraded waiters on " << soid << dendl;
515 requeue_ops(degraded_object_entry->second);
516 waiting_for_degraded_object.erase(degraded_object_entry);
517 }
518 auto unreadable_object_entry = waiting_for_unreadable_object.find(soid);
519 if (unreadable_object_entry != waiting_for_unreadable_object.end()) {
520 dout(20) << " kicking unreadable waiters on " << soid << dendl;
521 requeue_ops(unreadable_object_entry->second);
522 waiting_for_unreadable_object.erase(unreadable_object_entry);
523 }
524 finish_degraded_object(soid);
525 }
526
527 void PrimaryLogPG::schedule_recovery_work(
528 GenContext<ThreadPool::TPHandle&> *c)
529 {
530 osd->queue_recovery_context(this, c);
531 }
532
533 void PrimaryLogPG::replica_clear_repop_obc(
534 const vector<pg_log_entry_t> &logv,
535 ObjectStore::Transaction &t)
536 {
537 for (auto &&e: logv) {
538 /* Have to blast all clones, they share a snapset */
539 object_contexts.clear_range(
540 e.soid.get_object_boundary(), e.soid.get_head());
541 ceph_assert(
542 snapset_contexts.find(e.soid.get_head()) ==
543 snapset_contexts.end());
544 }
545 }
546
547 bool PrimaryLogPG::should_send_op(
548 pg_shard_t peer,
549 const hobject_t &hoid) {
550 if (peer == get_primary())
551 return true;
552 ceph_assert(recovery_state.has_peer_info(peer));
553 bool should_send =
554 hoid.pool != (int64_t)info.pgid.pool() ||
555 hoid <= last_backfill_started ||
556 hoid <= recovery_state.get_peer_info(peer).last_backfill;
557 if (!should_send) {
558 ceph_assert(is_backfill_target(peer));
559 dout(10) << __func__ << " issue_repop shipping empty opt to osd." << peer
560 << ", object " << hoid
561 << " beyond std::max(last_backfill_started "
562 << ", peer_info[peer].last_backfill "
563 << recovery_state.get_peer_info(peer).last_backfill
564 << ")" << dendl;
565 return should_send;
566 }
567 if (is_async_recovery_target(peer) &&
568 recovery_state.get_peer_missing(peer).is_missing(hoid)) {
569 should_send = false;
570 dout(10) << __func__ << " issue_repop shipping empty opt to osd." << peer
571 << ", object " << hoid
572 << " which is pending recovery in async_recovery_targets" << dendl;
573 }
574 return should_send;
575 }
576
577
578 ConnectionRef PrimaryLogPG::get_con_osd_cluster(
579 int peer, epoch_t from_epoch)
580 {
581 return osd->get_con_osd_cluster(peer, from_epoch);
582 }
583
584 PerfCounters *PrimaryLogPG::get_logger()
585 {
586 return osd->logger;
587 }
588
589
590 // ====================
591 // missing objects
592
593 bool PrimaryLogPG::is_missing_object(const hobject_t& soid) const
594 {
595 return recovery_state.get_pg_log().get_missing().get_items().count(soid);
596 }
597
598 void PrimaryLogPG::maybe_kick_recovery(
599 const hobject_t &soid)
600 {
601 eversion_t v;
602 bool work_started = false;
603 if (!recovery_state.get_missing_loc().needs_recovery(soid, &v))
604 return;
605
606 map<hobject_t, ObjectContextRef>::const_iterator p = recovering.find(soid);
607 if (p != recovering.end()) {
608 dout(7) << "object " << soid << " v " << v << ", already recovering." << dendl;
609 } else if (recovery_state.get_missing_loc().is_unfound(soid)) {
610 dout(7) << "object " << soid << " v " << v << ", is unfound." << dendl;
611 } else {
612 dout(7) << "object " << soid << " v " << v << ", recovering." << dendl;
613 PGBackend::RecoveryHandle *h = pgbackend->open_recovery_op();
614 if (is_missing_object(soid)) {
615 recover_missing(soid, v, CEPH_MSG_PRIO_HIGH, h);
616 } else if (recovery_state.get_missing_loc().is_deleted(soid)) {
617 prep_object_replica_deletes(soid, v, h, &work_started);
618 } else {
619 prep_object_replica_pushes(soid, v, h, &work_started);
620 }
621 pgbackend->run_recovery_op(h, CEPH_MSG_PRIO_HIGH);
622 }
623 }
624
625 void PrimaryLogPG::wait_for_unreadable_object(
626 const hobject_t& soid, OpRequestRef op)
627 {
628 ceph_assert(is_unreadable_object(soid));
629 maybe_kick_recovery(soid);
630 waiting_for_unreadable_object[soid].push_back(op);
631 op->mark_delayed("waiting for missing object");
632 }
633
634 bool PrimaryLogPG::is_degraded_or_backfilling_object(const hobject_t& soid)
635 {
636 /* The conditions below may clear (on_local_recover, before we queue
637 * the transaction) before we actually requeue the degraded waiters
638 * in on_global_recover after the transaction completes.
639 */
640 if (waiting_for_degraded_object.count(soid))
641 return true;
642 if (recovery_state.get_pg_log().get_missing().get_items().count(soid))
643 return true;
644 ceph_assert(!get_acting_recovery_backfill().empty());
645 for (set<pg_shard_t>::iterator i = get_acting_recovery_backfill().begin();
646 i != get_acting_recovery_backfill().end();
647 ++i) {
648 if (*i == get_primary()) continue;
649 pg_shard_t peer = *i;
650 auto peer_missing_entry = recovery_state.get_peer_missing().find(peer);
651 // If an object is missing on an async_recovery_target, return false.
652 // This will not block the op and the object is async recovered later.
653 if (peer_missing_entry != recovery_state.get_peer_missing().end() &&
654 peer_missing_entry->second.get_items().count(soid)) {
655 if (is_async_recovery_target(peer))
656 continue;
657 else
658 return true;
659 }
660 // Object is degraded if after last_backfill AND
661 // we are backfilling it
662 if (is_backfill_target(peer) &&
663 recovery_state.get_peer_info(peer).last_backfill <= soid &&
664 last_backfill_started >= soid &&
665 backfills_in_flight.count(soid))
666 return true;
667 }
668 return false;
669 }
670
671 bool PrimaryLogPG::is_degraded_on_async_recovery_target(const hobject_t& soid)
672 {
673 for (auto &i: get_async_recovery_targets()) {
674 auto peer_missing_entry = recovery_state.get_peer_missing().find(i);
675 if (peer_missing_entry != recovery_state.get_peer_missing().end() &&
676 peer_missing_entry->second.get_items().count(soid)) {
677 dout(30) << __func__ << " " << soid << dendl;
678 return true;
679 }
680 }
681 return false;
682 }
683
684 void PrimaryLogPG::wait_for_degraded_object(const hobject_t& soid, OpRequestRef op)
685 {
686 ceph_assert(is_degraded_or_backfilling_object(soid) || is_degraded_on_async_recovery_target(soid));
687
688 maybe_kick_recovery(soid);
689 waiting_for_degraded_object[soid].push_back(op);
690 op->mark_delayed("waiting for degraded object");
691 }
692
693 void PrimaryLogPG::block_write_on_full_cache(
694 const hobject_t& _oid, OpRequestRef op)
695 {
696 const hobject_t oid = _oid.get_head();
697 dout(20) << __func__ << ": blocking object " << oid
698 << " on full cache" << dendl;
699 objects_blocked_on_cache_full.insert(oid);
700 waiting_for_cache_not_full.push_back(op);
701 op->mark_delayed("waiting for cache not full");
702 }
703
704 void PrimaryLogPG::block_for_clean(
705 const hobject_t& oid, OpRequestRef op)
706 {
707 dout(20) << __func__ << ": blocking object " << oid
708 << " on primary repair" << dendl;
709 waiting_for_clean_to_primary_repair.push_back(op);
710 op->mark_delayed("waiting for clean to repair");
711 }
712
713 void PrimaryLogPG::block_write_on_snap_rollback(
714 const hobject_t& oid, ObjectContextRef obc, OpRequestRef op)
715 {
716 dout(20) << __func__ << ": blocking object " << oid.get_head()
717 << " on snap promotion " << obc->obs.oi.soid << dendl;
718 // otherwise, we'd have blocked in do_op
719 ceph_assert(oid.is_head());
720 ceph_assert(objects_blocked_on_snap_promotion.count(oid) == 0);
721 /*
722 * We block the head object here.
723 *
724 * Let's assume that there is racing read When the head object is being rollbacked.
725 * Since the two different ops can trigger promote_object() with the same source,
726 * infinite loop happens by canceling ops each other.
727 * To avoid this, we block the head object during rollback.
728 * So, the racing read will be blocked until the rollback is completed.
729 * see also: https://tracker.ceph.com/issues/49726
730 */
731 ObjectContextRef head_obc = get_object_context(oid, false);
732 head_obc->start_block();
733 objects_blocked_on_snap_promotion[oid] = obc;
734 wait_for_blocked_object(obc->obs.oi.soid, op);
735 }
736
737 void PrimaryLogPG::block_write_on_degraded_snap(
738 const hobject_t& snap, OpRequestRef op)
739 {
740 dout(20) << __func__ << ": blocking object " << snap.get_head()
741 << " on degraded snap " << snap << dendl;
742 // otherwise, we'd have blocked in do_op
743 ceph_assert(objects_blocked_on_degraded_snap.count(snap.get_head()) == 0);
744 objects_blocked_on_degraded_snap[snap.get_head()] = snap.snap;
745 wait_for_degraded_object(snap, op);
746 }
747
748 bool PrimaryLogPG::maybe_await_blocked_head(
749 const hobject_t &hoid,
750 OpRequestRef op)
751 {
752 ObjectContextRef obc;
753 obc = object_contexts.lookup(hoid.get_head());
754 if (obc) {
755 if (obc->is_blocked()) {
756 wait_for_blocked_object(obc->obs.oi.soid, op);
757 return true;
758 } else {
759 return false;
760 }
761 }
762 return false;
763 }
764
765 void PrimaryLogPG::wait_for_blocked_object(const hobject_t& soid, OpRequestRef op)
766 {
767 dout(10) << __func__ << " " << soid << " " << op << dendl;
768 waiting_for_blocked_object[soid].push_back(op);
769 op->mark_delayed("waiting for blocked object");
770 }
771
772 void PrimaryLogPG::maybe_force_recovery()
773 {
774 // no force if not in degraded/recovery/backfill states
775 if (!is_degraded() &&
776 !state_test(PG_STATE_RECOVERING |
777 PG_STATE_RECOVERY_WAIT |
778 PG_STATE_BACKFILLING |
779 PG_STATE_BACKFILL_WAIT |
780 PG_STATE_BACKFILL_TOOFULL))
781 return;
782
783 if (recovery_state.get_pg_log().get_log().approx_size() <
784 cct->_conf->osd_max_pg_log_entries *
785 cct->_conf->osd_force_recovery_pg_log_entries_factor)
786 return;
787
788 // find the oldest missing object
789 version_t min_version = recovery_state.get_pg_log().get_log().head.version;
790 hobject_t soid;
791 if (!recovery_state.get_pg_log().get_missing().get_rmissing().empty()) {
792 min_version = recovery_state.get_pg_log().get_missing().get_rmissing().begin()->first;
793 soid = recovery_state.get_pg_log().get_missing().get_rmissing().begin()->second;
794 }
795 ceph_assert(!get_acting_recovery_backfill().empty());
796 for (set<pg_shard_t>::iterator it = get_acting_recovery_backfill().begin();
797 it != get_acting_recovery_backfill().end();
798 ++it) {
799 if (*it == get_primary()) continue;
800 pg_shard_t peer = *it;
801 auto it_missing = recovery_state.get_peer_missing().find(peer);
802 if (it_missing != recovery_state.get_peer_missing().end() &&
803 !it_missing->second.get_rmissing().empty()) {
804 const auto& min_obj = recovery_state.get_peer_missing(peer).get_rmissing().begin();
805 dout(20) << __func__ << " peer " << peer << " min_version " << min_obj->first
806 << " oid " << min_obj->second << dendl;
807 if (min_version > min_obj->first) {
808 min_version = min_obj->first;
809 soid = min_obj->second;
810 }
811 }
812 }
813
814 // recover it
815 if (soid != hobject_t())
816 maybe_kick_recovery(soid);
817 }
818
819 bool PrimaryLogPG::check_laggy(OpRequestRef& op)
820 {
821 assert(HAVE_FEATURE(recovery_state.get_min_upacting_features(),
822 SERVER_OCTOPUS));
823 if (state_test(PG_STATE_WAIT)) {
824 dout(10) << __func__ << " PG is WAIT state" << dendl;
825 } else if (!state_test(PG_STATE_LAGGY)) {
826 auto mnow = osd->get_mnow();
827 auto ru = recovery_state.get_readable_until();
828 if (mnow <= ru) {
829 // not laggy
830 return true;
831 }
832 dout(10) << __func__
833 << " mnow " << mnow
834 << " > readable_until " << ru << dendl;
835
836 if (!is_primary()) {
837 osd->reply_op_error(op, -EAGAIN);
838 return false;
839 }
840
841 // go to laggy state
842 state_set(PG_STATE_LAGGY);
843 publish_stats_to_osd();
844 }
845 dout(10) << __func__ << " not readable" << dendl;
846 waiting_for_readable.push_back(op);
847 op->mark_delayed("waiting for readable");
848 return false;
849 }
850
851 bool PrimaryLogPG::check_laggy_requeue(OpRequestRef& op)
852 {
853 assert(HAVE_FEATURE(recovery_state.get_min_upacting_features(),
854 SERVER_OCTOPUS));
855 if (!state_test(PG_STATE_WAIT) && !state_test(PG_STATE_LAGGY)) {
856 return true; // not laggy
857 }
858 dout(10) << __func__ << " not readable" << dendl;
859 waiting_for_readable.push_front(op);
860 op->mark_delayed("waiting for readable");
861 return false;
862 }
863
864 void PrimaryLogPG::recheck_readable()
865 {
866 if (!is_wait() && !is_laggy()) {
867 dout(20) << __func__ << " wasn't wait or laggy" << dendl;
868 return;
869 }
870 auto mnow = osd->get_mnow();
871 bool pub = false;
872 if (is_wait()) {
873 auto prior_readable_until_ub = recovery_state.get_prior_readable_until_ub();
874 if (mnow < prior_readable_until_ub) {
875 dout(10) << __func__ << " still wait (mnow " << mnow
876 << " < prior_readable_until_ub " << prior_readable_until_ub
877 << ")" << dendl;
878 } else {
879 dout(10) << __func__ << " no longer wait (mnow " << mnow
880 << " >= prior_readable_until_ub " << prior_readable_until_ub
881 << ")" << dendl;
882 state_clear(PG_STATE_WAIT);
883 recovery_state.clear_prior_readable_until_ub();
884 pub = true;
885 }
886 }
887 if (is_laggy()) {
888 auto ru = recovery_state.get_readable_until();
889 if (ru == ceph::signedspan::zero()) {
890 dout(10) << __func__ << " still laggy (mnow " << mnow
891 << ", readable_until zero)" << dendl;
892 } else if (mnow >= ru) {
893 dout(10) << __func__ << " still laggy (mnow " << mnow
894 << " >= readable_until " << ru << ")" << dendl;
895 } else {
896 dout(10) << __func__ << " no longer laggy (mnow " << mnow
897 << " < readable_until " << ru << ")" << dendl;
898 state_clear(PG_STATE_LAGGY);
899 pub = true;
900 }
901 }
902 if (pub) {
903 publish_stats_to_osd();
904 }
905 if (!is_laggy() && !is_wait()) {
906 requeue_ops(waiting_for_readable);
907 }
908 }
909
910 bool PrimaryLogPG::pgls_filter(const PGLSFilter& filter, const hobject_t& sobj)
911 {
912 bufferlist bl;
913
914 // If filter has expressed an interest in an xattr, load it.
915 if (!filter.get_xattr().empty()) {
916 int ret = pgbackend->objects_get_attr(
917 sobj,
918 filter.get_xattr(),
919 &bl);
920 dout(0) << "getattr (sobj=" << sobj << ", attr=" << filter.get_xattr() << ") returned " << ret << dendl;
921 if (ret < 0) {
922 if (ret != -ENODATA || filter.reject_empty_xattr()) {
923 return false;
924 }
925 }
926 }
927
928 return filter.filter(sobj, bl);
929 }
930
931 std::pair<int, std::unique_ptr<const PGLSFilter>>
932 PrimaryLogPG::get_pgls_filter(bufferlist::const_iterator& iter)
933 {
934 string type;
935 // storing non-const PGLSFilter for the sake of ::init()
936 std::unique_ptr<PGLSFilter> filter;
937
938 try {
939 decode(type, iter);
940 }
941 catch (ceph::buffer::error& e) {
942 return { -EINVAL, nullptr };
943 }
944
945 if (type.compare("plain") == 0) {
946 filter = std::make_unique<PGLSPlainFilter>();
947 } else {
948 std::size_t dot = type.find('.');
949 if (dot == std::string::npos || dot == 0 || dot == type.size() - 1) {
950 return { -EINVAL, nullptr };
951 }
952
953 const std::string class_name = type.substr(0, dot);
954 const std::string filter_name = type.substr(dot + 1);
955 ClassHandler::ClassData *cls = NULL;
956 int r = ClassHandler::get_instance().open_class(class_name, &cls);
957 if (r != 0) {
958 derr << "Error opening class '" << class_name << "': "
959 << cpp_strerror(r) << dendl;
960 if (r != -EPERM) // propagate permission error
961 r = -EINVAL;
962 return { r, nullptr };
963 } else {
964 ceph_assert(cls);
965 }
966
967 ClassHandler::ClassFilter *class_filter = cls->get_filter(filter_name);
968 if (class_filter == NULL) {
969 derr << "Error finding filter '" << filter_name << "' in class "
970 << class_name << dendl;
971 return { -EINVAL, nullptr };
972 }
973 filter.reset(class_filter->fn());
974 if (!filter) {
975 // Object classes are obliged to return us something, but let's
976 // give an error rather than asserting out.
977 derr << "Buggy class " << class_name << " failed to construct "
978 "filter " << filter_name << dendl;
979 return { -EINVAL, nullptr };
980 }
981 }
982
983 ceph_assert(filter);
984 int r = filter->init(iter);
985 if (r < 0) {
986 derr << "Error initializing filter " << type << ": "
987 << cpp_strerror(r) << dendl;
988 return { -EINVAL, nullptr };
989 } else {
990 // Successfully constructed and initialized, return it.
991 return std::make_pair(0, std::move(filter));
992 }
993 }
994
995
996 // ==========================================================
997
998 void PrimaryLogPG::do_command(
999 const string_view& orig_prefix,
1000 const cmdmap_t& cmdmap,
1001 const bufferlist& idata,
1002 std::function<void(int,const std::string&,bufferlist&)> on_finish)
1003 {
1004 string format;
1005 cmd_getval(cmdmap, "format", format);
1006 std::unique_ptr<Formatter> f(Formatter::create(
1007 format, "json-pretty", "json-pretty"));
1008 int ret = 0;
1009 stringstream ss; // stderr error message stream
1010 bufferlist outbl; // if empty at end, we'll dump formatter as output
1011
1012 // get final prefix:
1013 // - ceph pg <pgid> foo -> prefix=pg, cmd=foo
1014 // - ceph tell <pgid> foo -> prefix=foo
1015 string prefix(orig_prefix);
1016 string command;
1017 cmd_getval(cmdmap, "cmd", command);
1018 if (command.size()) {
1019 prefix = command;
1020 }
1021
1022 if (prefix == "query") {
1023 f->open_object_section("pg");
1024 f->dump_stream("snap_trimq") << snap_trimq;
1025 f->dump_unsigned("snap_trimq_len", snap_trimq.size());
1026 recovery_state.dump_peering_state(f.get());
1027
1028 f->open_array_section("recovery_state");
1029 handle_query_state(f.get());
1030 f->close_section();
1031
1032 if (is_primary() && is_active() && m_scrubber) {
1033 m_scrubber->dump_scrubber(f.get(), m_planned_scrub);
1034 }
1035
1036 f->open_object_section("agent_state");
1037 if (agent_state)
1038 agent_state->dump(f.get());
1039 f->close_section();
1040
1041 f->close_section();
1042 }
1043
1044 else if (prefix == "mark_unfound_lost") {
1045 string mulcmd;
1046 cmd_getval(cmdmap, "mulcmd", mulcmd);
1047 int mode = -1;
1048 if (mulcmd == "revert") {
1049 if (pool.info.is_erasure()) {
1050 ss << "mode must be 'delete' for ec pool";
1051 ret = -EINVAL;
1052 goto out;
1053 }
1054 mode = pg_log_entry_t::LOST_REVERT;
1055 } else if (mulcmd == "delete") {
1056 mode = pg_log_entry_t::LOST_DELETE;
1057 } else {
1058 ss << "mode must be 'revert' or 'delete'; mark not yet implemented";
1059 ret = -EINVAL;
1060 goto out;
1061 }
1062 ceph_assert(mode == pg_log_entry_t::LOST_REVERT ||
1063 mode == pg_log_entry_t::LOST_DELETE);
1064
1065 if (!is_primary()) {
1066 ss << "not primary";
1067 ret = -EROFS;
1068 goto out;
1069 }
1070
1071 uint64_t unfound = recovery_state.get_missing_loc().num_unfound();
1072 if (!unfound) {
1073 ss << "pg has no unfound objects";
1074 goto out; // make command idempotent
1075 }
1076
1077 if (!recovery_state.all_unfound_are_queried_or_lost(get_osdmap())) {
1078 ss << "pg has " << unfound
1079 << " unfound objects but we haven't probed all sources, not marking lost";
1080 ret = -EINVAL;
1081 goto out;
1082 }
1083
1084 mark_all_unfound_lost(mode, on_finish);
1085 return;
1086 }
1087
1088 else if (prefix == "list_unfound") {
1089 hobject_t offset;
1090 string offset_json;
1091 bool show_offset = false;
1092 if (cmd_getval(cmdmap, "offset", offset_json)) {
1093 json_spirit::Value v;
1094 try {
1095 if (!json_spirit::read(offset_json, v))
1096 throw std::runtime_error("bad json");
1097 offset.decode(v);
1098 } catch (std::runtime_error& e) {
1099 ss << "error parsing offset: " << e.what();
1100 ret = -EINVAL;
1101 goto out;
1102 }
1103 show_offset = true;
1104 }
1105 f->open_object_section("missing");
1106 if (show_offset) {
1107 f->open_object_section("offset");
1108 offset.dump(f.get());
1109 f->close_section();
1110 }
1111 auto &needs_recovery_map = recovery_state.get_missing_loc()
1112 .get_needs_recovery();
1113 f->dump_int("num_missing", needs_recovery_map.size());
1114 f->dump_int("num_unfound", get_num_unfound());
1115 map<hobject_t, pg_missing_item>::const_iterator p =
1116 needs_recovery_map.upper_bound(offset);
1117 {
1118 f->open_array_section("objects");
1119 int32_t num = 0;
1120 for (; p != needs_recovery_map.end() &&
1121 num < cct->_conf->osd_command_max_records;
1122 ++p) {
1123 if (recovery_state.get_missing_loc().is_unfound(p->first)) {
1124 f->open_object_section("object");
1125 {
1126 f->open_object_section("oid");
1127 p->first.dump(f.get());
1128 f->close_section();
1129 }
1130 p->second.dump(f.get()); // have, need keys
1131 {
1132 f->open_array_section("locations");
1133 for (auto &&r : recovery_state.get_missing_loc().get_locations(
1134 p->first)) {
1135 f->dump_stream("shard") << r;
1136 }
1137 f->close_section();
1138 }
1139 f->close_section();
1140 num++;
1141 }
1142 }
1143 f->close_section();
1144 }
1145 // Get possible locations of missing objects from pg information
1146 PeeringState::QueryUnfound q(f.get());
1147 recovery_state.handle_event(q, 0);
1148 f->dump_bool("more", p != needs_recovery_map.end());
1149 f->close_section();
1150 }
1151
1152 else if (prefix == "scrub" ||
1153 prefix == "deep_scrub") {
1154 bool deep = (prefix == "deep_scrub");
1155 int64_t time = cmd_getval_or<int64_t>(cmdmap, "time", 0);
1156
1157 if (is_primary()) {
1158 const pg_pool_t *p = &pool.info;
1159 double pool_scrub_max_interval = 0;
1160 double scrub_max_interval;
1161 if (deep) {
1162 p->opts.get(pool_opts_t::DEEP_SCRUB_INTERVAL, &pool_scrub_max_interval);
1163 scrub_max_interval = pool_scrub_max_interval > 0 ?
1164 pool_scrub_max_interval : g_conf()->osd_deep_scrub_interval;
1165 } else {
1166 p->opts.get(pool_opts_t::SCRUB_MAX_INTERVAL, &pool_scrub_max_interval);
1167 scrub_max_interval = pool_scrub_max_interval > 0 ?
1168 pool_scrub_max_interval : g_conf()->osd_scrub_max_interval;
1169 }
1170 // Instead of marking must_scrub force a schedule scrub
1171 utime_t stamp = ceph_clock_now();
1172 if (time == 0)
1173 stamp -= scrub_max_interval;
1174 else
1175 stamp -= (float)time;
1176 stamp -= 100.0; // push back last scrub more for good measure
1177 if (deep) {
1178 set_last_deep_scrub_stamp(stamp);
1179 }
1180 set_last_scrub_stamp(stamp); // for 'deep' as well, as we use this value to order scrubs
1181 f->open_object_section("result");
1182 f->dump_bool("deep", deep);
1183 f->dump_stream("stamp") << stamp;
1184 f->close_section();
1185 } else {
1186 ss << "Not primary";
1187 ret = -EPERM;
1188 }
1189 outbl.append(ss.str());
1190 }
1191
1192 else if (prefix == "block" || prefix == "unblock" || prefix == "set" ||
1193 prefix == "unset") {
1194 string value;
1195 cmd_getval(cmdmap, "value", value);
1196
1197 if (is_primary()) {
1198 ret = m_scrubber->asok_debug(prefix, value, f.get(), ss);
1199 f->open_object_section("result");
1200 f->dump_bool("success", true);
1201 f->close_section();
1202 } else {
1203 ss << "Not primary";
1204 ret = -EPERM;
1205 }
1206 outbl.append(ss.str());
1207 }
1208 else {
1209 ret = -ENOSYS;
1210 ss << "prefix '" << prefix << "' not implemented";
1211 }
1212
1213 out:
1214 if (ret >= 0 && outbl.length() == 0) {
1215 f->flush(outbl);
1216 }
1217 on_finish(ret, ss.str(), outbl);
1218 }
1219
1220
1221 // ==========================================================
1222
1223 void PrimaryLogPG::do_pg_op(OpRequestRef op)
1224 {
1225 const MOSDOp *m = static_cast<const MOSDOp *>(op->get_req());
1226 ceph_assert(m->get_type() == CEPH_MSG_OSD_OP);
1227 dout(10) << "do_pg_op " << *m << dendl;
1228
1229 op->mark_started();
1230
1231 int result = 0;
1232 string cname, mname;
1233
1234 snapid_t snapid = m->get_snapid();
1235
1236 vector<OSDOp> ops = m->ops;
1237
1238 for (vector<OSDOp>::iterator p = ops.begin(); p != ops.end(); ++p) {
1239 std::unique_ptr<const PGLSFilter> filter;
1240 OSDOp& osd_op = *p;
1241 auto bp = p->indata.cbegin();
1242 switch (p->op.op) {
1243 case CEPH_OSD_OP_PGNLS_FILTER:
1244 try {
1245 decode(cname, bp);
1246 decode(mname, bp);
1247 }
1248 catch (const ceph::buffer::error& e) {
1249 dout(0) << "unable to decode PGLS_FILTER description in " << *m << dendl;
1250 result = -EINVAL;
1251 break;
1252 }
1253 std::tie(result, filter) = get_pgls_filter(bp);
1254 if (result < 0)
1255 break;
1256
1257 ceph_assert(filter);
1258
1259 // fall through
1260
1261 case CEPH_OSD_OP_PGNLS:
1262 if (snapid != CEPH_NOSNAP) {
1263 result = -EINVAL;
1264 break;
1265 }
1266 if (get_osdmap()->raw_pg_to_pg(m->get_pg()) != info.pgid.pgid) {
1267 dout(10) << " pgnls pg=" << m->get_pg()
1268 << " " << get_osdmap()->raw_pg_to_pg(m->get_pg())
1269 << " != " << info.pgid << dendl;
1270 result = 0; // hmm?
1271 } else {
1272 unsigned list_size = std::min<uint64_t>(cct->_conf->osd_max_pgls,
1273 p->op.pgls.count);
1274
1275 dout(10) << " pgnls pg=" << m->get_pg() << " count " << list_size
1276 << dendl;
1277 // read into a buffer
1278 vector<hobject_t> sentries;
1279 pg_nls_response_t response;
1280 try {
1281 decode(response.handle, bp);
1282 }
1283 catch (const ceph::buffer::error& e) {
1284 dout(0) << "unable to decode PGNLS handle in " << *m << dendl;
1285 result = -EINVAL;
1286 break;
1287 }
1288
1289 hobject_t next;
1290 hobject_t lower_bound = response.handle;
1291 hobject_t pg_start = info.pgid.pgid.get_hobj_start();
1292 hobject_t pg_end = info.pgid.pgid.get_hobj_end(pool.info.get_pg_num());
1293 dout(10) << " pgnls lower_bound " << lower_bound
1294 << " pg_end " << pg_end << dendl;
1295 if (((!lower_bound.is_max() && lower_bound >= pg_end) ||
1296 (lower_bound != hobject_t() && lower_bound < pg_start))) {
1297 // this should only happen with a buggy client.
1298 dout(10) << "outside of PG bounds " << pg_start << " .. "
1299 << pg_end << dendl;
1300 result = -EINVAL;
1301 break;
1302 }
1303
1304 hobject_t current = lower_bound;
1305 int r = pgbackend->objects_list_partial(
1306 current,
1307 list_size,
1308 list_size,
1309 &sentries,
1310 &next);
1311 if (r != 0) {
1312 result = -EINVAL;
1313 break;
1314 }
1315
1316 map<hobject_t, pg_missing_item>::const_iterator missing_iter =
1317 recovery_state.get_pg_log().get_missing().get_items().lower_bound(current);
1318 vector<hobject_t>::iterator ls_iter = sentries.begin();
1319 hobject_t _max = hobject_t::get_max();
1320 while (1) {
1321 const hobject_t &mcand =
1322 missing_iter == recovery_state.get_pg_log().get_missing().get_items().end() ?
1323 _max :
1324 missing_iter->first;
1325 const hobject_t &lcand =
1326 ls_iter == sentries.end() ?
1327 _max :
1328 *ls_iter;
1329
1330 hobject_t candidate;
1331 if (mcand == lcand) {
1332 candidate = mcand;
1333 if (!mcand.is_max()) {
1334 ++ls_iter;
1335 ++missing_iter;
1336 }
1337 } else if (mcand < lcand) {
1338 candidate = mcand;
1339 ceph_assert(!mcand.is_max());
1340 ++missing_iter;
1341 } else {
1342 candidate = lcand;
1343 ceph_assert(!lcand.is_max());
1344 ++ls_iter;
1345 }
1346
1347 dout(10) << " pgnls candidate 0x" << std::hex << candidate.get_hash()
1348 << " vs lower bound 0x" << lower_bound.get_hash()
1349 << std::dec << dendl;
1350
1351 if (candidate >= next) {
1352 break;
1353 }
1354
1355 if (response.entries.size() == list_size) {
1356 next = candidate;
1357 break;
1358 }
1359
1360 if (candidate.snap != CEPH_NOSNAP)
1361 continue;
1362
1363 // skip internal namespace
1364 if (candidate.get_namespace() == cct->_conf->osd_hit_set_namespace)
1365 continue;
1366
1367 if (recovery_state.get_missing_loc().is_deleted(candidate))
1368 continue;
1369
1370 // skip wrong namespace
1371 if (m->get_hobj().nspace != librados::all_nspaces &&
1372 candidate.get_namespace() != m->get_hobj().nspace)
1373 continue;
1374
1375 if (filter && !pgls_filter(*filter, candidate))
1376 continue;
1377
1378 dout(20) << "pgnls item 0x" << std::hex
1379 << candidate.get_hash()
1380 << ", rev 0x" << hobject_t::_reverse_bits(candidate.get_hash())
1381 << std::dec << " "
1382 << candidate.oid.name << dendl;
1383
1384 librados::ListObjectImpl item;
1385 item.nspace = candidate.get_namespace();
1386 item.oid = candidate.oid.name;
1387 item.locator = candidate.get_key();
1388 response.entries.push_back(item);
1389 }
1390
1391 if (next.is_max() &&
1392 missing_iter == recovery_state.get_pg_log().get_missing().get_items().end() &&
1393 ls_iter == sentries.end()) {
1394 result = 1;
1395
1396 // Set response.handle to the start of the next PG according
1397 // to the object sort order.
1398 response.handle = info.pgid.pgid.get_hobj_end(pool.info.get_pg_num());
1399 } else {
1400 response.handle = next;
1401 }
1402 dout(10) << "pgnls handle=" << response.handle << dendl;
1403 encode(response, osd_op.outdata);
1404 dout(10) << " pgnls result=" << result << " outdata.length()="
1405 << osd_op.outdata.length() << dendl;
1406 }
1407 break;
1408
1409 case CEPH_OSD_OP_PGLS_FILTER:
1410 try {
1411 decode(cname, bp);
1412 decode(mname, bp);
1413 }
1414 catch (const ceph::buffer::error& e) {
1415 dout(0) << "unable to decode PGLS_FILTER description in " << *m << dendl;
1416 result = -EINVAL;
1417 break;
1418 }
1419 std::tie(result, filter) = get_pgls_filter(bp);
1420 if (result < 0)
1421 break;
1422
1423 ceph_assert(filter);
1424
1425 // fall through
1426
1427 case CEPH_OSD_OP_PGLS:
1428 if (snapid != CEPH_NOSNAP) {
1429 result = -EINVAL;
1430 break;
1431 }
1432 if (get_osdmap()->raw_pg_to_pg(m->get_pg()) != info.pgid.pgid) {
1433 dout(10) << " pgls pg=" << m->get_pg()
1434 << " " << get_osdmap()->raw_pg_to_pg(m->get_pg())
1435 << " != " << info.pgid << dendl;
1436 result = 0; // hmm?
1437 } else {
1438 unsigned list_size = std::min<uint64_t>(cct->_conf->osd_max_pgls,
1439 p->op.pgls.count);
1440
1441 dout(10) << " pgls pg=" << m->get_pg() << " count " << list_size << dendl;
1442 // read into a buffer
1443 vector<hobject_t> sentries;
1444 pg_ls_response_t response;
1445 try {
1446 decode(response.handle, bp);
1447 }
1448 catch (const ceph::buffer::error& e) {
1449 dout(0) << "unable to decode PGLS handle in " << *m << dendl;
1450 result = -EINVAL;
1451 break;
1452 }
1453
1454 hobject_t next;
1455 hobject_t current = response.handle;
1456 int r = pgbackend->objects_list_partial(
1457 current,
1458 list_size,
1459 list_size,
1460 &sentries,
1461 &next);
1462 if (r != 0) {
1463 result = -EINVAL;
1464 break;
1465 }
1466
1467 ceph_assert(snapid == CEPH_NOSNAP || recovery_state.get_pg_log().get_missing().get_items().empty());
1468
1469 map<hobject_t, pg_missing_item>::const_iterator missing_iter =
1470 recovery_state.get_pg_log().get_missing().get_items().lower_bound(current);
1471 vector<hobject_t>::iterator ls_iter = sentries.begin();
1472 hobject_t _max = hobject_t::get_max();
1473 while (1) {
1474 const hobject_t &mcand =
1475 missing_iter == recovery_state.get_pg_log().get_missing().get_items().end() ?
1476 _max :
1477 missing_iter->first;
1478 const hobject_t &lcand =
1479 ls_iter == sentries.end() ?
1480 _max :
1481 *ls_iter;
1482
1483 hobject_t candidate;
1484 if (mcand == lcand) {
1485 candidate = mcand;
1486 if (!mcand.is_max()) {
1487 ++ls_iter;
1488 ++missing_iter;
1489 }
1490 } else if (mcand < lcand) {
1491 candidate = mcand;
1492 ceph_assert(!mcand.is_max());
1493 ++missing_iter;
1494 } else {
1495 candidate = lcand;
1496 ceph_assert(!lcand.is_max());
1497 ++ls_iter;
1498 }
1499
1500 if (candidate >= next) {
1501 break;
1502 }
1503
1504 if (response.entries.size() == list_size) {
1505 next = candidate;
1506 break;
1507 }
1508
1509 if (candidate.snap != CEPH_NOSNAP)
1510 continue;
1511
1512 // skip wrong namespace
1513 if (candidate.get_namespace() != m->get_hobj().nspace)
1514 continue;
1515
1516 if (recovery_state.get_missing_loc().is_deleted(candidate))
1517 continue;
1518
1519 if (filter && !pgls_filter(*filter, candidate))
1520 continue;
1521
1522 response.entries.push_back(make_pair(candidate.oid,
1523 candidate.get_key()));
1524 }
1525 if (next.is_max() &&
1526 missing_iter == recovery_state.get_pg_log().get_missing().get_items().end() &&
1527 ls_iter == sentries.end()) {
1528 result = 1;
1529 }
1530 response.handle = next;
1531 encode(response, osd_op.outdata);
1532 dout(10) << " pgls result=" << result << " outdata.length()="
1533 << osd_op.outdata.length() << dendl;
1534 }
1535 break;
1536
1537 case CEPH_OSD_OP_PG_HITSET_LS:
1538 {
1539 list< pair<utime_t,utime_t> > ls;
1540 for (list<pg_hit_set_info_t>::const_iterator p = info.hit_set.history.begin();
1541 p != info.hit_set.history.end();
1542 ++p)
1543 ls.push_back(make_pair(p->begin, p->end));
1544 if (hit_set)
1545 ls.push_back(make_pair(hit_set_start_stamp, utime_t()));
1546 encode(ls, osd_op.outdata);
1547 }
1548 break;
1549
1550 case CEPH_OSD_OP_PG_HITSET_GET:
1551 {
1552 utime_t stamp(osd_op.op.hit_set_get.stamp);
1553 if (hit_set_start_stamp && stamp >= hit_set_start_stamp) {
1554 // read the current in-memory HitSet, not the version we've
1555 // checkpointed.
1556 if (!hit_set) {
1557 result= -ENOENT;
1558 break;
1559 }
1560 encode(*hit_set, osd_op.outdata);
1561 result = osd_op.outdata.length();
1562 } else {
1563 // read an archived HitSet.
1564 hobject_t oid;
1565 for (list<pg_hit_set_info_t>::const_iterator p = info.hit_set.history.begin();
1566 p != info.hit_set.history.end();
1567 ++p) {
1568 if (stamp >= p->begin && stamp <= p->end) {
1569 oid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt);
1570 break;
1571 }
1572 }
1573 if (oid == hobject_t()) {
1574 result = -ENOENT;
1575 break;
1576 }
1577 if (!pool.info.is_replicated()) {
1578 // FIXME: EC not supported yet
1579 result = -EOPNOTSUPP;
1580 break;
1581 }
1582 if (is_unreadable_object(oid)) {
1583 wait_for_unreadable_object(oid, op);
1584 return;
1585 }
1586 result = osd->store->read(ch, ghobject_t(oid), 0, 0, osd_op.outdata);
1587 }
1588 }
1589 break;
1590
1591 case CEPH_OSD_OP_SCRUBLS:
1592 result = do_scrub_ls(m, &osd_op);
1593 break;
1594
1595 default:
1596 result = -EINVAL;
1597 break;
1598 }
1599
1600 if (result < 0)
1601 break;
1602 }
1603
1604 // reply
1605 MOSDOpReply *reply = new MOSDOpReply(m, 0, get_osdmap_epoch(),
1606 CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK,
1607 false);
1608 reply->claim_op_out_data(ops);
1609 reply->set_result(result);
1610 reply->set_reply_versions(info.last_update, info.last_user_version);
1611 osd->send_message_osd_client(reply, m->get_connection());
1612 }
1613
1614 int PrimaryLogPG::do_scrub_ls(const MOSDOp *m, OSDOp *osd_op)
1615 {
1616 if (m->get_pg() != info.pgid.pgid) {
1617 dout(10) << " scrubls pg=" << m->get_pg() << " != " << info.pgid << dendl;
1618 return -EINVAL; // hmm?
1619 }
1620 auto bp = osd_op->indata.cbegin();
1621 scrub_ls_arg_t arg;
1622 try {
1623 arg.decode(bp);
1624 } catch (ceph::buffer::error&) {
1625 dout(10) << " corrupted scrub_ls_arg_t" << dendl;
1626 return -EINVAL;
1627 }
1628
1629 int r = 0;
1630 scrub_ls_result_t result = {.interval = info.history.same_interval_since};
1631
1632 if (arg.interval != 0 && arg.interval != info.history.same_interval_since) {
1633 r = -EAGAIN;
1634 } else {
1635 bool store_queried = m_scrubber && m_scrubber->get_store_errors(arg, result);
1636 if (store_queried) {
1637 encode(result, osd_op->outdata);
1638 } else {
1639 // the scrubber's store is not initialized
1640 r = -ENOENT;
1641 }
1642 }
1643
1644 return r;
1645 }
1646
1647 /**
1648 * Grabs locks for OpContext, should be cleaned up in close_op_ctx
1649 *
1650 * @param ctx [in,out] ctx to get locks for
1651 * @return true on success, false if we are queued
1652 */
1653 bool PrimaryLogPG::get_rw_locks(bool write_ordered, OpContext *ctx)
1654 {
1655 /* If head_obc, !obc->obs->exists and we will always take the
1656 * snapdir lock *before* the head lock. Since all callers will do
1657 * this (read or write) if we get the first we will be guaranteed
1658 * to get the second.
1659 */
1660 if (write_ordered && ctx->op->may_read()) {
1661 ctx->lock_type = RWState::RWEXCL;
1662 } else if (write_ordered) {
1663 ctx->lock_type = RWState::RWWRITE;
1664 } else {
1665 ceph_assert(ctx->op->may_read());
1666 ctx->lock_type = RWState::RWREAD;
1667 }
1668
1669 if (ctx->head_obc) {
1670 ceph_assert(!ctx->obc->obs.exists);
1671 if (!ctx->lock_manager.get_lock_type(
1672 ctx->lock_type,
1673 ctx->head_obc->obs.oi.soid,
1674 ctx->head_obc,
1675 ctx->op)) {
1676 ctx->lock_type = RWState::RWNONE;
1677 return false;
1678 }
1679 }
1680 if (ctx->lock_manager.get_lock_type(
1681 ctx->lock_type,
1682 ctx->obc->obs.oi.soid,
1683 ctx->obc,
1684 ctx->op)) {
1685 return true;
1686 } else {
1687 ceph_assert(!ctx->head_obc);
1688 ctx->lock_type = RWState::RWNONE;
1689 return false;
1690 }
1691 }
1692
1693 /**
1694 * Releases locks
1695 *
1696 * @param manager [in] manager with locks to release
1697 */
1698 void PrimaryLogPG::release_object_locks(
1699 ObcLockManager &lock_manager) {
1700 std::list<std::pair<ObjectContextRef, std::list<OpRequestRef> > > to_req;
1701 bool requeue_recovery = false;
1702 bool requeue_snaptrim = false;
1703 lock_manager.put_locks(
1704 &to_req,
1705 &requeue_recovery,
1706 &requeue_snaptrim);
1707 if (requeue_recovery)
1708 queue_recovery();
1709 if (requeue_snaptrim)
1710 snap_trimmer_machine.process_event(TrimWriteUnblocked());
1711
1712 if (!to_req.empty()) {
1713 // requeue at front of scrub blocking queue if we are blocked by scrub
1714 for (auto &&p: to_req) {
1715 if (m_scrubber->write_blocked_by_scrub(p.first->obs.oi.soid.get_head())) {
1716 for (auto& op : p.second) {
1717 op->mark_delayed("waiting for scrub");
1718 }
1719
1720 waiting_for_scrub.splice(
1721 waiting_for_scrub.begin(),
1722 p.second,
1723 p.second.begin(),
1724 p.second.end());
1725 } else if (is_laggy()) {
1726 for (auto& op : p.second) {
1727 op->mark_delayed("waiting for readable");
1728 }
1729 waiting_for_readable.splice(
1730 waiting_for_readable.begin(),
1731 p.second,
1732 p.second.begin(),
1733 p.second.end());
1734 } else {
1735 requeue_ops(p.second);
1736 }
1737 }
1738 }
1739 }
1740
1741 PrimaryLogPG::PrimaryLogPG(OSDService *o, OSDMapRef curmap,
1742 const PGPool &_pool,
1743 const map<string,string>& ec_profile, spg_t p) :
1744 PG(o, curmap, _pool, p),
1745 pgbackend(
1746 PGBackend::build_pg_backend(
1747 _pool.info, ec_profile, this, coll_t(p), ch, o->store, cct)),
1748 object_contexts(o->cct, o->cct->_conf->osd_pg_object_context_cache_count),
1749 new_backfill(false),
1750 temp_seq(0),
1751 snap_trimmer_machine(this)
1752 {
1753 recovery_state.set_backend_predicates(
1754 pgbackend->get_is_readable_predicate(),
1755 pgbackend->get_is_recoverable_predicate());
1756 snap_trimmer_machine.initiate();
1757
1758 m_scrubber = make_unique<PrimaryLogScrub>(this);
1759 }
1760
1761 PrimaryLogPG::~PrimaryLogPG()
1762 {
1763 m_scrubber.reset();
1764 }
1765
1766 void PrimaryLogPG::get_src_oloc(const object_t& oid, const object_locator_t& oloc, object_locator_t& src_oloc)
1767 {
1768 src_oloc = oloc;
1769 if (oloc.key.empty())
1770 src_oloc.key = oid.name;
1771 }
1772
1773 void PrimaryLogPG::handle_backoff(OpRequestRef& op)
1774 {
1775 auto m = op->get_req<MOSDBackoff>();
1776 auto session = ceph::ref_cast<Session>(m->get_connection()->get_priv());
1777 if (!session)
1778 return; // drop it.
1779 hobject_t begin = info.pgid.pgid.get_hobj_start();
1780 hobject_t end = info.pgid.pgid.get_hobj_end(pool.info.get_pg_num());
1781 if (begin < m->begin) {
1782 begin = m->begin;
1783 }
1784 if (end > m->end) {
1785 end = m->end;
1786 }
1787 dout(10) << __func__ << " backoff ack id " << m->id
1788 << " [" << begin << "," << end << ")" << dendl;
1789 session->ack_backoff(cct, m->pgid, m->id, begin, end);
1790 }
1791
1792 void PrimaryLogPG::do_request(
1793 OpRequestRef& op,
1794 ThreadPool::TPHandle &handle)
1795 {
1796 if (op->osd_trace) {
1797 op->pg_trace.init("pg op", &trace_endpoint, &op->osd_trace);
1798 op->pg_trace.event("do request");
1799 }
1800
1801 [[maybe_unused]] auto span = tracing::osd::tracer.add_span(__func__, op->osd_parent_span);
1802
1803 // make sure we have a new enough map
1804 auto p = waiting_for_map.find(op->get_source());
1805 if (p != waiting_for_map.end()) {
1806 // preserve ordering
1807 dout(20) << __func__ << " waiting_for_map "
1808 << p->first << " not empty, queueing" << dendl;
1809 p->second.push_back(op);
1810 op->mark_delayed("waiting_for_map not empty");
1811 return;
1812 }
1813 if (!have_same_or_newer_map(op->min_epoch)) {
1814 dout(20) << __func__ << " min " << op->min_epoch
1815 << ", queue on waiting_for_map " << op->get_source() << dendl;
1816 waiting_for_map[op->get_source()].push_back(op);
1817 op->mark_delayed("op must wait for map");
1818 osd->request_osdmap_update(op->min_epoch);
1819 return;
1820 }
1821
1822 if (can_discard_request(op)) {
1823 return;
1824 }
1825
1826 // pg-wide backoffs
1827 const Message *m = op->get_req();
1828 int msg_type = m->get_type();
1829 if (m->get_connection()->has_feature(CEPH_FEATURE_RADOS_BACKOFF)) {
1830 auto session = ceph::ref_cast<Session>(m->get_connection()->get_priv());
1831 if (!session)
1832 return; // drop it.
1833 if (msg_type == CEPH_MSG_OSD_OP) {
1834 if (session->check_backoff(cct, info.pgid,
1835 info.pgid.pgid.get_hobj_start(), m)) {
1836 return;
1837 }
1838
1839 bool backoff =
1840 is_down() ||
1841 is_incomplete() ||
1842 (!is_active() && is_peered());
1843 if (g_conf()->osd_backoff_on_peering && !backoff) {
1844 if (is_peering()) {
1845 backoff = true;
1846 }
1847 }
1848 if (backoff) {
1849 add_pg_backoff(session);
1850 return;
1851 }
1852 }
1853 // pg backoff acks at pg-level
1854 if (msg_type == CEPH_MSG_OSD_BACKOFF) {
1855 const MOSDBackoff *ba = static_cast<const MOSDBackoff*>(m);
1856 if (ba->begin != ba->end) {
1857 handle_backoff(op);
1858 return;
1859 }
1860 }
1861 }
1862
1863 if (!is_peered()) {
1864 // Delay unless PGBackend says it's ok
1865 if (pgbackend->can_handle_while_inactive(op)) {
1866 bool handled = pgbackend->handle_message(op);
1867 ceph_assert(handled);
1868 return;
1869 } else {
1870 waiting_for_peered.push_back(op);
1871 op->mark_delayed("waiting for peered");
1872 return;
1873 }
1874 }
1875
1876 if (recovery_state.needs_flush()) {
1877 dout(20) << "waiting for flush on " << op << dendl;
1878 waiting_for_flush.push_back(op);
1879 op->mark_delayed("waiting for flush");
1880 return;
1881 }
1882
1883 ceph_assert(is_peered() && !recovery_state.needs_flush());
1884 if (pgbackend->handle_message(op))
1885 return;
1886
1887 switch (msg_type) {
1888 case CEPH_MSG_OSD_OP:
1889 case CEPH_MSG_OSD_BACKOFF:
1890 if (!is_active()) {
1891 dout(20) << " peered, not active, waiting for active on " << op << dendl;
1892 waiting_for_active.push_back(op);
1893 op->mark_delayed("waiting for active");
1894 return;
1895 }
1896 switch (msg_type) {
1897 case CEPH_MSG_OSD_OP:
1898 // verify client features
1899 if ((pool.info.has_tiers() || pool.info.is_tier()) &&
1900 !op->has_feature(CEPH_FEATURE_OSD_CACHEPOOL)) {
1901 osd->reply_op_error(op, -EOPNOTSUPP);
1902 return;
1903 }
1904 do_op(op);
1905 break;
1906 case CEPH_MSG_OSD_BACKOFF:
1907 // object-level backoff acks handled in osdop context
1908 handle_backoff(op);
1909 break;
1910 }
1911 break;
1912
1913 case MSG_OSD_PG_SCAN:
1914 do_scan(op, handle);
1915 break;
1916
1917 case MSG_OSD_PG_BACKFILL:
1918 do_backfill(op);
1919 break;
1920
1921 case MSG_OSD_PG_BACKFILL_REMOVE:
1922 do_backfill_remove(op);
1923 break;
1924
1925 case MSG_OSD_SCRUB_RESERVE:
1926 {
1927 if (!m_scrubber) {
1928 osd->reply_op_error(op, -EAGAIN);
1929 return;
1930 }
1931 auto m = op->get_req<MOSDScrubReserve>();
1932 switch (m->type) {
1933 case MOSDScrubReserve::REQUEST:
1934 m_scrubber->handle_scrub_reserve_request(op);
1935 break;
1936 case MOSDScrubReserve::GRANT:
1937 m_scrubber->handle_scrub_reserve_grant(op, m->from);
1938 break;
1939 case MOSDScrubReserve::REJECT:
1940 m_scrubber->handle_scrub_reserve_reject(op, m->from);
1941 break;
1942 case MOSDScrubReserve::RELEASE:
1943 m_scrubber->handle_scrub_reserve_release(op);
1944 break;
1945 }
1946 }
1947 break;
1948
1949 case MSG_OSD_REP_SCRUB:
1950 replica_scrub(op, handle);
1951 break;
1952
1953 case MSG_OSD_REP_SCRUBMAP:
1954 do_replica_scrub_map(op);
1955 break;
1956
1957 case MSG_OSD_PG_UPDATE_LOG_MISSING:
1958 do_update_log_missing(op);
1959 break;
1960
1961 case MSG_OSD_PG_UPDATE_LOG_MISSING_REPLY:
1962 do_update_log_missing_reply(op);
1963 break;
1964
1965 default:
1966 ceph_abort_msg("bad message type in do_request");
1967 }
1968 }
1969
1970 /** do_op - do an op
1971 * pg lock will be held (if multithreaded)
1972 * osd_lock NOT held.
1973 */
1974 void PrimaryLogPG::do_op(OpRequestRef& op)
1975 {
1976 FUNCTRACE(cct);
1977 // NOTE: take a non-const pointer here; we must be careful not to
1978 // change anything that will break other reads on m (operator<<).
1979 MOSDOp *m = static_cast<MOSDOp*>(op->get_nonconst_req());
1980 ceph_assert(m->get_type() == CEPH_MSG_OSD_OP);
1981 if (m->finish_decode()) {
1982 op->reset_desc(); // for TrackedOp
1983 m->clear_payload();
1984 }
1985
1986 dout(20) << __func__ << ": op " << *m << dendl;
1987
1988 const hobject_t head = m->get_hobj().get_head();
1989
1990 if (!info.pgid.pgid.contains(
1991 info.pgid.pgid.get_split_bits(pool.info.get_pg_num()), head)) {
1992 derr << __func__ << " " << info.pgid.pgid << " does not contain "
1993 << head << " pg_num " << pool.info.get_pg_num() << " hash "
1994 << std::hex << head.get_hash() << std::dec << dendl;
1995 osd->clog->warn() << info.pgid.pgid << " does not contain " << head
1996 << " op " << *m;
1997 ceph_assert(!cct->_conf->osd_debug_misdirected_ops);
1998 return;
1999 }
2000
2001 bool can_backoff =
2002 m->get_connection()->has_feature(CEPH_FEATURE_RADOS_BACKOFF);
2003 ceph::ref_t<Session> session;
2004 if (can_backoff) {
2005 session = static_cast<Session*>(m->get_connection()->get_priv().get());
2006 if (!session.get()) {
2007 dout(10) << __func__ << " no session" << dendl;
2008 return;
2009 }
2010
2011 if (session->check_backoff(cct, info.pgid, head, m)) {
2012 return;
2013 }
2014 }
2015
2016 if (m->has_flag(CEPH_OSD_FLAG_PARALLELEXEC)) {
2017 // not implemented.
2018 dout(20) << __func__ << ": PARALLELEXEC not implemented " << *m << dendl;
2019 osd->reply_op_error(op, -EINVAL);
2020 return;
2021 }
2022
2023 {
2024 int r = op->maybe_init_op_info(*get_osdmap());
2025 if (r) {
2026 osd->reply_op_error(op, r);
2027 return;
2028 }
2029 }
2030
2031 if ((m->get_flags() & (CEPH_OSD_FLAG_BALANCE_READS |
2032 CEPH_OSD_FLAG_LOCALIZE_READS)) &&
2033 op->may_read() &&
2034 !(op->may_write() || op->may_cache())) {
2035 // balanced reads; any replica will do
2036 if (!(is_primary() || is_nonprimary())) {
2037 osd->handle_misdirected_op(this, op);
2038 return;
2039 }
2040 } else {
2041 // normal case; must be primary
2042 if (!is_primary()) {
2043 osd->handle_misdirected_op(this, op);
2044 return;
2045 }
2046 }
2047
2048 if (!check_laggy(op)) {
2049 return;
2050 }
2051
2052 if (!op_has_sufficient_caps(op)) {
2053 osd->reply_op_error(op, -EPERM);
2054 return;
2055 }
2056
2057 if (op->includes_pg_op()) {
2058 return do_pg_op(op);
2059 }
2060
2061 // object name too long?
2062 if (m->get_oid().name.size() > cct->_conf->osd_max_object_name_len) {
2063 dout(4) << "do_op name is longer than "
2064 << cct->_conf->osd_max_object_name_len
2065 << " bytes" << dendl;
2066 osd->reply_op_error(op, -ENAMETOOLONG);
2067 return;
2068 }
2069 if (m->get_hobj().get_key().size() > cct->_conf->osd_max_object_name_len) {
2070 dout(4) << "do_op locator is longer than "
2071 << cct->_conf->osd_max_object_name_len
2072 << " bytes" << dendl;
2073 osd->reply_op_error(op, -ENAMETOOLONG);
2074 return;
2075 }
2076 if (m->get_hobj().nspace.size() > cct->_conf->osd_max_object_namespace_len) {
2077 dout(4) << "do_op namespace is longer than "
2078 << cct->_conf->osd_max_object_namespace_len
2079 << " bytes" << dendl;
2080 osd->reply_op_error(op, -ENAMETOOLONG);
2081 return;
2082 }
2083 if (m->get_hobj().oid.name.empty()) {
2084 dout(4) << "do_op empty oid name is not allowed" << dendl;
2085 osd->reply_op_error(op, -EINVAL);
2086 return;
2087 }
2088
2089 if (int r = osd->store->validate_hobject_key(head)) {
2090 dout(4) << "do_op object " << head << " invalid for backing store: "
2091 << r << dendl;
2092 osd->reply_op_error(op, r);
2093 return;
2094 }
2095
2096 // blocklisted?
2097 if (get_osdmap()->is_blocklisted(m->get_source_addr())) {
2098 dout(10) << "do_op " << m->get_source_addr() << " is blocklisted" << dendl;
2099 osd->reply_op_error(op, -EBLOCKLISTED);
2100 return;
2101 }
2102
2103 // order this op as a write?
2104 bool write_ordered = op->rwordered();
2105
2106 // discard due to cluster full transition? (we discard any op that
2107 // originates before the cluster or pool is marked full; the client
2108 // will resend after the full flag is removed or if they expect the
2109 // op to succeed despite being full). The except is FULL_FORCE and
2110 // FULL_TRY ops, which there is no reason to discard because they
2111 // bypass all full checks anyway. If this op isn't write or
2112 // read-ordered, we skip.
2113 // FIXME: we exclude mds writes for now.
2114 if (write_ordered && !(m->get_source().is_mds() ||
2115 m->has_flag(CEPH_OSD_FLAG_FULL_TRY) ||
2116 m->has_flag(CEPH_OSD_FLAG_FULL_FORCE)) &&
2117 info.history.last_epoch_marked_full > m->get_map_epoch()) {
2118 dout(10) << __func__ << " discarding op sent before full " << m << " "
2119 << *m << dendl;
2120 return;
2121 }
2122 // mds should have stopped writing before this point.
2123 // We can't allow OSD to become non-startable even if mds
2124 // could be writing as part of file removals.
2125 if (write_ordered && osd->check_failsafe_full(get_dpp()) &&
2126 !m->has_flag(CEPH_OSD_FLAG_FULL_TRY)) {
2127 dout(10) << __func__ << " fail-safe full check failed, dropping request." << dendl;
2128 return;
2129 }
2130 int64_t poolid = get_pgid().pool();
2131 const pg_pool_t *pi = get_osdmap()->get_pg_pool(poolid);
2132 if (!pi) {
2133 return;
2134 }
2135 if (pi->has_flag(pg_pool_t::FLAG_EIO)) {
2136 // drop op on the floor; the client will handle returning EIO
2137 if (m->has_flag(CEPH_OSD_FLAG_SUPPORTSPOOLEIO)) {
2138 dout(10) << __func__ << " discarding op due to pool EIO flag" << dendl;
2139 } else {
2140 dout(10) << __func__ << " replying EIO due to pool EIO flag" << dendl;
2141 osd->reply_op_error(op, -EIO);
2142 }
2143 return;
2144 }
2145 if (op->may_write()) {
2146
2147 // invalid?
2148 if (m->get_snapid() != CEPH_NOSNAP) {
2149 dout(20) << __func__ << ": write to clone not valid " << *m << dendl;
2150 osd->reply_op_error(op, -EINVAL);
2151 return;
2152 }
2153
2154 // too big?
2155 if (cct->_conf->osd_max_write_size &&
2156 m->get_data_len() > cct->_conf->osd_max_write_size << 20) {
2157 // journal can't hold commit!
2158 derr << "do_op msg data len " << m->get_data_len()
2159 << " > osd_max_write_size " << (cct->_conf->osd_max_write_size << 20)
2160 << " on " << *m << dendl;
2161 osd->reply_op_error(op, -OSD_WRITETOOBIG);
2162 return;
2163 }
2164 }
2165
2166 dout(10) << "do_op " << *m
2167 << (op->may_write() ? " may_write" : "")
2168 << (op->may_read() ? " may_read" : "")
2169 << (op->may_cache() ? " may_cache" : "")
2170 << " -> " << (write_ordered ? "write-ordered" : "read-ordered")
2171 << " flags " << ceph_osd_flag_string(m->get_flags())
2172 << dendl;
2173
2174 [[maybe_unused]] auto span = tracing::osd::tracer.add_span(__func__, op->osd_parent_span);
2175
2176 // missing object?
2177 if (is_unreadable_object(head)) {
2178 if (!is_primary()) {
2179 osd->reply_op_error(op, -EAGAIN);
2180 return;
2181 }
2182 if (can_backoff &&
2183 (g_conf()->osd_backoff_on_degraded ||
2184 (g_conf()->osd_backoff_on_unfound &&
2185 recovery_state.get_missing_loc().is_unfound(head)))) {
2186 add_backoff(session, head, head);
2187 maybe_kick_recovery(head);
2188 } else {
2189 wait_for_unreadable_object(head, op);
2190 }
2191 return;
2192 }
2193
2194 if (write_ordered) {
2195 // degraded object?
2196 if (is_degraded_or_backfilling_object(head)) {
2197 if (can_backoff && g_conf()->osd_backoff_on_degraded) {
2198 add_backoff(session, head, head);
2199 maybe_kick_recovery(head);
2200 } else {
2201 wait_for_degraded_object(head, op);
2202 }
2203 return;
2204 }
2205
2206 if (m_scrubber->is_scrub_active() && m_scrubber->write_blocked_by_scrub(head)) {
2207 dout(20) << __func__ << ": waiting for scrub" << dendl;
2208 waiting_for_scrub.push_back(op);
2209 op->mark_delayed("waiting for scrub");
2210 return;
2211 }
2212 if (!check_laggy_requeue(op)) {
2213 return;
2214 }
2215
2216 // blocked on snap?
2217 if (auto blocked_iter = objects_blocked_on_degraded_snap.find(head);
2218 blocked_iter != std::end(objects_blocked_on_degraded_snap)) {
2219 hobject_t to_wait_on(head);
2220 to_wait_on.snap = blocked_iter->second;
2221 wait_for_degraded_object(to_wait_on, op);
2222 return;
2223 }
2224 if (auto blocked_snap_promote_iter = objects_blocked_on_snap_promotion.find(head);
2225 blocked_snap_promote_iter != std::end(objects_blocked_on_snap_promotion)) {
2226 wait_for_blocked_object(blocked_snap_promote_iter->second->obs.oi.soid, op);
2227 return;
2228 }
2229 if (objects_blocked_on_cache_full.count(head)) {
2230 block_write_on_full_cache(head, op);
2231 return;
2232 }
2233 }
2234
2235 // dup/resent?
2236 if (op->may_write() || op->may_cache()) {
2237 // warning: we will get back *a* request for this reqid, but not
2238 // necessarily the most recent. this happens with flush and
2239 // promote ops, but we can't possible have both in our log where
2240 // the original request is still not stable on disk, so for our
2241 // purposes here it doesn't matter which one we get.
2242 eversion_t version;
2243 version_t user_version;
2244 int return_code = 0;
2245 vector<pg_log_op_return_item_t> op_returns;
2246 bool got = check_in_progress_op(
2247 m->get_reqid(), &version, &user_version, &return_code, &op_returns);
2248 if (got) {
2249 dout(3) << __func__ << " dup " << m->get_reqid()
2250 << " version " << version << dendl;
2251 if (already_complete(version)) {
2252 osd->reply_op_error(op, return_code, version, user_version, op_returns);
2253 } else {
2254 dout(10) << " waiting for " << version << " to commit" << dendl;
2255 // always queue ondisk waiters, so that we can requeue if needed
2256 waiting_for_ondisk[version].emplace_back(op, user_version, return_code,
2257 op_returns);
2258 op->mark_delayed("waiting for ondisk");
2259 }
2260 return;
2261 }
2262 }
2263
2264 ObjectContextRef obc;
2265 bool can_create = op->may_write();
2266 hobject_t missing_oid;
2267
2268 // kludge around the fact that LIST_SNAPS sets CEPH_SNAPDIR for LIST_SNAPS
2269 const hobject_t& oid =
2270 m->get_snapid() == CEPH_SNAPDIR ? head : m->get_hobj();
2271
2272 // make sure LIST_SNAPS is on CEPH_SNAPDIR and nothing else
2273 for (vector<OSDOp>::iterator p = m->ops.begin(); p != m->ops.end(); ++p) {
2274 OSDOp& osd_op = *p;
2275
2276 if (osd_op.op.op == CEPH_OSD_OP_LIST_SNAPS) {
2277 if (m->get_snapid() != CEPH_SNAPDIR) {
2278 dout(10) << "LIST_SNAPS with incorrect context" << dendl;
2279 osd->reply_op_error(op, -EINVAL);
2280 return;
2281 }
2282 } else {
2283 if (m->get_snapid() == CEPH_SNAPDIR) {
2284 dout(10) << "non-LIST_SNAPS on snapdir" << dendl;
2285 osd->reply_op_error(op, -EINVAL);
2286 return;
2287 }
2288 }
2289 }
2290
2291 // io blocked on obc?
2292 if (!m->has_flag(CEPH_OSD_FLAG_FLUSH) &&
2293 maybe_await_blocked_head(oid, op)) {
2294 return;
2295 }
2296
2297 if (!is_primary()) {
2298 if (!recovery_state.can_serve_replica_read(oid)) {
2299 dout(20) << __func__
2300 << ": unstable write on replica, bouncing to primary "
2301 << *m << dendl;
2302 osd->reply_op_error(op, -EAGAIN);
2303 return;
2304 }
2305 dout(20) << __func__ << ": serving replica read on oid " << oid
2306 << dendl;
2307 }
2308
2309 int r = find_object_context(
2310 oid, &obc, can_create,
2311 m->has_flag(CEPH_OSD_FLAG_MAP_SNAP_CLONE),
2312 &missing_oid);
2313
2314 // LIST_SNAPS needs the ssc too
2315 if (obc &&
2316 m->get_snapid() == CEPH_SNAPDIR &&
2317 !obc->ssc) {
2318 obc->ssc = get_snapset_context(oid, true);
2319 }
2320
2321 if (r == -EAGAIN) {
2322 // If we're not the primary of this OSD, we just return -EAGAIN. Otherwise,
2323 // we have to wait for the object.
2324 if (is_primary()) {
2325 // missing the specific snap we need; requeue and wait.
2326 ceph_assert(!op->may_write()); // only happens on a read/cache
2327 wait_for_unreadable_object(missing_oid, op);
2328 return;
2329 }
2330 } else if (r == 0) {
2331 if (is_unreadable_object(obc->obs.oi.soid)) {
2332 dout(10) << __func__ << ": clone " << obc->obs.oi.soid
2333 << " is unreadable, waiting" << dendl;
2334 wait_for_unreadable_object(obc->obs.oi.soid, op);
2335 return;
2336 }
2337
2338 // degraded object? (the check above was for head; this could be a clone)
2339 if (write_ordered &&
2340 obc->obs.oi.soid.snap != CEPH_NOSNAP &&
2341 is_degraded_or_backfilling_object(obc->obs.oi.soid)) {
2342 dout(10) << __func__ << ": clone " << obc->obs.oi.soid
2343 << " is degraded, waiting" << dendl;
2344 wait_for_degraded_object(obc->obs.oi.soid, op);
2345 return;
2346 }
2347 }
2348
2349 bool in_hit_set = false;
2350 if (hit_set) {
2351 if (obc.get()) {
2352 if (obc->obs.oi.soid != hobject_t() && hit_set->contains(obc->obs.oi.soid))
2353 in_hit_set = true;
2354 } else {
2355 if (missing_oid != hobject_t() && hit_set->contains(missing_oid))
2356 in_hit_set = true;
2357 }
2358 if (!op->hitset_inserted) {
2359 hit_set->insert(oid);
2360 op->hitset_inserted = true;
2361 if (hit_set->is_full() ||
2362 hit_set_start_stamp + pool.info.hit_set_period <= m->get_recv_stamp()) {
2363 hit_set_persist();
2364 }
2365 }
2366 }
2367
2368 if (agent_state) {
2369 if (agent_choose_mode(false, op))
2370 return;
2371 }
2372
2373 if (obc.get() && obc->obs.exists) {
2374 if (recover_adjacent_clones(obc, op)) {
2375 return;
2376 }
2377 if (maybe_handle_manifest(op,
2378 write_ordered,
2379 obc))
2380 return;
2381 }
2382
2383 if (maybe_handle_cache(op,
2384 write_ordered,
2385 obc,
2386 r,
2387 missing_oid,
2388 false,
2389 in_hit_set))
2390 return;
2391
2392 if (r && (r != -ENOENT || !obc)) {
2393 // copy the reqids for copy get on ENOENT
2394 if (r == -ENOENT &&
2395 (m->ops[0].op.op == CEPH_OSD_OP_COPY_GET)) {
2396 fill_in_copy_get_noent(op, oid, m->ops[0]);
2397 return;
2398 }
2399 dout(20) << __func__ << ": find_object_context got error " << r << dendl;
2400 if (op->may_write() &&
2401 get_osdmap()->require_osd_release >= ceph_release_t::kraken) {
2402 record_write_error(op, oid, nullptr, r);
2403 } else {
2404 osd->reply_op_error(op, r);
2405 }
2406 return;
2407 }
2408
2409 // make sure locator is consistent
2410 object_locator_t oloc(obc->obs.oi.soid);
2411 if (m->get_object_locator() != oloc) {
2412 dout(10) << " provided locator " << m->get_object_locator()
2413 << " != object's " << obc->obs.oi.soid << dendl;
2414 osd->clog->warn() << "bad locator " << m->get_object_locator()
2415 << " on object " << oloc
2416 << " op " << *m;
2417 }
2418
2419 // io blocked on obc?
2420 if (obc->is_blocked() &&
2421 !m->has_flag(CEPH_OSD_FLAG_FLUSH)) {
2422 wait_for_blocked_object(obc->obs.oi.soid, op);
2423 return;
2424 }
2425
2426 dout(25) << __func__ << " oi " << obc->obs.oi << dendl;
2427
2428 OpContext *ctx = new OpContext(op, m->get_reqid(), &m->ops, obc, this);
2429
2430 if (m->has_flag(CEPH_OSD_FLAG_SKIPRWLOCKS)) {
2431 dout(20) << __func__ << ": skipping rw locks" << dendl;
2432 } else if (m->get_flags() & CEPH_OSD_FLAG_FLUSH) {
2433 dout(20) << __func__ << ": part of flush, will ignore write lock" << dendl;
2434
2435 // verify there is in fact a flush in progress
2436 // FIXME: we could make this a stronger test.
2437 map<hobject_t,FlushOpRef>::iterator p = flush_ops.find(obc->obs.oi.soid);
2438 if (p == flush_ops.end()) {
2439 dout(10) << __func__ << " no flush in progress, aborting" << dendl;
2440 reply_ctx(ctx, -EINVAL);
2441 return;
2442 }
2443 } else if (!get_rw_locks(write_ordered, ctx)) {
2444 dout(20) << __func__ << " waiting for rw locks " << dendl;
2445 op->mark_delayed("waiting for rw locks");
2446 close_op_ctx(ctx);
2447 return;
2448 }
2449 dout(20) << __func__ << " obc " << *obc << dendl;
2450
2451 if (r) {
2452 dout(20) << __func__ << " returned an error: " << r << dendl;
2453 if (op->may_write() &&
2454 get_osdmap()->require_osd_release >= ceph_release_t::kraken) {
2455 record_write_error(op, oid, nullptr, r,
2456 ctx->op->allows_returnvec() ? ctx : nullptr);
2457 } else {
2458 osd->reply_op_error(op, r);
2459 }
2460 close_op_ctx(ctx);
2461 return;
2462 }
2463
2464 if (m->has_flag(CEPH_OSD_FLAG_IGNORE_CACHE)) {
2465 ctx->ignore_cache = true;
2466 }
2467
2468 if ((op->may_read()) && (obc->obs.oi.is_lost())) {
2469 // This object is lost. Reading from it returns an error.
2470 dout(20) << __func__ << ": object " << obc->obs.oi.soid
2471 << " is lost" << dendl;
2472 reply_ctx(ctx, -ENFILE);
2473 return;
2474 }
2475 if (!op->may_write() &&
2476 !op->may_cache() &&
2477 (!obc->obs.exists ||
2478 ((m->get_snapid() != CEPH_SNAPDIR) &&
2479 obc->obs.oi.is_whiteout()))) {
2480 // copy the reqids for copy get on ENOENT
2481 if (m->ops[0].op.op == CEPH_OSD_OP_COPY_GET) {
2482 fill_in_copy_get_noent(op, oid, m->ops[0]);
2483 close_op_ctx(ctx);
2484 return;
2485 }
2486 reply_ctx(ctx, -ENOENT);
2487 return;
2488 }
2489
2490 op->mark_started();
2491
2492 execute_ctx(ctx);
2493 utime_t prepare_latency = ceph_clock_now();
2494 prepare_latency -= op->get_dequeued_time();
2495 osd->logger->tinc(l_osd_op_prepare_lat, prepare_latency);
2496 if (op->may_read() && op->may_write()) {
2497 osd->logger->tinc(l_osd_op_rw_prepare_lat, prepare_latency);
2498 } else if (op->may_read()) {
2499 osd->logger->tinc(l_osd_op_r_prepare_lat, prepare_latency);
2500 } else if (op->may_write() || op->may_cache()) {
2501 osd->logger->tinc(l_osd_op_w_prepare_lat, prepare_latency);
2502 }
2503
2504 // force recovery of the oldest missing object if too many logs
2505 maybe_force_recovery();
2506 }
2507
2508 PrimaryLogPG::cache_result_t PrimaryLogPG::maybe_handle_manifest_detail(
2509 OpRequestRef op,
2510 bool write_ordered,
2511 ObjectContextRef obc)
2512 {
2513 if (!obc) {
2514 dout(20) << __func__ << ": no obc " << dendl;
2515 return cache_result_t::NOOP;
2516 }
2517
2518 if (!obc->obs.oi.has_manifest()) {
2519 dout(20) << __func__ << ": " << obc->obs.oi.soid
2520 << " is not manifest object " << dendl;
2521 return cache_result_t::NOOP;
2522 }
2523 if (op->get_req<MOSDOp>()->get_flags() & CEPH_OSD_FLAG_IGNORE_REDIRECT) {
2524 dout(20) << __func__ << ": ignoring redirect due to flag" << dendl;
2525 return cache_result_t::NOOP;
2526 }
2527
2528 // if it is write-ordered and blocked, stop now
2529 if (obc->is_blocked() && write_ordered) {
2530 // we're already doing something with this object
2531 dout(20) << __func__ << " blocked on " << obc->obs.oi.soid << dendl;
2532 return cache_result_t::NOOP;
2533 }
2534
2535 vector<OSDOp> ops = op->get_req<MOSDOp>()->ops;
2536 for (vector<OSDOp>::iterator p = ops.begin(); p != ops.end(); ++p) {
2537 OSDOp& osd_op = *p;
2538 ceph_osd_op& op = osd_op.op;
2539 if (op.op == CEPH_OSD_OP_SET_REDIRECT ||
2540 op.op == CEPH_OSD_OP_SET_CHUNK ||
2541 op.op == CEPH_OSD_OP_UNSET_MANIFEST ||
2542 op.op == CEPH_OSD_OP_TIER_PROMOTE ||
2543 op.op == CEPH_OSD_OP_TIER_FLUSH ||
2544 op.op == CEPH_OSD_OP_TIER_EVICT ||
2545 op.op == CEPH_OSD_OP_ISDIRTY) {
2546 return cache_result_t::NOOP;
2547 }
2548 }
2549
2550 switch (obc->obs.oi.manifest.type) {
2551 case object_manifest_t::TYPE_REDIRECT:
2552 if (op->may_write() || write_ordered) {
2553 do_proxy_write(op, obc);
2554 } else {
2555 // promoted object
2556 if (obc->obs.oi.size != 0) {
2557 return cache_result_t::NOOP;
2558 }
2559 do_proxy_read(op, obc);
2560 }
2561 return cache_result_t::HANDLED_PROXY;
2562 case object_manifest_t::TYPE_CHUNKED:
2563 {
2564 if (can_proxy_chunked_read(op, obc)) {
2565 map<hobject_t,FlushOpRef>::iterator p = flush_ops.find(obc->obs.oi.soid);
2566 if (p != flush_ops.end()) {
2567 do_proxy_chunked_op(op, obc->obs.oi.soid, obc, true);
2568 return cache_result_t::HANDLED_PROXY;
2569 }
2570 do_proxy_chunked_op(op, obc->obs.oi.soid, obc, write_ordered);
2571 return cache_result_t::HANDLED_PROXY;
2572 }
2573
2574 MOSDOp *m = static_cast<MOSDOp*>(op->get_nonconst_req());
2575 ceph_assert(m->get_type() == CEPH_MSG_OSD_OP);
2576 hobject_t head = m->get_hobj();
2577
2578 if (is_degraded_or_backfilling_object(head)) {
2579 dout(20) << __func__ << ": " << head << " is degraded, waiting" << dendl;
2580 wait_for_degraded_object(head, op);
2581 return cache_result_t::BLOCKED_RECOVERY;
2582 }
2583
2584 if (m_scrubber->write_blocked_by_scrub(head)) {
2585 dout(20) << __func__ << ": waiting for scrub" << dendl;
2586 waiting_for_scrub.push_back(op);
2587 op->mark_delayed("waiting for scrub");
2588 return cache_result_t::BLOCKED_RECOVERY;
2589 }
2590 if (!check_laggy_requeue(op)) {
2591 return cache_result_t::BLOCKED_RECOVERY;
2592 }
2593
2594 for (auto& p : obc->obs.oi.manifest.chunk_map) {
2595 if (p.second.is_missing()) {
2596 auto m = op->get_req<MOSDOp>();
2597 const object_locator_t oloc = m->get_object_locator();
2598 promote_object(obc, obc->obs.oi.soid, oloc, op, NULL);
2599 return cache_result_t::BLOCKED_PROMOTE;
2600 }
2601 }
2602 return cache_result_t::NOOP;
2603 }
2604 default:
2605 ceph_abort_msg("unrecognized manifest type");
2606 }
2607
2608 return cache_result_t::NOOP;
2609 }
2610
2611 void PrimaryLogPG::record_write_error(OpRequestRef op, const hobject_t &soid,
2612 MOSDOpReply *orig_reply, int r,
2613 OpContext *ctx_for_op_returns)
2614 {
2615 dout(20) << __func__ << " r=" << r << dendl;
2616 ceph_assert(op->may_write());
2617 const osd_reqid_t &reqid = op->get_req<MOSDOp>()->get_reqid();
2618 mempool::osd_pglog::list<pg_log_entry_t> entries;
2619 entries.push_back(pg_log_entry_t(pg_log_entry_t::ERROR, soid,
2620 get_next_version(), eversion_t(), 0,
2621 reqid, utime_t(), r));
2622 if (ctx_for_op_returns) {
2623 entries.back().set_op_returns(*ctx_for_op_returns->ops);
2624 dout(20) << __func__ << " op_returns=" << entries.back().op_returns << dendl;
2625 }
2626
2627 struct OnComplete {
2628 PrimaryLogPG *pg;
2629 OpRequestRef op;
2630 boost::intrusive_ptr<MOSDOpReply> orig_reply;
2631 int r;
2632 OnComplete(
2633 PrimaryLogPG *pg,
2634 OpRequestRef op,
2635 MOSDOpReply *orig_reply,
2636 int r)
2637 : pg(pg), op(op),
2638 orig_reply(orig_reply, false /* take over ref */), r(r)
2639 {}
2640 void operator()() {
2641 ldpp_dout(pg, 20) << "finished " << __func__ << " r=" << r << dendl;
2642 auto m = op->get_req<MOSDOp>();
2643 MOSDOpReply *reply = orig_reply.detach();
2644 ldpp_dout(pg, 10) << " sending commit on " << *m << " " << reply << dendl;
2645 pg->osd->send_message_osd_client(reply, m->get_connection());
2646 }
2647 };
2648
2649 ObcLockManager lock_manager;
2650 submit_log_entries(
2651 entries,
2652 std::move(lock_manager),
2653 std::optional<std::function<void(void)> >(
2654 OnComplete(this, op, orig_reply, r)),
2655 op,
2656 r);
2657 }
2658
2659 PrimaryLogPG::cache_result_t PrimaryLogPG::maybe_handle_cache_detail(
2660 OpRequestRef op,
2661 bool write_ordered,
2662 ObjectContextRef obc,
2663 int r, hobject_t missing_oid,
2664 bool must_promote,
2665 bool in_hit_set,
2666 ObjectContextRef *promote_obc)
2667 {
2668 // return quickly if caching is not enabled
2669 if (pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE)
2670 return cache_result_t::NOOP;
2671
2672 if (op &&
2673 op->get_req() &&
2674 op->get_req()->get_type() == CEPH_MSG_OSD_OP &&
2675 (op->get_req<MOSDOp>()->get_flags() &
2676 CEPH_OSD_FLAG_IGNORE_CACHE)) {
2677 dout(20) << __func__ << ": ignoring cache due to flag" << dendl;
2678 return cache_result_t::NOOP;
2679 }
2680
2681 must_promote = must_promote || op->need_promote();
2682
2683 if (obc)
2684 dout(25) << __func__ << " " << obc->obs.oi << " "
2685 << (obc->obs.exists ? "exists" : "DNE")
2686 << " missing_oid " << missing_oid
2687 << " must_promote " << (int)must_promote
2688 << " in_hit_set " << (int)in_hit_set
2689 << dendl;
2690 else
2691 dout(25) << __func__ << " (no obc)"
2692 << " missing_oid " << missing_oid
2693 << " must_promote " << (int)must_promote
2694 << " in_hit_set " << (int)in_hit_set
2695 << dendl;
2696
2697 // if it is write-ordered and blocked, stop now
2698 if (obc.get() && obc->is_blocked() && write_ordered) {
2699 // we're already doing something with this object
2700 dout(20) << __func__ << " blocked on " << obc->obs.oi.soid << dendl;
2701 return cache_result_t::NOOP;
2702 }
2703
2704 if (r == -ENOENT && missing_oid == hobject_t()) {
2705 // we know this object is logically absent (e.g., an undefined clone)
2706 return cache_result_t::NOOP;
2707 }
2708
2709 if (obc.get() && obc->obs.exists) {
2710 osd->logger->inc(l_osd_op_cache_hit);
2711 return cache_result_t::NOOP;
2712 }
2713 if (!is_primary()) {
2714 dout(20) << __func__ << " cache miss; ask the primary" << dendl;
2715 osd->reply_op_error(op, -EAGAIN);
2716 return cache_result_t::REPLIED_WITH_EAGAIN;
2717 }
2718
2719 if (missing_oid == hobject_t() && obc.get()) {
2720 missing_oid = obc->obs.oi.soid;
2721 }
2722
2723 auto m = op->get_req<MOSDOp>();
2724 const object_locator_t oloc = m->get_object_locator();
2725
2726 if (op->need_skip_handle_cache()) {
2727 return cache_result_t::NOOP;
2728 }
2729
2730 OpRequestRef promote_op;
2731
2732 switch (pool.info.cache_mode) {
2733 case pg_pool_t::CACHEMODE_WRITEBACK:
2734 if (agent_state &&
2735 agent_state->evict_mode == TierAgentState::EVICT_MODE_FULL) {
2736 if (!op->may_write() && !op->may_cache() &&
2737 !write_ordered && !must_promote) {
2738 dout(20) << __func__ << " cache pool full, proxying read" << dendl;
2739 do_proxy_read(op);
2740 return cache_result_t::HANDLED_PROXY;
2741 }
2742 dout(20) << __func__ << " cache pool full, waiting" << dendl;
2743 block_write_on_full_cache(missing_oid, op);
2744 return cache_result_t::BLOCKED_FULL;
2745 }
2746
2747 if (must_promote || (!hit_set && !op->need_skip_promote())) {
2748 promote_object(obc, missing_oid, oloc, op, promote_obc);
2749 return cache_result_t::BLOCKED_PROMOTE;
2750 }
2751
2752 if (op->may_write() || op->may_cache()) {
2753 do_proxy_write(op);
2754
2755 // Promote too?
2756 if (!op->need_skip_promote() &&
2757 maybe_promote(obc, missing_oid, oloc, in_hit_set,
2758 pool.info.min_write_recency_for_promote,
2759 OpRequestRef(),
2760 promote_obc)) {
2761 return cache_result_t::BLOCKED_PROMOTE;
2762 }
2763 return cache_result_t::HANDLED_PROXY;
2764 } else {
2765 do_proxy_read(op);
2766
2767 // Avoid duplicate promotion
2768 if (obc.get() && obc->is_blocked()) {
2769 if (promote_obc)
2770 *promote_obc = obc;
2771 return cache_result_t::BLOCKED_PROMOTE;
2772 }
2773
2774 // Promote too?
2775 if (!op->need_skip_promote()) {
2776 (void)maybe_promote(obc, missing_oid, oloc, in_hit_set,
2777 pool.info.min_read_recency_for_promote,
2778 promote_op, promote_obc);
2779 }
2780
2781 return cache_result_t::HANDLED_PROXY;
2782 }
2783 ceph_abort_msg("unreachable");
2784 return cache_result_t::NOOP;
2785
2786 case pg_pool_t::CACHEMODE_READONLY:
2787 // TODO: clean this case up
2788 if (!obc.get() && r == -ENOENT) {
2789 // we don't have the object and op's a read
2790 promote_object(obc, missing_oid, oloc, op, promote_obc);
2791 return cache_result_t::BLOCKED_PROMOTE;
2792 }
2793 if (!r) { // it must be a write
2794 do_cache_redirect(op);
2795 return cache_result_t::HANDLED_REDIRECT;
2796 }
2797 // crap, there was a failure of some kind
2798 return cache_result_t::NOOP;
2799
2800 case pg_pool_t::CACHEMODE_FORWARD:
2801 // this mode is deprecated; proxy instead
2802 case pg_pool_t::CACHEMODE_PROXY:
2803 if (!must_promote) {
2804 if (op->may_write() || op->may_cache() || write_ordered) {
2805 do_proxy_write(op);
2806 return cache_result_t::HANDLED_PROXY;
2807 } else {
2808 do_proxy_read(op);
2809 return cache_result_t::HANDLED_PROXY;
2810 }
2811 }
2812 // ugh, we're forced to promote.
2813 if (agent_state &&
2814 agent_state->evict_mode == TierAgentState::EVICT_MODE_FULL) {
2815 dout(20) << __func__ << " cache pool full, waiting" << dendl;
2816 block_write_on_full_cache(missing_oid, op);
2817 return cache_result_t::BLOCKED_FULL;
2818 }
2819 promote_object(obc, missing_oid, oloc, op, promote_obc);
2820 return cache_result_t::BLOCKED_PROMOTE;
2821
2822 case pg_pool_t::CACHEMODE_READFORWARD:
2823 // this mode is deprecated; proxy instead
2824 case pg_pool_t::CACHEMODE_READPROXY:
2825 // Do writeback to the cache tier for writes
2826 if (op->may_write() || write_ordered || must_promote) {
2827 if (agent_state &&
2828 agent_state->evict_mode == TierAgentState::EVICT_MODE_FULL) {
2829 dout(20) << __func__ << " cache pool full, waiting" << dendl;
2830 block_write_on_full_cache(missing_oid, op);
2831 return cache_result_t::BLOCKED_FULL;
2832 }
2833 promote_object(obc, missing_oid, oloc, op, promote_obc);
2834 return cache_result_t::BLOCKED_PROMOTE;
2835 }
2836
2837 // If it is a read, we can read, we need to proxy it
2838 do_proxy_read(op);
2839 return cache_result_t::HANDLED_PROXY;
2840
2841 default:
2842 ceph_abort_msg("unrecognized cache_mode");
2843 }
2844 return cache_result_t::NOOP;
2845 }
2846
2847 bool PrimaryLogPG::maybe_promote(ObjectContextRef obc,
2848 const hobject_t& missing_oid,
2849 const object_locator_t& oloc,
2850 bool in_hit_set,
2851 uint32_t recency,
2852 OpRequestRef promote_op,
2853 ObjectContextRef *promote_obc)
2854 {
2855 dout(20) << __func__ << " missing_oid " << missing_oid
2856 << " in_hit_set " << in_hit_set << dendl;
2857
2858 switch (recency) {
2859 case 0:
2860 break;
2861 case 1:
2862 // Check if in the current hit set
2863 if (in_hit_set) {
2864 break;
2865 } else {
2866 // not promoting
2867 return false;
2868 }
2869 break;
2870 default:
2871 {
2872 unsigned count = (int)in_hit_set;
2873 if (count) {
2874 // Check if in other hit sets
2875 const hobject_t& oid = obc.get() ? obc->obs.oi.soid : missing_oid;
2876 for (map<time_t,HitSetRef>::reverse_iterator itor =
2877 agent_state->hit_set_map.rbegin();
2878 itor != agent_state->hit_set_map.rend();
2879 ++itor) {
2880 if (!itor->second->contains(oid)) {
2881 break;
2882 }
2883 ++count;
2884 if (count >= recency) {
2885 break;
2886 }
2887 }
2888 }
2889 if (count >= recency) {
2890 break;
2891 }
2892 return false; // not promoting
2893 }
2894 break;
2895 }
2896
2897 if (osd->promote_throttle()) {
2898 dout(10) << __func__ << " promote throttled" << dendl;
2899 return false;
2900 }
2901 promote_object(obc, missing_oid, oloc, promote_op, promote_obc);
2902 return true;
2903 }
2904
2905 void PrimaryLogPG::do_cache_redirect(OpRequestRef op)
2906 {
2907 auto m = op->get_req<MOSDOp>();
2908 int flags = m->get_flags() & (CEPH_OSD_FLAG_ACK|CEPH_OSD_FLAG_ONDISK);
2909 MOSDOpReply *reply = new MOSDOpReply(m, -ENOENT, get_osdmap_epoch(),
2910 flags, false);
2911 request_redirect_t redir(m->get_object_locator(), pool.info.tier_of);
2912 reply->set_redirect(redir);
2913 dout(10) << "sending redirect to pool " << pool.info.tier_of << " for op "
2914 << op << dendl;
2915 m->get_connection()->send_message(reply);
2916 return;
2917 }
2918
2919 struct C_ProxyRead : public Context {
2920 PrimaryLogPGRef pg;
2921 hobject_t oid;
2922 epoch_t last_peering_reset;
2923 ceph_tid_t tid;
2924 PrimaryLogPG::ProxyReadOpRef prdop;
2925 utime_t start;
2926 C_ProxyRead(PrimaryLogPG *p, hobject_t o, epoch_t lpr,
2927 const PrimaryLogPG::ProxyReadOpRef& prd)
2928 : pg(p), oid(o), last_peering_reset(lpr),
2929 tid(0), prdop(prd), start(ceph_clock_now())
2930 {}
2931 void finish(int r) override {
2932 if (prdop->canceled)
2933 return;
2934 std::scoped_lock locker{*pg};
2935 if (prdop->canceled) {
2936 return;
2937 }
2938 if (last_peering_reset == pg->get_last_peering_reset()) {
2939 pg->finish_proxy_read(oid, tid, r);
2940 pg->osd->logger->tinc(l_osd_tier_r_lat, ceph_clock_now() - start);
2941 }
2942 }
2943 };
2944
2945 struct C_ProxyChunkRead : public Context {
2946 PrimaryLogPGRef pg;
2947 hobject_t oid;
2948 epoch_t last_peering_reset;
2949 ceph_tid_t tid;
2950 PrimaryLogPG::ProxyReadOpRef prdop;
2951 utime_t start;
2952 ObjectOperation *obj_op;
2953 int op_index = 0;
2954 uint64_t req_offset = 0;
2955 ObjectContextRef obc;
2956 uint64_t req_total_len = 0;
2957 C_ProxyChunkRead(PrimaryLogPG *p, hobject_t o, epoch_t lpr,
2958 const PrimaryLogPG::ProxyReadOpRef& prd)
2959 : pg(p), oid(o), last_peering_reset(lpr),
2960 tid(0), prdop(prd), start(ceph_clock_now()), obj_op(NULL)
2961 {}
2962 void finish(int r) override {
2963 if (prdop->canceled)
2964 return;
2965 std::scoped_lock locker{*pg};
2966 if (prdop->canceled) {
2967 return;
2968 }
2969 if (last_peering_reset == pg->get_last_peering_reset()) {
2970 if (r >= 0) {
2971 if (!prdop->ops[op_index].outdata.length()) {
2972 ceph_assert(req_total_len);
2973 bufferlist list;
2974 bufferptr bptr(req_total_len);
2975 list.push_back(std::move(bptr));
2976 prdop->ops[op_index].outdata.append(list);
2977 }
2978 ceph_assert(obj_op);
2979 uint64_t copy_offset;
2980 if (req_offset >= prdop->ops[op_index].op.extent.offset) {
2981 copy_offset = req_offset - prdop->ops[op_index].op.extent.offset;
2982 } else {
2983 copy_offset = 0;
2984 }
2985 prdop->ops[op_index].outdata.begin(copy_offset).copy_in(
2986 obj_op->ops[0].outdata.length(),
2987 obj_op->ops[0].outdata.c_str());
2988 }
2989
2990 pg->finish_proxy_read(oid, tid, r);
2991 pg->osd->logger->tinc(l_osd_tier_r_lat, ceph_clock_now() - start);
2992 if (obj_op) {
2993 delete obj_op;
2994 }
2995 }
2996 }
2997 };
2998
2999 void PrimaryLogPG::do_proxy_read(OpRequestRef op, ObjectContextRef obc)
3000 {
3001 // NOTE: non-const here because the ProxyReadOp needs mutable refs to
3002 // stash the result in the request's OSDOp vector
3003 MOSDOp *m = static_cast<MOSDOp*>(op->get_nonconst_req());
3004 object_locator_t oloc;
3005 hobject_t soid;
3006 /* extensible tier */
3007 if (obc && obc->obs.exists && obc->obs.oi.has_manifest()) {
3008 switch (obc->obs.oi.manifest.type) {
3009 case object_manifest_t::TYPE_REDIRECT:
3010 oloc = object_locator_t(obc->obs.oi.manifest.redirect_target);
3011 soid = obc->obs.oi.manifest.redirect_target;
3012 break;
3013 default:
3014 ceph_abort_msg("unrecognized manifest type");
3015 }
3016 } else {
3017 /* proxy */
3018 soid = m->get_hobj();
3019 oloc = object_locator_t(m->get_object_locator());
3020 oloc.pool = pool.info.tier_of;
3021 }
3022 unsigned flags = CEPH_OSD_FLAG_IGNORE_CACHE | CEPH_OSD_FLAG_IGNORE_OVERLAY;
3023
3024 // pass through some original flags that make sense.
3025 // - leave out redirection and balancing flags since we are
3026 // already proxying through the primary
3027 // - leave off read/write/exec flags that are derived from the op
3028 flags |= m->get_flags() & (CEPH_OSD_FLAG_RWORDERED |
3029 CEPH_OSD_FLAG_ORDERSNAP |
3030 CEPH_OSD_FLAG_ENFORCE_SNAPC |
3031 CEPH_OSD_FLAG_MAP_SNAP_CLONE);
3032
3033 dout(10) << __func__ << " Start proxy read for " << *m << dendl;
3034
3035 ProxyReadOpRef prdop(std::make_shared<ProxyReadOp>(op, soid, m->ops));
3036
3037 ObjectOperation obj_op;
3038 obj_op.dup(prdop->ops);
3039
3040 if (pool.info.cache_mode == pg_pool_t::CACHEMODE_WRITEBACK &&
3041 (agent_state && agent_state->evict_mode != TierAgentState::EVICT_MODE_FULL)) {
3042 for (unsigned i = 0; i < obj_op.ops.size(); i++) {
3043 ceph_osd_op op = obj_op.ops[i].op;
3044 switch (op.op) {
3045 case CEPH_OSD_OP_READ:
3046 case CEPH_OSD_OP_SYNC_READ:
3047 case CEPH_OSD_OP_SPARSE_READ:
3048 case CEPH_OSD_OP_CHECKSUM:
3049 case CEPH_OSD_OP_CMPEXT:
3050 op.flags = (op.flags | CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL) &
3051 ~(CEPH_OSD_OP_FLAG_FADVISE_DONTNEED | CEPH_OSD_OP_FLAG_FADVISE_NOCACHE);
3052 }
3053 }
3054 }
3055
3056 C_ProxyRead *fin = new C_ProxyRead(this, soid, get_last_peering_reset(),
3057 prdop);
3058 ceph_tid_t tid = osd->objecter->read(
3059 soid.oid, oloc, obj_op,
3060 m->get_snapid(), NULL,
3061 flags, new C_OnFinisher(fin, osd->get_objecter_finisher(get_pg_shard())),
3062 &prdop->user_version,
3063 &prdop->data_offset,
3064 m->get_features());
3065 fin->tid = tid;
3066 prdop->objecter_tid = tid;
3067 proxyread_ops[tid] = prdop;
3068 in_progress_proxy_ops[soid].push_back(op);
3069 }
3070
3071 void PrimaryLogPG::finish_proxy_read(hobject_t oid, ceph_tid_t tid, int r)
3072 {
3073 dout(10) << __func__ << " " << oid << " tid " << tid
3074 << " " << cpp_strerror(r) << dendl;
3075
3076 map<ceph_tid_t, ProxyReadOpRef>::iterator p = proxyread_ops.find(tid);
3077 if (p == proxyread_ops.end()) {
3078 dout(10) << __func__ << " no proxyread_op found" << dendl;
3079 return;
3080 }
3081 ProxyReadOpRef prdop = p->second;
3082 if (tid != prdop->objecter_tid) {
3083 dout(10) << __func__ << " tid " << tid << " != prdop " << prdop
3084 << " tid " << prdop->objecter_tid << dendl;
3085 return;
3086 }
3087 if (oid != prdop->soid) {
3088 dout(10) << __func__ << " oid " << oid << " != prdop " << prdop
3089 << " soid " << prdop->soid << dendl;
3090 return;
3091 }
3092 proxyread_ops.erase(tid);
3093
3094 map<hobject_t, list<OpRequestRef>>::iterator q = in_progress_proxy_ops.find(oid);
3095 if (q == in_progress_proxy_ops.end()) {
3096 dout(10) << __func__ << " no in_progress_proxy_ops found" << dendl;
3097 return;
3098 }
3099 ceph_assert(q->second.size());
3100 list<OpRequestRef>::iterator it = std::find(q->second.begin(),
3101 q->second.end(),
3102 prdop->op);
3103 ceph_assert(it != q->second.end());
3104 OpRequestRef op = *it;
3105 q->second.erase(it);
3106 if (q->second.size() == 0) {
3107 in_progress_proxy_ops.erase(oid);
3108 } else if (std::find(q->second.begin(),
3109 q->second.end(),
3110 prdop->op) != q->second.end()) {
3111 /* multiple read case */
3112 dout(20) << __func__ << " " << oid << " is not completed " << dendl;
3113 return;
3114 }
3115
3116 osd->logger->inc(l_osd_tier_proxy_read);
3117
3118 auto m = op->get_req<MOSDOp>();
3119 OpContext *ctx = new OpContext(op, m->get_reqid(), &prdop->ops, this);
3120 ctx->reply = new MOSDOpReply(m, 0, get_osdmap_epoch(), 0, false);
3121 ctx->user_at_version = prdop->user_version;
3122 ctx->data_off = prdop->data_offset;
3123 ctx->ignore_log_op_stats = true;
3124 complete_read_ctx(r, ctx);
3125 }
3126
3127 void PrimaryLogPG::kick_proxy_ops_blocked(hobject_t& soid)
3128 {
3129 map<hobject_t, list<OpRequestRef>>::iterator p = in_progress_proxy_ops.find(soid);
3130 if (p == in_progress_proxy_ops.end())
3131 return;
3132
3133 list<OpRequestRef>& ls = p->second;
3134 dout(10) << __func__ << " " << soid << " requeuing " << ls.size() << " requests" << dendl;
3135 requeue_ops(ls);
3136 in_progress_proxy_ops.erase(p);
3137 }
3138
3139 void PrimaryLogPG::cancel_proxy_read(ProxyReadOpRef prdop,
3140 vector<ceph_tid_t> *tids)
3141 {
3142 dout(10) << __func__ << " " << prdop->soid << dendl;
3143 prdop->canceled = true;
3144
3145 // cancel objecter op, if we can
3146 if (prdop->objecter_tid) {
3147 tids->push_back(prdop->objecter_tid);
3148 for (uint32_t i = 0; i < prdop->ops.size(); i++) {
3149 prdop->ops[i].outdata.clear();
3150 }
3151 proxyread_ops.erase(prdop->objecter_tid);
3152 prdop->objecter_tid = 0;
3153 }
3154 }
3155
3156 void PrimaryLogPG::cancel_proxy_ops(bool requeue, vector<ceph_tid_t> *tids)
3157 {
3158 dout(10) << __func__ << dendl;
3159
3160 // cancel proxy reads
3161 map<ceph_tid_t, ProxyReadOpRef>::iterator p = proxyread_ops.begin();
3162 while (p != proxyread_ops.end()) {
3163 cancel_proxy_read((p++)->second, tids);
3164 }
3165
3166 // cancel proxy writes
3167 map<ceph_tid_t, ProxyWriteOpRef>::iterator q = proxywrite_ops.begin();
3168 while (q != proxywrite_ops.end()) {
3169 cancel_proxy_write((q++)->second, tids);
3170 }
3171
3172 if (requeue) {
3173 map<hobject_t, list<OpRequestRef>>::iterator p =
3174 in_progress_proxy_ops.begin();
3175 while (p != in_progress_proxy_ops.end()) {
3176 list<OpRequestRef>& ls = p->second;
3177 dout(10) << __func__ << " " << p->first << " requeuing " << ls.size()
3178 << " requests" << dendl;
3179 requeue_ops(ls);
3180 in_progress_proxy_ops.erase(p++);
3181 }
3182 } else {
3183 in_progress_proxy_ops.clear();
3184 }
3185 }
3186
3187 struct C_ProxyWrite_Commit : public Context {
3188 PrimaryLogPGRef pg;
3189 hobject_t oid;
3190 epoch_t last_peering_reset;
3191 ceph_tid_t tid;
3192 PrimaryLogPG::ProxyWriteOpRef pwop;
3193 C_ProxyWrite_Commit(PrimaryLogPG *p, hobject_t o, epoch_t lpr,
3194 const PrimaryLogPG::ProxyWriteOpRef& pw)
3195 : pg(p), oid(o), last_peering_reset(lpr),
3196 tid(0), pwop(pw)
3197 {}
3198 void finish(int r) override {
3199 if (pwop->canceled)
3200 return;
3201 std::scoped_lock locker{*pg};
3202 if (pwop->canceled) {
3203 return;
3204 }
3205 if (last_peering_reset == pg->get_last_peering_reset()) {
3206 pg->finish_proxy_write(oid, tid, r);
3207 }
3208 }
3209 };
3210
3211 void PrimaryLogPG::do_proxy_write(OpRequestRef op, ObjectContextRef obc)
3212 {
3213 // NOTE: non-const because ProxyWriteOp takes a mutable ref
3214 MOSDOp *m = static_cast<MOSDOp*>(op->get_nonconst_req());
3215 object_locator_t oloc;
3216 SnapContext snapc(m->get_snap_seq(), m->get_snaps());
3217 hobject_t soid;
3218 /* extensible tier */
3219 if (obc && obc->obs.exists && obc->obs.oi.has_manifest()) {
3220 switch (obc->obs.oi.manifest.type) {
3221 case object_manifest_t::TYPE_REDIRECT:
3222 oloc = object_locator_t(obc->obs.oi.manifest.redirect_target);
3223 soid = obc->obs.oi.manifest.redirect_target;
3224 break;
3225 default:
3226 ceph_abort_msg("unrecognized manifest type");
3227 }
3228 } else {
3229 /* proxy */
3230 soid = m->get_hobj();
3231 oloc = object_locator_t(m->get_object_locator());
3232 oloc.pool = pool.info.tier_of;
3233 }
3234
3235 unsigned flags = CEPH_OSD_FLAG_IGNORE_CACHE | CEPH_OSD_FLAG_IGNORE_OVERLAY;
3236 if (!(op->may_write() || op->may_cache())) {
3237 flags |= CEPH_OSD_FLAG_RWORDERED;
3238 }
3239 if (op->allows_returnvec()) {
3240 flags |= CEPH_OSD_FLAG_RETURNVEC;
3241 }
3242
3243 dout(10) << __func__ << " Start proxy write for " << *m << dendl;
3244
3245 ProxyWriteOpRef pwop(std::make_shared<ProxyWriteOp>(op, soid, m->ops, m->get_reqid()));
3246 pwop->ctx = new OpContext(op, m->get_reqid(), &pwop->ops, this);
3247 pwop->mtime = m->get_mtime();
3248
3249 ObjectOperation obj_op;
3250 obj_op.dup(pwop->ops);
3251
3252 C_ProxyWrite_Commit *fin = new C_ProxyWrite_Commit(
3253 this, soid, get_last_peering_reset(), pwop);
3254 ceph_tid_t tid = osd->objecter->mutate(
3255 soid.oid, oloc, obj_op, snapc,
3256 ceph::real_clock::from_ceph_timespec(pwop->mtime),
3257 flags, new C_OnFinisher(fin, osd->get_objecter_finisher(get_pg_shard())),
3258 &pwop->user_version, pwop->reqid);
3259 fin->tid = tid;
3260 pwop->objecter_tid = tid;
3261 proxywrite_ops[tid] = pwop;
3262 in_progress_proxy_ops[soid].push_back(op);
3263 }
3264
3265 void PrimaryLogPG::do_proxy_chunked_op(OpRequestRef op, const hobject_t& missing_oid,
3266 ObjectContextRef obc, bool write_ordered)
3267 {
3268 MOSDOp *m = static_cast<MOSDOp*>(op->get_nonconst_req());
3269 OSDOp *osd_op = NULL;
3270 for (unsigned int i = 0; i < m->ops.size(); i++) {
3271 osd_op = &m->ops[i];
3272 uint64_t cursor = osd_op->op.extent.offset;
3273 uint64_t op_length = osd_op->op.extent.offset + osd_op->op.extent.length;
3274 uint64_t chunk_length = 0, chunk_index = 0, req_len = 0;
3275 object_manifest_t *manifest = &obc->obs.oi.manifest;
3276 map <uint64_t, map<uint64_t, uint64_t>> chunk_read;
3277
3278 while (cursor < op_length) {
3279 chunk_index = 0;
3280 chunk_length = 0;
3281 /* find the right chunk position for cursor */
3282 for (auto &p : manifest->chunk_map) {
3283 if (p.first <= cursor && p.first + p.second.length > cursor) {
3284 chunk_length = p.second.length;
3285 chunk_index = p.first;
3286 break;
3287 }
3288 }
3289 /* no index */
3290 if (!chunk_index && !chunk_length) {
3291 if (cursor == osd_op->op.extent.offset) {
3292 OpContext *ctx = new OpContext(op, m->get_reqid(), &m->ops, this);
3293 ctx->reply = new MOSDOpReply(m, 0, get_osdmap_epoch(), 0, false);
3294 ctx->data_off = osd_op->op.extent.offset;
3295 ctx->ignore_log_op_stats = true;
3296 complete_read_ctx(0, ctx);
3297 }
3298 break;
3299 }
3300 uint64_t next_length = chunk_length;
3301 /* the size to read -> | op length | */
3302 /* | a chunk | */
3303 if (cursor + next_length > op_length) {
3304 next_length = op_length - cursor;
3305 }
3306 /* the size to read -> | op length | */
3307 /* | a chunk | */
3308 if (cursor + next_length > chunk_index + chunk_length) {
3309 next_length = chunk_index + chunk_length - cursor;
3310 }
3311
3312 chunk_read[cursor] = {{chunk_index, next_length}};
3313 cursor += next_length;
3314 }
3315
3316 req_len = cursor - osd_op->op.extent.offset;
3317 for (auto &p : chunk_read) {
3318 auto chunks = p.second.begin();
3319 dout(20) << __func__ << " chunk_index: " << chunks->first
3320 << " next_length: " << chunks->second << " cursor: "
3321 << p.first << dendl;
3322 do_proxy_chunked_read(op, obc, i, chunks->first, p.first, chunks->second, req_len, write_ordered);
3323 }
3324 }
3325 }
3326
3327 struct RefCountCallback : public Context {
3328 public:
3329 PrimaryLogPG::OpContext *ctx;
3330 OSDOp& osd_op;
3331 bool requeue = false;
3332
3333 RefCountCallback(PrimaryLogPG::OpContext *ctx, OSDOp &osd_op)
3334 : ctx(ctx), osd_op(osd_op) {}
3335 void finish(int r) override {
3336 // NB: caller must already have pg->lock held
3337 ctx->obc->stop_block();
3338 ctx->pg->kick_object_context_blocked(ctx->obc);
3339 if (r >= 0) {
3340 osd_op.rval = 0;
3341 ctx->pg->execute_ctx(ctx);
3342 } else {
3343 // on cancel simply toss op out,
3344 // or requeue as requested
3345 if (r != -ECANCELED) {
3346 if (ctx->op)
3347 ctx->pg->osd->reply_op_error(ctx->op, r);
3348 } else if (requeue) {
3349 if (ctx->op)
3350 ctx->pg->requeue_op(ctx->op);
3351 }
3352 ctx->pg->close_op_ctx(ctx);
3353 }
3354 }
3355 void set_requeue(bool rq) {
3356 requeue = rq;
3357 }
3358 };
3359
3360 struct SetManifestFinisher : public PrimaryLogPG::OpFinisher {
3361 OSDOp& osd_op;
3362
3363 explicit SetManifestFinisher(OSDOp& osd_op) : osd_op(osd_op) {
3364 }
3365
3366 int execute() override {
3367 return osd_op.rval;
3368 }
3369 };
3370
3371 struct C_SetManifestRefCountDone : public Context {
3372 PrimaryLogPGRef pg;
3373 hobject_t soid;
3374 uint64_t offset;
3375 ceph_tid_t tid = 0;
3376 C_SetManifestRefCountDone(PrimaryLogPG *p,
3377 hobject_t soid, uint64_t offset) :
3378 pg(p), soid(soid), offset(offset) {}
3379 void finish(int r) override {
3380 if (r == -ECANCELED)
3381 return;
3382 std::scoped_lock locker{*pg};
3383 pg->finish_set_manifest_refcount(soid, r, tid, offset);
3384 }
3385 };
3386
3387 struct C_SetDedupChunks : public Context {
3388 PrimaryLogPGRef pg;
3389 hobject_t oid;
3390 epoch_t last_peering_reset;
3391 ceph_tid_t tid;
3392 uint64_t offset;
3393
3394 C_SetDedupChunks(PrimaryLogPG *p, hobject_t o, epoch_t lpr, uint64_t offset)
3395 : pg(p), oid(o), last_peering_reset(lpr),
3396 tid(0), offset(offset)
3397 {}
3398 void finish(int r) override {
3399 if (r == -ECANCELED)
3400 return;
3401 std::scoped_lock locker{*pg};
3402 if (last_peering_reset != pg->get_last_peering_reset()) {
3403 return;
3404 }
3405 pg->finish_set_dedup(oid, r, tid, offset);
3406 }
3407 };
3408
3409 void PrimaryLogPG::cancel_manifest_ops(bool requeue, vector<ceph_tid_t> *tids)
3410 {
3411 dout(10) << __func__ << dendl;
3412 auto p = manifest_ops.begin();
3413 while (p != manifest_ops.end()) {
3414 auto mop = p->second;
3415 // cancel objecter op, if we can
3416 if (mop->objecter_tid) {
3417 tids->push_back(mop->objecter_tid);
3418 mop->objecter_tid = 0;
3419 } else if (!mop->tids.empty()) {
3420 for (auto &p : mop->tids) {
3421 tids->push_back(p.second);
3422 }
3423 }
3424 if (mop->cb) {
3425 mop->cb->set_requeue(requeue);
3426 mop->cb->complete(-ECANCELED);
3427 }
3428 manifest_ops.erase(p++);
3429 }
3430 }
3431
3432 int PrimaryLogPG::get_manifest_ref_count(ObjectContextRef obc, std::string& fp_oid, OpRequestRef op)
3433 {
3434 int cnt = 0;
3435 // head
3436 for (auto &p : obc->obs.oi.manifest.chunk_map) {
3437 if (p.second.oid.oid.name == fp_oid) {
3438 cnt++;
3439 }
3440 }
3441 // snap
3442 SnapSet& ss = obc->ssc->snapset;
3443 const OSDMapRef& osdmap = get_osdmap();
3444 for (vector<snapid_t>::const_reverse_iterator p = ss.clones.rbegin();
3445 p != ss.clones.rend();
3446 ++p) {
3447 object_ref_delta_t refs;
3448 ObjectContextRef obc_l = nullptr;
3449 ObjectContextRef obc_g = nullptr;
3450 hobject_t clone_oid = obc->obs.oi.soid;
3451 clone_oid.snap = *p;
3452 if (osdmap->in_removed_snaps_queue(info.pgid.pgid.pool(), *p)) {
3453 return -EBUSY;
3454 }
3455 ObjectContextRef clone_obc = get_object_context(clone_oid, false);
3456 if (!clone_obc) {
3457 break;
3458 }
3459 if (recover_adjacent_clones(clone_obc, op)) {
3460 return -EAGAIN;
3461 }
3462 get_adjacent_clones(clone_obc, obc_l, obc_g);
3463 clone_obc->obs.oi.manifest.calc_refs_to_inc_on_set(
3464 obc_g ? &(obc_g->obs.oi.manifest) : nullptr ,
3465 nullptr,
3466 refs);
3467 for (auto p = refs.begin(); p != refs.end(); ++p) {
3468 if (p->first.oid.name == fp_oid && p->second > 0) {
3469 cnt += p->second;
3470 }
3471 }
3472 }
3473
3474 return cnt;
3475 }
3476
3477 bool PrimaryLogPG::recover_adjacent_clones(ObjectContextRef obc, OpRequestRef op)
3478 {
3479 if (!obc->ssc || !obc->ssc->snapset.clones.size()) {
3480 return false;
3481 }
3482 MOSDOp *m = static_cast<MOSDOp*>(op->get_nonconst_req());
3483 bool has_manifest_op = std::any_of(
3484 begin(m->ops),
3485 end(m->ops),
3486 [](const auto& osd_op) {
3487 return osd_op.op.op == CEPH_OSD_OP_SET_CHUNK;
3488 });
3489 if (!obc->obs.oi.manifest.is_chunked() && !has_manifest_op) {
3490 return false;
3491 }
3492 ceph_assert(op);
3493
3494 const SnapSet& snapset = obc->ssc->snapset;
3495 auto s = std::find(snapset.clones.begin(), snapset.clones.end(), obc->obs.oi.soid.snap);
3496 auto is_unreadable_snap = [this, obc, &snapset, op](auto iter) -> bool {
3497 hobject_t cid = obc->obs.oi.soid;
3498 cid.snap = (iter == snapset.clones.end()) ? snapid_t(CEPH_NOSNAP) : *iter;
3499 if (is_unreadable_object(cid)) {
3500 dout(10) << __func__ << ": clone " << cid
3501 << " is unreadable, waiting" << dendl;
3502 wait_for_unreadable_object(cid, op);
3503 return true;
3504 }
3505 return false;
3506 };
3507 if (s != snapset.clones.begin()) {
3508 if (is_unreadable_snap(s - 1)) {
3509 return true;
3510 }
3511 }
3512 if (s != snapset.clones.end()) {
3513 if (is_unreadable_snap(s + 1)) {
3514 return true;
3515 }
3516 }
3517 return false;
3518 }
3519
3520 ObjectContextRef PrimaryLogPG::get_prev_clone_obc(ObjectContextRef obc)
3521 {
3522 auto s = std::find(obc->ssc->snapset.clones.begin(), obc->ssc->snapset.clones.end(),
3523 obc->obs.oi.soid.snap);
3524 if (s != obc->ssc->snapset.clones.begin()) {
3525 auto s_iter = s - 1;
3526 hobject_t cid = obc->obs.oi.soid;
3527 object_ref_delta_t refs;
3528 cid.snap = *s_iter;
3529 ObjectContextRef cobc = get_object_context(cid, false, NULL);
3530 ceph_assert(cobc);
3531 return cobc;
3532 }
3533 return nullptr;
3534 }
3535
3536 void PrimaryLogPG::dec_refcount(const hobject_t& soid, const object_ref_delta_t& refs)
3537 {
3538 for (auto p = refs.begin(); p != refs.end(); ++p) {
3539 int dec_ref_count = p->second;
3540 ceph_assert(dec_ref_count < 0);
3541 while (dec_ref_count < 0) {
3542 dout(10) << __func__ << ": decrement reference on offset oid: " << p->first << dendl;
3543 refcount_manifest(soid, p->first,
3544 refcount_t::DECREMENT_REF, NULL, std::nullopt);
3545 dec_ref_count++;
3546 }
3547 }
3548 }
3549
3550
3551 void PrimaryLogPG::get_adjacent_clones(ObjectContextRef src_obc,
3552 ObjectContextRef& _l, ObjectContextRef& _g)
3553 {
3554 const SnapSet& snapset = src_obc->ssc->snapset;
3555 const object_info_t& oi = src_obc->obs.oi;
3556
3557 auto get_context = [this, &oi, &snapset](auto iter)
3558 -> ObjectContextRef {
3559 hobject_t cid = oi.soid;
3560 cid.snap = (iter == snapset.clones.end()) ? snapid_t(CEPH_NOSNAP) : *iter;
3561 ObjectContextRef obc = get_object_context(cid, false, NULL);
3562 ceph_assert(obc);
3563 return obc;
3564 };
3565
3566 // check adjacent clones
3567 auto s = std::find(snapset.clones.begin(), snapset.clones.end(), oi.soid.snap);
3568
3569 // We *must* find the clone iff it's not head,
3570 // let s == snapset.clones.end() mean head
3571 ceph_assert((s == snapset.clones.end()) == oi.soid.is_head());
3572
3573 if (s != snapset.clones.begin()) {
3574 _l = get_context(s - 1);
3575 }
3576
3577 if (s != snapset.clones.end()) {
3578 _g = get_context(s + 1);
3579 }
3580 }
3581
3582 bool PrimaryLogPG::inc_refcount_by_set(OpContext* ctx, object_manifest_t& set_chunk,
3583 OSDOp& osd_op)
3584 {
3585 object_ref_delta_t refs;
3586 ObjectContextRef obc_l, obc_g;
3587 get_adjacent_clones(ctx->obc, obc_l, obc_g);
3588 set_chunk.calc_refs_to_inc_on_set(
3589 obc_l ? &(obc_l->obs.oi.manifest) : nullptr,
3590 obc_g ? &(obc_g->obs.oi.manifest) : nullptr,
3591 refs);
3592 bool need_inc_ref = false;
3593 if (!refs.is_empty()) {
3594 ManifestOpRef mop(std::make_shared<ManifestOp>());
3595 for (auto c : set_chunk.chunk_map) {
3596 auto p = refs.find(c.second.oid);
3597 if (p == refs.end()) {
3598 continue;
3599 }
3600
3601 int inc_ref_count = p->second;
3602 if (inc_ref_count > 0) {
3603 /*
3604 * In set-chunk case, the first thing we should do is to increment
3605 * the reference the targe object has prior to update object_manifest in object_info_t.
3606 * So, call directly refcount_manifest.
3607 */
3608 auto target_oid = p->first;
3609 auto offset = c.first;
3610 auto length = c.second.length;
3611 auto* fin = new C_SetManifestRefCountDone(this, ctx->obs->oi.soid, offset);
3612 ceph_tid_t tid = refcount_manifest(ctx->obs->oi.soid, target_oid,
3613 refcount_t::INCREMENT_REF, fin, std::nullopt);
3614 fin->tid = tid;
3615 mop->chunks[target_oid] = make_pair(offset, length);
3616 mop->num_chunks++;
3617 mop->tids[offset] = tid;
3618
3619 if (!ctx->obc->is_blocked()) {
3620 ctx->obc->start_block();
3621 }
3622 need_inc_ref = true;
3623 } else if (inc_ref_count < 0) {
3624 hobject_t src = ctx->obs->oi.soid;
3625 hobject_t tgt = p->first;
3626 ctx->register_on_commit(
3627 [src, tgt, this](){
3628 refcount_manifest(src, tgt, refcount_t::DECREMENT_REF, NULL, std::nullopt);
3629 });
3630 }
3631 }
3632 if (mop->tids.size()) {
3633 mop->cb = new RefCountCallback(ctx, osd_op);
3634 manifest_ops[ctx->obs->oi.soid] = mop;
3635 manifest_ops[ctx->obs->oi.soid]->op = ctx->op;
3636 }
3637 }
3638
3639 return need_inc_ref;
3640 }
3641
3642 void PrimaryLogPG::update_chunk_map_by_dirty(OpContext* ctx) {
3643 /*
3644 * We should consider two cases here:
3645 * 1) just modification: This created dirty regions, but didn't update chunk_map.
3646 * 2) rollback: In rollback, head will be converted to the clone the rollback targets.
3647 * Also, rollback already updated chunk_map.
3648 * So, we should do here is to check whether chunk_map is updated and the clean_region has dirty regions.
3649 * In case of the rollback, chunk_map doesn't need to be clear
3650 */
3651 for (auto &p : ctx->obs->oi.manifest.chunk_map) {
3652 if (!ctx->clean_regions.is_clean_region(p.first, p.second.length)) {
3653 ctx->new_obs.oi.manifest.chunk_map.erase(p.first);
3654 if (ctx->new_obs.oi.manifest.chunk_map.empty()) {
3655 ctx->new_obs.oi.manifest.type = object_manifest_t::TYPE_NONE;
3656 ctx->new_obs.oi.clear_flag(object_info_t::FLAG_MANIFEST);
3657 ctx->delta_stats.num_objects_manifest--;
3658 }
3659 }
3660 }
3661 }
3662
3663 void PrimaryLogPG::dec_refcount_by_dirty(OpContext* ctx)
3664 {
3665 object_ref_delta_t refs;
3666 ObjectContextRef cobc = nullptr;
3667 ObjectContextRef obc = ctx->obc;
3668 // Look over previous snapshot, then figure out whether updated chunk needs to be deleted
3669 cobc = get_prev_clone_obc(obc);
3670 obc->obs.oi.manifest.calc_refs_to_drop_on_modify(
3671 cobc ? &cobc->obs.oi.manifest : nullptr,
3672 ctx->clean_regions,
3673 refs);
3674 if (!refs.is_empty()) {
3675 hobject_t soid = obc->obs.oi.soid;
3676 ctx->register_on_commit(
3677 [soid, this, refs](){
3678 dec_refcount(soid, refs);
3679 });
3680 }
3681 }
3682
3683 void PrimaryLogPG::dec_all_refcount_manifest(const object_info_t& oi, OpContext* ctx)
3684 {
3685 ceph_assert(oi.has_manifest());
3686 ceph_assert(ctx->obc->ssc);
3687
3688 if (oi.manifest.is_chunked()) {
3689 object_ref_delta_t refs;
3690 ObjectContextRef obc_l, obc_g, obc;
3691 /* in trim_object, oi and ctx can have different oid */
3692 obc = get_object_context(oi.soid, false, NULL);
3693 ceph_assert(obc);
3694 get_adjacent_clones(obc, obc_l, obc_g);
3695 oi.manifest.calc_refs_to_drop_on_removal(
3696 obc_l ? &(obc_l->obs.oi.manifest) : nullptr,
3697 obc_g ? &(obc_g->obs.oi.manifest) : nullptr,
3698 refs);
3699
3700 if (!refs.is_empty()) {
3701 /* dec_refcount will use head object anyway */
3702 hobject_t soid = ctx->obc->obs.oi.soid;
3703 ctx->register_on_commit(
3704 [soid, this, refs](){
3705 dec_refcount(soid, refs);
3706 });
3707 }
3708 } else if (oi.manifest.is_redirect() &&
3709 oi.test_flag(object_info_t::FLAG_REDIRECT_HAS_REFERENCE)) {
3710 ctx->register_on_commit(
3711 [oi, this](){
3712 refcount_manifest(oi.soid, oi.manifest.redirect_target,
3713 refcount_t::DECREMENT_REF, NULL, std::nullopt);
3714 });
3715 }
3716 }
3717
3718 ceph_tid_t PrimaryLogPG::refcount_manifest(hobject_t src_soid, hobject_t tgt_soid, refcount_t type,
3719 Context *cb, std::optional<bufferlist> chunk)
3720 {
3721 unsigned flags = CEPH_OSD_FLAG_IGNORE_CACHE | CEPH_OSD_FLAG_IGNORE_OVERLAY |
3722 CEPH_OSD_FLAG_RWORDERED;
3723
3724 dout(10) << __func__ << " Start refcount from " << src_soid
3725 << " to " << tgt_soid << dendl;
3726
3727 ObjectOperation obj_op;
3728 bufferlist in;
3729 if (type == refcount_t::INCREMENT_REF) {
3730 cls_cas_chunk_get_ref_op call;
3731 call.source = src_soid.get_head();
3732 ::encode(call, in);
3733 obj_op.call("cas", "chunk_get_ref", in);
3734 } else if (type == refcount_t::DECREMENT_REF) {
3735 cls_cas_chunk_put_ref_op call;
3736 call.source = src_soid.get_head();
3737 ::encode(call, in);
3738 obj_op.call("cas", "chunk_put_ref", in);
3739 } else if (type == refcount_t::CREATE_OR_GET_REF) {
3740 cls_cas_chunk_create_or_get_ref_op get_call;
3741 get_call.source = src_soid.get_head();
3742 ceph_assert(chunk);
3743 get_call.data = std::move(*chunk);
3744 ::encode(get_call, in);
3745 obj_op.call("cas", "chunk_create_or_get_ref", in);
3746 } else {
3747 ceph_assert(0 == "unrecognized type");
3748 }
3749
3750 Context *c = nullptr;
3751 if (cb) {
3752 c = new C_OnFinisher(cb, osd->get_objecter_finisher(get_pg_shard()));
3753 }
3754
3755 object_locator_t oloc(tgt_soid);
3756 ObjectContextRef src_obc = get_object_context(src_soid, false, NULL);
3757 ceph_assert(src_obc);
3758 auto tid = osd->objecter->mutate(
3759 tgt_soid.oid, oloc, obj_op, SnapContext(),
3760 ceph::real_clock::from_ceph_timespec(src_obc->obs.oi.mtime),
3761 flags, c);
3762 return tid;
3763 }
3764
3765 void PrimaryLogPG::do_proxy_chunked_read(OpRequestRef op, ObjectContextRef obc, int op_index,
3766 uint64_t chunk_index, uint64_t req_offset, uint64_t req_length,
3767 uint64_t req_total_len, bool write_ordered)
3768 {
3769 MOSDOp *m = static_cast<MOSDOp*>(op->get_nonconst_req());
3770 object_manifest_t *manifest = &obc->obs.oi.manifest;
3771 if (!manifest->chunk_map.count(chunk_index)) {
3772 return;
3773 }
3774 uint64_t chunk_length = manifest->chunk_map[chunk_index].length;
3775 hobject_t soid = manifest->chunk_map[chunk_index].oid;
3776 hobject_t ori_soid = m->get_hobj();
3777 object_locator_t oloc(soid);
3778 unsigned flags = CEPH_OSD_FLAG_IGNORE_CACHE | CEPH_OSD_FLAG_IGNORE_OVERLAY;
3779 if (write_ordered) {
3780 flags |= CEPH_OSD_FLAG_RWORDERED;
3781 }
3782
3783 if (!chunk_length || soid == hobject_t()) {
3784 return;
3785 }
3786
3787 /* same as do_proxy_read() */
3788 flags |= m->get_flags() & (CEPH_OSD_FLAG_RWORDERED |
3789 CEPH_OSD_FLAG_ORDERSNAP |
3790 CEPH_OSD_FLAG_ENFORCE_SNAPC |
3791 CEPH_OSD_FLAG_MAP_SNAP_CLONE);
3792
3793 dout(10) << __func__ << " Start do chunk proxy read for " << *m
3794 << " index: " << op_index << " oid: " << soid.oid.name << " req_offset: " << req_offset
3795 << " req_length: " << req_length << dendl;
3796
3797 ProxyReadOpRef prdop(std::make_shared<ProxyReadOp>(op, ori_soid, m->ops));
3798
3799 ObjectOperation *pobj_op = new ObjectOperation;
3800 OSDOp &osd_op = pobj_op->add_op(m->ops[op_index].op.op);
3801
3802 if (chunk_index <= req_offset) {
3803 osd_op.op.extent.offset = manifest->chunk_map[chunk_index].offset + req_offset - chunk_index;
3804 } else {
3805 ceph_abort_msg("chunk_index > req_offset");
3806 }
3807 osd_op.op.extent.length = req_length;
3808
3809 ObjectOperation obj_op;
3810 obj_op.dup(pobj_op->ops);
3811
3812 C_ProxyChunkRead *fin = new C_ProxyChunkRead(this, ori_soid, get_last_peering_reset(),
3813 prdop);
3814 fin->obj_op = pobj_op;
3815 fin->op_index = op_index;
3816 fin->req_offset = req_offset;
3817 fin->obc = obc;
3818 fin->req_total_len = req_total_len;
3819
3820 ceph_tid_t tid = osd->objecter->read(
3821 soid.oid, oloc, obj_op,
3822 m->get_snapid(), NULL,
3823 flags, new C_OnFinisher(fin, osd->get_objecter_finisher(get_pg_shard())),
3824 &prdop->user_version,
3825 &prdop->data_offset,
3826 m->get_features());
3827 fin->tid = tid;
3828 prdop->objecter_tid = tid;
3829 proxyread_ops[tid] = prdop;
3830 in_progress_proxy_ops[ori_soid].push_back(op);
3831 }
3832
3833 bool PrimaryLogPG::can_proxy_chunked_read(OpRequestRef op, ObjectContextRef obc)
3834 {
3835 MOSDOp *m = static_cast<MOSDOp*>(op->get_nonconst_req());
3836 OSDOp *osd_op = NULL;
3837 bool ret = true;
3838 for (unsigned int i = 0; i < m->ops.size(); i++) {
3839 osd_op = &m->ops[i];
3840 ceph_osd_op op = osd_op->op;
3841 switch (op.op) {
3842 case CEPH_OSD_OP_READ:
3843 case CEPH_OSD_OP_SYNC_READ: {
3844 uint64_t cursor = osd_op->op.extent.offset;
3845 uint64_t remain = osd_op->op.extent.length;
3846
3847 /* requested chunks exist in chunk_map ? */
3848 for (auto &p : obc->obs.oi.manifest.chunk_map) {
3849 if (p.first <= cursor && p.first + p.second.length > cursor) {
3850 if (!p.second.is_missing()) {
3851 return false;
3852 }
3853 if (p.second.length >= remain) {
3854 remain = 0;
3855 break;
3856 } else {
3857 remain = remain - p.second.length;
3858 }
3859 cursor += p.second.length;
3860 }
3861 }
3862
3863 if (remain) {
3864 dout(20) << __func__ << " requested chunks don't exist in chunk_map " << dendl;
3865 return false;
3866 }
3867 continue;
3868 }
3869 default:
3870 return false;
3871 }
3872 }
3873 return ret;
3874 }
3875
3876 void PrimaryLogPG::finish_proxy_write(hobject_t oid, ceph_tid_t tid, int r)
3877 {
3878 dout(10) << __func__ << " " << oid << " tid " << tid
3879 << " " << cpp_strerror(r) << dendl;
3880
3881 map<ceph_tid_t, ProxyWriteOpRef>::iterator p = proxywrite_ops.find(tid);
3882 if (p == proxywrite_ops.end()) {
3883 dout(10) << __func__ << " no proxywrite_op found" << dendl;
3884 return;
3885 }
3886 ProxyWriteOpRef pwop = p->second;
3887 ceph_assert(tid == pwop->objecter_tid);
3888 ceph_assert(oid == pwop->soid);
3889
3890 proxywrite_ops.erase(tid);
3891
3892 map<hobject_t, list<OpRequestRef> >::iterator q = in_progress_proxy_ops.find(oid);
3893 if (q == in_progress_proxy_ops.end()) {
3894 dout(10) << __func__ << " no in_progress_proxy_ops found" << dendl;
3895 delete pwop->ctx;
3896 pwop->ctx = NULL;
3897 return;
3898 }
3899 list<OpRequestRef>& in_progress_op = q->second;
3900 ceph_assert(in_progress_op.size());
3901 list<OpRequestRef>::iterator it = std::find(in_progress_op.begin(),
3902 in_progress_op.end(),
3903 pwop->op);
3904 ceph_assert(it != in_progress_op.end());
3905 in_progress_op.erase(it);
3906 if (in_progress_op.size() == 0) {
3907 in_progress_proxy_ops.erase(oid);
3908 } else if (std::find(in_progress_op.begin(),
3909 in_progress_op.end(),
3910 pwop->op) != in_progress_op.end()) {
3911 if (pwop->ctx)
3912 delete pwop->ctx;
3913 pwop->ctx = NULL;
3914 dout(20) << __func__ << " " << oid << " tid " << tid
3915 << " in_progress_op size: "
3916 << in_progress_op.size() << dendl;
3917 return;
3918 }
3919
3920 osd->logger->inc(l_osd_tier_proxy_write);
3921
3922 auto m = pwop->op->get_req<MOSDOp>();
3923 ceph_assert(m != NULL);
3924
3925 if (!pwop->sent_reply) {
3926 // send commit.
3927 assert(pwop->ctx->reply == nullptr);
3928 MOSDOpReply *reply = new MOSDOpReply(m, r, get_osdmap_epoch(), 0,
3929 true /* we claim it below */);
3930 reply->set_reply_versions(eversion_t(), pwop->user_version);
3931 reply->add_flags(CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK);
3932 reply->claim_op_out_data(pwop->ops);
3933 dout(10) << " sending commit on " << pwop << " " << reply << dendl;
3934 osd->send_message_osd_client(reply, m->get_connection());
3935 pwop->sent_reply = true;
3936 pwop->ctx->op->mark_commit_sent();
3937 }
3938
3939 delete pwop->ctx;
3940 pwop->ctx = NULL;
3941 }
3942
3943 void PrimaryLogPG::cancel_proxy_write(ProxyWriteOpRef pwop,
3944 vector<ceph_tid_t> *tids)
3945 {
3946 dout(10) << __func__ << " " << pwop->soid << dendl;
3947 pwop->canceled = true;
3948
3949 // cancel objecter op, if we can
3950 if (pwop->objecter_tid) {
3951 tids->push_back(pwop->objecter_tid);
3952 delete pwop->ctx;
3953 pwop->ctx = NULL;
3954 proxywrite_ops.erase(pwop->objecter_tid);
3955 pwop->objecter_tid = 0;
3956 }
3957 }
3958
3959 class PromoteCallback: public PrimaryLogPG::CopyCallback {
3960 ObjectContextRef obc;
3961 PrimaryLogPG *pg;
3962 utime_t start;
3963 public:
3964 PromoteCallback(ObjectContextRef obc_, PrimaryLogPG *pg_)
3965 : obc(obc_),
3966 pg(pg_),
3967 start(ceph_clock_now()) {}
3968
3969 void finish(PrimaryLogPG::CopyCallbackResults results) override {
3970 PrimaryLogPG::CopyResults *results_data = results.get<1>();
3971 int r = results.get<0>();
3972 if (obc->obs.oi.has_manifest() && obc->obs.oi.manifest.is_chunked()) {
3973 pg->finish_promote_manifest(r, results_data, obc);
3974 } else {
3975 pg->finish_promote(r, results_data, obc);
3976 }
3977 pg->osd->logger->tinc(l_osd_tier_promote_lat, ceph_clock_now() - start);
3978 }
3979 };
3980
3981 class PromoteManifestCallback: public PrimaryLogPG::CopyCallback {
3982 ObjectContextRef obc;
3983 PrimaryLogPG *pg;
3984 utime_t start;
3985 PrimaryLogPG::OpContext *ctx;
3986 PrimaryLogPG::CopyCallbackResults promote_results;
3987 public:
3988 PromoteManifestCallback(ObjectContextRef obc_, PrimaryLogPG *pg_, PrimaryLogPG::OpContext *ctx)
3989 : obc(obc_),
3990 pg(pg_),
3991 start(ceph_clock_now()), ctx(ctx) {}
3992
3993 void finish(PrimaryLogPG::CopyCallbackResults results) override {
3994 PrimaryLogPG::CopyResults *results_data = results.get<1>();
3995 int r = results.get<0>();
3996 promote_results = results;
3997 if (obc->obs.oi.has_manifest() && obc->obs.oi.manifest.is_redirect()) {
3998 ctx->user_at_version = results_data->user_version;
3999 }
4000 if (r >= 0) {
4001 ctx->pg->execute_ctx(ctx);
4002 } else {
4003 if (r != -ECANCELED) {
4004 if (ctx->op)
4005 ctx->pg->osd->reply_op_error(ctx->op, r);
4006 } else if (results_data->should_requeue) {
4007 if (ctx->op)
4008 ctx->pg->requeue_op(ctx->op);
4009 }
4010 ctx->pg->close_op_ctx(ctx);
4011 }
4012 pg->osd->logger->tinc(l_osd_tier_promote_lat, ceph_clock_now() - start);
4013 }
4014 friend struct PromoteFinisher;
4015 };
4016
4017 struct PromoteFinisher : public PrimaryLogPG::OpFinisher {
4018 PromoteManifestCallback *promote_callback;
4019
4020 explicit PromoteFinisher(PromoteManifestCallback *promote_callback)
4021 : promote_callback(promote_callback) {
4022 }
4023
4024 int execute() override {
4025 if (promote_callback->ctx->obc->obs.oi.manifest.is_redirect()) {
4026 promote_callback->ctx->pg->finish_promote(promote_callback->promote_results.get<0>(),
4027 promote_callback->promote_results.get<1>(),
4028 promote_callback->obc);
4029 } else if (promote_callback->ctx->obc->obs.oi.manifest.is_chunked()) {
4030 promote_callback->ctx->pg->finish_promote_manifest(promote_callback->promote_results.get<0>(),
4031 promote_callback->promote_results.get<1>(),
4032 promote_callback->obc);
4033 } else {
4034 ceph_abort_msg("unrecognized manifest type");
4035 }
4036 return 0;
4037 }
4038 };
4039
4040 void PrimaryLogPG::promote_object(ObjectContextRef obc,
4041 const hobject_t& missing_oid,
4042 const object_locator_t& oloc,
4043 OpRequestRef op,
4044 ObjectContextRef *promote_obc)
4045 {
4046 hobject_t hoid = obc ? obc->obs.oi.soid : missing_oid;
4047 ceph_assert(hoid != hobject_t());
4048 if (m_scrubber->write_blocked_by_scrub(hoid)) {
4049 dout(10) << __func__ << " " << hoid
4050 << " blocked by scrub" << dendl;
4051 if (op) {
4052 waiting_for_scrub.push_back(op);
4053 op->mark_delayed("waiting for scrub");
4054 dout(10) << __func__ << " " << hoid
4055 << " placing op in waiting_for_scrub" << dendl;
4056 } else {
4057 dout(10) << __func__ << " " << hoid
4058 << " no op, dropping on the floor" << dendl;
4059 }
4060 return;
4061 }
4062 if (op && !check_laggy_requeue(op)) {
4063 return;
4064 }
4065 if (!obc) { // we need to create an ObjectContext
4066 ceph_assert(missing_oid != hobject_t());
4067 obc = get_object_context(missing_oid, true);
4068 }
4069 if (promote_obc)
4070 *promote_obc = obc;
4071
4072 /*
4073 * Before promote complete, if there are proxy-reads for the object,
4074 * for this case we don't use DONTNEED.
4075 */
4076 unsigned src_fadvise_flags = LIBRADOS_OP_FLAG_FADVISE_SEQUENTIAL;
4077 map<hobject_t, list<OpRequestRef>>::iterator q = in_progress_proxy_ops.find(obc->obs.oi.soid);
4078 if (q == in_progress_proxy_ops.end()) {
4079 src_fadvise_flags |= LIBRADOS_OP_FLAG_FADVISE_DONTNEED;
4080 }
4081
4082 CopyCallback *cb;
4083 object_locator_t my_oloc;
4084 hobject_t src_hoid;
4085 if (!obc->obs.oi.has_manifest()) {
4086 my_oloc = oloc;
4087 my_oloc.pool = pool.info.tier_of;
4088 src_hoid = obc->obs.oi.soid;
4089 cb = new PromoteCallback(obc, this);
4090 } else {
4091 if (obc->obs.oi.manifest.is_chunked()) {
4092 src_hoid = obc->obs.oi.soid;
4093 cb = new PromoteCallback(obc, this);
4094 } else if (obc->obs.oi.manifest.is_redirect()) {
4095 object_locator_t src_oloc(obc->obs.oi.manifest.redirect_target);
4096 my_oloc = src_oloc;
4097 src_hoid = obc->obs.oi.manifest.redirect_target;
4098 cb = new PromoteCallback(obc, this);
4099 } else {
4100 ceph_abort_msg("unrecognized manifest type");
4101 }
4102 }
4103
4104 unsigned flags = CEPH_OSD_COPY_FROM_FLAG_IGNORE_OVERLAY |
4105 CEPH_OSD_COPY_FROM_FLAG_IGNORE_CACHE |
4106 CEPH_OSD_COPY_FROM_FLAG_MAP_SNAP_CLONE |
4107 CEPH_OSD_COPY_FROM_FLAG_RWORDERED;
4108 start_copy(cb, obc, src_hoid, my_oloc, 0, flags,
4109 obc->obs.oi.soid.snap == CEPH_NOSNAP,
4110 src_fadvise_flags, 0);
4111
4112 ceph_assert(obc->is_blocked());
4113
4114 if (op)
4115 wait_for_blocked_object(obc->obs.oi.soid, op);
4116
4117 recovery_state.update_stats(
4118 [](auto &history, auto &stats) {
4119 stats.stats.sum.num_promote++;
4120 return false;
4121 });
4122 }
4123
4124 void PrimaryLogPG::execute_ctx(OpContext *ctx)
4125 {
4126 FUNCTRACE(cct);
4127 dout(10) << __func__ << " " << ctx << dendl;
4128 ctx->reset_obs(ctx->obc);
4129 ctx->update_log_only = false; // reset in case finish_copyfrom() is re-running execute_ctx
4130 OpRequestRef op = ctx->op;
4131 auto m = op->get_req<MOSDOp>();
4132 ObjectContextRef obc = ctx->obc;
4133 const hobject_t& soid = obc->obs.oi.soid;
4134
4135 // this method must be idempotent since we may call it several times
4136 // before we finally apply the resulting transaction.
4137 ctx->op_t.reset(new PGTransaction);
4138
4139 if (op->may_write() || op->may_cache()) {
4140 // snap
4141 if (!(m->has_flag(CEPH_OSD_FLAG_ENFORCE_SNAPC)) &&
4142 pool.info.is_pool_snaps_mode()) {
4143 // use pool's snapc
4144 ctx->snapc = pool.snapc;
4145 } else {
4146 // client specified snapc
4147 ctx->snapc.seq = m->get_snap_seq();
4148 ctx->snapc.snaps = m->get_snaps();
4149 filter_snapc(ctx->snapc.snaps);
4150 }
4151 if ((m->has_flag(CEPH_OSD_FLAG_ORDERSNAP)) &&
4152 ctx->snapc.seq < obc->ssc->snapset.seq) {
4153 dout(10) << " ORDERSNAP flag set and snapc seq " << ctx->snapc.seq
4154 << " < snapset seq " << obc->ssc->snapset.seq
4155 << " on " << obc->obs.oi.soid << dendl;
4156 reply_ctx(ctx, -EOLDSNAPC);
4157 return;
4158 }
4159
4160 // version
4161 ctx->at_version = get_next_version();
4162 ctx->mtime = m->get_mtime();
4163
4164 dout(10) << __func__ << " " << soid << " " << *ctx->ops
4165 << " ov " << obc->obs.oi.version << " av " << ctx->at_version
4166 << " snapc " << ctx->snapc
4167 << " snapset " << obc->ssc->snapset
4168 << dendl;
4169 } else {
4170 dout(10) << __func__ << " " << soid << " " << *ctx->ops
4171 << " ov " << obc->obs.oi.version
4172 << dendl;
4173 }
4174
4175 if (!ctx->user_at_version)
4176 ctx->user_at_version = obc->obs.oi.user_version;
4177 dout(30) << __func__ << " user_at_version " << ctx->user_at_version << dendl;
4178
4179 {
4180 #ifdef WITH_LTTNG
4181 osd_reqid_t reqid = ctx->op->get_reqid();
4182 #endif
4183 tracepoint(osd, prepare_tx_enter, reqid.name._type,
4184 reqid.name._num, reqid.tid, reqid.inc);
4185 }
4186
4187 [[maybe_unused]] auto span = tracing::osd::tracer.add_span(__func__, ctx->op->osd_parent_span);
4188
4189 int result = prepare_transaction(ctx);
4190
4191 {
4192 #ifdef WITH_LTTNG
4193 osd_reqid_t reqid = ctx->op->get_reqid();
4194 #endif
4195 tracepoint(osd, prepare_tx_exit, reqid.name._type,
4196 reqid.name._num, reqid.tid, reqid.inc);
4197 }
4198
4199 bool pending_async_reads = !ctx->pending_async_reads.empty();
4200 if (result == -EINPROGRESS || pending_async_reads) {
4201 // come back later.
4202 if (pending_async_reads) {
4203 ceph_assert(pool.info.is_erasure());
4204 in_progress_async_reads.push_back(make_pair(op, ctx));
4205 ctx->start_async_reads(this);
4206 }
4207 return;
4208 }
4209
4210 if (result == -EAGAIN) {
4211 // clean up after the ctx
4212 close_op_ctx(ctx);
4213 return;
4214 }
4215
4216 bool ignore_out_data = false;
4217 if (!ctx->op_t->empty() &&
4218 op->may_write() &&
4219 result >= 0) {
4220 // successful update
4221 if (ctx->op->allows_returnvec()) {
4222 // enforce reasonable bound on the return buffer sizes
4223 for (auto& i : *ctx->ops) {
4224 if (i.outdata.length() > cct->_conf->osd_max_write_op_reply_len) {
4225 dout(10) << __func__ << " op " << i << " outdata overflow" << dendl;
4226 result = -EOVERFLOW; // overall result is overflow
4227 i.rval = -EOVERFLOW;
4228 i.outdata.clear();
4229 }
4230 }
4231 } else {
4232 // legacy behavior -- zero result and return data etc.
4233 ignore_out_data = true;
4234 result = 0;
4235 }
4236 }
4237
4238 // prepare the reply
4239 ctx->reply = new MOSDOpReply(m, result, get_osdmap_epoch(), 0,
4240 ignore_out_data);
4241 dout(20) << __func__ << " alloc reply " << ctx->reply
4242 << " result " << result << dendl;
4243
4244 // read or error?
4245 if ((ctx->op_t->empty() || result < 0) && !ctx->update_log_only) {
4246 // finish side-effects
4247 if (result >= 0)
4248 do_osd_op_effects(ctx, m->get_connection());
4249
4250 complete_read_ctx(result, ctx);
4251 return;
4252 }
4253
4254 ctx->reply->set_reply_versions(ctx->at_version, ctx->user_at_version);
4255
4256 ceph_assert(op->may_write() || op->may_cache());
4257
4258 // trim log?
4259 recovery_state.update_trim_to();
4260
4261 // verify that we are doing this in order?
4262 if (cct->_conf->osd_debug_op_order && m->get_source().is_client() &&
4263 !pool.info.is_tier() && !pool.info.has_tiers()) {
4264 map<client_t,ceph_tid_t>& cm = debug_op_order[obc->obs.oi.soid];
4265 ceph_tid_t t = m->get_tid();
4266 client_t n = m->get_source().num();
4267 map<client_t,ceph_tid_t>::iterator p = cm.find(n);
4268 if (p == cm.end()) {
4269 dout(20) << " op order client." << n << " tid " << t << " (first)" << dendl;
4270 cm[n] = t;
4271 } else {
4272 dout(20) << " op order client." << n << " tid " << t << " last was " << p->second << dendl;
4273 if (p->second > t) {
4274 derr << "bad op order, already applied " << p->second << " > this " << t << dendl;
4275 ceph_abort_msg("out of order op");
4276 }
4277 p->second = t;
4278 }
4279 }
4280
4281 if (ctx->update_log_only) {
4282 if (result >= 0)
4283 do_osd_op_effects(ctx, m->get_connection());
4284
4285 dout(20) << __func__ << " update_log_only -- result=" << result << dendl;
4286 // save just what we need from ctx
4287 MOSDOpReply *reply = ctx->reply;
4288 ctx->reply = nullptr;
4289 reply->get_header().data_off = (ctx->data_off ? *ctx->data_off : 0);
4290
4291 if (result == -ENOENT) {
4292 reply->set_enoent_reply_versions(info.last_update,
4293 info.last_user_version);
4294 }
4295 reply->add_flags(CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK);
4296 // append to pg log for dup detection - don't save buffers for now
4297 record_write_error(op, soid, reply, result,
4298 ctx->op->allows_returnvec() ? ctx : nullptr);
4299 close_op_ctx(ctx);
4300 return;
4301 }
4302
4303 // no need to capture PG ref, repop cancel will handle that
4304 // Can capture the ctx by pointer, it's owned by the repop
4305 ctx->register_on_commit(
4306 [m, ctx, this](){
4307 if (ctx->op)
4308 log_op_stats(*ctx->op, ctx->bytes_written, ctx->bytes_read);
4309
4310 if (m && !ctx->sent_reply) {
4311 MOSDOpReply *reply = ctx->reply;
4312 ctx->reply = nullptr;
4313 reply->add_flags(CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK);
4314 dout(10) << " sending reply on " << *m << " " << reply << dendl;
4315 osd->send_message_osd_client(reply, m->get_connection());
4316 ctx->sent_reply = true;
4317 ctx->op->mark_commit_sent();
4318 }
4319 });
4320 ctx->register_on_success(
4321 [ctx, this]() {
4322 do_osd_op_effects(
4323 ctx,
4324 ctx->op ? ctx->op->get_req()->get_connection() :
4325 ConnectionRef());
4326 });
4327 ctx->register_on_finish(
4328 [ctx]() {
4329 delete ctx;
4330 });
4331
4332 // issue replica writes
4333 ceph_tid_t rep_tid = osd->get_tid();
4334
4335 RepGather *repop = new_repop(ctx, rep_tid);
4336
4337 issue_repop(repop, ctx);
4338 eval_repop(repop);
4339 repop->put();
4340 }
4341
4342 void PrimaryLogPG::close_op_ctx(OpContext *ctx) {
4343 release_object_locks(ctx->lock_manager);
4344
4345 ctx->op_t.reset();
4346
4347 for (auto p = ctx->on_finish.begin(); p != ctx->on_finish.end();
4348 ctx->on_finish.erase(p++)) {
4349 (*p)();
4350 }
4351 delete ctx;
4352 }
4353
4354 void PrimaryLogPG::reply_ctx(OpContext *ctx, int r)
4355 {
4356 if (ctx->op)
4357 osd->reply_op_error(ctx->op, r);
4358 close_op_ctx(ctx);
4359 }
4360
4361 void PrimaryLogPG::log_op_stats(const OpRequest& op,
4362 const uint64_t inb,
4363 const uint64_t outb)
4364 {
4365 auto m = op.get_req<MOSDOp>();
4366 const utime_t now = ceph_clock_now();
4367
4368 const utime_t latency = now - m->get_recv_stamp();
4369 const utime_t process_latency = now - op.get_dequeued_time();
4370
4371 osd->logger->inc(l_osd_op);
4372
4373 osd->logger->inc(l_osd_op_outb, outb);
4374 osd->logger->inc(l_osd_op_inb, inb);
4375 osd->logger->tinc(l_osd_op_lat, latency);
4376 osd->logger->tinc(l_osd_op_process_lat, process_latency);
4377
4378 if (op.may_read() && op.may_write()) {
4379 osd->logger->inc(l_osd_op_rw);
4380 osd->logger->inc(l_osd_op_rw_inb, inb);
4381 osd->logger->inc(l_osd_op_rw_outb, outb);
4382 osd->logger->tinc(l_osd_op_rw_lat, latency);
4383 osd->logger->hinc(l_osd_op_rw_lat_inb_hist, latency.to_nsec(), inb);
4384 osd->logger->hinc(l_osd_op_rw_lat_outb_hist, latency.to_nsec(), outb);
4385 osd->logger->tinc(l_osd_op_rw_process_lat, process_latency);
4386 } else if (op.may_read()) {
4387 osd->logger->inc(l_osd_op_r);
4388 osd->logger->inc(l_osd_op_r_outb, outb);
4389 osd->logger->tinc(l_osd_op_r_lat, latency);
4390 osd->logger->hinc(l_osd_op_r_lat_outb_hist, latency.to_nsec(), outb);
4391 osd->logger->tinc(l_osd_op_r_process_lat, process_latency);
4392 } else if (op.may_write() || op.may_cache()) {
4393 osd->logger->inc(l_osd_op_w);
4394 osd->logger->inc(l_osd_op_w_inb, inb);
4395 osd->logger->tinc(l_osd_op_w_lat, latency);
4396 osd->logger->hinc(l_osd_op_w_lat_inb_hist, latency.to_nsec(), inb);
4397 osd->logger->tinc(l_osd_op_w_process_lat, process_latency);
4398 } else {
4399 ceph_abort();
4400 }
4401
4402 dout(15) << "log_op_stats " << *m
4403 << " inb " << inb
4404 << " outb " << outb
4405 << " lat " << latency << dendl;
4406
4407 if (m_dynamic_perf_stats.is_enabled()) {
4408 m_dynamic_perf_stats.add(osd, info, op, inb, outb, latency);
4409 }
4410 }
4411
4412 void PrimaryLogPG::set_dynamic_perf_stats_queries(
4413 const std::list<OSDPerfMetricQuery> &queries)
4414 {
4415 m_dynamic_perf_stats.set_queries(queries);
4416 }
4417
4418 void PrimaryLogPG::get_dynamic_perf_stats(DynamicPerfStats *stats)
4419 {
4420 std::swap(m_dynamic_perf_stats, *stats);
4421 }
4422
4423 void PrimaryLogPG::do_scan(
4424 OpRequestRef op,
4425 ThreadPool::TPHandle &handle)
4426 {
4427 auto m = op->get_req<MOSDPGScan>();
4428 ceph_assert(m->get_type() == MSG_OSD_PG_SCAN);
4429 dout(10) << "do_scan " << *m << dendl;
4430
4431 op->mark_started();
4432
4433 switch (m->op) {
4434 case MOSDPGScan::OP_SCAN_GET_DIGEST:
4435 {
4436 auto dpp = get_dpp();
4437 if (osd->check_backfill_full(dpp)) {
4438 dout(1) << __func__ << ": Canceling backfill: Full." << dendl;
4439 queue_peering_event(
4440 PGPeeringEventRef(
4441 std::make_shared<PGPeeringEvent>(
4442 get_osdmap_epoch(),
4443 get_osdmap_epoch(),
4444 PeeringState::BackfillTooFull())));
4445 return;
4446 }
4447
4448 BackfillInterval bi;
4449 bi.begin = m->begin;
4450 // No need to flush, there won't be any in progress writes occuring
4451 // past m->begin
4452 scan_range(
4453 cct->_conf->osd_backfill_scan_min,
4454 cct->_conf->osd_backfill_scan_max,
4455 &bi,
4456 handle);
4457 MOSDPGScan *reply = new MOSDPGScan(
4458 MOSDPGScan::OP_SCAN_DIGEST,
4459 pg_whoami,
4460 get_osdmap_epoch(), m->query_epoch,
4461 spg_t(info.pgid.pgid, get_primary().shard), bi.begin, bi.end);
4462 encode(bi.objects, reply->get_data());
4463 osd->send_message_osd_cluster(reply, m->get_connection());
4464 }
4465 break;
4466
4467 case MOSDPGScan::OP_SCAN_DIGEST:
4468 {
4469 pg_shard_t from = m->from;
4470
4471 // Check that from is in backfill_targets vector
4472 ceph_assert(is_backfill_target(from));
4473
4474 BackfillInterval& bi = peer_backfill_info[from];
4475 bi.begin = m->begin;
4476 bi.end = m->end;
4477 auto p = m->get_data().cbegin();
4478
4479 // take care to preserve ordering!
4480 bi.clear_objects();
4481 decode_noclear(bi.objects, p);
4482 dout(10) << __func__ << " bi.begin=" << bi.begin << " bi.end=" << bi.end
4483 << " bi.objects.size()=" << bi.objects.size() << dendl;
4484
4485 if (waiting_on_backfill.erase(from)) {
4486 if (waiting_on_backfill.empty()) {
4487 ceph_assert(
4488 peer_backfill_info.size() ==
4489 get_backfill_targets().size());
4490 finish_recovery_op(hobject_t::get_max());
4491 }
4492 } else {
4493 // we canceled backfill for a while due to a too full, and this
4494 // is an extra response from a non-too-full peer
4495 dout(20) << __func__ << " canceled backfill (too full?)" << dendl;
4496 }
4497 }
4498 break;
4499 }
4500 }
4501
4502 void PrimaryLogPG::do_backfill(OpRequestRef op)
4503 {
4504 auto m = op->get_req<MOSDPGBackfill>();
4505 ceph_assert(m->get_type() == MSG_OSD_PG_BACKFILL);
4506 dout(10) << "do_backfill " << *m << dendl;
4507
4508 op->mark_started();
4509
4510 switch (m->op) {
4511 case MOSDPGBackfill::OP_BACKFILL_FINISH:
4512 {
4513 ceph_assert(cct->_conf->osd_kill_backfill_at != 1);
4514
4515 MOSDPGBackfill *reply = new MOSDPGBackfill(
4516 MOSDPGBackfill::OP_BACKFILL_FINISH_ACK,
4517 get_osdmap_epoch(),
4518 m->query_epoch,
4519 spg_t(info.pgid.pgid, get_primary().shard));
4520 reply->set_priority(get_recovery_op_priority());
4521 osd->send_message_osd_cluster(reply, m->get_connection());
4522 queue_peering_event(
4523 PGPeeringEventRef(
4524 std::make_shared<PGPeeringEvent>(
4525 get_osdmap_epoch(),
4526 get_osdmap_epoch(),
4527 RecoveryDone())));
4528 }
4529 // fall-thru
4530
4531 case MOSDPGBackfill::OP_BACKFILL_PROGRESS:
4532 {
4533 ceph_assert(cct->_conf->osd_kill_backfill_at != 2);
4534
4535 ObjectStore::Transaction t;
4536 recovery_state.update_backfill_progress(
4537 m->last_backfill,
4538 m->stats,
4539 m->op == MOSDPGBackfill::OP_BACKFILL_PROGRESS,
4540 t);
4541
4542 int tr = osd->store->queue_transaction(ch, std::move(t), NULL);
4543 ceph_assert(tr == 0);
4544 }
4545 break;
4546
4547 case MOSDPGBackfill::OP_BACKFILL_FINISH_ACK:
4548 {
4549 ceph_assert(is_primary());
4550 ceph_assert(cct->_conf->osd_kill_backfill_at != 3);
4551 finish_recovery_op(hobject_t::get_max());
4552 }
4553 break;
4554 }
4555 }
4556
4557 void PrimaryLogPG::do_backfill_remove(OpRequestRef op)
4558 {
4559 const MOSDPGBackfillRemove *m = static_cast<const MOSDPGBackfillRemove*>(
4560 op->get_req());
4561 ceph_assert(m->get_type() == MSG_OSD_PG_BACKFILL_REMOVE);
4562 dout(7) << __func__ << " " << m->ls << dendl;
4563
4564 op->mark_started();
4565
4566 ObjectStore::Transaction t;
4567 for (auto& p : m->ls) {
4568 if (is_remote_backfilling()) {
4569 struct stat st;
4570 int r = osd->store->stat(ch, ghobject_t(p.first, ghobject_t::NO_GEN,
4571 pg_whoami.shard) , &st);
4572 if (r == 0) {
4573 sub_local_num_bytes(st.st_size);
4574 int64_t usersize;
4575 if (pool.info.is_erasure()) {
4576 bufferlist bv;
4577 int r = osd->store->getattr(
4578 ch,
4579 ghobject_t(p.first, ghobject_t::NO_GEN, pg_whoami.shard),
4580 OI_ATTR,
4581 bv);
4582 if (r >= 0) {
4583 object_info_t oi(bv);
4584 usersize = oi.size * pgbackend->get_ec_data_chunk_count();
4585 } else {
4586 dout(0) << __func__ << " " << ghobject_t(p.first, ghobject_t::NO_GEN, pg_whoami.shard)
4587 << " can't get object info" << dendl;
4588 usersize = 0;
4589 }
4590 } else {
4591 usersize = st.st_size;
4592 }
4593 sub_num_bytes(usersize);
4594 dout(10) << __func__ << " " << ghobject_t(p.first, ghobject_t::NO_GEN, pg_whoami.shard)
4595 << " sub actual data by " << st.st_size
4596 << " sub num_bytes by " << usersize
4597 << dendl;
4598 }
4599 }
4600 remove_snap_mapped_object(t, p.first);
4601 }
4602 int r = osd->store->queue_transaction(ch, std::move(t), NULL);
4603 ceph_assert(r == 0);
4604 }
4605
4606 int PrimaryLogPG::trim_object(
4607 bool first, const hobject_t &coid, snapid_t snap_to_trim,
4608 PrimaryLogPG::OpContextUPtr *ctxp)
4609 {
4610 *ctxp = NULL;
4611
4612 // load clone info
4613 bufferlist bl;
4614 ObjectContextRef obc = get_object_context(coid, false, NULL);
4615 if (!obc || !obc->ssc || !obc->ssc->exists) {
4616 osd->clog->error() << __func__ << ": Can not trim " << coid
4617 << " repair needed " << (obc ? "(no obc->ssc or !exists)" : "(no obc)");
4618 return -ENOENT;
4619 }
4620
4621 hobject_t head_oid = coid.get_head();
4622 ObjectContextRef head_obc = get_object_context(head_oid, false);
4623 if (!head_obc) {
4624 osd->clog->error() << __func__ << ": Can not trim " << coid
4625 << " repair needed, no snapset obc for " << head_oid;
4626 return -ENOENT;
4627 }
4628
4629 SnapSet& snapset = obc->ssc->snapset;
4630
4631 object_info_t &coi = obc->obs.oi;
4632 auto citer = snapset.clone_snaps.find(coid.snap);
4633 if (citer == snapset.clone_snaps.end()) {
4634 osd->clog->error() << "No clone_snaps in snapset " << snapset
4635 << " for object " << coid << "\n";
4636 return -ENOENT;
4637 }
4638 set<snapid_t> old_snaps(citer->second.begin(), citer->second.end());
4639 if (old_snaps.empty()) {
4640 osd->clog->error() << "No object info snaps for object " << coid;
4641 return -ENOENT;
4642 }
4643
4644 dout(10) << coid << " old_snaps " << old_snaps
4645 << " old snapset " << snapset << dendl;
4646 if (snapset.seq == 0) {
4647 osd->clog->error() << "No snapset.seq for object " << coid;
4648 return -ENOENT;
4649 }
4650
4651 set<snapid_t> new_snaps;
4652 const OSDMapRef& osdmap = get_osdmap();
4653 for (set<snapid_t>::iterator i = old_snaps.begin();
4654 i != old_snaps.end();
4655 ++i) {
4656 if (!osdmap->in_removed_snaps_queue(info.pgid.pgid.pool(), *i) &&
4657 *i != snap_to_trim) {
4658 new_snaps.insert(*i);
4659 }
4660 }
4661
4662 vector<snapid_t>::iterator p = snapset.clones.end();
4663
4664 if (new_snaps.empty()) {
4665 p = std::find(snapset.clones.begin(), snapset.clones.end(), coid.snap);
4666 if (p == snapset.clones.end()) {
4667 osd->clog->error() << "Snap " << coid.snap << " not in clones";
4668 return -ENOENT;
4669 }
4670 }
4671
4672 OpContextUPtr ctx = simple_opc_create(obc);
4673 ctx->head_obc = head_obc;
4674
4675 if (!ctx->lock_manager.get_snaptrimmer_write(
4676 coid,
4677 obc,
4678 first)) {
4679 close_op_ctx(ctx.release());
4680 dout(10) << __func__ << ": Unable to get a wlock on " << coid << dendl;
4681 return -ENOLCK;
4682 }
4683
4684 if (!ctx->lock_manager.get_snaptrimmer_write(
4685 head_oid,
4686 head_obc,
4687 first)) {
4688 close_op_ctx(ctx.release());
4689 dout(10) << __func__ << ": Unable to get a wlock on " << head_oid << dendl;
4690 return -ENOLCK;
4691 }
4692
4693 ctx->at_version = get_next_version();
4694
4695 PGTransaction *t = ctx->op_t.get();
4696
4697 if (new_snaps.empty()) {
4698 // remove clone
4699 dout(10) << coid << " snaps " << old_snaps << " -> "
4700 << new_snaps << " ... deleting" << dendl;
4701
4702 // ...from snapset
4703 ceph_assert(p != snapset.clones.end());
4704
4705 snapid_t last = coid.snap;
4706 ctx->delta_stats.num_bytes -= snapset.get_clone_bytes(last);
4707
4708 if (p != snapset.clones.begin()) {
4709 // not the oldest... merge overlap into next older clone
4710 vector<snapid_t>::iterator n = p - 1;
4711 hobject_t prev_coid = coid;
4712 prev_coid.snap = *n;
4713 bool adjust_prev_bytes = is_present_clone(prev_coid);
4714
4715 if (adjust_prev_bytes)
4716 ctx->delta_stats.num_bytes -= snapset.get_clone_bytes(*n);
4717
4718 snapset.clone_overlap[*n].intersection_of(
4719 snapset.clone_overlap[*p]);
4720
4721 if (adjust_prev_bytes)
4722 ctx->delta_stats.num_bytes += snapset.get_clone_bytes(*n);
4723 }
4724 ctx->delta_stats.num_objects--;
4725 if (coi.is_dirty())
4726 ctx->delta_stats.num_objects_dirty--;
4727 if (coi.is_omap())
4728 ctx->delta_stats.num_objects_omap--;
4729 if (coi.is_whiteout()) {
4730 dout(20) << __func__ << " trimming whiteout on " << coid << dendl;
4731 ctx->delta_stats.num_whiteouts--;
4732 }
4733 ctx->delta_stats.num_object_clones--;
4734 if (coi.is_cache_pinned())
4735 ctx->delta_stats.num_objects_pinned--;
4736 if (coi.has_manifest()) {
4737 dec_all_refcount_manifest(coi, ctx.get());
4738 ctx->delta_stats.num_objects_manifest--;
4739 }
4740 obc->obs.exists = false;
4741
4742 snapset.clones.erase(p);
4743 snapset.clone_overlap.erase(last);
4744 snapset.clone_size.erase(last);
4745 snapset.clone_snaps.erase(last);
4746
4747 ctx->log.push_back(
4748 pg_log_entry_t(
4749 pg_log_entry_t::DELETE,
4750 coid,
4751 ctx->at_version,
4752 ctx->obs->oi.version,
4753 0,
4754 osd_reqid_t(),
4755 ctx->mtime,
4756 0)
4757 );
4758 t->remove(coid);
4759 t->update_snaps(
4760 coid,
4761 old_snaps,
4762 new_snaps);
4763
4764 coi = object_info_t(coid);
4765
4766 ctx->at_version.version++;
4767 } else {
4768 // save adjusted snaps for this object
4769 dout(10) << coid << " snaps " << old_snaps << " -> " << new_snaps << dendl;
4770 snapset.clone_snaps[coid.snap] =
4771 vector<snapid_t>(new_snaps.rbegin(), new_snaps.rend());
4772 // we still do a 'modify' event on this object just to trigger a
4773 // snapmapper.update ... :(
4774
4775 coi.prior_version = coi.version;
4776 coi.version = ctx->at_version;
4777 bl.clear();
4778 encode(coi, bl, get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
4779 t->setattr(coid, OI_ATTR, bl);
4780
4781 ctx->log.push_back(
4782 pg_log_entry_t(
4783 pg_log_entry_t::MODIFY,
4784 coid,
4785 coi.version,
4786 coi.prior_version,
4787 0,
4788 osd_reqid_t(),
4789 ctx->mtime,
4790 0)
4791 );
4792 ctx->at_version.version++;
4793
4794 t->update_snaps(
4795 coid,
4796 old_snaps,
4797 new_snaps);
4798 }
4799
4800 // save head snapset
4801 dout(10) << coid << " new snapset " << snapset << " on "
4802 << head_obc->obs.oi << dendl;
4803 if (snapset.clones.empty() &&
4804 (head_obc->obs.oi.is_whiteout() &&
4805 !(head_obc->obs.oi.is_dirty() && pool.info.is_tier()) &&
4806 !head_obc->obs.oi.is_cache_pinned())) {
4807 // NOTE: this arguably constitutes minor interference with the
4808 // tiering agent if this is a cache tier since a snap trim event
4809 // is effectively evicting a whiteout we might otherwise want to
4810 // keep around.
4811 dout(10) << coid << " removing " << head_oid << dendl;
4812 ctx->log.push_back(
4813 pg_log_entry_t(
4814 pg_log_entry_t::DELETE,
4815 head_oid,
4816 ctx->at_version,
4817 head_obc->obs.oi.version,
4818 0,
4819 osd_reqid_t(),
4820 ctx->mtime,
4821 0)
4822 );
4823 dout(10) << "removing snap head" << dendl;
4824 object_info_t& oi = head_obc->obs.oi;
4825 ctx->delta_stats.num_objects--;
4826 if (oi.is_dirty()) {
4827 ctx->delta_stats.num_objects_dirty--;
4828 }
4829 if (oi.is_omap())
4830 ctx->delta_stats.num_objects_omap--;
4831 if (oi.is_whiteout()) {
4832 dout(20) << __func__ << " trimming whiteout on " << oi.soid << dendl;
4833 ctx->delta_stats.num_whiteouts--;
4834 }
4835 if (oi.is_cache_pinned()) {
4836 ctx->delta_stats.num_objects_pinned--;
4837 }
4838 if (oi.has_manifest()) {
4839 ctx->delta_stats.num_objects_manifest--;
4840 dec_all_refcount_manifest(oi, ctx.get());
4841 }
4842 head_obc->obs.exists = false;
4843 head_obc->obs.oi = object_info_t(head_oid);
4844 t->remove(head_oid);
4845 } else {
4846 if (get_osdmap()->require_osd_release < ceph_release_t::octopus) {
4847 // filter SnapSet::snaps for the benefit of pre-octopus
4848 // peers. This is perhaps overly conservative in that I'm not
4849 // certain they need this, but let's be conservative here.
4850 dout(10) << coid << " filtering snapset on " << head_oid << dendl;
4851 snapset.filter(pool.info);
4852 } else {
4853 snapset.snaps.clear();
4854 }
4855 dout(10) << coid << " writing updated snapset on " << head_oid
4856 << ", snapset is " << snapset << dendl;
4857 ctx->log.push_back(
4858 pg_log_entry_t(
4859 pg_log_entry_t::MODIFY,
4860 head_oid,
4861 ctx->at_version,
4862 head_obc->obs.oi.version,
4863 0,
4864 osd_reqid_t(),
4865 ctx->mtime,
4866 0)
4867 );
4868
4869 head_obc->obs.oi.prior_version = head_obc->obs.oi.version;
4870 head_obc->obs.oi.version = ctx->at_version;
4871
4872 map <string, bufferlist, less<>> attrs;
4873 bl.clear();
4874 encode(snapset, bl);
4875 attrs[SS_ATTR] = std::move(bl);
4876
4877 bl.clear();
4878 encode(head_obc->obs.oi, bl,
4879 get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
4880 attrs[OI_ATTR] = std::move(bl);
4881 t->setattrs(head_oid, attrs);
4882 }
4883
4884 *ctxp = std::move(ctx);
4885 return 0;
4886 }
4887
4888 void PrimaryLogPG::kick_snap_trim()
4889 {
4890 ceph_assert(is_active());
4891 ceph_assert(is_primary());
4892 if (is_clean() &&
4893 !state_test(PG_STATE_PREMERGE) &&
4894 !snap_trimq.empty()) {
4895 if (get_osdmap()->test_flag(CEPH_OSDMAP_NOSNAPTRIM)) {
4896 dout(10) << __func__ << ": nosnaptrim set, not kicking" << dendl;
4897 } else {
4898 dout(10) << __func__ << ": clean and snaps to trim, kicking" << dendl;
4899 snap_trimmer_machine.process_event(KickTrim());
4900 }
4901 }
4902 }
4903
4904 void PrimaryLogPG::snap_trimmer_scrub_complete()
4905 {
4906 if (is_primary() && is_active() && is_clean()) {
4907 ceph_assert(!snap_trimq.empty());
4908 snap_trimmer_machine.process_event(ScrubComplete());
4909 }
4910 }
4911
4912 void PrimaryLogPG::snap_trimmer(epoch_t queued)
4913 {
4914 if (recovery_state.is_deleting() || pg_has_reset_since(queued)) {
4915 return;
4916 }
4917
4918 ceph_assert(is_primary());
4919
4920 dout(10) << "snap_trimmer posting" << dendl;
4921 snap_trimmer_machine.process_event(DoSnapWork());
4922 dout(10) << "snap_trimmer complete" << dendl;
4923 return;
4924 }
4925
4926 namespace {
4927
4928 template<typename U, typename V>
4929 int do_cmp_xattr(int op, const U& lhs, const V& rhs)
4930 {
4931 switch (op) {
4932 case CEPH_OSD_CMPXATTR_OP_EQ:
4933 return lhs == rhs;
4934 case CEPH_OSD_CMPXATTR_OP_NE:
4935 return lhs != rhs;
4936 case CEPH_OSD_CMPXATTR_OP_GT:
4937 return lhs > rhs;
4938 case CEPH_OSD_CMPXATTR_OP_GTE:
4939 return lhs >= rhs;
4940 case CEPH_OSD_CMPXATTR_OP_LT:
4941 return lhs < rhs;
4942 case CEPH_OSD_CMPXATTR_OP_LTE:
4943 return lhs <= rhs;
4944 default:
4945 return -EINVAL;
4946 }
4947 }
4948
4949 } // anonymous namespace
4950
4951 int PrimaryLogPG::do_xattr_cmp_u64(int op, uint64_t v1, bufferlist& xattr)
4952 {
4953 uint64_t v2;
4954
4955 if (xattr.length()) {
4956 const char* first = xattr.c_str();
4957 if (auto [p, ec] = std::from_chars(first, first + xattr.length(), v2);
4958 ec != std::errc()) {
4959 return -EINVAL;
4960 }
4961 } else {
4962 v2 = 0;
4963 }
4964 dout(20) << "do_xattr_cmp_u64 '" << v1 << "' vs '" << v2 << "' op " << op << dendl;
4965 return do_cmp_xattr(op, v1, v2);
4966 }
4967
4968 int PrimaryLogPG::do_xattr_cmp_str(int op, string& v1s, bufferlist& xattr)
4969 {
4970 string_view v2s(xattr.c_str(), xattr.length());
4971 dout(20) << "do_xattr_cmp_str '" << v1s << "' vs '" << v2s << "' op " << op << dendl;
4972 return do_cmp_xattr(op, v1s, v2s);
4973 }
4974
4975 int PrimaryLogPG::do_writesame(OpContext *ctx, OSDOp& osd_op)
4976 {
4977 ceph_osd_op& op = osd_op.op;
4978 vector<OSDOp> write_ops(1);
4979 OSDOp& write_op = write_ops[0];
4980 uint64_t write_length = op.writesame.length;
4981 int result = 0;
4982
4983 if (!write_length)
4984 return 0;
4985
4986 if (!op.writesame.data_length || write_length % op.writesame.data_length)
4987 return -EINVAL;
4988
4989 if (op.writesame.data_length != osd_op.indata.length()) {
4990 derr << "invalid length ws data length " << op.writesame.data_length << " actual len " << osd_op.indata.length() << dendl;
4991 return -EINVAL;
4992 }
4993
4994 while (write_length) {
4995 write_op.indata.append(osd_op.indata);
4996 write_length -= op.writesame.data_length;
4997 }
4998
4999 write_op.op.op = CEPH_OSD_OP_WRITE;
5000 write_op.op.extent.offset = op.writesame.offset;
5001 write_op.op.extent.length = op.writesame.length;
5002 result = do_osd_ops(ctx, write_ops);
5003 if (result < 0)
5004 derr << "do_writesame do_osd_ops failed " << result << dendl;
5005
5006 return result;
5007 }
5008
5009 // ========================================================================
5010 // low level osd ops
5011
5012 int PrimaryLogPG::do_tmap2omap(OpContext *ctx, unsigned flags)
5013 {
5014 dout(20) << " convert tmap to omap for " << ctx->new_obs.oi.soid << dendl;
5015 bufferlist header, vals;
5016 int r = _get_tmap(ctx, &header, &vals);
5017 if (r < 0) {
5018 if (r == -ENODATA && (flags & CEPH_OSD_TMAP2OMAP_NULLOK))
5019 r = 0;
5020 return r;
5021 }
5022
5023 vector<OSDOp> ops(3);
5024
5025 ops[0].op.op = CEPH_OSD_OP_TRUNCATE;
5026 ops[0].op.extent.offset = 0;
5027 ops[0].op.extent.length = 0;
5028
5029 ops[1].op.op = CEPH_OSD_OP_OMAPSETHEADER;
5030 ops[1].indata = std::move(header);
5031
5032 ops[2].op.op = CEPH_OSD_OP_OMAPSETVALS;
5033 ops[2].indata = std::move(vals);
5034
5035 return do_osd_ops(ctx, ops);
5036 }
5037
5038 int PrimaryLogPG::do_tmapup_slow(OpContext *ctx, bufferlist::const_iterator& bp,
5039 OSDOp& osd_op, bufferlist& bl)
5040 {
5041 // decode
5042 bufferlist header;
5043 map<string, bufferlist> m;
5044 if (bl.length()) {
5045 auto p = bl.cbegin();
5046 decode(header, p);
5047 decode(m, p);
5048 ceph_assert(p.end());
5049 }
5050
5051 // do the update(s)
5052 while (!bp.end()) {
5053 __u8 op;
5054 string key;
5055 decode(op, bp);
5056
5057 switch (op) {
5058 case CEPH_OSD_TMAP_SET: // insert key
5059 {
5060 decode(key, bp);
5061 bufferlist data;
5062 decode(data, bp);
5063 m[key] = data;
5064 }
5065 break;
5066 case CEPH_OSD_TMAP_RM: // remove key
5067 decode(key, bp);
5068 if (!m.count(key)) {
5069 return -ENOENT;
5070 }
5071 m.erase(key);
5072 break;
5073 case CEPH_OSD_TMAP_RMSLOPPY: // remove key
5074 decode(key, bp);
5075 m.erase(key);
5076 break;
5077 case CEPH_OSD_TMAP_HDR: // update header
5078 {
5079 decode(header, bp);
5080 }
5081 break;
5082 default:
5083 return -EINVAL;
5084 }
5085 }
5086
5087 // reencode
5088 bufferlist obl;
5089 encode(header, obl);
5090 encode(m, obl);
5091
5092 // write it out
5093 vector<OSDOp> nops(1);
5094 OSDOp& newop = nops[0];
5095 newop.op.op = CEPH_OSD_OP_WRITEFULL;
5096 newop.op.extent.offset = 0;
5097 newop.op.extent.length = obl.length();
5098 newop.indata = obl;
5099 do_osd_ops(ctx, nops);
5100 return 0;
5101 }
5102
5103 int PrimaryLogPG::do_tmapup(OpContext *ctx, bufferlist::const_iterator& bp, OSDOp& osd_op)
5104 {
5105 bufferlist::const_iterator orig_bp = bp;
5106 int result = 0;
5107 if (bp.end()) {
5108 dout(10) << "tmapup is a no-op" << dendl;
5109 } else {
5110 // read the whole object
5111 vector<OSDOp> nops(1);
5112 OSDOp& newop = nops[0];
5113 newop.op.op = CEPH_OSD_OP_READ;
5114 newop.op.extent.offset = 0;
5115 newop.op.extent.length = 0;
5116 result = do_osd_ops(ctx, nops);
5117
5118 dout(10) << "tmapup read " << newop.outdata.length() << dendl;
5119
5120 dout(30) << " starting is \n";
5121 newop.outdata.hexdump(*_dout);
5122 *_dout << dendl;
5123
5124 auto ip = newop.outdata.cbegin();
5125 bufferlist obl;
5126
5127 dout(30) << "the update command is: \n";
5128 osd_op.indata.hexdump(*_dout);
5129 *_dout << dendl;
5130
5131 // header
5132 bufferlist header;
5133 __u32 nkeys = 0;
5134 if (newop.outdata.length()) {
5135 decode(header, ip);
5136 decode(nkeys, ip);
5137 }
5138 dout(10) << "tmapup header " << header.length() << dendl;
5139
5140 if (!bp.end() && *bp == CEPH_OSD_TMAP_HDR) {
5141 ++bp;
5142 decode(header, bp);
5143 dout(10) << "tmapup new header " << header.length() << dendl;
5144 }
5145
5146 encode(header, obl);
5147
5148 dout(20) << "tmapup initial nkeys " << nkeys << dendl;
5149
5150 // update keys
5151 bufferlist newkeydata;
5152 string nextkey, last_in_key;
5153 bufferlist nextval;
5154 bool have_next = false;
5155 if (!ip.end()) {
5156 have_next = true;
5157 decode(nextkey, ip);
5158 decode(nextval, ip);
5159 }
5160 while (!bp.end() && !result) {
5161 __u8 op;
5162 string key;
5163 try {
5164 decode(op, bp);
5165 decode(key, bp);
5166 }
5167 catch (ceph::buffer::error& e) {
5168 return -EINVAL;
5169 }
5170 if (key < last_in_key) {
5171 dout(5) << "tmapup warning: key '" << key << "' < previous key '" << last_in_key
5172 << "', falling back to an inefficient (unsorted) update" << dendl;
5173 bp = orig_bp;
5174 return do_tmapup_slow(ctx, bp, osd_op, newop.outdata);
5175 }
5176 last_in_key = key;
5177
5178 dout(10) << "tmapup op " << (int)op << " key " << key << dendl;
5179
5180 // skip existing intervening keys
5181 bool key_exists = false;
5182 while (have_next && !key_exists) {
5183 dout(20) << " (have_next=" << have_next << " nextkey=" << nextkey << ")" << dendl;
5184 if (nextkey > key)
5185 break;
5186 if (nextkey < key) {
5187 // copy untouched.
5188 encode(nextkey, newkeydata);
5189 encode(nextval, newkeydata);
5190 dout(20) << " keep " << nextkey << " " << nextval.length() << dendl;
5191 } else {
5192 // don't copy; discard old value. and stop.
5193 dout(20) << " drop " << nextkey << " " << nextval.length() << dendl;
5194 key_exists = true;
5195 nkeys--;
5196 }
5197 if (!ip.end()) {
5198 decode(nextkey, ip);
5199 decode(nextval, ip);
5200 } else {
5201 have_next = false;
5202 }
5203 }
5204
5205 if (op == CEPH_OSD_TMAP_SET) {
5206 bufferlist val;
5207 try {
5208 decode(val, bp);
5209 }
5210 catch (ceph::buffer::error& e) {
5211 return -EINVAL;
5212 }
5213 encode(key, newkeydata);
5214 encode(val, newkeydata);
5215 dout(20) << " set " << key << " " << val.length() << dendl;
5216 nkeys++;
5217 } else if (op == CEPH_OSD_TMAP_CREATE) {
5218 if (key_exists) {
5219 return -EEXIST;
5220 }
5221 bufferlist val;
5222 try {
5223 decode(val, bp);
5224 }
5225 catch (ceph::buffer::error& e) {
5226 return -EINVAL;
5227 }
5228 encode(key, newkeydata);
5229 encode(val, newkeydata);
5230 dout(20) << " create " << key << " " << val.length() << dendl;
5231 nkeys++;
5232 } else if (op == CEPH_OSD_TMAP_RM) {
5233 // do nothing.
5234 if (!key_exists) {
5235 return -ENOENT;
5236 }
5237 } else if (op == CEPH_OSD_TMAP_RMSLOPPY) {
5238 // do nothing
5239 } else {
5240 dout(10) << " invalid tmap op " << (int)op << dendl;
5241 return -EINVAL;
5242 }
5243 }
5244
5245 // copy remaining
5246 if (have_next) {
5247 encode(nextkey, newkeydata);
5248 encode(nextval, newkeydata);
5249 dout(20) << " keep " << nextkey << " " << nextval.length() << dendl;
5250 }
5251 if (!ip.end()) {
5252 bufferlist rest;
5253 rest.substr_of(newop.outdata, ip.get_off(), newop.outdata.length() - ip.get_off());
5254 dout(20) << " keep trailing " << rest.length()
5255 << " at " << newkeydata.length() << dendl;
5256 newkeydata.claim_append(rest);
5257 }
5258
5259 // encode final key count + key data
5260 dout(20) << "tmapup final nkeys " << nkeys << dendl;
5261 encode(nkeys, obl);
5262 obl.claim_append(newkeydata);
5263
5264 if (0) {
5265 dout(30) << " final is \n";
5266 obl.hexdump(*_dout);
5267 *_dout << dendl;
5268
5269 // sanity check
5270 auto tp = obl.cbegin();
5271 bufferlist h;
5272 decode(h, tp);
5273 map<string,bufferlist> d;
5274 decode(d, tp);
5275 ceph_assert(tp.end());
5276 dout(0) << " **** debug sanity check, looks ok ****" << dendl;
5277 }
5278
5279 // write it out
5280 if (!result) {
5281 dout(20) << "tmapput write " << obl.length() << dendl;
5282 newop.op.op = CEPH_OSD_OP_WRITEFULL;
5283 newop.op.extent.offset = 0;
5284 newop.op.extent.length = obl.length();
5285 newop.indata = obl;
5286 do_osd_ops(ctx, nops);
5287 }
5288 }
5289 return result;
5290 }
5291
5292 static int check_offset_and_length(uint64_t offset, uint64_t length,
5293 uint64_t max, DoutPrefixProvider *dpp)
5294 {
5295 if (offset >= max ||
5296 length > max ||
5297 offset + length > max) {
5298 ldpp_dout(dpp, 10) << __func__ << " "
5299 << "osd_max_object_size: " << max
5300 << "; Hard limit of object size is 4GB." << dendl;
5301 return -EFBIG;
5302 }
5303
5304 return 0;
5305 }
5306
5307 struct FillInVerifyExtent : public Context {
5308 ceph_le64 *r;
5309 int32_t *rval;
5310 bufferlist *outdatap;
5311 std::optional<uint32_t> maybe_crc;
5312 uint64_t size;
5313 OSDService *osd;
5314 hobject_t soid;
5315 uint32_t flags;
5316 FillInVerifyExtent(ceph_le64 *r, int32_t *rv, bufferlist *blp,
5317 std::optional<uint32_t> mc, uint64_t size,
5318 OSDService *osd, hobject_t soid, uint32_t flags) :
5319 r(r), rval(rv), outdatap(blp), maybe_crc(mc),
5320 size(size), osd(osd), soid(soid), flags(flags) {}
5321 void finish(int len) override {
5322 if (len < 0) {
5323 *rval = len;
5324 return;
5325 }
5326 *r = len;
5327 *rval = 0;
5328
5329 // whole object? can we verify the checksum?
5330 if (maybe_crc && *r == size) {
5331 uint32_t crc = outdatap->crc32c(-1);
5332 if (maybe_crc != crc) {
5333 osd->clog->error() << std::hex << " full-object read crc 0x" << crc
5334 << " != expected 0x" << *maybe_crc
5335 << std::dec << " on " << soid;
5336 if (!(flags & CEPH_OSD_OP_FLAG_FAILOK)) {
5337 *rval = -EIO;
5338 *r = 0;
5339 }
5340 }
5341 }
5342 }
5343 };
5344
5345 struct ToSparseReadResult : public Context {
5346 int* result;
5347 bufferlist* data_bl;
5348 uint64_t data_offset;
5349 ceph_le64* len;
5350 ToSparseReadResult(int* result, bufferlist* bl, uint64_t offset,
5351 ceph_le64* len)
5352 : result(result), data_bl(bl), data_offset(offset),len(len) {}
5353 void finish(int r) override {
5354 if (r < 0) {
5355 *result = r;
5356 return;
5357 }
5358 *result = 0;
5359 *len = r;
5360 bufferlist outdata;
5361 map<uint64_t, uint64_t> extents = {{data_offset, r}};
5362 encode(extents, outdata);
5363 encode_destructively(*data_bl, outdata);
5364 data_bl->swap(outdata);
5365 }
5366 };
5367
5368 template<typename V>
5369 static string list_keys(const map<string, V>& m) {
5370 string s;
5371 for (typename map<string, V>::const_iterator itr = m.begin(); itr != m.end(); ++itr) {
5372 if (!s.empty()) {
5373 s.push_back(',');
5374 }
5375 s.append(itr->first);
5376 }
5377 return s;
5378 }
5379
5380 template<typename T>
5381 static string list_entries(const T& m) {
5382 string s;
5383 for (typename T::const_iterator itr = m.begin(); itr != m.end(); ++itr) {
5384 if (!s.empty()) {
5385 s.push_back(',');
5386 }
5387 s.append(*itr);
5388 }
5389 return s;
5390 }
5391
5392 void PrimaryLogPG::maybe_create_new_object(
5393 OpContext *ctx,
5394 bool ignore_transaction)
5395 {
5396 ObjectState& obs = ctx->new_obs;
5397 if (!obs.exists) {
5398 ctx->delta_stats.num_objects++;
5399 obs.exists = true;
5400 ceph_assert(!obs.oi.is_whiteout());
5401 obs.oi.new_object();
5402 if (!ignore_transaction)
5403 ctx->op_t->create(obs.oi.soid);
5404 } else if (obs.oi.is_whiteout()) {
5405 dout(10) << __func__ << " clearing whiteout on " << obs.oi.soid << dendl;
5406 ctx->new_obs.oi.clear_flag(object_info_t::FLAG_WHITEOUT);
5407 --ctx->delta_stats.num_whiteouts;
5408 }
5409 }
5410
5411 struct ReadFinisher : public PrimaryLogPG::OpFinisher {
5412 OSDOp& osd_op;
5413
5414 explicit ReadFinisher(OSDOp& osd_op) : osd_op(osd_op) {
5415 }
5416
5417 int execute() override {
5418 return osd_op.rval;
5419 }
5420 };
5421
5422 struct C_ChecksumRead : public Context {
5423 PrimaryLogPG *primary_log_pg;
5424 OSDOp &osd_op;
5425 Checksummer::CSumType csum_type;
5426 bufferlist init_value_bl;
5427 ceph_le64 read_length;
5428 bufferlist read_bl;
5429 Context *fill_extent_ctx;
5430
5431 C_ChecksumRead(PrimaryLogPG *primary_log_pg, OSDOp &osd_op,
5432 Checksummer::CSumType csum_type, bufferlist &&init_value_bl,
5433 std::optional<uint32_t> maybe_crc, uint64_t size,
5434 OSDService *osd, hobject_t soid, uint32_t flags)
5435 : primary_log_pg(primary_log_pg), osd_op(osd_op),
5436 csum_type(csum_type), init_value_bl(std::move(init_value_bl)),
5437 fill_extent_ctx(new FillInVerifyExtent(&read_length, &osd_op.rval,
5438 &read_bl, maybe_crc, size,
5439 osd, soid, flags)) {
5440 }
5441 ~C_ChecksumRead() override {
5442 delete fill_extent_ctx;
5443 }
5444
5445 void finish(int r) override {
5446 fill_extent_ctx->complete(r);
5447 fill_extent_ctx = nullptr;
5448
5449 if (osd_op.rval >= 0) {
5450 bufferlist::const_iterator init_value_bl_it = init_value_bl.begin();
5451 osd_op.rval = primary_log_pg->finish_checksum(osd_op, csum_type,
5452 &init_value_bl_it, read_bl);
5453 }
5454 }
5455 };
5456
5457 int PrimaryLogPG::do_checksum(OpContext *ctx, OSDOp& osd_op,
5458 bufferlist::const_iterator *bl_it)
5459 {
5460 dout(20) << __func__ << dendl;
5461
5462 auto& op = osd_op.op;
5463 if (op.checksum.chunk_size > 0) {
5464 if (op.checksum.length == 0) {
5465 dout(10) << __func__ << ": length required when chunk size provided"
5466 << dendl;
5467 return -EINVAL;
5468 }
5469 if (op.checksum.length % op.checksum.chunk_size != 0) {
5470 dout(10) << __func__ << ": length not aligned to chunk size" << dendl;
5471 return -EINVAL;
5472 }
5473 }
5474
5475 auto& oi = ctx->new_obs.oi;
5476 if (op.checksum.offset == 0 && op.checksum.length == 0) {
5477 // zeroed offset+length implies checksum whole object
5478 op.checksum.length = oi.size;
5479 } else if (op.checksum.offset >= oi.size) {
5480 // read size was trimmed to zero, do nothing
5481 // see PrimaryLogPG::do_read
5482 return 0;
5483 } else if (op.extent.offset + op.extent.length > oi.size) {
5484 op.extent.length = oi.size - op.extent.offset;
5485 if (op.checksum.chunk_size > 0 &&
5486 op.checksum.length % op.checksum.chunk_size != 0) {
5487 dout(10) << __func__ << ": length (trimmed to 0x"
5488 << std::hex << op.checksum.length
5489 << ") not aligned to chunk size 0x"
5490 << op.checksum.chunk_size << std::dec
5491 << dendl;
5492 return -EINVAL;
5493 }
5494 }
5495
5496 Checksummer::CSumType csum_type;
5497 switch (op.checksum.type) {
5498 case CEPH_OSD_CHECKSUM_OP_TYPE_XXHASH32:
5499 csum_type = Checksummer::CSUM_XXHASH32;
5500 break;
5501 case CEPH_OSD_CHECKSUM_OP_TYPE_XXHASH64:
5502 csum_type = Checksummer::CSUM_XXHASH64;
5503 break;
5504 case CEPH_OSD_CHECKSUM_OP_TYPE_CRC32C:
5505 csum_type = Checksummer::CSUM_CRC32C;
5506 break;
5507 default:
5508 dout(10) << __func__ << ": unknown crc type ("
5509 << static_cast<uint32_t>(op.checksum.type) << ")" << dendl;
5510 return -EINVAL;
5511 }
5512
5513 size_t csum_init_value_size = Checksummer::get_csum_init_value_size(csum_type);
5514 if (bl_it->get_remaining() < csum_init_value_size) {
5515 dout(10) << __func__ << ": init value not provided" << dendl;
5516 return -EINVAL;
5517 }
5518
5519 bufferlist init_value_bl;
5520 init_value_bl.substr_of(bl_it->get_bl(), bl_it->get_off(),
5521 csum_init_value_size);
5522 *bl_it += csum_init_value_size;
5523
5524 if (pool.info.is_erasure() && op.checksum.length > 0) {
5525 // If there is a data digest and it is possible we are reading
5526 // entire object, pass the digest.
5527 std::optional<uint32_t> maybe_crc;
5528 if (oi.is_data_digest() && op.checksum.offset == 0 &&
5529 op.checksum.length >= oi.size) {
5530 maybe_crc = oi.data_digest;
5531 }
5532
5533 // async read
5534 auto& soid = oi.soid;
5535 auto checksum_ctx = new C_ChecksumRead(this, osd_op, csum_type,
5536 std::move(init_value_bl), maybe_crc,
5537 oi.size, osd, soid, op.flags);
5538
5539 ctx->pending_async_reads.push_back({
5540 {op.checksum.offset, op.checksum.length, op.flags},
5541 {&checksum_ctx->read_bl, checksum_ctx}});
5542
5543 dout(10) << __func__ << ": async_read noted for " << soid << dendl;
5544 ctx->op_finishers[ctx->current_osd_subop_num].reset(
5545 new ReadFinisher(osd_op));
5546 return -EINPROGRESS;
5547 }
5548
5549 // sync read
5550 std::vector<OSDOp> read_ops(1);
5551 auto& read_op = read_ops[0];
5552 if (op.checksum.length > 0) {
5553 read_op.op.op = CEPH_OSD_OP_READ;
5554 read_op.op.flags = op.flags;
5555 read_op.op.extent.offset = op.checksum.offset;
5556 read_op.op.extent.length = op.checksum.length;
5557 read_op.op.extent.truncate_size = 0;
5558 read_op.op.extent.truncate_seq = 0;
5559
5560 int r = do_osd_ops(ctx, read_ops);
5561 if (r < 0) {
5562 derr << __func__ << ": do_osd_ops failed: " << cpp_strerror(r) << dendl;
5563 return r;
5564 }
5565 }
5566
5567 bufferlist::const_iterator init_value_bl_it = init_value_bl.begin();
5568 return finish_checksum(osd_op, csum_type, &init_value_bl_it,
5569 read_op.outdata);
5570 }
5571
5572 int PrimaryLogPG::finish_checksum(OSDOp& osd_op,
5573 Checksummer::CSumType csum_type,
5574 bufferlist::const_iterator *init_value_bl_it,
5575 const bufferlist &read_bl) {
5576 dout(20) << __func__ << dendl;
5577
5578 auto& op = osd_op.op;
5579
5580 if (op.checksum.length > 0 && read_bl.length() != op.checksum.length) {
5581 derr << __func__ << ": bytes read " << read_bl.length() << " != "
5582 << op.checksum.length << dendl;
5583 return -EINVAL;
5584 }
5585
5586 size_t csum_chunk_size = (op.checksum.chunk_size != 0 ?
5587 op.checksum.chunk_size : read_bl.length());
5588 uint32_t csum_count = (csum_chunk_size > 0 ?
5589 read_bl.length() / csum_chunk_size : 0);
5590
5591 bufferlist csum;
5592 bufferptr csum_data;
5593 if (csum_count > 0) {
5594 size_t csum_value_size = Checksummer::get_csum_value_size(csum_type);
5595 csum_data = ceph::buffer::create(csum_value_size * csum_count);
5596 csum_data.zero();
5597 csum.append(csum_data);
5598
5599 switch (csum_type) {
5600 case Checksummer::CSUM_XXHASH32:
5601 {
5602 Checksummer::xxhash32::init_value_t init_value;
5603 decode(init_value, *init_value_bl_it);
5604 Checksummer::calculate<Checksummer::xxhash32>(
5605 init_value, csum_chunk_size, 0, read_bl.length(), read_bl,
5606 &csum_data);
5607 }
5608 break;
5609 case Checksummer::CSUM_XXHASH64:
5610 {
5611 Checksummer::xxhash64::init_value_t init_value;
5612 decode(init_value, *init_value_bl_it);
5613 Checksummer::calculate<Checksummer::xxhash64>(
5614 init_value, csum_chunk_size, 0, read_bl.length(), read_bl,
5615 &csum_data);
5616 }
5617 break;
5618 case Checksummer::CSUM_CRC32C:
5619 {
5620 Checksummer::crc32c::init_value_t init_value;
5621 decode(init_value, *init_value_bl_it);
5622 Checksummer::calculate<Checksummer::crc32c>(
5623 init_value, csum_chunk_size, 0, read_bl.length(), read_bl,
5624 &csum_data);
5625 }
5626 break;
5627 default:
5628 break;
5629 }
5630 }
5631
5632 encode(csum_count, osd_op.outdata);
5633 osd_op.outdata.claim_append(csum);
5634 return 0;
5635 }
5636
5637 struct C_ExtentCmpRead : public Context {
5638 PrimaryLogPG *primary_log_pg;
5639 OSDOp &osd_op;
5640 ceph_le64 read_length{};
5641 bufferlist read_bl;
5642 Context *fill_extent_ctx;
5643
5644 C_ExtentCmpRead(PrimaryLogPG *primary_log_pg, OSDOp &osd_op,
5645 std::optional<uint32_t> maybe_crc, uint64_t size,
5646 OSDService *osd, hobject_t soid, uint32_t flags)
5647 : primary_log_pg(primary_log_pg), osd_op(osd_op),
5648 fill_extent_ctx(new FillInVerifyExtent(&read_length, &osd_op.rval,
5649 &read_bl, maybe_crc, size,
5650 osd, soid, flags)) {
5651 }
5652 ~C_ExtentCmpRead() override {
5653 delete fill_extent_ctx;
5654 }
5655
5656 void finish(int r) override {
5657 if (r == -ENOENT) {
5658 osd_op.rval = 0;
5659 read_bl.clear();
5660 delete fill_extent_ctx;
5661 } else {
5662 fill_extent_ctx->complete(r);
5663 }
5664 fill_extent_ctx = nullptr;
5665
5666 if (osd_op.rval >= 0) {
5667 osd_op.rval = primary_log_pg->finish_extent_cmp(osd_op, read_bl);
5668 }
5669 }
5670 };
5671
5672 int PrimaryLogPG::do_extent_cmp(OpContext *ctx, OSDOp& osd_op)
5673 {
5674 dout(20) << __func__ << dendl;
5675 ceph_osd_op& op = osd_op.op;
5676
5677 auto& oi = ctx->new_obs.oi;
5678 uint64_t size = oi.size;
5679 if ((oi.truncate_seq < op.extent.truncate_seq) &&
5680 (op.extent.offset + op.extent.length > op.extent.truncate_size)) {
5681 size = op.extent.truncate_size;
5682 }
5683
5684 if (op.extent.offset >= size) {
5685 op.extent.length = 0;
5686 } else if (op.extent.offset + op.extent.length > size) {
5687 op.extent.length = size - op.extent.offset;
5688 }
5689
5690 if (op.extent.length == 0) {
5691 dout(20) << __func__ << " zero length extent" << dendl;
5692 return finish_extent_cmp(osd_op, bufferlist{});
5693 } else if (!ctx->obs->exists || ctx->obs->oi.is_whiteout()) {
5694 dout(20) << __func__ << " object DNE" << dendl;
5695 return finish_extent_cmp(osd_op, {});
5696 } else if (pool.info.is_erasure()) {
5697 // If there is a data digest and it is possible we are reading
5698 // entire object, pass the digest.
5699 std::optional<uint32_t> maybe_crc;
5700 if (oi.is_data_digest() && op.checksum.offset == 0 &&
5701 op.checksum.length >= oi.size) {
5702 maybe_crc = oi.data_digest;
5703 }
5704
5705 // async read
5706 auto& soid = oi.soid;
5707 auto extent_cmp_ctx = new C_ExtentCmpRead(this, osd_op, maybe_crc, oi.size,
5708 osd, soid, op.flags);
5709 ctx->pending_async_reads.push_back({
5710 {op.extent.offset, op.extent.length, op.flags},
5711 {&extent_cmp_ctx->read_bl, extent_cmp_ctx}});
5712
5713 dout(10) << __func__ << ": async_read noted for " << soid << dendl;
5714
5715 ctx->op_finishers[ctx->current_osd_subop_num].reset(
5716 new ReadFinisher(osd_op));
5717 return -EINPROGRESS;
5718 }
5719
5720 // sync read
5721 vector<OSDOp> read_ops(1);
5722 OSDOp& read_op = read_ops[0];
5723
5724 read_op.op.op = CEPH_OSD_OP_SYNC_READ;
5725 read_op.op.extent.offset = op.extent.offset;
5726 read_op.op.extent.length = op.extent.length;
5727 read_op.op.extent.truncate_seq = op.extent.truncate_seq;
5728 read_op.op.extent.truncate_size = op.extent.truncate_size;
5729
5730 int result = do_osd_ops(ctx, read_ops);
5731 if (result < 0) {
5732 derr << __func__ << " failed " << result << dendl;
5733 return result;
5734 }
5735 return finish_extent_cmp(osd_op, read_op.outdata);
5736 }
5737
5738 int PrimaryLogPG::finish_extent_cmp(OSDOp& osd_op, const bufferlist &read_bl)
5739 {
5740 for (uint64_t idx = 0; idx < osd_op.indata.length(); ++idx) {
5741 char read_byte = (idx < read_bl.length() ? read_bl[idx] : 0);
5742 if (osd_op.indata[idx] != read_byte) {
5743 return (-MAX_ERRNO - idx);
5744 }
5745 }
5746
5747 return 0;
5748 }
5749
5750 int PrimaryLogPG::do_read(OpContext *ctx, OSDOp& osd_op) {
5751 dout(20) << __func__ << dendl;
5752 auto& op = osd_op.op;
5753 auto& oi = ctx->new_obs.oi;
5754 auto& soid = oi.soid;
5755 __u32 seq = oi.truncate_seq;
5756 uint64_t size = oi.size;
5757 bool trimmed_read = false;
5758
5759 dout(30) << __func__ << " oi.size: " << oi.size << dendl;
5760 dout(30) << __func__ << " oi.truncate_seq: " << oi.truncate_seq << dendl;
5761 dout(30) << __func__ << " op.extent.truncate_seq: " << op.extent.truncate_seq << dendl;
5762 dout(30) << __func__ << " op.extent.truncate_size: " << op.extent.truncate_size << dendl;
5763
5764 // are we beyond truncate_size?
5765 if ( (seq < op.extent.truncate_seq) &&
5766 (op.extent.offset + op.extent.length > op.extent.truncate_size) &&
5767 (size > op.extent.truncate_size) )
5768 size = op.extent.truncate_size;
5769
5770 if (op.extent.length == 0) //length is zero mean read the whole object
5771 op.extent.length = size;
5772
5773 if (op.extent.offset >= size) {
5774 op.extent.length = 0;
5775 trimmed_read = true;
5776 } else if (op.extent.offset + op.extent.length > size) {
5777 op.extent.length = size - op.extent.offset;
5778 trimmed_read = true;
5779 }
5780
5781 dout(30) << __func__ << "op.extent.length is now " << op.extent.length << dendl;
5782
5783 // read into a buffer
5784 int result = 0;
5785 if (trimmed_read && op.extent.length == 0) {
5786 // read size was trimmed to zero and it is expected to do nothing
5787 // a read operation of 0 bytes does *not* do nothing, this is why
5788 // the trimmed_read boolean is needed
5789 } else if (pool.info.is_erasure()) {
5790 // The initialisation below is required to silence a false positive
5791 // -Wmaybe-uninitialized warning
5792 std::optional<uint32_t> maybe_crc;
5793 // If there is a data digest and it is possible we are reading
5794 // entire object, pass the digest. FillInVerifyExtent will
5795 // will check the oi.size again.
5796 if (oi.is_data_digest() && op.extent.offset == 0 &&
5797 op.extent.length >= oi.size)
5798 maybe_crc = oi.data_digest;
5799 ctx->pending_async_reads.push_back(
5800 make_pair(
5801 boost::make_tuple(op.extent.offset, op.extent.length, op.flags),
5802 make_pair(&osd_op.outdata,
5803 new FillInVerifyExtent(&op.extent.length, &osd_op.rval,
5804 &osd_op.outdata, maybe_crc, oi.size,
5805 osd, soid, op.flags))));
5806 dout(10) << " async_read noted for " << soid << dendl;
5807
5808 ctx->op_finishers[ctx->current_osd_subop_num].reset(
5809 new ReadFinisher(osd_op));
5810 } else {
5811 int r = pgbackend->objects_read_sync(
5812 soid, op.extent.offset, op.extent.length, op.flags, &osd_op.outdata);
5813 // whole object? can we verify the checksum?
5814 if (r >= 0 && op.extent.offset == 0 &&
5815 (uint64_t)r == oi.size && oi.is_data_digest()) {
5816 uint32_t crc = osd_op.outdata.crc32c(-1);
5817 if (oi.data_digest != crc) {
5818 osd->clog->error() << info.pgid << std::hex
5819 << " full-object read crc 0x" << crc
5820 << " != expected 0x" << oi.data_digest
5821 << std::dec << " on " << soid;
5822 r = -EIO; // try repair later
5823 }
5824 }
5825 if (r == -EIO) {
5826 r = rep_repair_primary_object(soid, ctx);
5827 }
5828 if (r >= 0)
5829 op.extent.length = r;
5830 else if (r == -EAGAIN) {
5831 result = -EAGAIN;
5832 } else {
5833 result = r;
5834 op.extent.length = 0;
5835 }
5836 dout(10) << " read got " << r << " / " << op.extent.length
5837 << " bytes from obj " << soid << dendl;
5838 }
5839 if (result >= 0) {
5840 ctx->delta_stats.num_rd_kb += shift_round_up(op.extent.length, 10);
5841 ctx->delta_stats.num_rd++;
5842 }
5843 return result;
5844 }
5845
5846 int PrimaryLogPG::do_sparse_read(OpContext *ctx, OSDOp& osd_op) {
5847 dout(20) << __func__ << dendl;
5848 auto& op = osd_op.op;
5849 auto& oi = ctx->new_obs.oi;
5850 auto& soid = oi.soid;
5851
5852 if (op.extent.truncate_seq) {
5853 dout(0) << "sparse_read does not support truncation sequence " << dendl;
5854 return -EINVAL;
5855 }
5856
5857 ++ctx->num_read;
5858 if (pool.info.is_erasure()) {
5859 // translate sparse read to a normal one if not supported
5860 uint64_t offset = op.extent.offset;
5861 uint64_t length = op.extent.length;
5862 if (offset > oi.size) {
5863 length = 0;
5864 } else if (offset + length > oi.size) {
5865 length = oi.size - offset;
5866 }
5867
5868 if (length > 0) {
5869 ctx->pending_async_reads.push_back(
5870 make_pair(
5871 boost::make_tuple(offset, length, op.flags),
5872 make_pair(
5873 &osd_op.outdata,
5874 new ToSparseReadResult(&osd_op.rval, &osd_op.outdata, offset,
5875 &op.extent.length))));
5876 dout(10) << " async_read (was sparse_read) noted for " << soid << dendl;
5877
5878 ctx->op_finishers[ctx->current_osd_subop_num].reset(
5879 new ReadFinisher(osd_op));
5880 } else {
5881 dout(10) << " sparse read ended up empty for " << soid << dendl;
5882 map<uint64_t, uint64_t> extents;
5883 encode(extents, osd_op.outdata);
5884 }
5885 } else {
5886 // read into a buffer
5887 map<uint64_t, uint64_t> m;
5888 int r = osd->store->fiemap(ch, ghobject_t(soid, ghobject_t::NO_GEN,
5889 info.pgid.shard),
5890 op.extent.offset, op.extent.length, m);
5891 if (r < 0) {
5892 return r;
5893 }
5894
5895 bufferlist data_bl;
5896 r = pgbackend->objects_readv_sync(soid, std::move(m), op.flags, &data_bl);
5897 if (r == -EIO) {
5898 r = rep_repair_primary_object(soid, ctx);
5899 }
5900 if (r < 0) {
5901 return r;
5902 }
5903
5904 // Why SPARSE_READ need checksum? In fact, librbd always use sparse-read.
5905 // Maybe at first, there is no much whole objects. With continued use, more
5906 // and more whole object exist. So from this point, for spare-read add
5907 // checksum make sense.
5908 if ((uint64_t)r == oi.size && oi.is_data_digest()) {
5909 uint32_t crc = data_bl.crc32c(-1);
5910 if (oi.data_digest != crc) {
5911 osd->clog->error() << info.pgid << std::hex
5912 << " full-object read crc 0x" << crc
5913 << " != expected 0x" << oi.data_digest
5914 << std::dec << " on " << soid;
5915 r = rep_repair_primary_object(soid, ctx);
5916 if (r < 0) {
5917 return r;
5918 }
5919 }
5920 }
5921
5922 op.extent.length = r;
5923
5924 encode(m, osd_op.outdata); // re-encode since it might be modified
5925 ::encode_destructively(data_bl, osd_op.outdata);
5926
5927 dout(10) << " sparse_read got " << r << " bytes from object "
5928 << soid << dendl;
5929 }
5930
5931 ctx->delta_stats.num_rd_kb += shift_round_up(op.extent.length, 10);
5932 ctx->delta_stats.num_rd++;
5933 return 0;
5934 }
5935
5936 int PrimaryLogPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops)
5937 {
5938 int result = 0;
5939 SnapSetContext *ssc = ctx->obc->ssc;
5940 ObjectState& obs = ctx->new_obs;
5941 object_info_t& oi = obs.oi;
5942 const hobject_t& soid = oi.soid;
5943 const bool skip_data_digest = osd->store->has_builtin_csum() &&
5944 osd->osd_skip_data_digest;
5945
5946 PGTransaction* t = ctx->op_t.get();
5947
5948 dout(10) << "do_osd_op " << soid << " " << ops << dendl;
5949
5950 jspan span;
5951 if (ctx->op) {
5952 span = tracing::osd::tracer.add_span(__func__, ctx->op->osd_parent_span);
5953 }
5954 ctx->current_osd_subop_num = 0;
5955 for (auto p = ops.begin(); p != ops.end(); ++p, ctx->current_osd_subop_num++, ctx->processed_subop_count++) {
5956 OSDOp& osd_op = *p;
5957 ceph_osd_op& op = osd_op.op;
5958
5959 OpFinisher* op_finisher = nullptr;
5960 {
5961 auto op_finisher_it = ctx->op_finishers.find(ctx->current_osd_subop_num);
5962 if (op_finisher_it != ctx->op_finishers.end()) {
5963 op_finisher = op_finisher_it->second.get();
5964 }
5965 }
5966
5967 // TODO: check endianness (ceph_le32 vs uint32_t, etc.)
5968 // The fields in ceph_osd_op are little-endian (according to the definition in rados.h),
5969 // but the code in this function seems to treat them as native-endian. What should the
5970 // tracepoints do?
5971 tracepoint(osd, do_osd_op_pre, soid.oid.name.c_str(), soid.snap.val, op.op, ceph_osd_op_name(op.op), op.flags);
5972
5973 dout(10) << "do_osd_op " << osd_op << dendl;
5974
5975 auto bp = osd_op.indata.cbegin();
5976
5977 // user-visible modifcation?
5978 switch (op.op) {
5979 // non user-visible modifications
5980 case CEPH_OSD_OP_WATCH:
5981 case CEPH_OSD_OP_CACHE_EVICT:
5982 case CEPH_OSD_OP_CACHE_FLUSH:
5983 case CEPH_OSD_OP_CACHE_TRY_FLUSH:
5984 case CEPH_OSD_OP_UNDIRTY:
5985 case CEPH_OSD_OP_COPY_FROM: // we handle user_version update explicitly
5986 case CEPH_OSD_OP_COPY_FROM2:
5987 case CEPH_OSD_OP_CACHE_PIN:
5988 case CEPH_OSD_OP_CACHE_UNPIN:
5989 case CEPH_OSD_OP_SET_REDIRECT:
5990 case CEPH_OSD_OP_SET_CHUNK:
5991 case CEPH_OSD_OP_TIER_PROMOTE:
5992 case CEPH_OSD_OP_TIER_FLUSH:
5993 case CEPH_OSD_OP_TIER_EVICT:
5994 break;
5995 default:
5996 if (op.op & CEPH_OSD_OP_MODE_WR)
5997 ctx->user_modify = true;
5998 }
5999
6000 // munge -1 truncate to 0 truncate
6001 if (ceph_osd_op_uses_extent(op.op) &&
6002 op.extent.truncate_seq == 1 &&
6003 op.extent.truncate_size == (-1ULL)) {
6004 op.extent.truncate_size = 0;
6005 op.extent.truncate_seq = 0;
6006 }
6007
6008 // munge ZERO -> TRUNCATE? (don't munge to DELETE or we risk hosing attributes)
6009 if (op.op == CEPH_OSD_OP_ZERO &&
6010 obs.exists &&
6011 op.extent.offset < static_cast<Option::size_t>(osd->osd_max_object_size) &&
6012 op.extent.length >= 1 &&
6013 op.extent.length <= static_cast<Option::size_t>(osd->osd_max_object_size) &&
6014 op.extent.offset + op.extent.length >= oi.size) {
6015 if (op.extent.offset >= oi.size) {
6016 // no-op
6017 goto fail;
6018 }
6019 dout(10) << " munging ZERO " << op.extent.offset << "~" << op.extent.length
6020 << " -> TRUNCATE " << op.extent.offset << " (old size is " << oi.size << ")" << dendl;
6021 op.op = CEPH_OSD_OP_TRUNCATE;
6022 }
6023
6024 switch (op.op) {
6025
6026 // --- READS ---
6027
6028 case CEPH_OSD_OP_CMPEXT:
6029 ++ctx->num_read;
6030 tracepoint(osd, do_osd_op_pre_extent_cmp, soid.oid.name.c_str(),
6031 soid.snap.val, oi.size, oi.truncate_seq, op.extent.offset,
6032 op.extent.length, op.extent.truncate_size,
6033 op.extent.truncate_seq);
6034
6035 if (op_finisher == nullptr) {
6036 result = do_extent_cmp(ctx, osd_op);
6037 } else {
6038 result = op_finisher->execute();
6039 }
6040 break;
6041
6042 case CEPH_OSD_OP_SYNC_READ:
6043 if (pool.info.is_erasure()) {
6044 result = -EOPNOTSUPP;
6045 break;
6046 }
6047 // fall through
6048 case CEPH_OSD_OP_READ:
6049 ++ctx->num_read;
6050 tracepoint(osd, do_osd_op_pre_read, soid.oid.name.c_str(),
6051 soid.snap.val, oi.size, oi.truncate_seq, op.extent.offset,
6052 op.extent.length, op.extent.truncate_size,
6053 op.extent.truncate_seq);
6054 if (op_finisher == nullptr) {
6055 if (!ctx->data_off) {
6056 ctx->data_off = op.extent.offset;
6057 }
6058 result = do_read(ctx, osd_op);
6059 } else {
6060 result = op_finisher->execute();
6061 }
6062 break;
6063
6064 case CEPH_OSD_OP_CHECKSUM:
6065 ++ctx->num_read;
6066 {
6067 tracepoint(osd, do_osd_op_pre_checksum, soid.oid.name.c_str(),
6068 soid.snap.val, oi.size, oi.truncate_seq, op.checksum.type,
6069 op.checksum.offset, op.checksum.length,
6070 op.checksum.chunk_size);
6071
6072 if (op_finisher == nullptr) {
6073 result = do_checksum(ctx, osd_op, &bp);
6074 } else {
6075 result = op_finisher->execute();
6076 }
6077 }
6078 break;
6079
6080 /* map extents */
6081 case CEPH_OSD_OP_MAPEXT:
6082 tracepoint(osd, do_osd_op_pre_mapext, soid.oid.name.c_str(), soid.snap.val, op.extent.offset, op.extent.length);
6083 if (pool.info.is_erasure()) {
6084 result = -EOPNOTSUPP;
6085 break;
6086 }
6087 ++ctx->num_read;
6088 {
6089 // read into a buffer
6090 bufferlist bl;
6091 int r = osd->store->fiemap(ch, ghobject_t(soid, ghobject_t::NO_GEN,
6092 info.pgid.shard),
6093 op.extent.offset, op.extent.length, bl);
6094 osd_op.outdata = std::move(bl);
6095 if (r < 0)
6096 result = r;
6097 else
6098 ctx->delta_stats.num_rd_kb += shift_round_up(bl.length(), 10);
6099 ctx->delta_stats.num_rd++;
6100 dout(10) << " map_extents done on object " << soid << dendl;
6101 }
6102 break;
6103
6104 /* map extents */
6105 case CEPH_OSD_OP_SPARSE_READ:
6106 tracepoint(osd, do_osd_op_pre_sparse_read, soid.oid.name.c_str(),
6107 soid.snap.val, oi.size, oi.truncate_seq, op.extent.offset,
6108 op.extent.length, op.extent.truncate_size,
6109 op.extent.truncate_seq);
6110 if (op_finisher == nullptr) {
6111 result = do_sparse_read(ctx, osd_op);
6112 } else {
6113 result = op_finisher->execute();
6114 }
6115 break;
6116
6117 case CEPH_OSD_OP_CALL:
6118 {
6119 string cname, mname;
6120 bufferlist indata;
6121 try {
6122 bp.copy(op.cls.class_len, cname);
6123 bp.copy(op.cls.method_len, mname);
6124 bp.copy(op.cls.indata_len, indata);
6125 } catch (ceph::buffer::error& e) {
6126 dout(10) << "call unable to decode class + method + indata" << dendl;
6127 dout(30) << "in dump: ";
6128 osd_op.indata.hexdump(*_dout);
6129 *_dout << dendl;
6130 result = -EINVAL;
6131 tracepoint(osd, do_osd_op_pre_call, soid.oid.name.c_str(), soid.snap.val, "???", "???");
6132 break;
6133 }
6134 tracepoint(osd, do_osd_op_pre_call, soid.oid.name.c_str(), soid.snap.val, cname.c_str(), mname.c_str());
6135
6136 ClassHandler::ClassData *cls;
6137 result = ClassHandler::get_instance().open_class(cname, &cls);
6138 ceph_assert(result == 0); // init_op_flags() already verified this works.
6139
6140 ClassHandler::ClassMethod *method = cls->get_method(mname);
6141 if (!method) {
6142 dout(10) << "call method " << cname << "." << mname << " does not exist" << dendl;
6143 result = -EOPNOTSUPP;
6144 break;
6145 }
6146
6147 int flags = method->get_flags();
6148 if (flags & CLS_METHOD_WR)
6149 ctx->user_modify = true;
6150
6151 bufferlist outdata;
6152 dout(10) << "call method " << cname << "." << mname << dendl;
6153 int prev_rd = ctx->num_read;
6154 int prev_wr = ctx->num_write;
6155 result = method->exec((cls_method_context_t)&ctx, indata, outdata);
6156
6157 if (ctx->num_read > prev_rd && !(flags & CLS_METHOD_RD)) {
6158 derr << "method " << cname << "." << mname << " tried to read object but is not marked RD" << dendl;
6159 result = -EIO;
6160 break;
6161 }
6162 if (ctx->num_write > prev_wr && !(flags & CLS_METHOD_WR)) {
6163 derr << "method " << cname << "." << mname << " tried to update object but is not marked WR" << dendl;
6164 result = -EIO;
6165 break;
6166 }
6167
6168 dout(10) << "method called response length=" << outdata.length() << dendl;
6169 op.extent.length = outdata.length();
6170 osd_op.outdata.claim_append(outdata);
6171 dout(30) << "out dump: ";
6172 osd_op.outdata.hexdump(*_dout);
6173 *_dout << dendl;
6174 }
6175 break;
6176
6177 case CEPH_OSD_OP_STAT:
6178 // note: stat does not require RD
6179 {
6180 tracepoint(osd, do_osd_op_pre_stat, soid.oid.name.c_str(), soid.snap.val);
6181
6182 if (obs.exists && !oi.is_whiteout()) {
6183 encode(oi.size, osd_op.outdata);
6184 encode(oi.mtime, osd_op.outdata);
6185 dout(10) << "stat oi has " << oi.size << " " << oi.mtime << dendl;
6186 } else {
6187 result = -ENOENT;
6188 dout(10) << "stat oi object does not exist" << dendl;
6189 }
6190
6191 ctx->delta_stats.num_rd++;
6192 }
6193 break;
6194
6195 case CEPH_OSD_OP_ISDIRTY:
6196 ++ctx->num_read;
6197 {
6198 tracepoint(osd, do_osd_op_pre_isdirty, soid.oid.name.c_str(), soid.snap.val);
6199 bool is_dirty = obs.oi.is_dirty();
6200 encode(is_dirty, osd_op.outdata);
6201 ctx->delta_stats.num_rd++;
6202 result = 0;
6203 }
6204 break;
6205
6206 case CEPH_OSD_OP_UNDIRTY:
6207 ++ctx->num_write;
6208 result = 0;
6209 {
6210 tracepoint(osd, do_osd_op_pre_undirty, soid.oid.name.c_str(), soid.snap.val);
6211 if (oi.is_dirty()) {
6212 ctx->undirty = true; // see make_writeable()
6213 ctx->modify = true;
6214 ctx->delta_stats.num_wr++;
6215 }
6216 }
6217 break;
6218
6219 case CEPH_OSD_OP_CACHE_TRY_FLUSH:
6220 ++ctx->num_write;
6221 result = 0;
6222 {
6223 tracepoint(osd, do_osd_op_pre_try_flush, soid.oid.name.c_str(), soid.snap.val);
6224 if (ctx->lock_type != RWState::RWNONE) {
6225 dout(10) << "cache-try-flush without SKIPRWLOCKS flag set" << dendl;
6226 result = -EINVAL;
6227 break;
6228 }
6229 if (pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE || obs.oi.has_manifest()) {
6230 result = -EINVAL;
6231 break;
6232 }
6233 if (!obs.exists) {
6234 result = 0;
6235 break;
6236 }
6237 if (oi.is_cache_pinned()) {
6238 dout(10) << "cache-try-flush on a pinned object, consider unpin this object first" << dendl;
6239 result = -EPERM;
6240 break;
6241 }
6242 if (oi.is_dirty()) {
6243 result = start_flush(ctx->op, ctx->obc, false, NULL, std::nullopt);
6244 if (result == -EINPROGRESS)
6245 result = -EAGAIN;
6246 } else {
6247 result = 0;
6248 }
6249 }
6250 break;
6251
6252 case CEPH_OSD_OP_CACHE_FLUSH:
6253 ++ctx->num_write;
6254 result = 0;
6255 {
6256 tracepoint(osd, do_osd_op_pre_cache_flush, soid.oid.name.c_str(), soid.snap.val);
6257 if (ctx->lock_type == RWState::RWNONE) {
6258 dout(10) << "cache-flush with SKIPRWLOCKS flag set" << dendl;
6259 result = -EINVAL;
6260 break;
6261 }
6262 if (pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE || obs.oi.has_manifest()) {
6263 result = -EINVAL;
6264 break;
6265 }
6266 if (!obs.exists) {
6267 result = 0;
6268 break;
6269 }
6270 if (oi.is_cache_pinned()) {
6271 dout(10) << "cache-flush on a pinned object, consider unpin this object first" << dendl;
6272 result = -EPERM;
6273 break;
6274 }
6275 hobject_t missing;
6276 if (oi.is_dirty()) {
6277 result = start_flush(ctx->op, ctx->obc, true, &missing, std::nullopt);
6278 if (result == -EINPROGRESS)
6279 result = -EAGAIN;
6280 } else {
6281 result = 0;
6282 }
6283 // Check special return value which has set missing_return
6284 if (result == -ENOENT) {
6285 dout(10) << __func__ << " CEPH_OSD_OP_CACHE_FLUSH got ENOENT" << dendl;
6286 ceph_assert(!missing.is_min());
6287 wait_for_unreadable_object(missing, ctx->op);
6288 // Error code which is used elsewhere when wait_for_unreadable_object() is used
6289 result = -EAGAIN;
6290 }
6291 }
6292 break;
6293
6294 case CEPH_OSD_OP_CACHE_EVICT:
6295 ++ctx->num_write;
6296 result = 0;
6297 {
6298 tracepoint(osd, do_osd_op_pre_cache_evict, soid.oid.name.c_str(), soid.snap.val);
6299 if (pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE || obs.oi.has_manifest()) {
6300 result = -EINVAL;
6301 break;
6302 }
6303 if (!obs.exists) {
6304 result = 0;
6305 break;
6306 }
6307 if (oi.is_cache_pinned()) {
6308 dout(10) << "cache-evict on a pinned object, consider unpin this object first" << dendl;
6309 result = -EPERM;
6310 break;
6311 }
6312 if (oi.is_dirty()) {
6313 result = -EBUSY;
6314 break;
6315 }
6316 if (!oi.watchers.empty()) {
6317 result = -EBUSY;
6318 break;
6319 }
6320 if (soid.snap == CEPH_NOSNAP) {
6321 result = _verify_no_head_clones(soid, ssc->snapset);
6322 if (result < 0)
6323 break;
6324 }
6325 result = _delete_oid(ctx, true, false);
6326 if (result >= 0) {
6327 // mark that this is a cache eviction to avoid triggering normal
6328 // make_writeable() clone creation in finish_ctx()
6329 ctx->cache_operation = true;
6330 }
6331 osd->logger->inc(l_osd_tier_evict);
6332 }
6333 break;
6334
6335 case CEPH_OSD_OP_GETXATTR:
6336 ++ctx->num_read;
6337 {
6338 string aname;
6339 bp.copy(op.xattr.name_len, aname);
6340 tracepoint(osd, do_osd_op_pre_getxattr, soid.oid.name.c_str(), soid.snap.val, aname.c_str());
6341 string name = "_" + aname;
6342 int r = getattr_maybe_cache(
6343 ctx->obc,
6344 name,
6345 &(osd_op.outdata));
6346 if (r >= 0) {
6347 op.xattr.value_len = osd_op.outdata.length();
6348 result = 0;
6349 ctx->delta_stats.num_rd_kb += shift_round_up(osd_op.outdata.length(), 10);
6350 } else
6351 result = r;
6352
6353 ctx->delta_stats.num_rd++;
6354 }
6355 break;
6356
6357 case CEPH_OSD_OP_GETXATTRS:
6358 ++ctx->num_read;
6359 {
6360 tracepoint(osd, do_osd_op_pre_getxattrs, soid.oid.name.c_str(), soid.snap.val);
6361 map<string, bufferlist,less<>> out;
6362 result = getattrs_maybe_cache(
6363 ctx->obc,
6364 &out);
6365
6366 bufferlist bl;
6367 encode(out, bl);
6368 ctx->delta_stats.num_rd_kb += shift_round_up(bl.length(), 10);
6369 ctx->delta_stats.num_rd++;
6370 osd_op.outdata.claim_append(bl);
6371 }
6372 break;
6373
6374 case CEPH_OSD_OP_CMPXATTR:
6375 ++ctx->num_read;
6376 {
6377 string aname;
6378 bp.copy(op.xattr.name_len, aname);
6379 tracepoint(osd, do_osd_op_pre_cmpxattr, soid.oid.name.c_str(), soid.snap.val, aname.c_str());
6380 string name = "_" + aname;
6381 name[op.xattr.name_len + 1] = 0;
6382
6383 bufferlist xattr;
6384 result = getattr_maybe_cache(
6385 ctx->obc,
6386 name,
6387 &xattr);
6388 if (result < 0 && result != -EEXIST && result != -ENODATA)
6389 break;
6390
6391 ctx->delta_stats.num_rd++;
6392 ctx->delta_stats.num_rd_kb += shift_round_up(xattr.length(), 10);
6393
6394 switch (op.xattr.cmp_mode) {
6395 case CEPH_OSD_CMPXATTR_MODE_STRING:
6396 {
6397 string val;
6398 bp.copy(op.xattr.value_len, val);
6399 val[op.xattr.value_len] = 0;
6400 dout(10) << "CEPH_OSD_OP_CMPXATTR name=" << name << " val=" << val
6401 << " op=" << (int)op.xattr.cmp_op << " mode=" << (int)op.xattr.cmp_mode << dendl;
6402 result = do_xattr_cmp_str(op.xattr.cmp_op, val, xattr);
6403 }
6404 break;
6405
6406 case CEPH_OSD_CMPXATTR_MODE_U64:
6407 {
6408 uint64_t u64val;
6409 try {
6410 decode(u64val, bp);
6411 }
6412 catch (ceph::buffer::error& e) {
6413 result = -EINVAL;
6414 goto fail;
6415 }
6416 dout(10) << "CEPH_OSD_OP_CMPXATTR name=" << name << " val=" << u64val
6417 << " op=" << (int)op.xattr.cmp_op << " mode=" << (int)op.xattr.cmp_mode << dendl;
6418 result = do_xattr_cmp_u64(op.xattr.cmp_op, u64val, xattr);
6419 }
6420 break;
6421
6422 default:
6423 dout(10) << "bad cmp mode " << (int)op.xattr.cmp_mode << dendl;
6424 result = -EINVAL;
6425 }
6426
6427 if (!result) {
6428 dout(10) << "comparison returned false" << dendl;
6429 result = -ECANCELED;
6430 break;
6431 }
6432 if (result < 0) {
6433 dout(10) << "comparison returned " << result << " " << cpp_strerror(-result) << dendl;
6434 break;
6435 }
6436
6437 dout(10) << "comparison returned true" << dendl;
6438 }
6439 break;
6440
6441 case CEPH_OSD_OP_ASSERT_VER:
6442 ++ctx->num_read;
6443 {
6444 uint64_t ver = op.assert_ver.ver;
6445 tracepoint(osd, do_osd_op_pre_assert_ver, soid.oid.name.c_str(), soid.snap.val, ver);
6446 if (!ver)
6447 result = -EINVAL;
6448 else if (ver < oi.user_version)
6449 result = -ERANGE;
6450 else if (ver > oi.user_version)
6451 result = -EOVERFLOW;
6452 }
6453 break;
6454
6455 case CEPH_OSD_OP_LIST_WATCHERS:
6456 ++ctx->num_read;
6457 {
6458 tracepoint(osd, do_osd_op_pre_list_watchers, soid.oid.name.c_str(), soid.snap.val);
6459 obj_list_watch_response_t resp;
6460
6461 map<pair<uint64_t, entity_name_t>, watch_info_t>::const_iterator oi_iter;
6462 for (oi_iter = oi.watchers.begin(); oi_iter != oi.watchers.end();
6463 ++oi_iter) {
6464 dout(20) << "key cookie=" << oi_iter->first.first
6465 << " entity=" << oi_iter->first.second << " "
6466 << oi_iter->second << dendl;
6467 ceph_assert(oi_iter->first.first == oi_iter->second.cookie);
6468 ceph_assert(oi_iter->first.second.is_client());
6469
6470 watch_item_t wi(oi_iter->first.second, oi_iter->second.cookie,
6471 oi_iter->second.timeout_seconds, oi_iter->second.addr);
6472 resp.entries.push_back(wi);
6473 }
6474
6475 resp.encode(osd_op.outdata, ctx->get_features());
6476 result = 0;
6477
6478 ctx->delta_stats.num_rd++;
6479 break;
6480 }
6481
6482 case CEPH_OSD_OP_LIST_SNAPS:
6483 ++ctx->num_read;
6484 {
6485 tracepoint(osd, do_osd_op_pre_list_snaps, soid.oid.name.c_str(), soid.snap.val);
6486 obj_list_snap_response_t resp;
6487
6488 if (!ssc) {
6489 ssc = ctx->obc->ssc = get_snapset_context(soid, false);
6490 }
6491 ceph_assert(ssc);
6492 dout(20) << " snapset " << ssc->snapset << dendl;
6493
6494 int clonecount = ssc->snapset.clones.size();
6495 clonecount++; // for head
6496 resp.clones.reserve(clonecount);
6497 for (auto clone_iter = ssc->snapset.clones.begin();
6498 clone_iter != ssc->snapset.clones.end(); ++clone_iter) {
6499 clone_info ci;
6500 ci.cloneid = *clone_iter;
6501
6502 hobject_t clone_oid = soid;
6503 clone_oid.snap = *clone_iter;
6504
6505 auto p = ssc->snapset.clone_snaps.find(*clone_iter);
6506 if (p == ssc->snapset.clone_snaps.end()) {
6507 osd->clog->error() << "osd." << osd->whoami
6508 << ": inconsistent clone_snaps found for oid "
6509 << soid << " clone " << *clone_iter
6510 << " snapset " << ssc->snapset;
6511 result = -EINVAL;
6512 break;
6513 }
6514 for (auto q = p->second.rbegin(); q != p->second.rend(); ++q) {
6515 ci.snaps.push_back(*q);
6516 }
6517
6518 dout(20) << " clone " << *clone_iter << " snaps " << ci.snaps << dendl;
6519
6520 map<snapid_t, interval_set<uint64_t> >::const_iterator coi;
6521 coi = ssc->snapset.clone_overlap.find(ci.cloneid);
6522 if (coi == ssc->snapset.clone_overlap.end()) {
6523 osd->clog->error() << "osd." << osd->whoami
6524 << ": inconsistent clone_overlap found for oid "
6525 << soid << " clone " << *clone_iter;
6526 result = -EINVAL;
6527 break;
6528 }
6529 const interval_set<uint64_t> &o = coi->second;
6530 ci.overlap.reserve(o.num_intervals());
6531 for (interval_set<uint64_t>::const_iterator r = o.begin();
6532 r != o.end(); ++r) {
6533 ci.overlap.push_back(pair<uint64_t,uint64_t>(r.get_start(),
6534 r.get_len()));
6535 }
6536
6537 map<snapid_t, uint64_t>::const_iterator si;
6538 si = ssc->snapset.clone_size.find(ci.cloneid);
6539 if (si == ssc->snapset.clone_size.end()) {
6540 osd->clog->error() << "osd." << osd->whoami
6541 << ": inconsistent clone_size found for oid "
6542 << soid << " clone " << *clone_iter;
6543 result = -EINVAL;
6544 break;
6545 }
6546 ci.size = si->second;
6547
6548 resp.clones.push_back(ci);
6549 }
6550 if (result < 0) {
6551 break;
6552 }
6553 if (!ctx->obc->obs.oi.is_whiteout()) {
6554 ceph_assert(obs.exists);
6555 clone_info ci;
6556 ci.cloneid = CEPH_NOSNAP;
6557
6558 //Size for HEAD is oi.size
6559 ci.size = oi.size;
6560
6561 resp.clones.push_back(ci);
6562 }
6563 resp.seq = ssc->snapset.seq;
6564
6565 resp.encode(osd_op.outdata);
6566 result = 0;
6567
6568 ctx->delta_stats.num_rd++;
6569 break;
6570 }
6571
6572 case CEPH_OSD_OP_NOTIFY:
6573 ++ctx->num_read;
6574 {
6575 uint32_t timeout;
6576 bufferlist bl;
6577
6578 try {
6579 uint32_t ver; // obsolete
6580 decode(ver, bp);
6581 decode(timeout, bp);
6582 decode(bl, bp);
6583 } catch (const ceph::buffer::error &e) {
6584 timeout = 0;
6585 }
6586 tracepoint(osd, do_osd_op_pre_notify, soid.oid.name.c_str(), soid.snap.val, timeout);
6587 if (!timeout)
6588 timeout = cct->_conf->osd_default_notify_timeout;
6589
6590 notify_info_t n;
6591 n.timeout = timeout;
6592 n.notify_id = osd->get_next_id(get_osdmap_epoch());
6593 n.cookie = op.notify.cookie;
6594 n.bl = bl;
6595 ctx->notifies.push_back(n);
6596
6597 // return our unique notify id to the client
6598 encode(n.notify_id, osd_op.outdata);
6599 }
6600 break;
6601
6602 case CEPH_OSD_OP_NOTIFY_ACK:
6603 ++ctx->num_read;
6604 {
6605 try {
6606 uint64_t notify_id = 0;
6607 uint64_t watch_cookie = 0;
6608 decode(notify_id, bp);
6609 decode(watch_cookie, bp);
6610 bufferlist reply_bl;
6611 if (!bp.end()) {
6612 decode(reply_bl, bp);
6613 }
6614 tracepoint(osd, do_osd_op_pre_notify_ack, soid.oid.name.c_str(), soid.snap.val, notify_id, watch_cookie, "Y");
6615 OpContext::NotifyAck ack(notify_id, watch_cookie, reply_bl);
6616 ctx->notify_acks.push_back(ack);
6617 } catch (const ceph::buffer::error &e) {
6618 tracepoint(osd, do_osd_op_pre_notify_ack, soid.oid.name.c_str(), soid.snap.val, op.watch.cookie, 0, "N");
6619 OpContext::NotifyAck ack(
6620 // op.watch.cookie is actually the notify_id for historical reasons
6621 op.watch.cookie
6622 );
6623 ctx->notify_acks.push_back(ack);
6624 }
6625 }
6626 break;
6627
6628 case CEPH_OSD_OP_SETALLOCHINT:
6629 ++ctx->num_write;
6630 result = 0;
6631 {
6632 tracepoint(osd, do_osd_op_pre_setallochint, soid.oid.name.c_str(), soid.snap.val, op.alloc_hint.expected_object_size, op.alloc_hint.expected_write_size);
6633 maybe_create_new_object(ctx);
6634 oi.expected_object_size = op.alloc_hint.expected_object_size;
6635 oi.expected_write_size = op.alloc_hint.expected_write_size;
6636 oi.alloc_hint_flags = op.alloc_hint.flags;
6637 t->set_alloc_hint(soid, op.alloc_hint.expected_object_size,
6638 op.alloc_hint.expected_write_size,
6639 op.alloc_hint.flags);
6640 }
6641 break;
6642
6643
6644 // --- WRITES ---
6645
6646 // -- object data --
6647
6648 case CEPH_OSD_OP_WRITE:
6649 ++ctx->num_write;
6650 result = 0;
6651 { // write
6652 __u32 seq = oi.truncate_seq;
6653 tracepoint(osd, do_osd_op_pre_write, soid.oid.name.c_str(), soid.snap.val, oi.size, seq, op.extent.offset, op.extent.length, op.extent.truncate_size, op.extent.truncate_seq);
6654 if (op.extent.length != osd_op.indata.length()) {
6655 result = -EINVAL;
6656 break;
6657 }
6658
6659 if (pool.info.has_flag(pg_pool_t::FLAG_WRITE_FADVISE_DONTNEED))
6660 op.flags = op.flags | CEPH_OSD_OP_FLAG_FADVISE_DONTNEED;
6661
6662 if (pool.info.requires_aligned_append() &&
6663 (op.extent.offset % pool.info.required_alignment() != 0)) {
6664 result = -EOPNOTSUPP;
6665 break;
6666 }
6667
6668 if (!obs.exists) {
6669 if (pool.info.requires_aligned_append() && op.extent.offset) {
6670 result = -EOPNOTSUPP;
6671 break;
6672 }
6673 } else if (op.extent.offset != oi.size &&
6674 pool.info.requires_aligned_append()) {
6675 result = -EOPNOTSUPP;
6676 break;
6677 }
6678
6679 if (seq && (seq > op.extent.truncate_seq) &&
6680 (op.extent.offset + op.extent.length > oi.size)) {
6681 // old write, arrived after trimtrunc
6682 op.extent.length = (op.extent.offset > oi.size ? 0 : oi.size - op.extent.offset);
6683 dout(10) << " old truncate_seq " << op.extent.truncate_seq << " < current " << seq
6684 << ", adjusting write length to " << op.extent.length << dendl;
6685 bufferlist t;
6686 t.substr_of(osd_op.indata, 0, op.extent.length);
6687 osd_op.indata.swap(t);
6688 }
6689 if (op.extent.truncate_seq > seq) {
6690 // write arrives before trimtrunc
6691 if (obs.exists && !oi.is_whiteout()) {
6692 dout(10) << " truncate_seq " << op.extent.truncate_seq << " > current " << seq
6693 << ", truncating to " << op.extent.truncate_size << dendl;
6694 t->truncate(soid, op.extent.truncate_size);
6695 oi.truncate_seq = op.extent.truncate_seq;
6696 oi.truncate_size = op.extent.truncate_size;
6697 if (oi.size > op.extent.truncate_size) {
6698 interval_set<uint64_t> trim;
6699 trim.insert(op.extent.truncate_size,
6700 oi.size - op.extent.truncate_size);
6701 ctx->modified_ranges.union_of(trim);
6702 ctx->clean_regions.mark_data_region_dirty(op.extent.truncate_size, oi.size - op.extent.truncate_size);
6703 oi.clear_data_digest();
6704 }
6705 if (op.extent.truncate_size != oi.size) {
6706 truncate_update_size_and_usage(ctx->delta_stats,
6707 oi,
6708 op.extent.truncate_size);
6709 }
6710 } else {
6711 dout(10) << " truncate_seq " << op.extent.truncate_seq << " > current " << seq
6712 << ", but object is new" << dendl;
6713 oi.truncate_seq = op.extent.truncate_seq;
6714 oi.truncate_size = op.extent.truncate_size;
6715 }
6716 }
6717 result = check_offset_and_length(
6718 op.extent.offset, op.extent.length,
6719 static_cast<Option::size_t>(osd->osd_max_object_size), get_dpp());
6720 if (result < 0)
6721 break;
6722
6723 maybe_create_new_object(ctx);
6724
6725 if (op.extent.length == 0) {
6726 if (op.extent.offset > oi.size) {
6727 t->truncate(
6728 soid, op.extent.offset);
6729 truncate_update_size_and_usage(ctx->delta_stats, oi,
6730 op.extent.offset);
6731 } else {
6732 t->nop(soid);
6733 }
6734 } else {
6735 t->write(
6736 soid, op.extent.offset, op.extent.length, osd_op.indata, op.flags);
6737 }
6738
6739 if (op.extent.offset == 0 && op.extent.length >= oi.size
6740 && !skip_data_digest) {
6741 obs.oi.set_data_digest(osd_op.indata.crc32c(-1));
6742 } else if (op.extent.offset == oi.size && obs.oi.is_data_digest()) {
6743 if (skip_data_digest) {
6744 obs.oi.clear_data_digest();
6745 } else {
6746 obs.oi.set_data_digest(osd_op.indata.crc32c(obs.oi.data_digest));
6747 }
6748 } else {
6749 obs.oi.clear_data_digest();
6750 }
6751 write_update_size_and_usage(ctx->delta_stats, oi, ctx->modified_ranges,
6752 op.extent.offset, op.extent.length);
6753 ctx->clean_regions.mark_data_region_dirty(op.extent.offset, op.extent.length);
6754 dout(10) << "clean_regions modified" << ctx->clean_regions << dendl;
6755 }
6756 break;
6757
6758 case CEPH_OSD_OP_WRITEFULL:
6759 ++ctx->num_write;
6760 result = 0;
6761 { // write full object
6762 tracepoint(osd, do_osd_op_pre_writefull, soid.oid.name.c_str(), soid.snap.val, oi.size, 0, op.extent.length);
6763
6764 if (op.extent.length != osd_op.indata.length()) {
6765 result = -EINVAL;
6766 break;
6767 }
6768 result = check_offset_and_length(
6769 0, op.extent.length,
6770 static_cast<Option::size_t>(osd->osd_max_object_size), get_dpp());
6771 if (result < 0)
6772 break;
6773
6774 if (pool.info.has_flag(pg_pool_t::FLAG_WRITE_FADVISE_DONTNEED))
6775 op.flags = op.flags | CEPH_OSD_OP_FLAG_FADVISE_DONTNEED;
6776
6777 maybe_create_new_object(ctx);
6778 if (pool.info.is_erasure()) {
6779 t->truncate(soid, 0);
6780 } else if (obs.exists && op.extent.length < oi.size) {
6781 t->truncate(soid, op.extent.length);
6782 }
6783 if (op.extent.length) {
6784 t->write(soid, 0, op.extent.length, osd_op.indata, op.flags);
6785 }
6786 if (!skip_data_digest) {
6787 obs.oi.set_data_digest(osd_op.indata.crc32c(-1));
6788 } else {
6789 obs.oi.clear_data_digest();
6790 }
6791 ctx->clean_regions.mark_data_region_dirty(0,
6792 std::max((uint64_t)op.extent.length, oi.size));
6793 write_update_size_and_usage(ctx->delta_stats, oi, ctx->modified_ranges,
6794 0, op.extent.length, true);
6795 }
6796 break;
6797
6798 case CEPH_OSD_OP_WRITESAME:
6799 ++ctx->num_write;
6800 tracepoint(osd, do_osd_op_pre_writesame, soid.oid.name.c_str(), soid.snap.val, oi.size, op.writesame.offset, op.writesame.length, op.writesame.data_length);
6801 result = do_writesame(ctx, osd_op);
6802 break;
6803
6804 case CEPH_OSD_OP_ROLLBACK :
6805 ++ctx->num_write;
6806 tracepoint(osd, do_osd_op_pre_rollback, soid.oid.name.c_str(), soid.snap.val);
6807 result = _rollback_to(ctx, osd_op);
6808 break;
6809
6810 case CEPH_OSD_OP_ZERO:
6811 tracepoint(osd, do_osd_op_pre_zero, soid.oid.name.c_str(), soid.snap.val, op.extent.offset, op.extent.length);
6812 if (pool.info.requires_aligned_append()) {
6813 result = -EOPNOTSUPP;
6814 break;
6815 }
6816 ++ctx->num_write;
6817 { // zero
6818 result = check_offset_and_length(
6819 op.extent.offset, op.extent.length,
6820 static_cast<Option::size_t>(osd->osd_max_object_size), get_dpp());
6821 if (result < 0)
6822 break;
6823
6824 if (op.extent.length && obs.exists && !oi.is_whiteout()) {
6825 t->zero(soid, op.extent.offset, op.extent.length);
6826 interval_set<uint64_t> ch;
6827 ch.insert(op.extent.offset, op.extent.length);
6828 ctx->modified_ranges.union_of(ch);
6829 ctx->clean_regions.mark_data_region_dirty(op.extent.offset, op.extent.length);
6830 ctx->delta_stats.num_wr++;
6831 oi.clear_data_digest();
6832 } else {
6833 // no-op
6834 }
6835 }
6836 break;
6837 case CEPH_OSD_OP_CREATE:
6838 ++ctx->num_write;
6839 result = 0;
6840 {
6841 tracepoint(osd, do_osd_op_pre_create, soid.oid.name.c_str(), soid.snap.val);
6842 if (obs.exists && !oi.is_whiteout() &&
6843 (op.flags & CEPH_OSD_OP_FLAG_EXCL)) {
6844 result = -EEXIST; /* this is an exclusive create */
6845 } else {
6846 if (osd_op.indata.length()) {
6847 auto p = osd_op.indata.cbegin();
6848 string category;
6849 try {
6850 decode(category, p);
6851 }
6852 catch (ceph::buffer::error& e) {
6853 result = -EINVAL;
6854 goto fail;
6855 }
6856 // category is no longer implemented.
6857 }
6858 maybe_create_new_object(ctx);
6859 t->nop(soid);
6860 }
6861 }
6862 break;
6863
6864 case CEPH_OSD_OP_TRIMTRUNC:
6865 op.extent.offset = op.extent.truncate_size;
6866 // falling through
6867
6868 case CEPH_OSD_OP_TRUNCATE:
6869 tracepoint(osd, do_osd_op_pre_truncate, soid.oid.name.c_str(), soid.snap.val, oi.size, oi.truncate_seq, op.extent.offset, op.extent.length, op.extent.truncate_size, op.extent.truncate_seq);
6870 if (pool.info.requires_aligned_append()) {
6871 result = -EOPNOTSUPP;
6872 break;
6873 }
6874 ++ctx->num_write;
6875 result = 0;
6876 {
6877 // truncate
6878 if (!obs.exists || oi.is_whiteout()) {
6879 dout(10) << " object dne, truncate is a no-op" << dendl;
6880 break;
6881 }
6882
6883 result = check_offset_and_length(
6884 op.extent.offset, op.extent.length,
6885 static_cast<Option::size_t>(osd->osd_max_object_size), get_dpp());
6886 if (result < 0)
6887 break;
6888
6889 if (op.extent.truncate_seq) {
6890 ceph_assert(op.extent.offset == op.extent.truncate_size);
6891 if (op.extent.truncate_seq <= oi.truncate_seq) {
6892 dout(10) << " truncate seq " << op.extent.truncate_seq << " <= current " << oi.truncate_seq
6893 << ", no-op" << dendl;
6894 break; // old
6895 }
6896 dout(10) << " truncate seq " << op.extent.truncate_seq << " > current " << oi.truncate_seq
6897 << ", truncating" << dendl;
6898 oi.truncate_seq = op.extent.truncate_seq;
6899 oi.truncate_size = op.extent.truncate_size;
6900 }
6901
6902 maybe_create_new_object(ctx);
6903 t->truncate(soid, op.extent.offset);
6904 if (oi.size > op.extent.offset) {
6905 interval_set<uint64_t> trim;
6906 trim.insert(op.extent.offset, oi.size-op.extent.offset);
6907 ctx->modified_ranges.union_of(trim);
6908 ctx->clean_regions.mark_data_region_dirty(op.extent.offset, oi.size - op.extent.offset);
6909 } else if (oi.size < op.extent.offset) {
6910 ctx->clean_regions.mark_data_region_dirty(oi.size, op.extent.offset - oi.size);
6911 }
6912 if (op.extent.offset != oi.size) {
6913 truncate_update_size_and_usage(ctx->delta_stats,
6914 oi,
6915 op.extent.offset);
6916 }
6917 ctx->delta_stats.num_wr++;
6918 // do no set exists, or we will break above DELETE -> TRUNCATE munging.
6919
6920 oi.clear_data_digest();
6921 }
6922 break;
6923
6924 case CEPH_OSD_OP_DELETE:
6925 ++ctx->num_write;
6926 result = 0;
6927 tracepoint(osd, do_osd_op_pre_delete, soid.oid.name.c_str(), soid.snap.val);
6928 {
6929 result = _delete_oid(ctx, false, ctx->ignore_cache);
6930 }
6931 break;
6932
6933 case CEPH_OSD_OP_WATCH:
6934 ++ctx->num_write;
6935 result = 0;
6936 {
6937 tracepoint(osd, do_osd_op_pre_watch, soid.oid.name.c_str(), soid.snap.val,
6938 op.watch.cookie, op.watch.op);
6939 if (!obs.exists) {
6940 result = -ENOENT;
6941 break;
6942 }
6943 result = 0;
6944 uint64_t cookie = op.watch.cookie;
6945 entity_name_t entity = ctx->reqid.name;
6946 ObjectContextRef obc = ctx->obc;
6947
6948 dout(10) << "watch " << ceph_osd_watch_op_name(op.watch.op)
6949 << ": ctx->obc=" << (void *)obc.get() << " cookie=" << cookie
6950 << " oi.version=" << oi.version.version << " ctx->at_version=" << ctx->at_version << dendl;
6951 dout(10) << "watch: oi.user_version=" << oi.user_version<< dendl;
6952 dout(10) << "watch: peer_addr="
6953 << ctx->op->get_req()->get_connection()->get_peer_addr() << dendl;
6954
6955 uint32_t timeout = cct->_conf->osd_client_watch_timeout;
6956 if (op.watch.timeout != 0) {
6957 timeout = op.watch.timeout;
6958 }
6959
6960 watch_info_t w(cookie, timeout,
6961 ctx->op->get_req()->get_connection()->get_peer_addr());
6962 if (op.watch.op == CEPH_OSD_WATCH_OP_WATCH ||
6963 op.watch.op == CEPH_OSD_WATCH_OP_LEGACY_WATCH) {
6964 if (oi.watchers.count(make_pair(cookie, entity))) {
6965 dout(10) << " found existing watch " << w << " by " << entity << dendl;
6966 } else {
6967 dout(10) << " registered new watch " << w << " by " << entity << dendl;
6968 oi.watchers[make_pair(cookie, entity)] = w;
6969 t->nop(soid); // make sure update the object_info on disk!
6970 }
6971 bool will_ping = (op.watch.op == CEPH_OSD_WATCH_OP_WATCH);
6972 ctx->watch_connects.push_back(make_pair(w, will_ping));
6973 } else if (op.watch.op == CEPH_OSD_WATCH_OP_RECONNECT) {
6974 if (!oi.watchers.count(make_pair(cookie, entity))) {
6975 result = -ENOTCONN;
6976 break;
6977 }
6978 dout(10) << " found existing watch " << w << " by " << entity << dendl;
6979 ctx->watch_connects.push_back(make_pair(w, true));
6980 } else if (op.watch.op == CEPH_OSD_WATCH_OP_PING) {
6981 /* Note: WATCH with PING doesn't cause may_write() to return true,
6982 * so if there is nothing else in the transaction, this is going
6983 * to run do_osd_op_effects, but not write out a log entry */
6984 if (!oi.watchers.count(make_pair(cookie, entity))) {
6985 result = -ENOTCONN;
6986 break;
6987 }
6988 map<pair<uint64_t,entity_name_t>,WatchRef>::iterator p =
6989 obc->watchers.find(make_pair(cookie, entity));
6990 if (p == obc->watchers.end() ||
6991 !p->second->is_connected()) {
6992 // client needs to reconnect
6993 result = -ETIMEDOUT;
6994 break;
6995 }
6996 dout(10) << " found existing watch " << w << " by " << entity << dendl;
6997 p->second->got_ping(ceph_clock_now());
6998 result = 0;
6999 } else if (op.watch.op == CEPH_OSD_WATCH_OP_UNWATCH) {
7000 map<pair<uint64_t, entity_name_t>, watch_info_t>::iterator oi_iter =
7001 oi.watchers.find(make_pair(cookie, entity));
7002 if (oi_iter != oi.watchers.end()) {
7003 dout(10) << " removed watch " << oi_iter->second << " by "
7004 << entity << dendl;
7005 oi.watchers.erase(oi_iter);
7006 t->nop(soid); // update oi on disk
7007 ctx->watch_disconnects.push_back(
7008 watch_disconnect_t(cookie, entity, false));
7009 } else {
7010 dout(10) << " can't remove: no watch by " << entity << dendl;
7011 }
7012 }
7013 }
7014 break;
7015
7016 case CEPH_OSD_OP_CACHE_PIN:
7017 tracepoint(osd, do_osd_op_pre_cache_pin, soid.oid.name.c_str(), soid.snap.val);
7018 if ((!pool.info.is_tier() ||
7019 pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE)) {
7020 result = -EINVAL;
7021 dout(10) << " pin object is only allowed on the cache tier " << dendl;
7022 break;
7023 }
7024 ++ctx->num_write;
7025 result = 0;
7026 {
7027 if (!obs.exists || oi.is_whiteout()) {
7028 result = -ENOENT;
7029 break;
7030 }
7031
7032 if (!oi.is_cache_pinned()) {
7033 oi.set_flag(object_info_t::FLAG_CACHE_PIN);
7034 ctx->modify = true;
7035 ctx->delta_stats.num_objects_pinned++;
7036 ctx->delta_stats.num_wr++;
7037 }
7038 }
7039 break;
7040
7041 case CEPH_OSD_OP_CACHE_UNPIN:
7042 tracepoint(osd, do_osd_op_pre_cache_unpin, soid.oid.name.c_str(), soid.snap.val);
7043 if ((!pool.info.is_tier() ||
7044 pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE)) {
7045 result = -EINVAL;
7046 dout(10) << " pin object is only allowed on the cache tier " << dendl;
7047 break;
7048 }
7049 ++ctx->num_write;
7050 result = 0;
7051 {
7052 if (!obs.exists || oi.is_whiteout()) {
7053 result = -ENOENT;
7054 break;
7055 }
7056
7057 if (oi.is_cache_pinned()) {
7058 oi.clear_flag(object_info_t::FLAG_CACHE_PIN);
7059 ctx->modify = true;
7060 ctx->delta_stats.num_objects_pinned--;
7061 ctx->delta_stats.num_wr++;
7062 }
7063 }
7064 break;
7065
7066 case CEPH_OSD_OP_SET_REDIRECT:
7067 ++ctx->num_write;
7068 result = 0;
7069 {
7070 if (pool.info.is_tier()) {
7071 result = -EINVAL;
7072 break;
7073 }
7074 if (!obs.exists) {
7075 result = -ENOENT;
7076 break;
7077 }
7078 if (get_osdmap()->require_osd_release < ceph_release_t::luminous) {
7079 result = -EOPNOTSUPP;
7080 break;
7081 }
7082
7083 object_t target_name;
7084 object_locator_t target_oloc;
7085 snapid_t target_snapid = (uint64_t)op.copy_from.snapid;
7086 version_t target_version = op.copy_from.src_version;
7087 try {
7088 decode(target_name, bp);
7089 decode(target_oloc, bp);
7090 }
7091 catch (ceph::buffer::error& e) {
7092 result = -EINVAL;
7093 goto fail;
7094 }
7095 pg_t raw_pg;
7096 get_osdmap()->object_locator_to_pg(target_name, target_oloc, raw_pg);
7097 hobject_t target(target_name, target_oloc.key, target_snapid,
7098 raw_pg.ps(), raw_pg.pool(),
7099 target_oloc.nspace);
7100 if (target == soid) {
7101 dout(20) << " set-redirect self is invalid" << dendl;
7102 result = -EINVAL;
7103 break;
7104 }
7105
7106 bool need_reference = (osd_op.op.flags & CEPH_OSD_OP_FLAG_WITH_REFERENCE);
7107 bool has_reference = (oi.flags & object_info_t::FLAG_REDIRECT_HAS_REFERENCE);
7108 if (has_reference) {
7109 result = -EINVAL;
7110 dout(5) << " the object is already a manifest " << dendl;
7111 break;
7112 }
7113 if (op_finisher == nullptr && need_reference) {
7114 // start
7115 ctx->op_finishers[ctx->current_osd_subop_num].reset(
7116 new SetManifestFinisher(osd_op));
7117 ManifestOpRef mop = std::make_shared<ManifestOp>(new RefCountCallback(ctx, osd_op));
7118 auto* fin = new C_SetManifestRefCountDone(this, soid, 0);
7119 ceph_tid_t tid = refcount_manifest(soid, target,
7120 refcount_t::INCREMENT_REF, fin, std::nullopt);
7121 fin->tid = tid;
7122 mop->num_chunks++;
7123 mop->tids[0] = tid;
7124 manifest_ops[soid] = mop;
7125 ctx->obc->start_block();
7126 result = -EINPROGRESS;
7127 } else {
7128 // finish
7129 if (op_finisher) {
7130 result = op_finisher->execute();
7131 ceph_assert(result == 0);
7132 }
7133
7134 if (!oi.has_manifest() && !oi.manifest.is_redirect())
7135 ctx->delta_stats.num_objects_manifest++;
7136
7137 oi.set_flag(object_info_t::FLAG_MANIFEST);
7138 oi.manifest.redirect_target = target;
7139 oi.manifest.type = object_manifest_t::TYPE_REDIRECT;
7140 t->truncate(soid, 0);
7141 ctx->clean_regions.mark_data_region_dirty(0, oi.size);
7142 if (oi.is_omap() && pool.info.supports_omap()) {
7143 t->omap_clear(soid);
7144 obs.oi.clear_omap_digest();
7145 obs.oi.clear_flag(object_info_t::FLAG_OMAP);
7146 ctx->clean_regions.mark_omap_dirty();
7147 }
7148 write_update_size_and_usage(ctx->delta_stats, oi, ctx->modified_ranges,
7149 0, oi.size, false);
7150 ctx->delta_stats.num_bytes -= oi.size;
7151 oi.size = 0;
7152 oi.new_object();
7153 oi.user_version = target_version;
7154 ctx->user_at_version = target_version;
7155 /* rm_attrs */
7156 map<string,bufferlist,less<>> rmattrs;
7157 result = getattrs_maybe_cache(ctx->obc, &rmattrs);
7158 if (result < 0) {
7159 dout(10) << __func__ << " error: " << cpp_strerror(result) << dendl;
7160 return result;
7161 }
7162 map<string, bufferlist>::iterator iter;
7163 for (iter = rmattrs.begin(); iter != rmattrs.end(); ++iter) {
7164 const string& name = iter->first;
7165 t->rmattr(soid, name);
7166 }
7167 if (!has_reference && need_reference) {
7168 oi.set_flag(object_info_t::FLAG_REDIRECT_HAS_REFERENCE);
7169 }
7170 dout(10) << "set-redirect oid:" << oi.soid << " user_version: " << oi.user_version << dendl;
7171 if (op_finisher) {
7172 ctx->op_finishers.erase(ctx->current_osd_subop_num);
7173 }
7174 }
7175 }
7176
7177 break;
7178
7179 case CEPH_OSD_OP_SET_CHUNK:
7180 ++ctx->num_write;
7181 result = 0;
7182 {
7183 if (pool.info.is_tier()) {
7184 result = -EINVAL;
7185 break;
7186 }
7187 if (!obs.exists) {
7188 result = -ENOENT;
7189 break;
7190 }
7191 if (get_osdmap()->require_osd_release < ceph_release_t::luminous) {
7192 result = -EOPNOTSUPP;
7193 break;
7194 }
7195 if (oi.manifest.is_redirect()) {
7196 result = -EINVAL;
7197 goto fail;
7198 }
7199
7200 object_locator_t tgt_oloc;
7201 uint64_t src_offset, src_length, tgt_offset;
7202 object_t tgt_name;
7203 try {
7204 decode(src_offset, bp);
7205 decode(src_length, bp);
7206 decode(tgt_oloc, bp);
7207 decode(tgt_name, bp);
7208 decode(tgt_offset, bp);
7209 }
7210 catch (ceph::buffer::error& e) {
7211 result = -EINVAL;
7212 goto fail;
7213 }
7214
7215 if (!src_length) {
7216 result = -EINVAL;
7217 goto fail;
7218 }
7219 if (src_offset + src_length > oi.size) {
7220 result = -ERANGE;
7221 goto fail;
7222 }
7223 if (!(osd_op.op.flags & CEPH_OSD_OP_FLAG_WITH_REFERENCE)) {
7224 result = -EOPNOTSUPP;
7225 break;
7226 }
7227 if (pool.info.is_erasure()) {
7228 result = -EOPNOTSUPP;
7229 break;
7230 }
7231
7232 for (auto &p : oi.manifest.chunk_map) {
7233 interval_set<uint64_t> chunk;
7234 chunk.insert(p.first, p.second.length);
7235 if (chunk.intersects(src_offset, src_length)) {
7236 dout(20) << __func__ << " overlapped !! offset: " << src_offset << " length: " << src_length
7237 << " chunk_info: " << p << dendl;
7238 result = -EOPNOTSUPP;
7239 goto fail;
7240 }
7241 }
7242
7243 pg_t raw_pg;
7244 chunk_info_t chunk_info;
7245 get_osdmap()->object_locator_to_pg(tgt_name, tgt_oloc, raw_pg);
7246 hobject_t target(tgt_name, tgt_oloc.key, snapid_t(),
7247 raw_pg.ps(), raw_pg.pool(),
7248 tgt_oloc.nspace);
7249 bool has_reference = (oi.manifest.chunk_map.find(src_offset) != oi.manifest.chunk_map.end()) &&
7250 (oi.manifest.chunk_map[src_offset].test_flag(chunk_info_t::FLAG_HAS_REFERENCE));
7251 if (has_reference) {
7252 result = -EINVAL;
7253 dout(5) << " the object is already a manifest " << dendl;
7254 break;
7255 }
7256 chunk_info.oid = target;
7257 chunk_info.offset = tgt_offset;
7258 chunk_info.length = src_length;
7259 if (op_finisher == nullptr) {
7260 // start
7261 ctx->op_finishers[ctx->current_osd_subop_num].reset(
7262 new SetManifestFinisher(osd_op));
7263 object_manifest_t set_chunk;
7264 bool need_inc_ref = false;
7265 set_chunk.chunk_map[src_offset] = chunk_info;
7266 need_inc_ref = inc_refcount_by_set(ctx, set_chunk, osd_op);
7267 if (need_inc_ref) {
7268 result = -EINPROGRESS;
7269 break;
7270 }
7271 }
7272 if (op_finisher) {
7273 result = op_finisher->execute();
7274 ceph_assert(result == 0);
7275 }
7276
7277 oi.manifest.chunk_map[src_offset] = chunk_info;
7278 if (!oi.has_manifest() && !oi.manifest.is_chunked())
7279 ctx->delta_stats.num_objects_manifest++;
7280 oi.set_flag(object_info_t::FLAG_MANIFEST);
7281 oi.manifest.type = object_manifest_t::TYPE_CHUNKED;
7282 if (!has_reference) {
7283 oi.manifest.chunk_map[src_offset].set_flag(chunk_info_t::FLAG_HAS_REFERENCE);
7284 }
7285 ctx->modify = true;
7286 ctx->cache_operation = true;
7287
7288 dout(10) << "set-chunked oid:" << oi.soid << " user_version: " << oi.user_version
7289 << " chunk_info: " << chunk_info << dendl;
7290 if (op_finisher) {
7291 ctx->op_finishers.erase(ctx->current_osd_subop_num);
7292 }
7293 }
7294
7295 break;
7296
7297 case CEPH_OSD_OP_TIER_PROMOTE:
7298 ++ctx->num_write;
7299 result = 0;
7300 {
7301 if (pool.info.is_tier()) {
7302 result = -EINVAL;
7303 break;
7304 }
7305 if (!obs.exists) {
7306 result = -ENOENT;
7307 break;
7308 }
7309 if (get_osdmap()->require_osd_release < ceph_release_t::luminous) {
7310 result = -EOPNOTSUPP;
7311 break;
7312 }
7313 if (!obs.oi.has_manifest()) {
7314 result = 0;
7315 break;
7316 }
7317
7318 if (op_finisher == nullptr) {
7319 PromoteManifestCallback *cb;
7320 object_locator_t my_oloc;
7321 hobject_t src_hoid;
7322
7323 if (obs.oi.manifest.is_chunked()) {
7324 src_hoid = obs.oi.soid;
7325 } else if (obs.oi.manifest.is_redirect()) {
7326 object_locator_t src_oloc(obs.oi.manifest.redirect_target);
7327 my_oloc = src_oloc;
7328 src_hoid = obs.oi.manifest.redirect_target;
7329 } else {
7330 ceph_abort_msg("unrecognized manifest type");
7331 }
7332 cb = new PromoteManifestCallback(ctx->obc, this, ctx);
7333 ctx->op_finishers[ctx->current_osd_subop_num].reset(
7334 new PromoteFinisher(cb));
7335 unsigned flags = CEPH_OSD_COPY_FROM_FLAG_IGNORE_OVERLAY |
7336 CEPH_OSD_COPY_FROM_FLAG_IGNORE_CACHE |
7337 CEPH_OSD_COPY_FROM_FLAG_MAP_SNAP_CLONE |
7338 CEPH_OSD_COPY_FROM_FLAG_RWORDERED;
7339 unsigned src_fadvise_flags = LIBRADOS_OP_FLAG_FADVISE_SEQUENTIAL;
7340 start_copy(cb, ctx->obc, src_hoid, my_oloc, 0, flags,
7341 obs.oi.soid.snap == CEPH_NOSNAP,
7342 src_fadvise_flags, 0);
7343
7344 dout(10) << "tier-promote oid:" << oi.soid << " manifest: " << obs.oi.manifest << dendl;
7345 result = -EINPROGRESS;
7346 } else {
7347 result = op_finisher->execute();
7348 ceph_assert(result == 0);
7349 ctx->op_finishers.erase(ctx->current_osd_subop_num);
7350 }
7351 }
7352
7353 break;
7354
7355 case CEPH_OSD_OP_TIER_FLUSH:
7356 ++ctx->num_write;
7357 result = 0;
7358 {
7359 if (pool.info.is_tier()) {
7360 result = -EINVAL;
7361 break;
7362 }
7363 if (!obs.exists) {
7364 result = -ENOENT;
7365 break;
7366 }
7367 if (get_osdmap()->require_osd_release < ceph_release_t::octopus) {
7368 result = -EOPNOTSUPP;
7369 break;
7370 }
7371 if (!obs.oi.has_manifest()) {
7372 result = 0;
7373 break;
7374 }
7375
7376 if (oi.is_dirty()) {
7377 result = start_flush(ctx->op, ctx->obc, true, NULL, std::nullopt);
7378 if (result == -EINPROGRESS)
7379 result = -EAGAIN;
7380 } else {
7381 result = 0;
7382 }
7383 }
7384
7385 break;
7386
7387 case CEPH_OSD_OP_TIER_EVICT:
7388 ++ctx->num_write;
7389 result = 0;
7390 {
7391 if (pool.info.is_tier()) {
7392 result = -EINVAL;
7393 break;
7394 }
7395 if (!obs.exists) {
7396 result = -ENOENT;
7397 break;
7398 }
7399 if (get_osdmap()->require_osd_release < ceph_release_t::octopus) {
7400 result = -EOPNOTSUPP;
7401 break;
7402 }
7403 if (!obs.oi.has_manifest()) {
7404 result = -EINVAL;
7405 break;
7406 }
7407
7408 // The chunks already has a reference, so it is just enough to invoke truncate if necessary
7409 for (auto &p : obs.oi.manifest.chunk_map) {
7410 p.second.set_flag(chunk_info_t::FLAG_MISSING);
7411 // punch hole
7412 t->zero(soid, p.first, p.second.length);
7413 }
7414 oi.clear_data_digest();
7415 ctx->delta_stats.num_wr++;
7416 ctx->cache_operation = true;
7417 osd->logger->inc(l_osd_tier_evict);
7418 }
7419
7420 break;
7421
7422 case CEPH_OSD_OP_UNSET_MANIFEST:
7423 ++ctx->num_write;
7424 result = 0;
7425 {
7426 if (pool.info.is_tier()) {
7427 result = -EINVAL;
7428 break;
7429 }
7430 if (!obs.exists) {
7431 result = -ENOENT;
7432 break;
7433 }
7434 if (!oi.has_manifest()) {
7435 result = -EOPNOTSUPP;
7436 break;
7437 }
7438 if (get_osdmap()->require_osd_release < ceph_release_t::luminous) {
7439 result = -EOPNOTSUPP;
7440 break;
7441 }
7442
7443 dec_all_refcount_manifest(oi, ctx);
7444
7445 oi.clear_flag(object_info_t::FLAG_MANIFEST);
7446 oi.manifest = object_manifest_t();
7447 ctx->delta_stats.num_objects_manifest--;
7448 ctx->delta_stats.num_wr++;
7449 ctx->modify = true;
7450 }
7451
7452 break;
7453
7454 // -- object attrs --
7455
7456 case CEPH_OSD_OP_SETXATTR:
7457 ++ctx->num_write;
7458 result = 0;
7459 {
7460 if (cct->_conf->osd_max_attr_size > 0 &&
7461 op.xattr.value_len > cct->_conf->osd_max_attr_size) {
7462 tracepoint(osd, do_osd_op_pre_setxattr, soid.oid.name.c_str(), soid.snap.val, "???");
7463 result = -EFBIG;
7464 break;
7465 }
7466 unsigned max_name_len =
7467 std::min<uint64_t>(osd->store->get_max_attr_name_length(),
7468 cct->_conf->osd_max_attr_name_len);
7469 if (op.xattr.name_len > max_name_len) {
7470 result = -ENAMETOOLONG;
7471 break;
7472 }
7473 maybe_create_new_object(ctx);
7474 string aname;
7475 bp.copy(op.xattr.name_len, aname);
7476 tracepoint(osd, do_osd_op_pre_setxattr, soid.oid.name.c_str(), soid.snap.val, aname.c_str());
7477 string name = "_" + aname;
7478 bufferlist bl;
7479 bp.copy(op.xattr.value_len, bl);
7480 t->setattr(soid, name, bl);
7481 ctx->delta_stats.num_wr++;
7482 }
7483 break;
7484
7485 case CEPH_OSD_OP_RMXATTR:
7486 ++ctx->num_write;
7487 result = 0;
7488 {
7489 string aname;
7490 bp.copy(op.xattr.name_len, aname);
7491 tracepoint(osd, do_osd_op_pre_rmxattr, soid.oid.name.c_str(), soid.snap.val, aname.c_str());
7492 if (!obs.exists || oi.is_whiteout()) {
7493 result = -ENOENT;
7494 break;
7495 }
7496 string name = "_" + aname;
7497 t->rmattr(soid, name);
7498 ctx->delta_stats.num_wr++;
7499 }
7500 break;
7501
7502
7503 // -- fancy writers --
7504 case CEPH_OSD_OP_APPEND:
7505 {
7506 tracepoint(osd, do_osd_op_pre_append, soid.oid.name.c_str(), soid.snap.val, oi.size, oi.truncate_seq, op.extent.offset, op.extent.length, op.extent.truncate_size, op.extent.truncate_seq);
7507 // just do it inline; this works because we are happy to execute
7508 // fancy op on replicas as well.
7509 vector<OSDOp> nops(1);
7510 OSDOp& newop = nops[0];
7511 newop.op.op = CEPH_OSD_OP_WRITE;
7512 newop.op.extent.offset = oi.size;
7513 newop.op.extent.length = op.extent.length;
7514 newop.op.extent.truncate_seq = oi.truncate_seq;
7515 newop.indata = osd_op.indata;
7516 result = do_osd_ops(ctx, nops);
7517 osd_op.outdata = std::move(newop.outdata);
7518 }
7519 break;
7520
7521 case CEPH_OSD_OP_STARTSYNC:
7522 result = 0;
7523 t->nop(soid);
7524 break;
7525
7526 // -- trivial map --
7527 case CEPH_OSD_OP_TMAPGET:
7528 tracepoint(osd, do_osd_op_pre_tmapget, soid.oid.name.c_str(), soid.snap.val);
7529 if (pool.info.is_erasure()) {
7530 result = -EOPNOTSUPP;
7531 break;
7532 }
7533 {
7534 vector<OSDOp> nops(1);
7535 OSDOp& newop = nops[0];
7536 newop.op.op = CEPH_OSD_OP_SYNC_READ;
7537 newop.op.extent.offset = 0;
7538 newop.op.extent.length = 0;
7539 result = do_osd_ops(ctx, nops);
7540 osd_op.outdata = std::move(newop.outdata);
7541 }
7542 break;
7543
7544 case CEPH_OSD_OP_TMAPPUT:
7545 tracepoint(osd, do_osd_op_pre_tmapput, soid.oid.name.c_str(), soid.snap.val);
7546 if (pool.info.is_erasure()) {
7547 result = -EOPNOTSUPP;
7548 break;
7549 }
7550 {
7551 //_dout_lock.Lock();
7552 //osd_op.data.hexdump(*_dout);
7553 //_dout_lock.Unlock();
7554
7555 // verify sort order
7556 bool unsorted = false;
7557 if (true) {
7558 bufferlist header;
7559 decode(header, bp);
7560 uint32_t n;
7561 decode(n, bp);
7562 string last_key;
7563 while (n--) {
7564 string key;
7565 decode(key, bp);
7566 dout(10) << "tmapput key " << key << dendl;
7567 bufferlist val;
7568 decode(val, bp);
7569 if (key < last_key) {
7570 dout(10) << "TMAPPUT is unordered; resorting" << dendl;
7571 unsorted = true;
7572 break;
7573 }
7574 last_key = key;
7575 }
7576 }
7577
7578 // write it
7579 vector<OSDOp> nops(1);
7580 OSDOp& newop = nops[0];
7581 newop.op.op = CEPH_OSD_OP_WRITEFULL;
7582 newop.op.extent.offset = 0;
7583 newop.op.extent.length = osd_op.indata.length();
7584 newop.indata = osd_op.indata;
7585
7586 if (unsorted) {
7587 bp = osd_op.indata.begin();
7588 bufferlist header;
7589 map<string, bufferlist> m;
7590 decode(header, bp);
7591 decode(m, bp);
7592 ceph_assert(bp.end());
7593 bufferlist newbl;
7594 encode(header, newbl);
7595 encode(m, newbl);
7596 newop.indata = newbl;
7597 }
7598 result = do_osd_ops(ctx, nops);
7599 ceph_assert(result == 0);
7600 }
7601 break;
7602
7603 case CEPH_OSD_OP_TMAPUP:
7604 tracepoint(osd, do_osd_op_pre_tmapup, soid.oid.name.c_str(), soid.snap.val);
7605 if (pool.info.is_erasure()) {
7606 result = -EOPNOTSUPP;
7607 break;
7608 }
7609 ++ctx->num_write;
7610 result = do_tmapup(ctx, bp, osd_op);
7611 break;
7612
7613 case CEPH_OSD_OP_TMAP2OMAP:
7614 ++ctx->num_write;
7615 tracepoint(osd, do_osd_op_pre_tmap2omap, soid.oid.name.c_str(), soid.snap.val);
7616 result = do_tmap2omap(ctx, op.tmap2omap.flags);
7617 break;
7618
7619 // OMAP Read ops
7620 case CEPH_OSD_OP_OMAPGETKEYS:
7621 ++ctx->num_read;
7622 {
7623 string start_after;
7624 uint64_t max_return;
7625 try {
7626 decode(start_after, bp);
7627 decode(max_return, bp);
7628 }
7629 catch (ceph::buffer::error& e) {
7630 result = -EINVAL;
7631 tracepoint(osd, do_osd_op_pre_omapgetkeys, soid.oid.name.c_str(), soid.snap.val, "???", 0);
7632 goto fail;
7633 }
7634 if (max_return > cct->_conf->osd_max_omap_entries_per_request) {
7635 max_return = cct->_conf->osd_max_omap_entries_per_request;
7636 }
7637 tracepoint(osd, do_osd_op_pre_omapgetkeys, soid.oid.name.c_str(), soid.snap.val, start_after.c_str(), max_return);
7638
7639 bufferlist bl;
7640 uint32_t num = 0;
7641 bool truncated = false;
7642 if (oi.is_omap()) {
7643 ObjectMap::ObjectMapIterator iter = osd->store->get_omap_iterator(
7644 ch, ghobject_t(soid)
7645 );
7646 ceph_assert(iter);
7647 iter->upper_bound(start_after);
7648 for (num = 0; iter->valid(); ++num, iter->next()) {
7649 if (num >= max_return ||
7650 bl.length() >= cct->_conf->osd_max_omap_bytes_per_request) {
7651 truncated = true;
7652 break;
7653 }
7654 encode(iter->key(), bl);
7655 }
7656 } // else return empty out_set
7657 encode(num, osd_op.outdata);
7658 osd_op.outdata.claim_append(bl);
7659 encode(truncated, osd_op.outdata);
7660 ctx->delta_stats.num_rd_kb += shift_round_up(osd_op.outdata.length(), 10);
7661 ctx->delta_stats.num_rd++;
7662 }
7663 break;
7664
7665 case CEPH_OSD_OP_OMAPGETVALS:
7666 ++ctx->num_read;
7667 {
7668 string start_after;
7669 uint64_t max_return;
7670 string filter_prefix;
7671 try {
7672 decode(start_after, bp);
7673 decode(max_return, bp);
7674 decode(filter_prefix, bp);
7675 }
7676 catch (ceph::buffer::error& e) {
7677 result = -EINVAL;
7678 tracepoint(osd, do_osd_op_pre_omapgetvals, soid.oid.name.c_str(), soid.snap.val, "???", 0, "???");
7679 goto fail;
7680 }
7681 if (max_return > cct->_conf->osd_max_omap_entries_per_request) {
7682 max_return = cct->_conf->osd_max_omap_entries_per_request;
7683 }
7684 tracepoint(osd, do_osd_op_pre_omapgetvals, soid.oid.name.c_str(), soid.snap.val, start_after.c_str(), max_return, filter_prefix.c_str());
7685
7686 uint32_t num = 0;
7687 bool truncated = false;
7688 bufferlist bl;
7689 if (oi.is_omap()) {
7690 ObjectMap::ObjectMapIterator iter = osd->store->get_omap_iterator(
7691 ch, ghobject_t(soid)
7692 );
7693 if (!iter) {
7694 result = -ENOENT;
7695 goto fail;
7696 }
7697 iter->upper_bound(start_after);
7698 if (filter_prefix > start_after) iter->lower_bound(filter_prefix);
7699 for (num = 0;
7700 iter->valid() &&
7701 iter->key().substr(0, filter_prefix.size()) == filter_prefix;
7702 ++num, iter->next()) {
7703 dout(20) << "Found key " << iter->key() << dendl;
7704 if (num >= max_return ||
7705 bl.length() >= cct->_conf->osd_max_omap_bytes_per_request) {
7706 truncated = true;
7707 break;
7708 }
7709 encode(iter->key(), bl);
7710 encode(iter->value(), bl);
7711 }
7712 } // else return empty out_set
7713 encode(num, osd_op.outdata);
7714 osd_op.outdata.claim_append(bl);
7715 encode(truncated, osd_op.outdata);
7716 ctx->delta_stats.num_rd_kb += shift_round_up(osd_op.outdata.length(), 10);
7717 ctx->delta_stats.num_rd++;
7718 }
7719 break;
7720
7721 case CEPH_OSD_OP_OMAPGETHEADER:
7722 tracepoint(osd, do_osd_op_pre_omapgetheader, soid.oid.name.c_str(), soid.snap.val);
7723 if (!oi.is_omap()) {
7724 // return empty header
7725 break;
7726 }
7727 ++ctx->num_read;
7728 {
7729 osd->store->omap_get_header(ch, ghobject_t(soid), &osd_op.outdata);
7730 ctx->delta_stats.num_rd_kb += shift_round_up(osd_op.outdata.length(), 10);
7731 ctx->delta_stats.num_rd++;
7732 }
7733 break;
7734
7735 case CEPH_OSD_OP_OMAPGETVALSBYKEYS:
7736 ++ctx->num_read;
7737 {
7738 set<string> keys_to_get;
7739 try {
7740 decode(keys_to_get, bp);
7741 }
7742 catch (ceph::buffer::error& e) {
7743 result = -EINVAL;
7744 tracepoint(osd, do_osd_op_pre_omapgetvalsbykeys, soid.oid.name.c_str(), soid.snap.val, "???");
7745 goto fail;
7746 }
7747 tracepoint(osd, do_osd_op_pre_omapgetvalsbykeys, soid.oid.name.c_str(), soid.snap.val, list_entries(keys_to_get).c_str());
7748 map<string, bufferlist> out;
7749 if (oi.is_omap()) {
7750 osd->store->omap_get_values(ch, ghobject_t(soid), keys_to_get, &out);
7751 } // else return empty omap entries
7752 encode(out, osd_op.outdata);
7753 ctx->delta_stats.num_rd_kb += shift_round_up(osd_op.outdata.length(), 10);
7754 ctx->delta_stats.num_rd++;
7755 }
7756 break;
7757
7758 case CEPH_OSD_OP_OMAP_CMP:
7759 ++ctx->num_read;
7760 {
7761 if (!obs.exists || oi.is_whiteout()) {
7762 result = -ENOENT;
7763 tracepoint(osd, do_osd_op_pre_omap_cmp, soid.oid.name.c_str(), soid.snap.val, "???");
7764 break;
7765 }
7766 map<string, pair<bufferlist, int> > assertions;
7767 try {
7768 decode(assertions, bp);
7769 }
7770 catch (ceph::buffer::error& e) {
7771 result = -EINVAL;
7772 tracepoint(osd, do_osd_op_pre_omap_cmp, soid.oid.name.c_str(), soid.snap.val, "???");
7773 goto fail;
7774 }
7775 tracepoint(osd, do_osd_op_pre_omap_cmp, soid.oid.name.c_str(), soid.snap.val, list_keys(assertions).c_str());
7776
7777 map<string, bufferlist> out;
7778
7779 if (oi.is_omap()) {
7780 set<string> to_get;
7781 for (map<string, pair<bufferlist, int> >::iterator i = assertions.begin();
7782 i != assertions.end();
7783 ++i)
7784 to_get.insert(i->first);
7785 int r = osd->store->omap_get_values(ch, ghobject_t(soid),
7786 to_get, &out);
7787 if (r < 0) {
7788 result = r;
7789 break;
7790 }
7791 } // else leave out empty
7792
7793 //Should set num_rd_kb based on encode length of map
7794 ctx->delta_stats.num_rd++;
7795
7796 int r = 0;
7797 bufferlist empty;
7798 for (map<string, pair<bufferlist, int> >::iterator i = assertions.begin();
7799 i != assertions.end();
7800 ++i) {
7801 auto out_entry = out.find(i->first);
7802 bufferlist &bl = (out_entry != out.end()) ?
7803 out_entry->second : empty;
7804 switch (i->second.second) {
7805 case CEPH_OSD_CMPXATTR_OP_EQ:
7806 if (!(bl == i->second.first)) {
7807 r = -ECANCELED;
7808 }
7809 break;
7810 case CEPH_OSD_CMPXATTR_OP_LT:
7811 if (!(bl < i->second.first)) {
7812 r = -ECANCELED;
7813 }
7814 break;
7815 case CEPH_OSD_CMPXATTR_OP_GT:
7816 if (!(bl > i->second.first)) {
7817 r = -ECANCELED;
7818 }
7819 break;
7820 default:
7821 r = -EINVAL;
7822 break;
7823 }
7824 if (r < 0)
7825 break;
7826 }
7827 if (r < 0) {
7828 result = r;
7829 }
7830 }
7831 break;
7832
7833 // OMAP Write ops
7834 case CEPH_OSD_OP_OMAPSETVALS:
7835 if (!pool.info.supports_omap()) {
7836 result = -EOPNOTSUPP;
7837 tracepoint(osd, do_osd_op_pre_omapsetvals, soid.oid.name.c_str(), soid.snap.val);
7838 break;
7839 }
7840 ++ctx->num_write;
7841 result = 0;
7842 {
7843 maybe_create_new_object(ctx);
7844 bufferlist to_set_bl;
7845 try {
7846 decode_str_str_map_to_bl(bp, &to_set_bl);
7847 }
7848 catch (ceph::buffer::error& e) {
7849 result = -EINVAL;
7850 tracepoint(osd, do_osd_op_pre_omapsetvals, soid.oid.name.c_str(), soid.snap.val);
7851 goto fail;
7852 }
7853 tracepoint(osd, do_osd_op_pre_omapsetvals, soid.oid.name.c_str(), soid.snap.val);
7854 if (cct->_conf->subsys.should_gather<dout_subsys, 20>()) {
7855 dout(20) << "setting vals: " << dendl;
7856 map<string,bufferlist> to_set;
7857 bufferlist::const_iterator pt = to_set_bl.begin();
7858 decode(to_set, pt);
7859 for (map<string, bufferlist>::iterator i = to_set.begin();
7860 i != to_set.end();
7861 ++i) {
7862 dout(20) << "\t" << i->first << dendl;
7863 }
7864 }
7865 t->omap_setkeys(soid, to_set_bl);
7866 ctx->clean_regions.mark_omap_dirty();
7867 ctx->delta_stats.num_wr++;
7868 ctx->delta_stats.num_wr_kb += shift_round_up(to_set_bl.length(), 10);
7869 }
7870 obs.oi.set_flag(object_info_t::FLAG_OMAP);
7871 obs.oi.clear_omap_digest();
7872 break;
7873
7874 case CEPH_OSD_OP_OMAPSETHEADER:
7875 tracepoint(osd, do_osd_op_pre_omapsetheader, soid.oid.name.c_str(), soid.snap.val);
7876 if (!pool.info.supports_omap()) {
7877 result = -EOPNOTSUPP;
7878 break;
7879 }
7880 ++ctx->num_write;
7881 result = 0;
7882 {
7883 maybe_create_new_object(ctx);
7884 t->omap_setheader(soid, osd_op.indata);
7885 ctx->clean_regions.mark_omap_dirty();
7886 ctx->delta_stats.num_wr++;
7887 }
7888 obs.oi.set_flag(object_info_t::FLAG_OMAP);
7889 obs.oi.clear_omap_digest();
7890 break;
7891
7892 case CEPH_OSD_OP_OMAPCLEAR:
7893 tracepoint(osd, do_osd_op_pre_omapclear, soid.oid.name.c_str(), soid.snap.val);
7894 if (!pool.info.supports_omap()) {
7895 result = -EOPNOTSUPP;
7896 break;
7897 }
7898 ++ctx->num_write;
7899 result = 0;
7900 {
7901 if (!obs.exists || oi.is_whiteout()) {
7902 result = -ENOENT;
7903 break;
7904 }
7905 if (oi.is_omap()) {
7906 t->omap_clear(soid);
7907 ctx->clean_regions.mark_omap_dirty();
7908 ctx->delta_stats.num_wr++;
7909 obs.oi.clear_omap_digest();
7910 obs.oi.clear_flag(object_info_t::FLAG_OMAP);
7911 }
7912 }
7913 break;
7914
7915 case CEPH_OSD_OP_OMAPRMKEYS:
7916 if (!pool.info.supports_omap()) {
7917 result = -EOPNOTSUPP;
7918 tracepoint(osd, do_osd_op_pre_omaprmkeys, soid.oid.name.c_str(), soid.snap.val);
7919 break;
7920 }
7921 ++ctx->num_write;
7922 result = 0;
7923 {
7924 if (!obs.exists || oi.is_whiteout()) {
7925 result = -ENOENT;
7926 tracepoint(osd, do_osd_op_pre_omaprmkeys, soid.oid.name.c_str(), soid.snap.val);
7927 break;
7928 }
7929 bufferlist to_rm_bl;
7930 try {
7931 decode_str_set_to_bl(bp, &to_rm_bl);
7932 }
7933 catch (ceph::buffer::error& e) {
7934 result = -EINVAL;
7935 tracepoint(osd, do_osd_op_pre_omaprmkeys, soid.oid.name.c_str(), soid.snap.val);
7936 goto fail;
7937 }
7938 tracepoint(osd, do_osd_op_pre_omaprmkeys, soid.oid.name.c_str(), soid.snap.val);
7939 t->omap_rmkeys(soid, to_rm_bl);
7940 ctx->clean_regions.mark_omap_dirty();
7941 ctx->delta_stats.num_wr++;
7942 }
7943 obs.oi.clear_omap_digest();
7944 break;
7945
7946 case CEPH_OSD_OP_OMAPRMKEYRANGE:
7947 tracepoint(osd, do_osd_op_pre_omaprmkeyrange, soid.oid.name.c_str(), soid.snap.val);
7948 if (!pool.info.supports_omap()) {
7949 result = -EOPNOTSUPP;
7950 break;
7951 }
7952 ++ctx->num_write;
7953 result = 0;
7954 {
7955 if (!obs.exists || oi.is_whiteout()) {
7956 result = -ENOENT;
7957 break;
7958 }
7959 std::string key_begin, key_end;
7960 try {
7961 decode(key_begin, bp);
7962 decode(key_end, bp);
7963 } catch (ceph::buffer::error& e) {
7964 result = -EINVAL;
7965 goto fail;
7966 }
7967 t->omap_rmkeyrange(soid, key_begin, key_end);
7968 ctx->delta_stats.num_wr++;
7969 }
7970 obs.oi.clear_omap_digest();
7971 break;
7972
7973 case CEPH_OSD_OP_COPY_GET:
7974 ++ctx->num_read;
7975 tracepoint(osd, do_osd_op_pre_copy_get, soid.oid.name.c_str(),
7976 soid.snap.val);
7977 if (op_finisher == nullptr) {
7978 result = do_copy_get(ctx, bp, osd_op, ctx->obc);
7979 } else {
7980 result = op_finisher->execute();
7981 }
7982 break;
7983
7984 case CEPH_OSD_OP_COPY_FROM:
7985 case CEPH_OSD_OP_COPY_FROM2:
7986 ++ctx->num_write;
7987 result = 0;
7988 {
7989 object_t src_name;
7990 object_locator_t src_oloc;
7991 uint32_t truncate_seq = 0;
7992 uint64_t truncate_size = 0;
7993 bool have_truncate = false;
7994 snapid_t src_snapid = (uint64_t)op.copy_from.snapid;
7995 version_t src_version = op.copy_from.src_version;
7996
7997 if ((op.op == CEPH_OSD_OP_COPY_FROM2) &&
7998 (op.copy_from.flags & ~CEPH_OSD_COPY_FROM_FLAGS)) {
7999 dout(20) << "invalid copy-from2 flags 0x"
8000 << std::hex << (int)op.copy_from.flags << std::dec << dendl;
8001 result = -EINVAL;
8002 break;
8003 }
8004 try {
8005 decode(src_name, bp);
8006 decode(src_oloc, bp);
8007 // check if client sent us truncate_seq and truncate_size
8008 if ((op.op == CEPH_OSD_OP_COPY_FROM2) &&
8009 (op.copy_from.flags & CEPH_OSD_COPY_FROM_FLAG_TRUNCATE_SEQ)) {
8010 decode(truncate_seq, bp);
8011 decode(truncate_size, bp);
8012 have_truncate = true;
8013 }
8014 }
8015 catch (ceph::buffer::error& e) {
8016 result = -EINVAL;
8017 tracepoint(osd,
8018 do_osd_op_pre_copy_from,
8019 soid.oid.name.c_str(),
8020 soid.snap.val,
8021 "???",
8022 0,
8023 "???",
8024 "???",
8025 0,
8026 src_snapid,
8027 src_version);
8028 goto fail;
8029 }
8030 tracepoint(osd,
8031 do_osd_op_pre_copy_from,
8032 soid.oid.name.c_str(),
8033 soid.snap.val,
8034 src_name.name.c_str(),
8035 src_oloc.pool,
8036 src_oloc.key.c_str(),
8037 src_oloc.nspace.c_str(),
8038 src_oloc.hash,
8039 src_snapid,
8040 src_version);
8041 if (op_finisher == nullptr) {
8042 // start
8043 pg_t raw_pg;
8044 get_osdmap()->object_locator_to_pg(src_name, src_oloc, raw_pg);
8045 hobject_t src(src_name, src_oloc.key, src_snapid,
8046 raw_pg.ps(), raw_pg.pool(),
8047 src_oloc.nspace);
8048 if (src == soid) {
8049 dout(20) << " copy from self is invalid" << dendl;
8050 result = -EINVAL;
8051 break;
8052 }
8053 CopyFromCallback *cb = new CopyFromCallback(ctx, osd_op);
8054 if (have_truncate)
8055 cb->set_truncate(truncate_seq, truncate_size);
8056 ctx->op_finishers[ctx->current_osd_subop_num].reset(
8057 new CopyFromFinisher(cb));
8058 start_copy(cb, ctx->obc, src, src_oloc, src_version,
8059 op.copy_from.flags,
8060 false,
8061 op.copy_from.src_fadvise_flags,
8062 op.flags);
8063 result = -EINPROGRESS;
8064 } else {
8065 // finish
8066 result = op_finisher->execute();
8067 ceph_assert(result == 0);
8068
8069 // COPY_FROM cannot be executed multiple times -- it must restart
8070 ctx->op_finishers.erase(ctx->current_osd_subop_num);
8071 }
8072 }
8073 break;
8074
8075 default:
8076 tracepoint(osd, do_osd_op_pre_unknown, soid.oid.name.c_str(), soid.snap.val, op.op, ceph_osd_op_name(op.op));
8077 dout(1) << "unrecognized osd op " << op.op
8078 << " " << ceph_osd_op_name(op.op)
8079 << dendl;
8080 result = -EOPNOTSUPP;
8081 }
8082
8083 fail:
8084 osd_op.rval = result;
8085 tracepoint(osd, do_osd_op_post, soid.oid.name.c_str(), soid.snap.val, op.op, ceph_osd_op_name(op.op), op.flags, result);
8086 if (result < 0 && (op.flags & CEPH_OSD_OP_FLAG_FAILOK) &&
8087 result != -EAGAIN && result != -EINPROGRESS)
8088 result = 0;
8089
8090 if (result < 0)
8091 break;
8092 }
8093 if (result < 0) {
8094 dout(10) << __func__ << " error: " << cpp_strerror(result) << dendl;
8095 }
8096 return result;
8097 }
8098
8099 int PrimaryLogPG::_get_tmap(OpContext *ctx, bufferlist *header, bufferlist *vals)
8100 {
8101 if (ctx->new_obs.oi.size == 0) {
8102 dout(20) << "unable to get tmap for zero sized " << ctx->new_obs.oi.soid << dendl;
8103 return -ENODATA;
8104 }
8105 vector<OSDOp> nops(1);
8106 OSDOp &newop = nops[0];
8107 newop.op.op = CEPH_OSD_OP_TMAPGET;
8108 do_osd_ops(ctx, nops);
8109 try {
8110 bufferlist::const_iterator i = newop.outdata.begin();
8111 decode(*header, i);
8112 (*vals).substr_of(newop.outdata, i.get_off(), i.get_remaining());
8113 } catch (...) {
8114 dout(20) << "unsuccessful at decoding tmap for " << ctx->new_obs.oi.soid
8115 << dendl;
8116 return -EINVAL;
8117 }
8118 dout(20) << "successful at decoding tmap for " << ctx->new_obs.oi.soid
8119 << dendl;
8120 return 0;
8121 }
8122
8123 int PrimaryLogPG::_verify_no_head_clones(const hobject_t& soid,
8124 const SnapSet& ss)
8125 {
8126 // verify that all clones have been evicted
8127 dout(20) << __func__ << " verifying clones are absent "
8128 << ss << dendl;
8129 for (vector<snapid_t>::const_iterator p = ss.clones.begin();
8130 p != ss.clones.end();
8131 ++p) {
8132 hobject_t clone_oid = soid;
8133 clone_oid.snap = *p;
8134 if (is_missing_object(clone_oid))
8135 return -EBUSY;
8136 ObjectContextRef clone_obc = get_object_context(clone_oid, false);
8137 if (clone_obc && clone_obc->obs.exists) {
8138 dout(10) << __func__ << " cannot evict head before clone "
8139 << clone_oid << dendl;
8140 return -EBUSY;
8141 }
8142 if (copy_ops.count(clone_oid)) {
8143 dout(10) << __func__ << " cannot evict head, pending promote on clone "
8144 << clone_oid << dendl;
8145 return -EBUSY;
8146 }
8147 }
8148 return 0;
8149 }
8150
8151 inline int PrimaryLogPG::_delete_oid(
8152 OpContext *ctx,
8153 bool no_whiteout, // no whiteouts, no matter what.
8154 bool try_no_whiteout) // try not to whiteout
8155 {
8156 SnapSet& snapset = ctx->new_snapset;
8157 ObjectState& obs = ctx->new_obs;
8158 object_info_t& oi = obs.oi;
8159 const hobject_t& soid = oi.soid;
8160 PGTransaction* t = ctx->op_t.get();
8161
8162 // cache: cache: set whiteout on delete?
8163 bool whiteout = false;
8164 if (pool.info.cache_mode != pg_pool_t::CACHEMODE_NONE
8165 && !no_whiteout
8166 && !try_no_whiteout) {
8167 whiteout = true;
8168 }
8169
8170 // in luminous or later, we can't delete the head if there are
8171 // clones. we trust the caller passing no_whiteout has already
8172 // verified they don't exist.
8173 if (!snapset.clones.empty() ||
8174 (!ctx->snapc.snaps.empty() && ctx->snapc.snaps[0] > snapset.seq)) {
8175 if (no_whiteout) {
8176 dout(20) << __func__ << " has or will have clones but no_whiteout=1"
8177 << dendl;
8178 } else {
8179 dout(20) << __func__ << " has or will have clones; will whiteout"
8180 << dendl;
8181 whiteout = true;
8182 }
8183 }
8184 dout(20) << __func__ << " " << soid << " whiteout=" << (int)whiteout
8185 << " no_whiteout=" << (int)no_whiteout
8186 << " try_no_whiteout=" << (int)try_no_whiteout
8187 << dendl;
8188 if (!obs.exists || (obs.oi.is_whiteout() && whiteout))
8189 return -ENOENT;
8190
8191 t->remove(soid);
8192
8193 if (oi.size > 0) {
8194 interval_set<uint64_t> ch;
8195 ch.insert(0, oi.size);
8196 ctx->modified_ranges.union_of(ch);
8197 ctx->clean_regions.mark_data_region_dirty(0, oi.size);
8198 }
8199
8200 ctx->clean_regions.mark_omap_dirty();
8201 ctx->delta_stats.num_wr++;
8202 if (soid.is_snap()) {
8203 ceph_assert(ctx->obc->ssc->snapset.clone_overlap.count(soid.snap));
8204 ctx->delta_stats.num_bytes -= ctx->obc->ssc->snapset.get_clone_bytes(soid.snap);
8205 } else {
8206 ctx->delta_stats.num_bytes -= oi.size;
8207 }
8208 oi.size = 0;
8209 oi.new_object();
8210
8211 // disconnect all watchers
8212 for (map<pair<uint64_t, entity_name_t>, watch_info_t>::iterator p =
8213 oi.watchers.begin();
8214 p != oi.watchers.end();
8215 ++p) {
8216 dout(20) << __func__ << " will disconnect watcher " << p->first << dendl;
8217 ctx->watch_disconnects.push_back(
8218 watch_disconnect_t(p->first.first, p->first.second, true));
8219 }
8220 oi.watchers.clear();
8221
8222 if (whiteout) {
8223 dout(20) << __func__ << " setting whiteout on " << soid << dendl;
8224 oi.set_flag(object_info_t::FLAG_WHITEOUT);
8225 ctx->delta_stats.num_whiteouts++;
8226 t->create(soid);
8227 osd->logger->inc(l_osd_tier_whiteout);
8228 return 0;
8229 }
8230
8231 if (oi.has_manifest()) {
8232 ctx->delta_stats.num_objects_manifest--;
8233 dec_all_refcount_manifest(oi, ctx);
8234 }
8235
8236 // delete the head
8237 ctx->delta_stats.num_objects--;
8238 if (soid.is_snap())
8239 ctx->delta_stats.num_object_clones--;
8240 if (oi.is_whiteout()) {
8241 dout(20) << __func__ << " deleting whiteout on " << soid << dendl;
8242 ctx->delta_stats.num_whiteouts--;
8243 oi.clear_flag(object_info_t::FLAG_WHITEOUT);
8244 }
8245 if (oi.is_cache_pinned()) {
8246 ctx->delta_stats.num_objects_pinned--;
8247 }
8248 obs.exists = false;
8249 return 0;
8250 }
8251
8252 int PrimaryLogPG::_rollback_to(OpContext *ctx, OSDOp& op)
8253 {
8254 ObjectState& obs = ctx->new_obs;
8255 object_info_t& oi = obs.oi;
8256 const hobject_t& soid = oi.soid;
8257 snapid_t snapid = (uint64_t)op.op.snap.snapid;
8258 hobject_t missing_oid;
8259
8260 dout(10) << "_rollback_to " << soid << " snapid " << snapid << dendl;
8261
8262 ObjectContextRef rollback_to;
8263
8264 int ret = find_object_context(
8265 hobject_t(soid.oid, soid.get_key(), snapid, soid.get_hash(), info.pgid.pool(),
8266 soid.get_namespace()),
8267 &rollback_to, false, false, &missing_oid);
8268 if (ret == -EAGAIN) {
8269 /* clone must be missing */
8270 ceph_assert(is_degraded_or_backfilling_object(missing_oid) || is_degraded_on_async_recovery_target(missing_oid));
8271 dout(20) << "_rollback_to attempted to roll back to a missing or backfilling clone "
8272 << missing_oid << " (requested snapid: ) " << snapid << dendl;
8273 block_write_on_degraded_snap(missing_oid, ctx->op);
8274 return ret;
8275 }
8276 {
8277 ObjectContextRef promote_obc;
8278 cache_result_t tier_mode_result;
8279 if (obs.exists && obs.oi.has_manifest()) {
8280 /*
8281 * In the case of manifest object, the object_info exists on the base tier at all time,
8282 * so promote_obc should be equal to rollback_to
8283 * */
8284 promote_obc = rollback_to;
8285 tier_mode_result =
8286 maybe_handle_manifest_detail(
8287 ctx->op,
8288 true,
8289 rollback_to);
8290 } else {
8291 tier_mode_result =
8292 maybe_handle_cache_detail(
8293 ctx->op,
8294 true,
8295 rollback_to,
8296 ret,
8297 missing_oid,
8298 true,
8299 false,
8300 &promote_obc);
8301 }
8302 switch (tier_mode_result) {
8303 case cache_result_t::NOOP:
8304 break;
8305 case cache_result_t::BLOCKED_PROMOTE:
8306 ceph_assert(promote_obc);
8307 block_write_on_snap_rollback(soid, promote_obc, ctx->op);
8308 return -EAGAIN;
8309 case cache_result_t::BLOCKED_FULL:
8310 block_write_on_full_cache(soid, ctx->op);
8311 return -EAGAIN;
8312 case cache_result_t::REPLIED_WITH_EAGAIN:
8313 ceph_abort_msg("this can't happen, no rollback on replica");
8314 default:
8315 ceph_abort_msg("must promote was set, other values are not valid");
8316 return -EAGAIN;
8317 }
8318 }
8319
8320 if (ret == -ENOENT || (rollback_to && rollback_to->obs.oi.is_whiteout())) {
8321 // there's no snapshot here, or there's no object.
8322 // if there's no snapshot, we delete the object; otherwise, do nothing.
8323 dout(20) << "_rollback_to deleting head on " << soid.oid
8324 << " because got ENOENT|whiteout on find_object_context" << dendl;
8325 if (ctx->obc->obs.oi.watchers.size()) {
8326 // Cannot delete an object with watchers
8327 ret = -EBUSY;
8328 } else {
8329 _delete_oid(ctx, false, false);
8330 ret = 0;
8331 }
8332 } else if (ret) {
8333 // ummm....huh? It *can't* return anything else at time of writing.
8334 ceph_abort_msg("unexpected error code in _rollback_to");
8335 } else { //we got our context, let's use it to do the rollback!
8336 hobject_t& rollback_to_sobject = rollback_to->obs.oi.soid;
8337 if (is_degraded_or_backfilling_object(rollback_to_sobject) ||
8338 is_degraded_on_async_recovery_target(rollback_to_sobject)) {
8339 dout(20) << "_rollback_to attempted to roll back to a degraded object "
8340 << rollback_to_sobject << " (requested snapid: ) " << snapid << dendl;
8341 block_write_on_degraded_snap(rollback_to_sobject, ctx->op);
8342 ret = -EAGAIN;
8343 } else if (rollback_to->obs.oi.soid.snap == CEPH_NOSNAP) {
8344 // rolling back to the head; we just need to clone it.
8345 ctx->modify = true;
8346 } else {
8347 if (rollback_to->obs.oi.has_manifest() && rollback_to->obs.oi.manifest.is_chunked()) {
8348 /*
8349 * looking at the following case, the foo head needs the reference of chunk4 and chunk5
8350 * in case snap[1] is removed.
8351 *
8352 * Before rollback to snap[1]:
8353 *
8354 * foo snap[1]: [chunk4] [chunk5]
8355 * foo snap[0]: [ chunk2 ]
8356 * foo head : [chunk1] [chunk3]
8357 *
8358 * After:
8359 *
8360 * foo snap[1]: [chunk4] [chunk5]
8361 * foo snap[0]: [ chunk2 ]
8362 * foo head : [chunk4] [chunk5]
8363 *
8364 */
8365 OpFinisher* op_finisher = nullptr;
8366 auto op_finisher_it = ctx->op_finishers.find(ctx->current_osd_subop_num);
8367 if (op_finisher_it != ctx->op_finishers.end()) {
8368 op_finisher = op_finisher_it->second.get();
8369 }
8370 if (!op_finisher) {
8371 bool need_inc_ref = inc_refcount_by_set(ctx, rollback_to->obs.oi.manifest, op);
8372 if (need_inc_ref) {
8373 ceph_assert(op_finisher_it == ctx->op_finishers.end());
8374 ctx->op_finishers[ctx->current_osd_subop_num].reset(
8375 new SetManifestFinisher(op));
8376 return -EINPROGRESS;
8377 }
8378 } else {
8379 op_finisher->execute();
8380 ctx->op_finishers.erase(ctx->current_osd_subop_num);
8381 }
8382 }
8383 _do_rollback_to(ctx, rollback_to, op);
8384 }
8385 }
8386 return ret;
8387 }
8388
8389 void PrimaryLogPG::_do_rollback_to(OpContext *ctx, ObjectContextRef rollback_to,
8390 OSDOp& op)
8391 {
8392 SnapSet& snapset = ctx->new_snapset;
8393 ObjectState& obs = ctx->new_obs;
8394 object_info_t& oi = obs.oi;
8395 const hobject_t& soid = oi.soid;
8396 PGTransaction* t = ctx->op_t.get();
8397 snapid_t snapid = (uint64_t)op.op.snap.snapid;
8398 hobject_t& rollback_to_sobject = rollback_to->obs.oi.soid;
8399
8400 /* 1) Delete current head
8401 * 2) Clone correct snapshot into head
8402 * 3) Calculate clone_overlaps by following overlaps
8403 * forward from rollback snapshot */
8404 dout(10) << "_do_rollback_to deleting " << soid.oid
8405 << " and rolling back to old snap" << dendl;
8406
8407 if (obs.exists) {
8408 t->remove(soid);
8409 if (obs.oi.has_manifest()) {
8410 dec_all_refcount_manifest(obs.oi, ctx);
8411 oi.manifest.clear();
8412 oi.manifest.type = object_manifest_t::TYPE_NONE;
8413 oi.clear_flag(object_info_t::FLAG_MANIFEST);
8414 ctx->delta_stats.num_objects_manifest--;
8415 ctx->cache_operation = true; // do not trigger to call ref function to calculate refcount
8416 }
8417 }
8418 t->clone(soid, rollback_to_sobject);
8419 t->add_obc(rollback_to);
8420
8421 map<snapid_t, interval_set<uint64_t> >::iterator iter =
8422 snapset.clone_overlap.lower_bound(snapid);
8423 ceph_assert(iter != snapset.clone_overlap.end());
8424 interval_set<uint64_t> overlaps = iter->second;
8425 for ( ;
8426 iter != snapset.clone_overlap.end();
8427 ++iter)
8428 overlaps.intersection_of(iter->second);
8429
8430 if (obs.oi.size > 0) {
8431 interval_set<uint64_t> modified;
8432 modified.insert(0, obs.oi.size);
8433 overlaps.intersection_of(modified);
8434 modified.subtract(overlaps);
8435 ctx->modified_ranges.union_of(modified);
8436 }
8437
8438 // Adjust the cached objectcontext
8439 maybe_create_new_object(ctx, true);
8440 ctx->delta_stats.num_bytes -= obs.oi.size;
8441 ctx->delta_stats.num_bytes += rollback_to->obs.oi.size;
8442 ctx->clean_regions.mark_data_region_dirty(0, std::max(obs.oi.size, rollback_to->obs.oi.size));
8443 ctx->clean_regions.mark_omap_dirty();
8444 obs.oi.size = rollback_to->obs.oi.size;
8445 if (rollback_to->obs.oi.is_data_digest())
8446 obs.oi.set_data_digest(rollback_to->obs.oi.data_digest);
8447 else
8448 obs.oi.clear_data_digest();
8449 if (rollback_to->obs.oi.is_omap_digest())
8450 obs.oi.set_omap_digest(rollback_to->obs.oi.omap_digest);
8451 else
8452 obs.oi.clear_omap_digest();
8453
8454 if (rollback_to->obs.oi.has_manifest() && rollback_to->obs.oi.manifest.is_chunked()) {
8455 obs.oi.set_flag(object_info_t::FLAG_MANIFEST);
8456 obs.oi.manifest.type = rollback_to->obs.oi.manifest.type;
8457 obs.oi.manifest.chunk_map = rollback_to->obs.oi.manifest.chunk_map;
8458 ctx->cache_operation = true;
8459 ctx->delta_stats.num_objects_manifest++;
8460 }
8461
8462 if (rollback_to->obs.oi.is_omap()) {
8463 dout(10) << __func__ << " setting omap flag on " << obs.oi.soid << dendl;
8464 obs.oi.set_flag(object_info_t::FLAG_OMAP);
8465 } else {
8466 dout(10) << __func__ << " clearing omap flag on " << obs.oi.soid << dendl;
8467 obs.oi.clear_flag(object_info_t::FLAG_OMAP);
8468 }
8469 }
8470
8471 void PrimaryLogPG::_make_clone(
8472 OpContext *ctx,
8473 PGTransaction* t,
8474 ObjectContextRef obc,
8475 const hobject_t& head, const hobject_t& coid,
8476 object_info_t *poi)
8477 {
8478 bufferlist bv;
8479 encode(*poi, bv, get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
8480
8481 t->clone(coid, head);
8482 setattr_maybe_cache(obc, t, OI_ATTR, bv);
8483 rmattr_maybe_cache(obc, t, SS_ATTR);
8484 }
8485
8486 void PrimaryLogPG::make_writeable(OpContext *ctx)
8487 {
8488 const hobject_t& soid = ctx->obs->oi.soid;
8489 SnapContext& snapc = ctx->snapc;
8490
8491 // clone?
8492 ceph_assert(soid.snap == CEPH_NOSNAP);
8493 dout(20) << "make_writeable " << soid << " snapset=" << ctx->new_snapset
8494 << " snapc=" << snapc << dendl;
8495
8496 bool was_dirty = ctx->obc->obs.oi.is_dirty();
8497 if (ctx->new_obs.exists) {
8498 // we will mark the object dirty
8499 if (ctx->undirty && was_dirty) {
8500 dout(20) << " clearing DIRTY flag" << dendl;
8501 ceph_assert(ctx->new_obs.oi.is_dirty());
8502 ctx->new_obs.oi.clear_flag(object_info_t::FLAG_DIRTY);
8503 --ctx->delta_stats.num_objects_dirty;
8504 osd->logger->inc(l_osd_tier_clean);
8505 } else if (!was_dirty && !ctx->undirty) {
8506 dout(20) << " setting DIRTY flag" << dendl;
8507 ctx->new_obs.oi.set_flag(object_info_t::FLAG_DIRTY);
8508 ++ctx->delta_stats.num_objects_dirty;
8509 osd->logger->inc(l_osd_tier_dirty);
8510 }
8511 } else {
8512 if (was_dirty) {
8513 dout(20) << " deletion, decrementing num_dirty and clearing flag" << dendl;
8514 ctx->new_obs.oi.clear_flag(object_info_t::FLAG_DIRTY);
8515 --ctx->delta_stats.num_objects_dirty;
8516 }
8517 }
8518
8519 if ((ctx->new_obs.exists &&
8520 ctx->new_obs.oi.is_omap()) &&
8521 (!ctx->obc->obs.exists ||
8522 !ctx->obc->obs.oi.is_omap())) {
8523 ++ctx->delta_stats.num_objects_omap;
8524 }
8525 if ((!ctx->new_obs.exists ||
8526 !ctx->new_obs.oi.is_omap()) &&
8527 (ctx->obc->obs.exists &&
8528 ctx->obc->obs.oi.is_omap())) {
8529 --ctx->delta_stats.num_objects_omap;
8530 }
8531
8532 if (ctx->new_snapset.seq > snapc.seq) {
8533 dout(10) << " op snapset is old" << dendl;
8534 }
8535
8536 if ((ctx->obs->exists && !ctx->obs->oi.is_whiteout()) && // head exist(ed)
8537 snapc.snaps.size() && // there are snaps
8538 !ctx->cache_operation &&
8539 snapc.snaps[0] > ctx->new_snapset.seq) { // existing object is old
8540 // clone
8541 hobject_t coid = soid;
8542 coid.snap = snapc.seq;
8543
8544 unsigned l;
8545 for (l = 1;
8546 l < snapc.snaps.size() && snapc.snaps[l] > ctx->new_snapset.seq;
8547 l++) ;
8548
8549 vector<snapid_t> snaps(l);
8550 for (unsigned i=0; i<l; i++)
8551 snaps[i] = snapc.snaps[i];
8552
8553 // prepare clone
8554 object_info_t static_snap_oi(coid);
8555 object_info_t *snap_oi;
8556 if (is_primary()) {
8557 ctx->clone_obc = object_contexts.lookup_or_create(static_snap_oi.soid);
8558 ctx->clone_obc->destructor_callback =
8559 new C_PG_ObjectContext(this, ctx->clone_obc.get());
8560 ctx->clone_obc->obs.oi = static_snap_oi;
8561 ctx->clone_obc->obs.exists = true;
8562 ctx->clone_obc->ssc = ctx->obc->ssc;
8563 ctx->clone_obc->ssc->ref++;
8564 if (pool.info.is_erasure())
8565 ctx->clone_obc->attr_cache = ctx->obc->attr_cache;
8566 snap_oi = &ctx->clone_obc->obs.oi;
8567 if (ctx->obc->obs.oi.has_manifest()) {
8568 if ((ctx->obc->obs.oi.flags & object_info_t::FLAG_REDIRECT_HAS_REFERENCE) &&
8569 ctx->obc->obs.oi.manifest.is_redirect()) {
8570 snap_oi->set_flag(object_info_t::FLAG_MANIFEST);
8571 snap_oi->manifest.type = object_manifest_t::TYPE_REDIRECT;
8572 snap_oi->manifest.redirect_target = ctx->obc->obs.oi.manifest.redirect_target;
8573 } else if (ctx->obc->obs.oi.manifest.is_chunked()) {
8574 snap_oi->set_flag(object_info_t::FLAG_MANIFEST);
8575 snap_oi->manifest.type = object_manifest_t::TYPE_CHUNKED;
8576 snap_oi->manifest.chunk_map = ctx->obc->obs.oi.manifest.chunk_map;
8577 } else {
8578 ceph_abort_msg("unrecognized manifest type");
8579 }
8580 }
8581 bool got = ctx->lock_manager.get_write_greedy(
8582 coid,
8583 ctx->clone_obc,
8584 ctx->op);
8585 ceph_assert(got);
8586 dout(20) << " got greedy write on clone_obc " << *ctx->clone_obc << dendl;
8587 } else {
8588 snap_oi = &static_snap_oi;
8589 }
8590 snap_oi->version = ctx->at_version;
8591 snap_oi->prior_version = ctx->obs->oi.version;
8592 snap_oi->copy_user_bits(ctx->obs->oi);
8593
8594 _make_clone(ctx, ctx->op_t.get(), ctx->clone_obc, soid, coid, snap_oi);
8595
8596 ctx->delta_stats.num_objects++;
8597 if (snap_oi->is_dirty()) {
8598 ctx->delta_stats.num_objects_dirty++;
8599 osd->logger->inc(l_osd_tier_dirty);
8600 }
8601 if (snap_oi->is_omap())
8602 ctx->delta_stats.num_objects_omap++;
8603 if (snap_oi->is_cache_pinned())
8604 ctx->delta_stats.num_objects_pinned++;
8605 if (snap_oi->has_manifest())
8606 ctx->delta_stats.num_objects_manifest++;
8607 ctx->delta_stats.num_object_clones++;
8608 ctx->new_snapset.clones.push_back(coid.snap);
8609 ctx->new_snapset.clone_size[coid.snap] = ctx->obs->oi.size;
8610 ctx->new_snapset.clone_snaps[coid.snap] = snaps;
8611
8612 // clone_overlap should contain an entry for each clone
8613 // (an empty interval_set if there is no overlap)
8614 ctx->new_snapset.clone_overlap[coid.snap];
8615 if (ctx->obs->oi.size)
8616 ctx->new_snapset.clone_overlap[coid.snap].insert(0, ctx->obs->oi.size);
8617
8618 // log clone
8619 dout(10) << " cloning v " << ctx->obs->oi.version
8620 << " to " << coid << " v " << ctx->at_version
8621 << " snaps=" << snaps
8622 << " snapset=" << ctx->new_snapset << dendl;
8623 ctx->log.push_back(pg_log_entry_t(
8624 pg_log_entry_t::CLONE, coid, ctx->at_version,
8625 ctx->obs->oi.version,
8626 ctx->obs->oi.user_version,
8627 osd_reqid_t(), ctx->new_obs.oi.mtime, 0));
8628 encode(snaps, ctx->log.back().snaps);
8629
8630 ctx->at_version.version++;
8631 }
8632
8633 // update most recent clone_overlap and usage stats
8634 if (ctx->new_snapset.clones.size() > 0) {
8635 // the clone_overlap is difference of range between head and clones.
8636 // we need to check whether the most recent clone exists, if it's
8637 // been evicted, it's not included in the stats, but the clone_overlap
8638 // is still exist in the snapset, so we should update the
8639 // clone_overlap to make it sense.
8640 hobject_t last_clone_oid = soid;
8641 last_clone_oid.snap = ctx->new_snapset.clone_overlap.rbegin()->first;
8642 interval_set<uint64_t> &newest_overlap =
8643 ctx->new_snapset.clone_overlap.rbegin()->second;
8644 ctx->modified_ranges.intersection_of(newest_overlap);
8645 if (is_present_clone(last_clone_oid)) {
8646 // modified_ranges is still in use by the clone
8647 ctx->delta_stats.num_bytes += ctx->modified_ranges.size();
8648 }
8649 newest_overlap.subtract(ctx->modified_ranges);
8650 }
8651
8652 if (snapc.seq > ctx->new_snapset.seq) {
8653 // update snapset with latest snap context
8654 ctx->new_snapset.seq = snapc.seq;
8655 if (get_osdmap()->require_osd_release < ceph_release_t::octopus) {
8656 ctx->new_snapset.snaps = snapc.snaps;
8657 } else {
8658 ctx->new_snapset.snaps.clear();
8659 }
8660 }
8661 dout(20) << "make_writeable " << soid
8662 << " done, snapset=" << ctx->new_snapset << dendl;
8663 }
8664
8665
8666 void PrimaryLogPG::write_update_size_and_usage(object_stat_sum_t& delta_stats, object_info_t& oi,
8667 interval_set<uint64_t>& modified, uint64_t offset,
8668 uint64_t length, bool write_full)
8669 {
8670 interval_set<uint64_t> ch;
8671 if (write_full) {
8672 if (oi.size)
8673 ch.insert(0, oi.size);
8674 } else if (length)
8675 ch.insert(offset, length);
8676 modified.union_of(ch);
8677 if (write_full ||
8678 (offset + length > oi.size && length)) {
8679 uint64_t new_size = offset + length;
8680 delta_stats.num_bytes -= oi.size;
8681 delta_stats.num_bytes += new_size;
8682 oi.size = new_size;
8683 }
8684
8685 delta_stats.num_wr++;
8686 delta_stats.num_wr_kb += shift_round_up(length, 10);
8687 }
8688
8689 void PrimaryLogPG::truncate_update_size_and_usage(
8690 object_stat_sum_t& delta_stats,
8691 object_info_t& oi,
8692 uint64_t truncate_size)
8693 {
8694 if (oi.size != truncate_size) {
8695 delta_stats.num_bytes -= oi.size;
8696 delta_stats.num_bytes += truncate_size;
8697 oi.size = truncate_size;
8698 }
8699 }
8700
8701 void PrimaryLogPG::complete_disconnect_watches(
8702 ObjectContextRef obc,
8703 const list<watch_disconnect_t> &to_disconnect)
8704 {
8705 for (list<watch_disconnect_t>::const_iterator i =
8706 to_disconnect.begin();
8707 i != to_disconnect.end();
8708 ++i) {
8709 pair<uint64_t, entity_name_t> watcher(i->cookie, i->name);
8710 auto watchers_entry = obc->watchers.find(watcher);
8711 if (watchers_entry != obc->watchers.end()) {
8712 WatchRef watch = watchers_entry->second;
8713 dout(10) << "do_osd_op_effects disconnect watcher " << watcher << dendl;
8714 obc->watchers.erase(watcher);
8715 watch->remove(i->send_disconnect);
8716 } else {
8717 dout(10) << "do_osd_op_effects disconnect failed to find watcher "
8718 << watcher << dendl;
8719 }
8720 }
8721 }
8722
8723 void PrimaryLogPG::do_osd_op_effects(OpContext *ctx, const ConnectionRef& conn)
8724 {
8725 entity_name_t entity = ctx->reqid.name;
8726 dout(15) << "do_osd_op_effects " << entity << " con " << conn.get() << dendl;
8727
8728 // disconnects first
8729 complete_disconnect_watches(ctx->obc, ctx->watch_disconnects);
8730
8731 ceph_assert(conn);
8732
8733 auto session = conn->get_priv();
8734 if (!session)
8735 return;
8736
8737 for (list<pair<watch_info_t,bool> >::iterator i = ctx->watch_connects.begin();
8738 i != ctx->watch_connects.end();
8739 ++i) {
8740 pair<uint64_t, entity_name_t> watcher(i->first.cookie, entity);
8741 dout(15) << "do_osd_op_effects applying watch connect on session "
8742 << session.get() << " watcher " << watcher << dendl;
8743 WatchRef watch;
8744 if (ctx->obc->watchers.count(watcher)) {
8745 dout(15) << "do_osd_op_effects found existing watch watcher " << watcher
8746 << dendl;
8747 watch = ctx->obc->watchers[watcher];
8748 } else {
8749 dout(15) << "do_osd_op_effects new watcher " << watcher
8750 << dendl;
8751 watch = Watch::makeWatchRef(
8752 this, osd, ctx->obc, i->first.timeout_seconds,
8753 i->first.cookie, entity, conn->get_peer_addr());
8754 ctx->obc->watchers.insert(
8755 make_pair(
8756 watcher,
8757 watch));
8758 }
8759 watch->connect(conn, i->second);
8760 }
8761
8762 for (list<notify_info_t>::iterator p = ctx->notifies.begin();
8763 p != ctx->notifies.end();
8764 ++p) {
8765 dout(10) << "do_osd_op_effects, notify " << *p << dendl;
8766 ConnectionRef conn(ctx->op->get_req()->get_connection());
8767 NotifyRef notif(
8768 Notify::makeNotifyRef(
8769 conn,
8770 ctx->reqid.name.num(),
8771 p->bl,
8772 p->timeout,
8773 p->cookie,
8774 p->notify_id,
8775 ctx->obc->obs.oi.user_version,
8776 osd));
8777 for (map<pair<uint64_t, entity_name_t>, WatchRef>::iterator i =
8778 ctx->obc->watchers.begin();
8779 i != ctx->obc->watchers.end();
8780 ++i) {
8781 dout(10) << "starting notify on watch " << i->first << dendl;
8782 i->second->start_notify(notif);
8783 }
8784 notif->init();
8785 }
8786
8787 for (list<OpContext::NotifyAck>::iterator p = ctx->notify_acks.begin();
8788 p != ctx->notify_acks.end();
8789 ++p) {
8790 if (p->watch_cookie)
8791 dout(10) << "notify_ack " << make_pair(*(p->watch_cookie), p->notify_id) << dendl;
8792 else
8793 dout(10) << "notify_ack " << make_pair("NULL", p->notify_id) << dendl;
8794 for (map<pair<uint64_t, entity_name_t>, WatchRef>::iterator i =
8795 ctx->obc->watchers.begin();
8796 i != ctx->obc->watchers.end();
8797 ++i) {
8798 if (i->first.second != entity) continue;
8799 if (p->watch_cookie &&
8800 *(p->watch_cookie) != i->first.first) continue;
8801 dout(10) << "acking notify on watch " << i->first << dendl;
8802 i->second->notify_ack(p->notify_id, p->reply_bl);
8803 }
8804 }
8805 }
8806
8807 hobject_t PrimaryLogPG::generate_temp_object(const hobject_t& target)
8808 {
8809 ostringstream ss;
8810 ss << "temp_" << info.pgid << "_" << get_role()
8811 << "_" << osd->monc->get_global_id() << "_" << (++temp_seq);
8812 hobject_t hoid = target.make_temp_hobject(ss.str());
8813 dout(20) << __func__ << " " << hoid << dendl;
8814 return hoid;
8815 }
8816
8817 hobject_t PrimaryLogPG::get_temp_recovery_object(
8818 const hobject_t& target,
8819 eversion_t version)
8820 {
8821 ostringstream ss;
8822 ss << "temp_recovering_" << info.pgid // (note this includes the shardid)
8823 << "_" << version
8824 << "_" << info.history.same_interval_since
8825 << "_" << target.snap;
8826 // pgid + version + interval + snapid is unique, and short
8827 hobject_t hoid = target.make_temp_hobject(ss.str());
8828 dout(20) << __func__ << " " << hoid << dendl;
8829 return hoid;
8830 }
8831
8832 int PrimaryLogPG::prepare_transaction(OpContext *ctx)
8833 {
8834 ceph_assert(!ctx->ops->empty());
8835
8836 // valid snap context?
8837 if (!ctx->snapc.is_valid()) {
8838 dout(10) << " invalid snapc " << ctx->snapc << dendl;
8839 return -EINVAL;
8840 }
8841
8842 // prepare the actual mutation
8843 int result = do_osd_ops(ctx, *ctx->ops);
8844 if (result < 0) {
8845 if (ctx->op->may_write() &&
8846 get_osdmap()->require_osd_release >= ceph_release_t::kraken) {
8847 // need to save the error code in the pg log, to detect dup ops,
8848 // but do nothing else
8849 ctx->update_log_only = true;
8850 }
8851 return result;
8852 }
8853
8854 // read-op? write-op noop? done?
8855 if (ctx->op_t->empty() && !ctx->modify) {
8856 if (ctx->pending_async_reads.empty())
8857 unstable_stats.add(ctx->delta_stats);
8858 if (ctx->op->may_write() &&
8859 get_osdmap()->require_osd_release >= ceph_release_t::kraken) {
8860 ctx->update_log_only = true;
8861 }
8862 return result;
8863 }
8864
8865 // check for full
8866 if ((ctx->delta_stats.num_bytes > 0 ||
8867 ctx->delta_stats.num_objects > 0) && // FIXME: keys?
8868 pool.info.has_flag(pg_pool_t::FLAG_FULL)) {
8869 auto m = ctx->op->get_req<MOSDOp>();
8870 if (ctx->reqid.name.is_mds() || // FIXME: ignore MDS for now
8871 m->has_flag(CEPH_OSD_FLAG_FULL_FORCE)) {
8872 dout(20) << __func__ << " full, but proceeding due to FULL_FORCE or MDS"
8873 << dendl;
8874 } else if (m->has_flag(CEPH_OSD_FLAG_FULL_TRY)) {
8875 // they tried, they failed.
8876 dout(20) << __func__ << " full, replying to FULL_TRY op" << dendl;
8877 return pool.info.has_flag(pg_pool_t::FLAG_FULL_QUOTA) ? -EDQUOT : -ENOSPC;
8878 } else {
8879 // drop request
8880 dout(20) << __func__ << " full, dropping request (bad client)" << dendl;
8881 return -EAGAIN;
8882 }
8883 }
8884
8885 const hobject_t& soid = ctx->obs->oi.soid;
8886 // clone, if necessary
8887 if (soid.snap == CEPH_NOSNAP)
8888 make_writeable(ctx);
8889
8890 finish_ctx(ctx,
8891 ctx->new_obs.exists ? pg_log_entry_t::MODIFY :
8892 pg_log_entry_t::DELETE,
8893 result);
8894
8895 return result;
8896 }
8897
8898 void PrimaryLogPG::finish_ctx(OpContext *ctx, int log_op_type, int result)
8899 {
8900 const hobject_t& soid = ctx->obs->oi.soid;
8901 dout(20) << __func__ << " " << soid << " " << ctx
8902 << " op " << pg_log_entry_t::get_op_name(log_op_type)
8903 << dendl;
8904 utime_t now = ceph_clock_now();
8905
8906 jspan span;
8907 if (ctx->op) {
8908 span = tracing::osd::tracer.add_span(__func__, ctx->op->osd_parent_span);
8909 }
8910
8911 // Drop the reference if deduped chunk is modified
8912 if (ctx->new_obs.oi.is_dirty() &&
8913 (ctx->obs->oi.has_manifest() && ctx->obs->oi.manifest.is_chunked()) &&
8914 !ctx->cache_operation &&
8915 log_op_type != pg_log_entry_t::PROMOTE) {
8916 update_chunk_map_by_dirty(ctx);
8917 // If a clone is creating, ignore dropping the reference for manifest object
8918 if (!ctx->delta_stats.num_object_clones) {
8919 dec_refcount_by_dirty(ctx);
8920 }
8921 }
8922
8923 // finish and log the op.
8924 if (ctx->user_modify) {
8925 // update the user_version for any modify ops, except for the watch op
8926 ctx->user_at_version = std::max(info.last_user_version, ctx->new_obs.oi.user_version) + 1;
8927 /* In order for new clients and old clients to interoperate properly
8928 * when exchanging versions, we need to lower bound the user_version
8929 * (which our new clients pay proper attention to)
8930 * by the at_version (which is all the old clients can ever see). */
8931 if (ctx->at_version.version > ctx->user_at_version)
8932 ctx->user_at_version = ctx->at_version.version;
8933 ctx->new_obs.oi.user_version = ctx->user_at_version;
8934 }
8935 ctx->bytes_written = ctx->op_t->get_bytes_written();
8936
8937 if (ctx->new_obs.exists) {
8938 ctx->new_obs.oi.version = ctx->at_version;
8939 ctx->new_obs.oi.prior_version = ctx->obs->oi.version;
8940 ctx->new_obs.oi.last_reqid = ctx->reqid;
8941 if (ctx->mtime != utime_t()) {
8942 ctx->new_obs.oi.mtime = ctx->mtime;
8943 dout(10) << " set mtime to " << ctx->new_obs.oi.mtime << dendl;
8944 ctx->new_obs.oi.local_mtime = now;
8945 } else {
8946 dout(10) << " mtime unchanged at " << ctx->new_obs.oi.mtime << dendl;
8947 }
8948
8949 // object_info_t
8950 map <string, bufferlist, less<>> attrs;
8951 bufferlist bv(sizeof(ctx->new_obs.oi));
8952 encode(ctx->new_obs.oi, bv,
8953 get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
8954 attrs[OI_ATTR] = std::move(bv);
8955
8956 // snapset
8957 if (soid.snap == CEPH_NOSNAP) {
8958 dout(10) << " final snapset " << ctx->new_snapset
8959 << " in " << soid << dendl;
8960 bufferlist bss;
8961 encode(ctx->new_snapset, bss);
8962 attrs[SS_ATTR] = std::move(bss);
8963 } else {
8964 dout(10) << " no snapset (this is a clone)" << dendl;
8965 }
8966 ctx->op_t->setattrs(soid, attrs);
8967 } else {
8968 // reset cached oi
8969 ctx->new_obs.oi = object_info_t(ctx->obc->obs.oi.soid);
8970 }
8971
8972 // append to log
8973 ctx->log.push_back(
8974 pg_log_entry_t(log_op_type, soid, ctx->at_version,
8975 ctx->obs->oi.version,
8976 ctx->user_at_version, ctx->reqid,
8977 ctx->mtime,
8978 (ctx->op && ctx->op->allows_returnvec()) ? result : 0));
8979 if (ctx->op && ctx->op->allows_returnvec()) {
8980 // also the per-op values
8981 ctx->log.back().set_op_returns(*ctx->ops);
8982 dout(20) << __func__ << " op_returns " << ctx->log.back().op_returns
8983 << dendl;
8984 }
8985
8986 ctx->log.back().clean_regions = ctx->clean_regions;
8987 dout(20) << __func__ << " object " << soid << " marks clean_regions " << ctx->log.back().clean_regions << dendl;
8988
8989 if (soid.snap < CEPH_NOSNAP) {
8990 switch (log_op_type) {
8991 case pg_log_entry_t::MODIFY:
8992 case pg_log_entry_t::PROMOTE:
8993 case pg_log_entry_t::CLEAN:
8994 dout(20) << __func__ << " encoding snaps from " << ctx->new_snapset
8995 << dendl;
8996 encode(ctx->new_snapset.clone_snaps[soid.snap], ctx->log.back().snaps);
8997 break;
8998 default:
8999 break;
9000 }
9001 }
9002
9003 if (!ctx->extra_reqids.empty()) {
9004 dout(20) << __func__ << " extra_reqids " << ctx->extra_reqids << " "
9005 << ctx->extra_reqid_return_codes << dendl;
9006 ctx->log.back().extra_reqids.swap(ctx->extra_reqids);
9007 ctx->log.back().extra_reqid_return_codes.swap(ctx->extra_reqid_return_codes);
9008 }
9009
9010 // apply new object state.
9011 ctx->obc->obs = ctx->new_obs;
9012
9013 if (soid.is_head() && !ctx->obc->obs.exists) {
9014 ctx->obc->ssc->exists = false;
9015 ctx->obc->ssc->snapset = SnapSet();
9016 } else {
9017 ctx->obc->ssc->exists = true;
9018 ctx->obc->ssc->snapset = ctx->new_snapset;
9019 }
9020 }
9021
9022 void PrimaryLogPG::apply_stats(
9023 const hobject_t &soid,
9024 const object_stat_sum_t &delta_stats) {
9025
9026 recovery_state.apply_op_stats(soid, delta_stats);
9027 for (set<pg_shard_t>::const_iterator i = get_backfill_targets().begin();
9028 i != get_backfill_targets().end();
9029 ++i) {
9030 pg_shard_t bt = *i;
9031 const pg_info_t& pinfo = recovery_state.get_peer_info(bt);
9032 if (soid > pinfo.last_backfill && soid <= last_backfill_started) {
9033 pending_backfill_updates[soid].stats.add(delta_stats);
9034 }
9035 }
9036
9037 m_scrubber->stats_of_handled_objects(delta_stats, soid);
9038 }
9039
9040 void PrimaryLogPG::complete_read_ctx(int result, OpContext *ctx)
9041 {
9042 auto m = ctx->op->get_req<MOSDOp>();
9043 ceph_assert(ctx->async_reads_complete());
9044
9045 for (auto p = ctx->ops->begin();
9046 p != ctx->ops->end() && result >= 0; ++p) {
9047 if (p->rval < 0 && !(p->op.flags & CEPH_OSD_OP_FLAG_FAILOK)) {
9048 result = p->rval;
9049 break;
9050 }
9051 ctx->bytes_read += p->outdata.length();
9052 }
9053 ctx->reply->get_header().data_off = (ctx->data_off ? *ctx->data_off : 0);
9054
9055 MOSDOpReply *reply = ctx->reply;
9056 ctx->reply = nullptr;
9057
9058 if (result >= 0) {
9059 if (!ctx->ignore_log_op_stats) {
9060 log_op_stats(*ctx->op, ctx->bytes_written, ctx->bytes_read);
9061
9062 publish_stats_to_osd();
9063 }
9064
9065 // on read, return the current object version
9066 if (ctx->obs) {
9067 reply->set_reply_versions(eversion_t(), ctx->obs->oi.user_version);
9068 } else {
9069 reply->set_reply_versions(eversion_t(), ctx->user_at_version);
9070 }
9071 } else if (result == -ENOENT) {
9072 // on ENOENT, set a floor for what the next user version will be.
9073 reply->set_enoent_reply_versions(info.last_update, info.last_user_version);
9074 }
9075
9076 reply->set_result(result);
9077 reply->add_flags(CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK);
9078 osd->send_message_osd_client(reply, m->get_connection());
9079 close_op_ctx(ctx);
9080 }
9081
9082 // ========================================================================
9083 // copyfrom
9084
9085 struct C_Copyfrom : public Context {
9086 PrimaryLogPGRef pg;
9087 hobject_t oid;
9088 epoch_t last_peering_reset;
9089 ceph_tid_t tid;
9090 PrimaryLogPG::CopyOpRef cop; // used for keeping the cop alive
9091 C_Copyfrom(PrimaryLogPG *p, hobject_t o, epoch_t lpr,
9092 const PrimaryLogPG::CopyOpRef& c)
9093 : pg(p), oid(o), last_peering_reset(lpr),
9094 tid(0), cop(c)
9095 {}
9096 void finish(int r) override {
9097 if (r == -ECANCELED)
9098 return;
9099 std::scoped_lock l{*pg};
9100 if (last_peering_reset == pg->get_last_peering_reset()) {
9101 pg->process_copy_chunk(oid, tid, r);
9102 cop.reset();
9103 }
9104 }
9105 };
9106
9107 struct C_CopyFrom_AsyncReadCb : public Context {
9108 OSDOp *osd_op;
9109 object_copy_data_t reply_obj;
9110 uint64_t features;
9111 size_t len;
9112 C_CopyFrom_AsyncReadCb(OSDOp *osd_op, uint64_t features) :
9113 osd_op(osd_op), features(features), len(0) {}
9114 void finish(int r) override {
9115 osd_op->rval = r;
9116 if (r < 0) {
9117 return;
9118 }
9119
9120 ceph_assert(len > 0);
9121 ceph_assert(len <= reply_obj.data.length());
9122 bufferlist bl;
9123 bl.substr_of(reply_obj.data, 0, len);
9124 reply_obj.data.swap(bl);
9125 encode(reply_obj, osd_op->outdata, features);
9126 }
9127 };
9128
9129 struct C_CopyChunk : public Context {
9130 PrimaryLogPGRef pg;
9131 hobject_t oid;
9132 epoch_t last_peering_reset;
9133 ceph_tid_t tid;
9134 PrimaryLogPG::CopyOpRef cop; // used for keeping the cop alive
9135 uint64_t offset = 0;
9136 C_CopyChunk(PrimaryLogPG *p, hobject_t o, epoch_t lpr,
9137 const PrimaryLogPG::CopyOpRef& c)
9138 : pg(p), oid(o), last_peering_reset(lpr),
9139 tid(0), cop(c)
9140 {}
9141 void finish(int r) override {
9142 if (r == -ECANCELED)
9143 return;
9144 std::scoped_lock l{*pg};
9145 if (last_peering_reset == pg->get_last_peering_reset()) {
9146 pg->process_copy_chunk_manifest(oid, tid, r, offset);
9147 cop.reset();
9148 }
9149 }
9150 };
9151
9152 int PrimaryLogPG::do_copy_get(OpContext *ctx, bufferlist::const_iterator& bp,
9153 OSDOp& osd_op, ObjectContextRef &obc)
9154 {
9155 object_info_t& oi = obc->obs.oi;
9156 hobject_t& soid = oi.soid;
9157 int result = 0;
9158 object_copy_cursor_t cursor;
9159 uint64_t out_max;
9160 try {
9161 decode(cursor, bp);
9162 decode(out_max, bp);
9163 }
9164 catch (ceph::buffer::error& e) {
9165 result = -EINVAL;
9166 return result;
9167 }
9168
9169 const MOSDOp *op = reinterpret_cast<const MOSDOp*>(ctx->op->get_req());
9170 uint64_t features = op->get_features();
9171
9172 bool async_read_started = false;
9173 object_copy_data_t _reply_obj;
9174 C_CopyFrom_AsyncReadCb *cb = nullptr;
9175 if (pool.info.is_erasure()) {
9176 cb = new C_CopyFrom_AsyncReadCb(&osd_op, features);
9177 }
9178 object_copy_data_t &reply_obj = cb ? cb->reply_obj : _reply_obj;
9179 // size, mtime
9180 reply_obj.size = oi.size;
9181 reply_obj.mtime = oi.mtime;
9182 ceph_assert(obc->ssc);
9183 if (soid.snap < CEPH_NOSNAP) {
9184 auto p = obc->ssc->snapset.clone_snaps.find(soid.snap);
9185 ceph_assert(p != obc->ssc->snapset.clone_snaps.end()); // warn?
9186 reply_obj.snaps = p->second;
9187 } else {
9188 reply_obj.snap_seq = obc->ssc->snapset.seq;
9189 }
9190 if (oi.is_data_digest()) {
9191 reply_obj.flags |= object_copy_data_t::FLAG_DATA_DIGEST;
9192 reply_obj.data_digest = oi.data_digest;
9193 }
9194 if (oi.is_omap_digest()) {
9195 reply_obj.flags |= object_copy_data_t::FLAG_OMAP_DIGEST;
9196 reply_obj.omap_digest = oi.omap_digest;
9197 }
9198 reply_obj.truncate_seq = oi.truncate_seq;
9199 reply_obj.truncate_size = oi.truncate_size;
9200
9201 // attrs
9202 map<string,bufferlist,less<>>& out_attrs = reply_obj.attrs;
9203 if (!cursor.attr_complete) {
9204 result = getattrs_maybe_cache(
9205 ctx->obc,
9206 &out_attrs);
9207 if (result < 0) {
9208 if (cb) {
9209 delete cb;
9210 }
9211 return result;
9212 }
9213 cursor.attr_complete = true;
9214 dout(20) << " got attrs" << dendl;
9215 }
9216
9217 int64_t left = out_max - osd_op.outdata.length();
9218
9219 // data
9220 bufferlist& bl = reply_obj.data;
9221 if (left > 0 && !cursor.data_complete) {
9222 if (cursor.data_offset < oi.size) {
9223 uint64_t max_read = std::min(oi.size - cursor.data_offset, (uint64_t)left);
9224 if (cb) {
9225 async_read_started = true;
9226 ctx->pending_async_reads.push_back(
9227 make_pair(
9228 boost::make_tuple(cursor.data_offset, max_read, osd_op.op.flags),
9229 make_pair(&bl, cb)));
9230 cb->len = max_read;
9231
9232 ctx->op_finishers[ctx->current_osd_subop_num].reset(
9233 new ReadFinisher(osd_op));
9234 result = -EINPROGRESS;
9235
9236 dout(10) << __func__ << ": async_read noted for " << soid << dendl;
9237 } else {
9238 result = pgbackend->objects_read_sync(
9239 oi.soid, cursor.data_offset, max_read, osd_op.op.flags, &bl);
9240 if (result < 0)
9241 return result;
9242 }
9243 left -= max_read;
9244 cursor.data_offset += max_read;
9245 }
9246 if (cursor.data_offset == oi.size) {
9247 cursor.data_complete = true;
9248 dout(20) << " got data" << dendl;
9249 }
9250 ceph_assert(cursor.data_offset <= oi.size);
9251 }
9252
9253 // omap
9254 uint32_t omap_keys = 0;
9255 if (!pool.info.supports_omap() || !oi.is_omap()) {
9256 cursor.omap_complete = true;
9257 } else {
9258 if (left > 0 && !cursor.omap_complete) {
9259 ceph_assert(cursor.data_complete);
9260 if (cursor.omap_offset.empty()) {
9261 osd->store->omap_get_header(ch, ghobject_t(oi.soid),
9262 &reply_obj.omap_header);
9263 }
9264 bufferlist omap_data;
9265 ObjectMap::ObjectMapIterator iter =
9266 osd->store->get_omap_iterator(ch, ghobject_t(oi.soid));
9267 ceph_assert(iter);
9268 iter->upper_bound(cursor.omap_offset);
9269 for (; iter->valid(); iter->next()) {
9270 ++omap_keys;
9271 encode(iter->key(), omap_data);
9272 encode(iter->value(), omap_data);
9273 left -= iter->key().length() + 4 + iter->value().length() + 4;
9274 if (left <= 0)
9275 break;
9276 }
9277 if (omap_keys) {
9278 encode(omap_keys, reply_obj.omap_data);
9279 reply_obj.omap_data.claim_append(omap_data);
9280 }
9281 if (iter->valid()) {
9282 cursor.omap_offset = iter->key();
9283 } else {
9284 cursor.omap_complete = true;
9285 dout(20) << " got omap" << dendl;
9286 }
9287 }
9288 }
9289
9290 if (cursor.is_complete()) {
9291 // include reqids only in the final step. this is a bit fragile
9292 // but it works...
9293 recovery_state.get_pg_log().get_log().get_object_reqids(ctx->obc->obs.oi.soid, 10,
9294 &reply_obj.reqids,
9295 &reply_obj.reqid_return_codes);
9296 dout(20) << " got reqids" << dendl;
9297 }
9298
9299 dout(20) << " cursor.is_complete=" << cursor.is_complete()
9300 << " " << out_attrs.size() << " attrs"
9301 << " " << bl.length() << " bytes"
9302 << " " << reply_obj.omap_header.length() << " omap header bytes"
9303 << " " << reply_obj.omap_data.length() << " omap data bytes in "
9304 << omap_keys << " keys"
9305 << " " << reply_obj.reqids.size() << " reqids"
9306 << dendl;
9307 reply_obj.cursor = cursor;
9308 if (!async_read_started) {
9309 encode(reply_obj, osd_op.outdata, features);
9310 }
9311 if (cb && !async_read_started) {
9312 delete cb;
9313 }
9314
9315 if (result > 0) {
9316 result = 0;
9317 }
9318 return result;
9319 }
9320
9321 void PrimaryLogPG::fill_in_copy_get_noent(OpRequestRef& op, hobject_t oid,
9322 OSDOp& osd_op)
9323 {
9324 const MOSDOp *m = static_cast<const MOSDOp*>(op->get_req());
9325 uint64_t features = m->get_features();
9326 object_copy_data_t reply_obj;
9327
9328 recovery_state.get_pg_log().get_log().get_object_reqids(oid, 10, &reply_obj.reqids,
9329 &reply_obj.reqid_return_codes);
9330 dout(20) << __func__ << " got reqids " << reply_obj.reqids << dendl;
9331 encode(reply_obj, osd_op.outdata, features);
9332 osd_op.rval = -ENOENT;
9333 MOSDOpReply *reply = new MOSDOpReply(m, 0, get_osdmap_epoch(), 0, false);
9334 reply->set_result(-ENOENT);
9335 reply->add_flags(CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK);
9336 osd->send_message_osd_client(reply, m->get_connection());
9337 }
9338
9339 void PrimaryLogPG::start_copy(CopyCallback *cb, ObjectContextRef obc,
9340 hobject_t src, object_locator_t oloc,
9341 version_t version, unsigned flags,
9342 bool mirror_snapset,
9343 unsigned src_obj_fadvise_flags,
9344 unsigned dest_obj_fadvise_flags)
9345 {
9346 const hobject_t& dest = obc->obs.oi.soid;
9347 dout(10) << __func__ << " " << dest
9348 << " from " << src << " " << oloc << " v" << version
9349 << " flags " << flags
9350 << (mirror_snapset ? " mirror_snapset" : "")
9351 << dendl;
9352
9353 ceph_assert(!mirror_snapset || src.snap == CEPH_NOSNAP);
9354
9355 // cancel a previous in-progress copy?
9356 if (copy_ops.count(dest)) {
9357 // FIXME: if the src etc match, we could avoid restarting from the
9358 // beginning.
9359 CopyOpRef cop = copy_ops[dest];
9360 vector<ceph_tid_t> tids;
9361 cancel_copy(cop, false, &tids);
9362 osd->objecter->op_cancel(tids, -ECANCELED);
9363 }
9364
9365 CopyOpRef cop(std::make_shared<CopyOp>(cb, obc, src, oloc, version, flags,
9366 mirror_snapset, src_obj_fadvise_flags,
9367 dest_obj_fadvise_flags));
9368 copy_ops[dest] = cop;
9369 obc->start_block();
9370
9371 if (!obc->obs.oi.has_manifest()) {
9372 _copy_some(obc, cop);
9373 } else {
9374 if (obc->obs.oi.manifest.is_redirect()) {
9375 _copy_some(obc, cop);
9376 } else if (obc->obs.oi.manifest.is_chunked()) {
9377 auto p = obc->obs.oi.manifest.chunk_map.begin();
9378 _copy_some_manifest(obc, cop, p->first);
9379 } else {
9380 ceph_abort_msg("unrecognized manifest type");
9381 }
9382 }
9383 }
9384
9385 void PrimaryLogPG::_copy_some(ObjectContextRef obc, CopyOpRef cop)
9386 {
9387 dout(10) << __func__ << " " << *obc << " " << cop << dendl;
9388
9389 unsigned flags = 0;
9390 if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_FLUSH)
9391 flags |= CEPH_OSD_FLAG_FLUSH;
9392 if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_IGNORE_CACHE)
9393 flags |= CEPH_OSD_FLAG_IGNORE_CACHE;
9394 if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_IGNORE_OVERLAY)
9395 flags |= CEPH_OSD_FLAG_IGNORE_OVERLAY;
9396 if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_MAP_SNAP_CLONE)
9397 flags |= CEPH_OSD_FLAG_MAP_SNAP_CLONE;
9398 if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_RWORDERED)
9399 flags |= CEPH_OSD_FLAG_RWORDERED;
9400
9401 C_GatherBuilder gather(cct);
9402
9403 if (cop->cursor.is_initial() && cop->mirror_snapset) {
9404 // list snaps too.
9405 ceph_assert(cop->src.snap == CEPH_NOSNAP);
9406 ObjectOperation op;
9407 op.list_snaps(&cop->results.snapset, NULL);
9408 ceph_tid_t tid = osd->objecter->read(cop->src.oid, cop->oloc, op,
9409 CEPH_SNAPDIR, NULL,
9410 flags, gather.new_sub(), NULL);
9411 cop->objecter_tid2 = tid;
9412 }
9413
9414 ObjectOperation op;
9415 if (cop->results.user_version) {
9416 op.assert_version(cop->results.user_version);
9417 } else {
9418 // we should learn the version after the first chunk, if we didn't know
9419 // it already!
9420 ceph_assert(cop->cursor.is_initial());
9421 }
9422 op.copy_get(&cop->cursor, get_copy_chunk_size(),
9423 &cop->results.object_size, &cop->results.mtime,
9424 &cop->attrs, &cop->data, &cop->omap_header, &cop->omap_data,
9425 &cop->results.snaps, &cop->results.snap_seq,
9426 &cop->results.flags,
9427 &cop->results.source_data_digest,
9428 &cop->results.source_omap_digest,
9429 &cop->results.reqids,
9430 &cop->results.reqid_return_codes,
9431 &cop->results.truncate_seq,
9432 &cop->results.truncate_size,
9433 &cop->rval);
9434 op.set_last_op_flags(cop->src_obj_fadvise_flags);
9435
9436 C_Copyfrom *fin = new C_Copyfrom(this, obc->obs.oi.soid,
9437 get_last_peering_reset(), cop);
9438 gather.set_finisher(new C_OnFinisher(fin,
9439 osd->get_objecter_finisher(get_pg_shard())));
9440
9441 ceph_tid_t tid = osd->objecter->read(cop->src.oid, cop->oloc, op,
9442 cop->src.snap, NULL,
9443 flags,
9444 gather.new_sub(),
9445 // discover the object version if we don't know it yet
9446 cop->results.user_version ? NULL : &cop->results.user_version);
9447 fin->tid = tid;
9448 cop->objecter_tid = tid;
9449 gather.activate();
9450 }
9451
9452 void PrimaryLogPG::_copy_some_manifest(ObjectContextRef obc, CopyOpRef cop, uint64_t start_offset)
9453 {
9454 dout(10) << __func__ << " " << *obc << " " << cop << dendl;
9455
9456 unsigned flags = 0;
9457 if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_FLUSH)
9458 flags |= CEPH_OSD_FLAG_FLUSH;
9459 if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_IGNORE_CACHE)
9460 flags |= CEPH_OSD_FLAG_IGNORE_CACHE;
9461 if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_IGNORE_OVERLAY)
9462 flags |= CEPH_OSD_FLAG_IGNORE_OVERLAY;
9463 if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_MAP_SNAP_CLONE)
9464 flags |= CEPH_OSD_FLAG_MAP_SNAP_CLONE;
9465 if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_RWORDERED)
9466 flags |= CEPH_OSD_FLAG_RWORDERED;
9467
9468 int num_chunks = 0;
9469 uint64_t last_offset = 0, chunks_size = 0;
9470 object_manifest_t *manifest = &obc->obs.oi.manifest;
9471 map<uint64_t, chunk_info_t>::iterator iter = manifest->chunk_map.find(start_offset);
9472 for (;iter != manifest->chunk_map.end(); ++iter) {
9473 num_chunks++;
9474 chunks_size += iter->second.length;
9475 last_offset = iter->first;
9476 if (get_copy_chunk_size() < chunks_size) {
9477 break;
9478 }
9479 }
9480
9481 cop->num_chunk = num_chunks;
9482 cop->start_offset = start_offset;
9483 cop->last_offset = last_offset;
9484 dout(20) << __func__ << " oid " << obc->obs.oi.soid << " num_chunks: " << num_chunks
9485 << " start_offset: " << start_offset << " chunks_size: " << chunks_size
9486 << " last_offset: " << last_offset << dendl;
9487
9488 iter = manifest->chunk_map.find(start_offset);
9489 for (;iter != manifest->chunk_map.end(); ++iter) {
9490 uint64_t obj_offset = iter->first;
9491 uint64_t length = manifest->chunk_map[iter->first].length;
9492 hobject_t soid = manifest->chunk_map[iter->first].oid;
9493 object_locator_t oloc(soid);
9494 CopyCallback * cb = NULL;
9495 CopyOpRef sub_cop(std::make_shared<CopyOp>(cb, ObjectContextRef(), cop->src, oloc,
9496 cop->results.user_version, cop->flags, cop->mirror_snapset,
9497 cop->src_obj_fadvise_flags, cop->dest_obj_fadvise_flags));
9498 sub_cop->cursor.data_offset = obj_offset;
9499 cop->chunk_cops[obj_offset] = sub_cop;
9500
9501 int s = sub_cop->chunk_ops.size();
9502 sub_cop->chunk_ops.resize(s+1);
9503 sub_cop->chunk_ops[s].op.op = CEPH_OSD_OP_READ;
9504 sub_cop->chunk_ops[s].op.extent.offset = manifest->chunk_map[iter->first].offset;
9505 sub_cop->chunk_ops[s].op.extent.length = length;
9506
9507 ObjectOperation op;
9508 op.dup(sub_cop->chunk_ops);
9509
9510 if (cop->results.user_version) {
9511 op.assert_version(cop->results.user_version);
9512 } else {
9513 // we should learn the version after the first chunk, if we didn't know
9514 // it already!
9515 ceph_assert(cop->cursor.is_initial());
9516 }
9517 op.set_last_op_flags(cop->src_obj_fadvise_flags);
9518
9519 C_CopyChunk *fin = new C_CopyChunk(this, obc->obs.oi.soid,
9520 get_last_peering_reset(), cop);
9521 fin->offset = obj_offset;
9522
9523 ceph_tid_t tid = osd->objecter->read(
9524 soid.oid, oloc, op,
9525 sub_cop->src.snap, NULL,
9526 flags,
9527 new C_OnFinisher(fin, osd->get_objecter_finisher(get_pg_shard())),
9528 // discover the object version if we don't know it yet
9529 sub_cop->results.user_version ? NULL : &sub_cop->results.user_version);
9530 fin->tid = tid;
9531 sub_cop->objecter_tid = tid;
9532
9533 dout(20) << __func__ << " tgt_oid: " << soid.oid << " tgt_offset: "
9534 << manifest->chunk_map[iter->first].offset
9535 << " length: " << length << " pool id: " << oloc.pool
9536 << " tid: " << tid << dendl;
9537
9538 if (last_offset <= iter->first) {
9539 break;
9540 }
9541 }
9542 }
9543
9544 void PrimaryLogPG::process_copy_chunk(hobject_t oid, ceph_tid_t tid, int r)
9545 {
9546 dout(10) << __func__ << " " << oid << " tid " << tid
9547 << " " << cpp_strerror(r) << dendl;
9548 map<hobject_t,CopyOpRef>::iterator p = copy_ops.find(oid);
9549 if (p == copy_ops.end()) {
9550 dout(10) << __func__ << " no copy_op found" << dendl;
9551 return;
9552 }
9553 CopyOpRef cop = p->second;
9554 if (tid != cop->objecter_tid) {
9555 dout(10) << __func__ << " tid " << tid << " != cop " << cop
9556 << " tid " << cop->objecter_tid << dendl;
9557 return;
9558 }
9559
9560 if (cop->omap_data.length() || cop->omap_header.length())
9561 cop->results.has_omap = true;
9562
9563 if (r >= 0 && !pool.info.supports_omap() &&
9564 (cop->omap_data.length() || cop->omap_header.length())) {
9565 r = -EOPNOTSUPP;
9566 }
9567 cop->objecter_tid = 0;
9568 cop->objecter_tid2 = 0; // assume this ordered before us (if it happened)
9569 ObjectContextRef& cobc = cop->obc;
9570
9571 if (r < 0)
9572 goto out;
9573
9574 ceph_assert(cop->rval >= 0);
9575
9576 if (oid.snap < CEPH_NOSNAP && !cop->results.snaps.empty()) {
9577 // verify snap hasn't been deleted
9578 vector<snapid_t>::iterator p = cop->results.snaps.begin();
9579 while (p != cop->results.snaps.end()) {
9580 // make best effort to sanitize snaps/clones.
9581 if (get_osdmap()->in_removed_snaps_queue(info.pgid.pgid.pool(), *p)) {
9582 dout(10) << __func__ << " clone snap " << *p << " has been deleted"
9583 << dendl;
9584 for (vector<snapid_t>::iterator q = p + 1;
9585 q != cop->results.snaps.end();
9586 ++q)
9587 *(q - 1) = *q;
9588 cop->results.snaps.resize(cop->results.snaps.size() - 1);
9589 } else {
9590 ++p;
9591 }
9592 }
9593 if (cop->results.snaps.empty()) {
9594 dout(10) << __func__ << " no more snaps for " << oid << dendl;
9595 r = -ENOENT;
9596 goto out;
9597 }
9598 }
9599
9600 ceph_assert(cop->rval >= 0);
9601
9602 if (!cop->temp_cursor.data_complete) {
9603 cop->results.data_digest = cop->data.crc32c(cop->results.data_digest);
9604 }
9605 if (pool.info.supports_omap() && !cop->temp_cursor.omap_complete) {
9606 if (cop->omap_header.length()) {
9607 cop->results.omap_digest =
9608 cop->omap_header.crc32c(cop->results.omap_digest);
9609 }
9610 if (cop->omap_data.length()) {
9611 bufferlist keys;
9612 keys.substr_of(cop->omap_data, 4, cop->omap_data.length() - 4);
9613 cop->results.omap_digest = keys.crc32c(cop->results.omap_digest);
9614 }
9615 }
9616
9617 if (!cop->temp_cursor.attr_complete) {
9618 for (map<string,bufferlist>::iterator p = cop->attrs.begin();
9619 p != cop->attrs.end();
9620 ++p) {
9621 cop->results.attrs[string("_") + p->first] = p->second;
9622 }
9623 cop->attrs.clear();
9624 }
9625
9626 if (!cop->cursor.is_complete()) {
9627 // write out what we have so far
9628 if (cop->temp_cursor.is_initial()) {
9629 ceph_assert(!cop->results.started_temp_obj);
9630 cop->results.started_temp_obj = true;
9631 cop->results.temp_oid = generate_temp_object(oid);
9632 dout(20) << __func__ << " using temp " << cop->results.temp_oid << dendl;
9633 }
9634 ObjectContextRef tempobc = get_object_context(cop->results.temp_oid, true);
9635 OpContextUPtr ctx = simple_opc_create(tempobc);
9636 if (cop->temp_cursor.is_initial()) {
9637 ctx->new_temp_oid = cop->results.temp_oid;
9638 }
9639 _write_copy_chunk(cop, ctx->op_t.get());
9640 simple_opc_submit(std::move(ctx));
9641 dout(10) << __func__ << " fetching more" << dendl;
9642 _copy_some(cobc, cop);
9643 return;
9644 }
9645
9646 // verify digests?
9647 if (cop->results.is_data_digest() || cop->results.is_omap_digest()) {
9648 dout(20) << __func__ << std::hex
9649 << " got digest: rx data 0x" << cop->results.data_digest
9650 << " omap 0x" << cop->results.omap_digest
9651 << ", source: data 0x" << cop->results.source_data_digest
9652 << " omap 0x" << cop->results.source_omap_digest
9653 << std::dec
9654 << " flags " << cop->results.flags
9655 << dendl;
9656 }
9657 if (cop->results.is_data_digest() &&
9658 cop->results.data_digest != cop->results.source_data_digest) {
9659 derr << __func__ << std::hex << " data digest 0x" << cop->results.data_digest
9660 << " != source 0x" << cop->results.source_data_digest << std::dec
9661 << dendl;
9662 osd->clog->error() << info.pgid << " copy from " << cop->src
9663 << " to " << cop->obc->obs.oi.soid << std::hex
9664 << " data digest 0x" << cop->results.data_digest
9665 << " != source 0x" << cop->results.source_data_digest
9666 << std::dec;
9667 r = -EIO;
9668 goto out;
9669 }
9670 if (cop->results.is_omap_digest() &&
9671 cop->results.omap_digest != cop->results.source_omap_digest) {
9672 derr << __func__ << std::hex
9673 << " omap digest 0x" << cop->results.omap_digest
9674 << " != source 0x" << cop->results.source_omap_digest
9675 << std::dec << dendl;
9676 osd->clog->error() << info.pgid << " copy from " << cop->src
9677 << " to " << cop->obc->obs.oi.soid << std::hex
9678 << " omap digest 0x" << cop->results.omap_digest
9679 << " != source 0x" << cop->results.source_omap_digest
9680 << std::dec;
9681 r = -EIO;
9682 goto out;
9683 }
9684 if (cct->_conf->osd_debug_inject_copyfrom_error) {
9685 derr << __func__ << " injecting copyfrom failure" << dendl;
9686 r = -EIO;
9687 goto out;
9688 }
9689
9690 cop->results.fill_in_final_tx = std::function<void(PGTransaction*)>(
9691 [this, &cop /* avoid ref cycle */](PGTransaction *t) {
9692 ObjectState& obs = cop->obc->obs;
9693 if (cop->temp_cursor.is_initial()) {
9694 dout(20) << "fill_in_final_tx: writing "
9695 << "directly to final object" << dendl;
9696 // write directly to final object
9697 cop->results.temp_oid = obs.oi.soid;
9698 _write_copy_chunk(cop, t);
9699 } else {
9700 // finish writing to temp object, then move into place
9701 dout(20) << "fill_in_final_tx: writing to temp object" << dendl;
9702 if (obs.oi.has_manifest() && obs.oi.manifest.is_redirect() && obs.exists) {
9703 /* In redirect manifest case, the object exists in the upper tier.
9704 * So, to avoid a conflict when rename() is called, remove existing
9705 * object first
9706 */
9707 t->remove(obs.oi.soid);
9708 }
9709 _write_copy_chunk(cop, t);
9710 t->rename(obs.oi.soid, cop->results.temp_oid);
9711 }
9712 t->setattrs(obs.oi.soid, cop->results.attrs);
9713 });
9714
9715 dout(20) << __func__ << " success; committing" << dendl;
9716
9717 out:
9718 dout(20) << __func__ << " complete r = " << cpp_strerror(r) << dendl;
9719 CopyCallbackResults results(r, &cop->results);
9720 cop->cb->complete(results);
9721
9722 copy_ops.erase(cobc->obs.oi.soid);
9723 cobc->stop_block();
9724
9725 if (r < 0 && cop->results.started_temp_obj) {
9726 dout(10) << __func__ << " deleting partial temp object "
9727 << cop->results.temp_oid << dendl;
9728 ObjectContextRef tempobc = get_object_context(cop->results.temp_oid, true);
9729 OpContextUPtr ctx = simple_opc_create(tempobc);
9730 ctx->op_t->remove(cop->results.temp_oid);
9731 ctx->discard_temp_oid = cop->results.temp_oid;
9732 simple_opc_submit(std::move(ctx));
9733 }
9734
9735 // cancel and requeue proxy ops on this object
9736 if (!r) {
9737 cancel_and_requeue_proxy_ops(cobc->obs.oi.soid);
9738 }
9739
9740 kick_object_context_blocked(cobc);
9741 }
9742
9743 void PrimaryLogPG::process_copy_chunk_manifest(hobject_t oid, ceph_tid_t tid, int r, uint64_t offset)
9744 {
9745 dout(10) << __func__ << " " << oid << " tid " << tid
9746 << " " << cpp_strerror(r) << dendl;
9747 map<hobject_t,CopyOpRef>::iterator p = copy_ops.find(oid);
9748 if (p == copy_ops.end()) {
9749 dout(10) << __func__ << " no copy_op found" << dendl;
9750 return;
9751 }
9752 CopyOpRef obj_cop = p->second;
9753 CopyOpRef chunk_cop = obj_cop->chunk_cops[offset];
9754
9755 if (tid != chunk_cop->objecter_tid) {
9756 dout(10) << __func__ << " tid " << tid << " != cop " << chunk_cop
9757 << " tid " << chunk_cop->objecter_tid << dendl;
9758 return;
9759 }
9760
9761 if (chunk_cop->omap_data.length() || chunk_cop->omap_header.length()) {
9762 r = -EOPNOTSUPP;
9763 }
9764
9765 chunk_cop->objecter_tid = 0;
9766 chunk_cop->objecter_tid2 = 0; // assume this ordered before us (if it happened)
9767 ObjectContextRef& cobc = obj_cop->obc;
9768 OSDOp &chunk_data = chunk_cop->chunk_ops[0];
9769
9770 if (r < 0) {
9771 obj_cop->failed = true;
9772 goto out;
9773 }
9774
9775 if (obj_cop->failed) {
9776 return;
9777 }
9778 if (!chunk_data.outdata.length()) {
9779 r = -EIO;
9780 obj_cop->failed = true;
9781 goto out;
9782 }
9783
9784 obj_cop->num_chunk--;
9785
9786 /* check all of the copyop are completed */
9787 if (obj_cop->num_chunk) {
9788 dout(20) << __func__ << " num_chunk: " << obj_cop->num_chunk << dendl;
9789 return;
9790 }
9791
9792 {
9793 OpContextUPtr ctx = simple_opc_create(obj_cop->obc);
9794 if (!ctx->lock_manager.take_write_lock(
9795 obj_cop->obc->obs.oi.soid,
9796 obj_cop->obc)) {
9797 // recovery op can take read lock.
9798 // so need to wait for recovery completion
9799 r = -EAGAIN;
9800 obj_cop->failed = true;
9801 close_op_ctx(ctx.release());
9802 goto out;
9803 }
9804 dout(20) << __func__ << " took lock on obc, " << obj_cop->obc->rwstate << dendl;
9805
9806 PGTransaction *t = ctx->op_t.get();
9807 ObjectState& obs = ctx->new_obs;
9808 for (auto p : obj_cop->chunk_cops) {
9809 OSDOp &sub_chunk = p.second->chunk_ops[0];
9810 t->write(cobc->obs.oi.soid,
9811 p.second->cursor.data_offset,
9812 sub_chunk.outdata.length(),
9813 sub_chunk.outdata,
9814 p.second->dest_obj_fadvise_flags);
9815 dout(20) << __func__ << " offset: " << p.second->cursor.data_offset
9816 << " length: " << sub_chunk.outdata.length() << dendl;
9817 write_update_size_and_usage(ctx->delta_stats, obs.oi, ctx->modified_ranges,
9818 p.second->cursor.data_offset, sub_chunk.outdata.length());
9819 obs.oi.manifest.chunk_map[p.second->cursor.data_offset].clear_flag(chunk_info_t::FLAG_MISSING);
9820 ctx->clean_regions.mark_data_region_dirty(p.second->cursor.data_offset, sub_chunk.outdata.length());
9821 sub_chunk.outdata.clear();
9822 }
9823 obs.oi.clear_data_digest();
9824 ctx->at_version = get_next_version();
9825 finish_ctx(ctx.get(), pg_log_entry_t::PROMOTE);
9826 simple_opc_submit(std::move(ctx));
9827 obj_cop->chunk_cops.clear();
9828
9829 auto p = cobc->obs.oi.manifest.chunk_map.rbegin();
9830 /* check remaining work */
9831 if (p != cobc->obs.oi.manifest.chunk_map.rend()) {
9832 if (obj_cop->last_offset < p->first) {
9833 for (auto &en : cobc->obs.oi.manifest.chunk_map) {
9834 if (obj_cop->last_offset < en.first) {
9835 _copy_some_manifest(cobc, obj_cop, en.first);
9836 return;
9837 }
9838 }
9839 }
9840 }
9841 }
9842
9843 out:
9844 dout(20) << __func__ << " complete r = " << cpp_strerror(r) << dendl;
9845 CopyCallbackResults results(r, &obj_cop->results);
9846 obj_cop->cb->complete(results);
9847
9848 copy_ops.erase(cobc->obs.oi.soid);
9849 cobc->stop_block();
9850
9851 // cancel and requeue proxy ops on this object
9852 if (!r) {
9853 cancel_and_requeue_proxy_ops(cobc->obs.oi.soid);
9854 }
9855
9856 kick_object_context_blocked(cobc);
9857 }
9858
9859 void PrimaryLogPG::cancel_and_requeue_proxy_ops(hobject_t oid) {
9860 vector<ceph_tid_t> tids;
9861 for (map<ceph_tid_t, ProxyReadOpRef>::iterator it = proxyread_ops.begin();
9862 it != proxyread_ops.end();) {
9863 if (it->second->soid == oid) {
9864 cancel_proxy_read((it++)->second, &tids);
9865 } else {
9866 ++it;
9867 }
9868 }
9869 for (map<ceph_tid_t, ProxyWriteOpRef>::iterator it = proxywrite_ops.begin();
9870 it != proxywrite_ops.end();) {
9871 if (it->second->soid == oid) {
9872 cancel_proxy_write((it++)->second, &tids);
9873 } else {
9874 ++it;
9875 }
9876 }
9877 osd->objecter->op_cancel(tids, -ECANCELED);
9878 kick_proxy_ops_blocked(oid);
9879 }
9880
9881 void PrimaryLogPG::_write_copy_chunk(CopyOpRef cop, PGTransaction *t)
9882 {
9883 dout(20) << __func__ << " " << cop
9884 << " " << cop->attrs.size() << " attrs"
9885 << " " << cop->data.length() << " bytes"
9886 << " " << cop->omap_header.length() << " omap header bytes"
9887 << " " << cop->omap_data.length() << " omap data bytes"
9888 << dendl;
9889 if (!cop->temp_cursor.attr_complete) {
9890 t->create(cop->results.temp_oid);
9891 }
9892 if (!cop->temp_cursor.data_complete) {
9893 ceph_assert(cop->data.length() + cop->temp_cursor.data_offset ==
9894 cop->cursor.data_offset);
9895 if (pool.info.required_alignment() &&
9896 !cop->cursor.data_complete) {
9897 /**
9898 * Trim off the unaligned bit at the end, we'll adjust cursor.data_offset
9899 * to pick it up on the next pass.
9900 */
9901 ceph_assert(cop->temp_cursor.data_offset %
9902 pool.info.required_alignment() == 0);
9903 if (cop->data.length() % pool.info.required_alignment() != 0) {
9904 uint64_t to_trim =
9905 cop->data.length() % pool.info.required_alignment();
9906 bufferlist bl;
9907 bl.substr_of(cop->data, 0, cop->data.length() - to_trim);
9908 cop->data.swap(bl);
9909 cop->cursor.data_offset -= to_trim;
9910 ceph_assert(cop->data.length() + cop->temp_cursor.data_offset ==
9911 cop->cursor.data_offset);
9912 }
9913 }
9914 if (cop->data.length()) {
9915 t->write(
9916 cop->results.temp_oid,
9917 cop->temp_cursor.data_offset,
9918 cop->data.length(),
9919 cop->data,
9920 cop->dest_obj_fadvise_flags);
9921 }
9922 cop->data.clear();
9923 }
9924 if (pool.info.supports_omap()) {
9925 if (!cop->temp_cursor.omap_complete) {
9926 if (cop->omap_header.length()) {
9927 t->omap_setheader(
9928 cop->results.temp_oid,
9929 cop->omap_header);
9930 cop->omap_header.clear();
9931 }
9932 if (cop->omap_data.length()) {
9933 map<string,bufferlist> omap;
9934 bufferlist::const_iterator p = cop->omap_data.begin();
9935 decode(omap, p);
9936 t->omap_setkeys(cop->results.temp_oid, omap);
9937 cop->omap_data.clear();
9938 }
9939 }
9940 } else {
9941 ceph_assert(cop->omap_header.length() == 0);
9942 ceph_assert(cop->omap_data.length() == 0);
9943 }
9944 cop->temp_cursor = cop->cursor;
9945 }
9946
9947 void PrimaryLogPG::finish_copyfrom(CopyFromCallback *cb)
9948 {
9949 OpContext *ctx = cb->ctx;
9950 dout(20) << "finish_copyfrom on " << ctx->obs->oi.soid << dendl;
9951
9952 ObjectState& obs = ctx->new_obs;
9953 if (obs.exists) {
9954 dout(20) << __func__ << ": exists, removing" << dendl;
9955 ctx->op_t->remove(obs.oi.soid);
9956 } else {
9957 ctx->delta_stats.num_objects++;
9958 obs.exists = true;
9959 }
9960 if (cb->is_temp_obj_used()) {
9961 ctx->discard_temp_oid = cb->results->temp_oid;
9962 }
9963 cb->results->fill_in_final_tx(ctx->op_t.get());
9964
9965 // CopyFromCallback fills this in for us
9966 obs.oi.user_version = ctx->user_at_version;
9967
9968 if (cb->results->is_data_digest()) {
9969 obs.oi.set_data_digest(cb->results->data_digest);
9970 } else {
9971 obs.oi.clear_data_digest();
9972 }
9973 if (cb->results->is_omap_digest()) {
9974 obs.oi.set_omap_digest(cb->results->omap_digest);
9975 } else {
9976 obs.oi.clear_omap_digest();
9977 }
9978
9979 obs.oi.truncate_seq = cb->truncate_seq;
9980 obs.oi.truncate_size = cb->truncate_size;
9981
9982 obs.oi.mtime = ceph::real_clock::to_timespec(cb->results->mtime);
9983 ctx->mtime = utime_t();
9984
9985 ctx->extra_reqids = cb->results->reqids;
9986 ctx->extra_reqid_return_codes = cb->results->reqid_return_codes;
9987
9988 // cache: clear whiteout?
9989 if (obs.oi.is_whiteout()) {
9990 dout(10) << __func__ << " clearing whiteout on " << obs.oi.soid << dendl;
9991 obs.oi.clear_flag(object_info_t::FLAG_WHITEOUT);
9992 --ctx->delta_stats.num_whiteouts;
9993 }
9994
9995 if (cb->results->has_omap) {
9996 dout(10) << __func__ << " setting omap flag on " << obs.oi.soid << dendl;
9997 obs.oi.set_flag(object_info_t::FLAG_OMAP);
9998 ctx->clean_regions.mark_omap_dirty();
9999 } else {
10000 dout(10) << __func__ << " clearing omap flag on " << obs.oi.soid << dendl;
10001 obs.oi.clear_flag(object_info_t::FLAG_OMAP);
10002 }
10003
10004 interval_set<uint64_t> ch;
10005 if (obs.oi.size > 0)
10006 ch.insert(0, obs.oi.size);
10007 ctx->modified_ranges.union_of(ch);
10008 ctx->clean_regions.mark_data_region_dirty(0, std::max(obs.oi.size, cb->get_data_size()));
10009
10010 if (cb->get_data_size() != obs.oi.size) {
10011 ctx->delta_stats.num_bytes -= obs.oi.size;
10012 obs.oi.size = cb->get_data_size();
10013 ctx->delta_stats.num_bytes += obs.oi.size;
10014 }
10015 ctx->delta_stats.num_wr++;
10016 ctx->delta_stats.num_wr_kb += shift_round_up(obs.oi.size, 10);
10017
10018 osd->logger->inc(l_osd_copyfrom);
10019 }
10020
10021 void PrimaryLogPG::finish_promote(int r, CopyResults *results,
10022 ObjectContextRef obc)
10023 {
10024 const hobject_t& soid = obc->obs.oi.soid;
10025 dout(10) << __func__ << " " << soid << " r=" << r
10026 << " uv" << results->user_version << dendl;
10027
10028 if (r == -ECANCELED) {
10029 return;
10030 }
10031
10032 if (r != -ENOENT && soid.is_snap()) {
10033 if (results->snaps.empty()) {
10034 // we must have read "snap" content from the head object in the
10035 // base pool. use snap_seq to construct what snaps should be
10036 // for this clone (what is was before we evicted the clean clone
10037 // from this pool, and what it will be when we flush and the
10038 // clone eventually happens in the base pool). we want to use
10039 // snaps in (results->snap_seq,soid.snap]
10040 SnapSet& snapset = obc->ssc->snapset;
10041 for (auto p = snapset.clone_snaps.rbegin();
10042 p != snapset.clone_snaps.rend();
10043 ++p) {
10044 for (auto snap : p->second) {
10045 if (snap > soid.snap) {
10046 continue;
10047 }
10048 if (snap <= results->snap_seq) {
10049 break;
10050 }
10051 results->snaps.push_back(snap);
10052 }
10053 }
10054 }
10055
10056 dout(20) << __func__ << " snaps " << results->snaps << dendl;
10057 filter_snapc(results->snaps);
10058
10059 dout(20) << __func__ << " filtered snaps " << results->snaps << dendl;
10060 if (results->snaps.empty()) {
10061 dout(20) << __func__
10062 << " snaps are empty, clone is invalid,"
10063 << " setting r to ENOENT" << dendl;
10064 r = -ENOENT;
10065 }
10066 }
10067
10068 if (r < 0 && results->started_temp_obj) {
10069 dout(10) << __func__ << " abort; will clean up partial work" << dendl;
10070 ObjectContextRef tempobc = get_object_context(results->temp_oid, false);
10071 ceph_assert(tempobc);
10072 OpContextUPtr ctx = simple_opc_create(tempobc);
10073 ctx->op_t->remove(results->temp_oid);
10074 simple_opc_submit(std::move(ctx));
10075 results->started_temp_obj = false;
10076 }
10077
10078 if (r == -ENOENT && soid.is_snap()) {
10079 dout(10) << __func__
10080 << ": enoent while trying to promote clone, " << soid
10081 << " must have been trimmed, removing from snapset"
10082 << dendl;
10083 hobject_t head(soid.get_head());
10084 ObjectContextRef obc = get_object_context(head, false);
10085 ceph_assert(obc);
10086
10087 OpContextUPtr tctx = simple_opc_create(obc);
10088 tctx->at_version = get_next_version();
10089 if (get_osdmap()->require_osd_release < ceph_release_t::octopus) {
10090 filter_snapc(tctx->new_snapset.snaps);
10091 } else {
10092 tctx->new_snapset.snaps.clear();
10093 }
10094 vector<snapid_t> new_clones;
10095 map<snapid_t, vector<snapid_t>> new_clone_snaps;
10096 for (vector<snapid_t>::iterator i = tctx->new_snapset.clones.begin();
10097 i != tctx->new_snapset.clones.end();
10098 ++i) {
10099 if (*i != soid.snap) {
10100 new_clones.push_back(*i);
10101 auto p = tctx->new_snapset.clone_snaps.find(*i);
10102 if (p != tctx->new_snapset.clone_snaps.end()) {
10103 new_clone_snaps[*i] = p->second;
10104 }
10105 }
10106 }
10107 tctx->new_snapset.clones.swap(new_clones);
10108 tctx->new_snapset.clone_overlap.erase(soid.snap);
10109 tctx->new_snapset.clone_size.erase(soid.snap);
10110 tctx->new_snapset.clone_snaps.swap(new_clone_snaps);
10111
10112 // take RWWRITE lock for duration of our local write. ignore starvation.
10113 if (!tctx->lock_manager.take_write_lock(
10114 head,
10115 obc)) {
10116 ceph_abort_msg("problem!");
10117 }
10118 dout(20) << __func__ << " took lock on obc, " << obc->rwstate << dendl;
10119
10120 finish_ctx(tctx.get(), pg_log_entry_t::PROMOTE);
10121
10122 simple_opc_submit(std::move(tctx));
10123 return;
10124 }
10125
10126 bool whiteout = false;
10127 if (r == -ENOENT) {
10128 ceph_assert(soid.snap == CEPH_NOSNAP); // snap case is above
10129 dout(10) << __func__ << " whiteout " << soid << dendl;
10130 whiteout = true;
10131 }
10132
10133 if (r < 0 && !whiteout) {
10134 derr << __func__ << " unexpected promote error " << cpp_strerror(r) << dendl;
10135 // pass error to everyone blocked on this object
10136 // FIXME: this is pretty sloppy, but at this point we got
10137 // something unexpected and don't have many other options.
10138 map<hobject_t,list<OpRequestRef>>::iterator blocked_iter =
10139 waiting_for_blocked_object.find(soid);
10140 if (blocked_iter != waiting_for_blocked_object.end()) {
10141 while (!blocked_iter->second.empty()) {
10142 osd->reply_op_error(blocked_iter->second.front(), r);
10143 blocked_iter->second.pop_front();
10144 }
10145 waiting_for_blocked_object.erase(blocked_iter);
10146 }
10147 return;
10148 }
10149
10150 osd->promote_finish(results->object_size);
10151
10152 OpContextUPtr tctx = simple_opc_create(obc);
10153 tctx->at_version = get_next_version();
10154
10155 if (!obc->obs.oi.has_manifest()) {
10156 ++tctx->delta_stats.num_objects;
10157 }
10158 if (soid.snap < CEPH_NOSNAP)
10159 ++tctx->delta_stats.num_object_clones;
10160 tctx->new_obs.exists = true;
10161
10162 tctx->extra_reqids = results->reqids;
10163 tctx->extra_reqid_return_codes = results->reqid_return_codes;
10164
10165 if (obc->obs.oi.has_manifest() && obc->obs.oi.manifest.is_redirect()) {
10166 tctx->new_obs.oi.manifest.type = object_manifest_t::TYPE_NONE;
10167 tctx->new_obs.oi.clear_flag(object_info_t::FLAG_REDIRECT_HAS_REFERENCE);
10168 tctx->new_obs.oi.clear_flag(object_info_t::FLAG_MANIFEST);
10169 tctx->new_obs.oi.manifest.redirect_target = hobject_t();
10170 tctx->delta_stats.num_objects_manifest--;
10171 if (obc->obs.oi.test_flag(object_info_t::FLAG_REDIRECT_HAS_REFERENCE)) {
10172 dec_all_refcount_manifest(obc->obs.oi, tctx.get());
10173 }
10174 }
10175
10176 if (whiteout) {
10177 // create a whiteout
10178 tctx->op_t->create(soid);
10179 tctx->new_obs.oi.set_flag(object_info_t::FLAG_WHITEOUT);
10180 ++tctx->delta_stats.num_whiteouts;
10181 dout(20) << __func__ << " creating whiteout on " << soid << dendl;
10182 osd->logger->inc(l_osd_tier_whiteout);
10183 } else {
10184 if (results->has_omap) {
10185 dout(10) << __func__ << " setting omap flag on " << soid << dendl;
10186 tctx->new_obs.oi.set_flag(object_info_t::FLAG_OMAP);
10187 ++tctx->delta_stats.num_objects_omap;
10188 }
10189
10190 results->fill_in_final_tx(tctx->op_t.get());
10191 if (results->started_temp_obj) {
10192 tctx->discard_temp_oid = results->temp_oid;
10193 }
10194 tctx->new_obs.oi.size = results->object_size;
10195 tctx->new_obs.oi.user_version = results->user_version;
10196 tctx->new_obs.oi.mtime = ceph::real_clock::to_timespec(results->mtime);
10197 tctx->mtime = utime_t();
10198 if (results->is_data_digest()) {
10199 tctx->new_obs.oi.set_data_digest(results->data_digest);
10200 } else {
10201 tctx->new_obs.oi.clear_data_digest();
10202 }
10203 if (results->object_size)
10204 tctx->clean_regions.mark_data_region_dirty(0, results->object_size);
10205 if (results->is_omap_digest()) {
10206 tctx->new_obs.oi.set_omap_digest(results->omap_digest);
10207 } else {
10208 tctx->new_obs.oi.clear_omap_digest();
10209 }
10210 if (results->has_omap)
10211 tctx->clean_regions.mark_omap_dirty();
10212 tctx->new_obs.oi.truncate_seq = results->truncate_seq;
10213 tctx->new_obs.oi.truncate_size = results->truncate_size;
10214
10215 if (soid.snap != CEPH_NOSNAP) {
10216 ceph_assert(obc->ssc->snapset.clone_snaps.count(soid.snap));
10217 ceph_assert(obc->ssc->snapset.clone_size.count(soid.snap));
10218 ceph_assert(obc->ssc->snapset.clone_size[soid.snap] ==
10219 results->object_size);
10220 ceph_assert(obc->ssc->snapset.clone_overlap.count(soid.snap));
10221
10222 tctx->delta_stats.num_bytes += obc->ssc->snapset.get_clone_bytes(soid.snap);
10223 } else {
10224 tctx->delta_stats.num_bytes += results->object_size;
10225 }
10226 }
10227
10228 if (results->mirror_snapset) {
10229 ceph_assert(tctx->new_obs.oi.soid.snap == CEPH_NOSNAP);
10230 tctx->new_snapset.from_snap_set(
10231 results->snapset,
10232 get_osdmap()->require_osd_release < ceph_release_t::luminous);
10233 }
10234 dout(20) << __func__ << " new_snapset " << tctx->new_snapset << dendl;
10235
10236 // take RWWRITE lock for duration of our local write. ignore starvation.
10237 if (!tctx->lock_manager.take_write_lock(
10238 obc->obs.oi.soid,
10239 obc)) {
10240 ceph_abort_msg("problem!");
10241 }
10242 dout(20) << __func__ << " took lock on obc, " << obc->rwstate << dendl;
10243
10244 finish_ctx(tctx.get(), pg_log_entry_t::PROMOTE);
10245
10246 simple_opc_submit(std::move(tctx));
10247
10248 osd->logger->inc(l_osd_tier_promote);
10249
10250 if (agent_state &&
10251 agent_state->is_idle())
10252 agent_choose_mode();
10253 }
10254
10255 void PrimaryLogPG::finish_promote_manifest(int r, CopyResults *results,
10256 ObjectContextRef obc)
10257 {
10258 const hobject_t& soid = obc->obs.oi.soid;
10259 dout(10) << __func__ << " " << soid << " r=" << r
10260 << " uv" << results->user_version << dendl;
10261
10262 if (r == -ECANCELED || r == -EAGAIN) {
10263 return;
10264 }
10265
10266 if (r < 0) {
10267 derr << __func__ << " unexpected promote error " << cpp_strerror(r) << dendl;
10268 // pass error to everyone blocked on this object
10269 // FIXME: this is pretty sloppy, but at this point we got
10270 // something unexpected and don't have many other options.
10271 map<hobject_t,list<OpRequestRef>>::iterator blocked_iter =
10272 waiting_for_blocked_object.find(soid);
10273 if (blocked_iter != waiting_for_blocked_object.end()) {
10274 while (!blocked_iter->second.empty()) {
10275 osd->reply_op_error(blocked_iter->second.front(), r);
10276 blocked_iter->second.pop_front();
10277 }
10278 waiting_for_blocked_object.erase(blocked_iter);
10279 }
10280 return;
10281 }
10282
10283 osd->promote_finish(results->object_size);
10284 osd->logger->inc(l_osd_tier_promote);
10285
10286 if (agent_state &&
10287 agent_state->is_idle())
10288 agent_choose_mode();
10289 }
10290
10291 void PrimaryLogPG::cancel_copy(CopyOpRef cop, bool requeue,
10292 vector<ceph_tid_t> *tids)
10293 {
10294 dout(10) << __func__ << " " << cop->obc->obs.oi.soid
10295 << " from " << cop->src << " " << cop->oloc
10296 << " v" << cop->results.user_version << dendl;
10297
10298 // cancel objecter op, if we can
10299 if (cop->objecter_tid) {
10300 tids->push_back(cop->objecter_tid);
10301 cop->objecter_tid = 0;
10302 if (cop->objecter_tid2) {
10303 tids->push_back(cop->objecter_tid2);
10304 cop->objecter_tid2 = 0;
10305 }
10306 }
10307
10308 copy_ops.erase(cop->obc->obs.oi.soid);
10309 cop->obc->stop_block();
10310
10311 kick_object_context_blocked(cop->obc);
10312 cop->results.should_requeue = requeue;
10313 CopyCallbackResults result(-ECANCELED, &cop->results);
10314 cop->cb->complete(result);
10315
10316 // There may still be an objecter callback referencing this copy op.
10317 // That callback will not need the obc since it's been canceled, and
10318 // we need the obc reference to go away prior to flush.
10319 cop->obc = ObjectContextRef();
10320 }
10321
10322 void PrimaryLogPG::cancel_copy_ops(bool requeue, vector<ceph_tid_t> *tids)
10323 {
10324 dout(10) << __func__ << dendl;
10325 map<hobject_t,CopyOpRef>::iterator p = copy_ops.begin();
10326 while (p != copy_ops.end()) {
10327 // requeue this op? can I queue up all of them?
10328 cancel_copy((p++)->second, requeue, tids);
10329 }
10330 }
10331
10332 struct C_gather : public Context {
10333 PrimaryLogPGRef pg;
10334 hobject_t oid;
10335 epoch_t last_peering_reset;
10336 OSDOp *osd_op;
10337 C_gather(PrimaryLogPG *pg_, hobject_t oid_, epoch_t lpr_, OSDOp *osd_op_) :
10338 pg(pg_), oid(oid_), last_peering_reset(lpr_), osd_op(osd_op_) {}
10339 void finish(int r) override {
10340 if (r == -ECANCELED)
10341 return;
10342 std::scoped_lock locker{*pg};
10343 auto p = pg->cls_gather_ops.find(oid);
10344 if (p == pg->cls_gather_ops.end()) {
10345 // op was cancelled
10346 return;
10347 }
10348 if (last_peering_reset != pg->get_last_peering_reset()) {
10349 return;
10350 }
10351 osd_op->rval = r;
10352 PrimaryLogPG::OpContext *ctx = p->second.ctx;
10353 pg->cls_gather_ops.erase(p);
10354 pg->execute_ctx(ctx);
10355 }
10356 };
10357
10358 int PrimaryLogPG::start_cls_gather(OpContext *ctx, std::map<std::string, bufferlist> *src_obj_buffs, const std::string& pool,
10359 const char *cls, const char *method, bufferlist& inbl)
10360 {
10361 OpRequestRef op = ctx->op;
10362 MOSDOp *m = static_cast<MOSDOp*>(op->get_nonconst_req());
10363
10364 auto pool_id = osd->objecter->with_osdmap(std::mem_fn(&OSDMap::lookup_pg_pool_name), pool);
10365 object_locator_t oloc(pool_id);
10366
10367 ObjectState& obs = ctx->new_obs;
10368 object_info_t& oi = obs.oi;
10369 const hobject_t& soid = oi.soid;
10370
10371 ObjectContextRef obc = get_object_context(soid, false);
10372 C_GatherBuilder gather(cct);
10373
10374 auto [iter, inserted] = cls_gather_ops.emplace(soid, CLSGatherOp(ctx, obc, op));
10375 ceph_assert(inserted);
10376 auto &cgop = iter->second;
10377 for (std::map<std::string, bufferlist>::iterator it = src_obj_buffs->begin(); it != src_obj_buffs->end(); it++) {
10378 std::string oid = it->first;
10379 ObjectOperation obj_op;
10380 obj_op.call(cls, method, inbl);
10381 uint32_t flags = 0;
10382 ceph_tid_t tid = osd->objecter->read(
10383 object_t(oid), oloc, obj_op,
10384 m->get_snapid(), &it->second,
10385 flags, gather.new_sub());
10386 cgop.objecter_tids.push_back(tid);
10387 dout(10) << __func__ << " src=" << oid << ", tgt=" << soid << dendl;
10388 }
10389
10390 C_gather *fin = new C_gather(this, soid, get_last_peering_reset(), &(*ctx->ops)[ctx->current_osd_subop_num]);
10391 gather.set_finisher(new C_OnFinisher(fin,
10392 osd->get_objecter_finisher(get_pg_shard())));
10393 gather.activate();
10394
10395 return -EINPROGRESS;
10396 }
10397
10398 // ========================================================================
10399 // flush
10400 //
10401 // Flush a dirty object in the cache tier by writing it back to the
10402 // base tier. The sequence looks like:
10403 //
10404 // * send a copy-from operation to the base tier to copy the current
10405 // version of the object
10406 // * base tier will pull the object via (perhaps multiple) copy-get(s)
10407 // * on completion, we check if the object has been modified. if so,
10408 // just reply with -EAGAIN.
10409 // * try to take a write lock so we can clear the dirty flag. if this
10410 // fails, wait and retry
10411 // * start a repop that clears the bit.
10412 //
10413 // If we have to wait, we will retry by coming back through the
10414 // start_flush method. We check if a flush is already in progress
10415 // and, if so, try to finish it by rechecking the version and trying
10416 // to clear the dirty bit.
10417 //
10418 // In order for the cache-flush (a write op) to not block the copy-get
10419 // from reading the object, the client *must* set the SKIPRWLOCKS
10420 // flag.
10421 //
10422 // NOTE: normally writes are strictly ordered for the client, but
10423 // flushes are special in that they can be reordered with respect to
10424 // other writes. In particular, we can't have a flush request block
10425 // an update to the cache pool object!
10426
10427 struct C_Flush : public Context {
10428 PrimaryLogPGRef pg;
10429 hobject_t oid;
10430 epoch_t last_peering_reset;
10431 ceph_tid_t tid;
10432 utime_t start;
10433 C_Flush(PrimaryLogPG *p, hobject_t o, epoch_t lpr)
10434 : pg(p), oid(o), last_peering_reset(lpr),
10435 tid(0), start(ceph_clock_now())
10436 {}
10437 void finish(int r) override {
10438 if (r == -ECANCELED)
10439 return;
10440 std::scoped_lock locker{*pg};
10441 if (last_peering_reset == pg->get_last_peering_reset()) {
10442 pg->finish_flush(oid, tid, r);
10443 pg->osd->logger->tinc(l_osd_tier_flush_lat, ceph_clock_now() - start);
10444 }
10445 }
10446 };
10447
10448 int PrimaryLogPG::start_dedup(OpRequestRef op, ObjectContextRef obc)
10449 {
10450 const object_info_t& oi = obc->obs.oi;
10451 const hobject_t& soid = oi.soid;
10452
10453 ceph_assert(obc->is_blocked());
10454 if (oi.size == 0) {
10455 // evicted
10456 return 0;
10457 }
10458 if (pool.info.get_fingerprint_type() == pg_pool_t::TYPE_FINGERPRINT_NONE) {
10459 dout(0) << " fingerprint algorithm is not set " << dendl;
10460 return -EINVAL;
10461 }
10462
10463 /*
10464 * The operations to make dedup chunks are tracked by a ManifestOp.
10465 * This op will be finished if all the operations are completed.
10466 */
10467 ManifestOpRef mop(std::make_shared<ManifestOp>());
10468
10469 // cdc
10470 std::map<uint64_t, bufferlist> chunks;
10471 int r = do_cdc(oi, mop->new_manifest.chunk_map, chunks);
10472 if (r < 0) {
10473 return r;
10474 }
10475 if (!chunks.size()) {
10476 return 0;
10477 }
10478
10479 // chunks issued here are different with chunk_map newly generated
10480 // because the same chunks in previous snap will not be issued
10481 // So, we need two data structures; the first is the issued chunk list to track
10482 // issued operations, and the second is the new chunk_map to update chunk_map after
10483 // all operations are finished
10484 object_ref_delta_t refs;
10485 ObjectContextRef obc_l, obc_g;
10486 get_adjacent_clones(obc, obc_l, obc_g);
10487 // skip if the same content exits in prev snap at same offset
10488 mop->new_manifest.calc_refs_to_inc_on_set(
10489 obc_l ? &(obc_l->obs.oi.manifest) : nullptr,
10490 obc_g ? &(obc_g->obs.oi.manifest) : nullptr,
10491 refs);
10492
10493 for (auto p : chunks) {
10494 hobject_t target = mop->new_manifest.chunk_map[p.first].oid;
10495 if (refs.find(target) == refs.end()) {
10496 continue;
10497 }
10498 C_SetDedupChunks *fin = new C_SetDedupChunks(this, soid, get_last_peering_reset(), p.first);
10499 ceph_tid_t tid = refcount_manifest(soid, target, refcount_t::CREATE_OR_GET_REF,
10500 fin, std::move(chunks[p.first]));
10501 mop->chunks[target] = make_pair(p.first, p.second.length());
10502 mop->num_chunks++;
10503 mop->tids[p.first] = tid;
10504 fin->tid = tid;
10505 dout(10) << __func__ << " oid: " << soid << " tid: " << tid
10506 << " target: " << target << " offset: " << p.first
10507 << " length: " << p.second.length() << dendl;
10508 }
10509
10510 if (mop->tids.size()) {
10511 manifest_ops[soid] = mop;
10512 manifest_ops[soid]->op = op;
10513 } else {
10514 // size == 0
10515 return 0;
10516 }
10517
10518 return -EINPROGRESS;
10519 }
10520
10521 int PrimaryLogPG::do_cdc(const object_info_t& oi,
10522 std::map<uint64_t, chunk_info_t>& chunk_map,
10523 std::map<uint64_t, bufferlist>& chunks)
10524 {
10525 string chunk_algo = pool.info.get_dedup_chunk_algorithm_name();
10526 int64_t chunk_size = pool.info.get_dedup_cdc_chunk_size();
10527 uint64_t total_length = 0;
10528
10529 std::unique_ptr<CDC> cdc = CDC::create(chunk_algo, cbits(chunk_size)-1);
10530 if (!cdc) {
10531 dout(0) << __func__ << " unrecognized chunk-algorithm " << dendl;
10532 return -EINVAL;
10533 }
10534
10535 bufferlist bl;
10536 /**
10537 * We disable EC pool as a base tier of distributed dedup.
10538 * The reason why we disallow erasure code pool here is that the EC pool does not support objects_read_sync().
10539 * Therefore, we should change the current implementation totally to make EC pool compatible.
10540 * As s result, we leave this as a future work.
10541 */
10542 int r = pgbackend->objects_read_sync(
10543 oi.soid, 0, oi.size, 0, &bl);
10544 if (r < 0) {
10545 dout(0) << __func__ << " read fail " << oi.soid
10546 << " len: " << oi.size << " r: " << r << dendl;
10547 return r;
10548 }
10549 if (bl.length() != oi.size) {
10550 dout(0) << __func__ << " bl.length: " << bl.length() << " != oi.size: "
10551 << oi.size << " during chunking " << dendl;
10552 return -EIO;
10553 }
10554
10555 dout(10) << __func__ << " oid: " << oi.soid << " len: " << bl.length()
10556 << " oi.size: " << oi.size
10557 << " chunk_size: " << chunk_size << dendl;
10558
10559 vector<pair<uint64_t, uint64_t>> cdc_chunks;
10560 cdc->calc_chunks(bl, &cdc_chunks);
10561
10562 // get fingerprint
10563 for (auto p : cdc_chunks) {
10564 bufferlist chunk;
10565 chunk.substr_of(bl, p.first, p.second);
10566 hobject_t target = get_fpoid_from_chunk(oi.soid, chunk);
10567 chunks[p.first] = std::move(chunk);
10568 chunk_map[p.first] = chunk_info_t(0, p.second, target);
10569 total_length += p.second;
10570 }
10571 return total_length;
10572 }
10573
10574 hobject_t PrimaryLogPG::get_fpoid_from_chunk(const hobject_t soid, bufferlist& chunk)
10575 {
10576 pg_pool_t::fingerprint_t fp_algo = pool.info.get_fingerprint_type();
10577 if (fp_algo == pg_pool_t::TYPE_FINGERPRINT_NONE) {
10578 return hobject_t();
10579 }
10580 object_t fp_oid = [&fp_algo, &chunk]() -> string {
10581 switch (fp_algo) {
10582 case pg_pool_t::TYPE_FINGERPRINT_SHA1:
10583 return ceph::crypto::digest<ceph::crypto::SHA1>(chunk).to_str();
10584 case pg_pool_t::TYPE_FINGERPRINT_SHA256:
10585 return ceph::crypto::digest<ceph::crypto::SHA256>(chunk).to_str();
10586 case pg_pool_t::TYPE_FINGERPRINT_SHA512:
10587 return ceph::crypto::digest<ceph::crypto::SHA512>(chunk).to_str();
10588 default:
10589 assert(0 == "unrecognized fingerprint type");
10590 return {};
10591 }
10592 }();
10593
10594 pg_t raw_pg;
10595 object_locator_t oloc(soid);
10596 oloc.pool = pool.info.get_dedup_tier();
10597 get_osdmap()->object_locator_to_pg(fp_oid, oloc, raw_pg);
10598 hobject_t target(fp_oid, oloc.key, snapid_t(),
10599 raw_pg.ps(), raw_pg.pool(),
10600 oloc.nspace);
10601 return target;
10602 }
10603
10604 int PrimaryLogPG::finish_set_dedup(hobject_t oid, int r, ceph_tid_t tid, uint64_t offset)
10605 {
10606 dout(10) << __func__ << " " << oid << " tid " << tid
10607 << " " << cpp_strerror(r) << dendl;
10608 map<hobject_t,ManifestOpRef>::iterator p = manifest_ops.find(oid);
10609 if (p == manifest_ops.end()) {
10610 dout(10) << __func__ << " no manifest_op found" << dendl;
10611 return -EINVAL;
10612 }
10613 ManifestOpRef mop = p->second;
10614 mop->results[offset] = r;
10615 if (r < 0) {
10616 // if any failure occurs, put a mark on the results to recognize the failure
10617 mop->results[0] = r;
10618 }
10619 if (mop->num_chunks != mop->results.size()) {
10620 // there are on-going works
10621 return -EINPROGRESS;
10622 }
10623 ObjectContextRef obc = get_object_context(oid, false);
10624 if (!obc) {
10625 if (mop->op)
10626 osd->reply_op_error(mop->op, -EINVAL);
10627 return -EINVAL;
10628 }
10629 ceph_assert(obc->is_blocked());
10630 obc->stop_block();
10631 kick_object_context_blocked(obc);
10632 if (mop->results[0] < 0) {
10633 // check if the previous op returns fail
10634 ceph_assert(mop->num_chunks == mop->results.size());
10635 manifest_ops.erase(oid);
10636 osd->reply_op_error(mop->op, mop->results[0]);
10637 return -EIO;
10638 }
10639
10640 if (mop->chunks.size()) {
10641 OpContextUPtr ctx = simple_opc_create(obc);
10642 ceph_assert(ctx);
10643 if (ctx->lock_manager.get_lock_type(
10644 RWState::RWWRITE,
10645 oid,
10646 obc,
10647 mop->op)) {
10648 dout(20) << __func__ << " took write lock" << dendl;
10649 } else if (mop->op) {
10650 dout(10) << __func__ << " waiting on write lock " << mop->op << dendl;
10651 close_op_ctx(ctx.release());
10652 return -EAGAIN;
10653 }
10654
10655 ctx->at_version = get_next_version();
10656 ctx->new_obs = obc->obs;
10657 ctx->new_obs.oi.clear_flag(object_info_t::FLAG_DIRTY);
10658 --ctx->delta_stats.num_objects_dirty;
10659
10660 /*
10661 * Let's assume that there is a manifest snapshotted object, and we issue tier_flush() to head.
10662 * head: [0, 2) aaa <-- tier_flush()
10663 * 20: [0, 2) ddd, [6, 2) bbb, [8, 2) ccc
10664 *
10665 * In this case, if the new chunk_map is as follows,
10666 * new_chunk_map : [0, 2) ddd, [6, 2) bbb, [8, 2) ccc
10667 * we should drop aaa from head by using calc_refs_to_drop_on_removal().
10668 * So, the precedure is
10669 * 1. calc_refs_to_drop_on_removal()
10670 * 2. register old references to drop after tier_flush() is committed
10671 * 3. update new chunk_map
10672 */
10673
10674 ObjectCleanRegions c_regions = ctx->clean_regions;
10675 ObjectContextRef cobc = get_prev_clone_obc(obc);
10676 c_regions.mark_fully_dirty();
10677 // CDC was done on entire range of manifest object,
10678 // so the first thing we should do here is to drop the reference to old chunks
10679 ObjectContextRef obc_l, obc_g;
10680 get_adjacent_clones(obc, obc_l, obc_g);
10681 // clear all old references
10682 object_ref_delta_t refs;
10683 ctx->obs->oi.manifest.calc_refs_to_drop_on_removal(
10684 obc_l ? &(obc_l->obs.oi.manifest) : nullptr,
10685 obc_g ? &(obc_g->obs.oi.manifest) : nullptr,
10686 refs);
10687 if (!refs.is_empty()) {
10688 ctx->register_on_commit(
10689 [oid, this, refs](){
10690 dec_refcount(oid, refs);
10691 });
10692 }
10693
10694 // set new references
10695 ctx->new_obs.oi.manifest.chunk_map = mop->new_manifest.chunk_map;
10696
10697 finish_ctx(ctx.get(), pg_log_entry_t::CLEAN);
10698 simple_opc_submit(std::move(ctx));
10699 }
10700 if (mop->op)
10701 osd->reply_op_error(mop->op, r);
10702
10703 manifest_ops.erase(oid);
10704 return 0;
10705 }
10706
10707 int PrimaryLogPG::finish_set_manifest_refcount(hobject_t oid, int r, ceph_tid_t tid, uint64_t offset)
10708 {
10709 dout(10) << __func__ << " " << oid << " tid " << tid
10710 << " " << cpp_strerror(r) << dendl;
10711 map<hobject_t,ManifestOpRef>::iterator p = manifest_ops.find(oid);
10712 if (p == manifest_ops.end()) {
10713 dout(10) << __func__ << " no manifest_op found" << dendl;
10714 return -EINVAL;
10715 }
10716 ManifestOpRef mop = p->second;
10717 mop->results[offset] = r;
10718 if (r < 0) {
10719 // if any failure occurs, put a mark on the results to recognize the failure
10720 mop->results[0] = r;
10721 }
10722 if (mop->num_chunks != mop->results.size()) {
10723 // there are on-going works
10724 return -EINPROGRESS;
10725 }
10726
10727 if (mop->cb) {
10728 mop->cb->complete(r);
10729 }
10730
10731 manifest_ops.erase(p);
10732 mop.reset();
10733
10734 return 0;
10735 }
10736
10737 int PrimaryLogPG::start_flush(
10738 OpRequestRef op, ObjectContextRef obc,
10739 bool blocking, hobject_t *pmissing,
10740 std::optional<std::function<void()>> &&on_flush)
10741 {
10742 const object_info_t& oi = obc->obs.oi;
10743 const hobject_t& soid = oi.soid;
10744 dout(10) << __func__ << " " << soid
10745 << " v" << oi.version
10746 << " uv" << oi.user_version
10747 << " " << (blocking ? "blocking" : "non-blocking/best-effort")
10748 << dendl;
10749
10750 bool preoctopus_compat =
10751 get_osdmap()->require_osd_release < ceph_release_t::octopus;
10752 SnapSet snapset;
10753 if (preoctopus_compat) {
10754 // for pre-octopus compatibility, filter SnapSet::snaps. not
10755 // certain we need this, but let's be conservative.
10756 snapset = obc->ssc->snapset.get_filtered(pool.info);
10757 } else {
10758 // NOTE: change this to a const ref when we remove this compat code
10759 snapset = obc->ssc->snapset;
10760 }
10761
10762 if (obc->obs.oi.has_manifest() && obc->obs.oi.manifest.is_chunked()) {
10763 // current dedup tier only supports blocking operation
10764 if (!blocking) {
10765 return -EOPNOTSUPP;
10766 }
10767 }
10768
10769 // verify there are no (older) check for dirty clones
10770 {
10771 dout(20) << " snapset " << snapset << dendl;
10772 vector<snapid_t>::reverse_iterator p = snapset.clones.rbegin();
10773 while (p != snapset.clones.rend() && *p >= soid.snap)
10774 ++p;
10775 if (p != snapset.clones.rend()) {
10776 hobject_t next = soid;
10777 next.snap = *p;
10778 ceph_assert(next.snap < soid.snap);
10779 if (recovery_state.get_pg_log().get_missing().is_missing(next)) {
10780 dout(10) << __func__ << " missing clone is " << next << dendl;
10781 if (pmissing)
10782 *pmissing = next;
10783 return -ENOENT;
10784 }
10785 ObjectContextRef older_obc = get_object_context(next, false);
10786 if (older_obc) {
10787 dout(20) << __func__ << " next oldest clone is " << older_obc->obs.oi
10788 << dendl;
10789 if (older_obc->obs.oi.is_dirty()) {
10790 dout(10) << __func__ << " next oldest clone is dirty: "
10791 << older_obc->obs.oi << dendl;
10792 return -EBUSY;
10793 }
10794 } else {
10795 dout(20) << __func__ << " next oldest clone " << next
10796 << " is not present; implicitly clean" << dendl;
10797 }
10798 } else {
10799 dout(20) << __func__ << " no older clones" << dendl;
10800 }
10801 }
10802
10803 if (blocking)
10804 obc->start_block();
10805
10806 map<hobject_t,FlushOpRef>::iterator p = flush_ops.find(soid);
10807 if (p != flush_ops.end()) {
10808 FlushOpRef fop = p->second;
10809 if (fop->op == op) {
10810 // we couldn't take the write lock on a cache-try-flush before;
10811 // now we are trying again for the lock.
10812 return try_flush_mark_clean(fop);
10813 }
10814 if (fop->flushed_version == obc->obs.oi.user_version &&
10815 (fop->blocking || !blocking)) {
10816 // nonblocking can join anything
10817 // blocking can only join a blocking flush
10818 dout(20) << __func__ << " piggybacking on existing flush " << dendl;
10819 if (op)
10820 fop->dup_ops.push_back(op);
10821 return -EAGAIN; // clean up this ctx; op will retry later
10822 }
10823
10824 // cancel current flush since it will fail anyway, or because we
10825 // are blocking and the existing flush is nonblocking.
10826 dout(20) << __func__ << " canceling previous flush; it will fail" << dendl;
10827 if (fop->op)
10828 osd->reply_op_error(fop->op, -EBUSY);
10829 while (!fop->dup_ops.empty()) {
10830 osd->reply_op_error(fop->dup_ops.front(), -EBUSY);
10831 fop->dup_ops.pop_front();
10832 }
10833 vector<ceph_tid_t> tids;
10834 cancel_flush(fop, false, &tids);
10835 osd->objecter->op_cancel(tids, -ECANCELED);
10836 }
10837
10838 if (obc->obs.oi.has_manifest() && obc->obs.oi.manifest.is_chunked()) {
10839 int r = start_dedup(op, obc);
10840 if (r != -EINPROGRESS) {
10841 if (blocking)
10842 obc->stop_block();
10843 }
10844 return r;
10845 }
10846
10847 /**
10848 * In general, we need to send a delete and a copyfrom.
10849 * Consider snapc 10:[10, 9, 8, 4, 3, 2]:[10(10, 9), 4(4,3,2)]
10850 * where 4 is marked as clean. To flush 10, we have to:
10851 * 1) delete 4:[4,3,2] -- Logically, the object does not exist after 4
10852 * 2) copyfrom 8:[8,4,3,2] -- flush object after snap 8
10853 *
10854 * There is a complicating case. Supposed there had been a clone 7
10855 * for snaps [7, 6] which has been trimmed since they no longer exist.
10856 * In the base pool, we'd have 5:[4,3,2]:[4(4,3,2)]+head. When we submit
10857 * the delete, the snap will be promoted to 5, and the head will become
10858 * a whiteout. When the copy-from goes through, we'll end up with
10859 * 8:[8,4,3,2]:[4(4,3,2)]+head.
10860 *
10861 * Another complication is the case where there is an interval change
10862 * after doing the delete and the flush but before marking the object
10863 * clean. We'll happily delete head and then recreate it at the same
10864 * sequence number, which works out ok.
10865 */
10866
10867 SnapContext snapc, dsnapc;
10868 if (snapset.seq != 0) {
10869 if (soid.snap == CEPH_NOSNAP) {
10870 snapc = snapset.get_ssc_as_of(snapset.seq);
10871 } else {
10872 snapid_t min_included_snap;
10873 auto p = snapset.clone_snaps.find(soid.snap);
10874 ceph_assert(p != snapset.clone_snaps.end());
10875 min_included_snap = p->second.back();
10876 snapc = snapset.get_ssc_as_of(min_included_snap - 1);
10877 }
10878
10879 snapid_t prev_snapc = 0;
10880 for (vector<snapid_t>::reverse_iterator citer = snapset.clones.rbegin();
10881 citer != snapset.clones.rend();
10882 ++citer) {
10883 if (*citer < soid.snap) {
10884 prev_snapc = *citer;
10885 break;
10886 }
10887 }
10888
10889 dsnapc = snapset.get_ssc_as_of(prev_snapc);
10890 }
10891
10892 object_locator_t base_oloc(soid);
10893 base_oloc.pool = pool.info.tier_of;
10894
10895 if (dsnapc.seq < snapc.seq) {
10896 ObjectOperation o;
10897 o.remove();
10898 osd->objecter->mutate(
10899 soid.oid,
10900 base_oloc,
10901 o,
10902 dsnapc,
10903 ceph::real_clock::from_ceph_timespec(oi.mtime),
10904 (CEPH_OSD_FLAG_IGNORE_OVERLAY |
10905 CEPH_OSD_FLAG_ENFORCE_SNAPC),
10906 NULL /* no callback, we'll rely on the ordering w.r.t the next op */);
10907 }
10908
10909 FlushOpRef fop(std::make_shared<FlushOp>());
10910 fop->obc = obc;
10911 fop->flushed_version = oi.user_version;
10912 fop->blocking = blocking;
10913 fop->on_flush = std::move(on_flush);
10914 fop->op = op;
10915
10916 ObjectOperation o;
10917 if (oi.is_whiteout()) {
10918 fop->removal = true;
10919 o.remove();
10920 } else {
10921 object_locator_t oloc(soid);
10922 o.copy_from(soid.oid.name, soid.snap, oloc, oi.user_version,
10923 CEPH_OSD_COPY_FROM_FLAG_FLUSH |
10924 CEPH_OSD_COPY_FROM_FLAG_IGNORE_OVERLAY |
10925 CEPH_OSD_COPY_FROM_FLAG_IGNORE_CACHE |
10926 CEPH_OSD_COPY_FROM_FLAG_MAP_SNAP_CLONE,
10927 LIBRADOS_OP_FLAG_FADVISE_SEQUENTIAL|LIBRADOS_OP_FLAG_FADVISE_NOCACHE);
10928
10929 //mean the base tier don't cache data after this
10930 if (agent_state && agent_state->evict_mode != TierAgentState::EVICT_MODE_FULL)
10931 o.set_last_op_flags(LIBRADOS_OP_FLAG_FADVISE_DONTNEED);
10932 }
10933 C_Flush *fin = new C_Flush(this, soid, get_last_peering_reset());
10934
10935 ceph_tid_t tid = osd->objecter->mutate(
10936 soid.oid, base_oloc, o, snapc,
10937 ceph::real_clock::from_ceph_timespec(oi.mtime),
10938 CEPH_OSD_FLAG_IGNORE_OVERLAY | CEPH_OSD_FLAG_ENFORCE_SNAPC,
10939 new C_OnFinisher(fin,
10940 osd->get_objecter_finisher(get_pg_shard())));
10941 /* we're under the pg lock and fin->finish() is grabbing that */
10942 fin->tid = tid;
10943 fop->objecter_tid = tid;
10944
10945 flush_ops[soid] = fop;
10946
10947 recovery_state.update_stats(
10948 [&oi](auto &history, auto &stats) {
10949 stats.stats.sum.num_flush++;
10950 stats.stats.sum.num_flush_kb += shift_round_up(oi.size, 10);
10951 return false;
10952 });
10953 return -EINPROGRESS;
10954 }
10955
10956 void PrimaryLogPG::finish_flush(hobject_t oid, ceph_tid_t tid, int r)
10957 {
10958 dout(10) << __func__ << " " << oid << " tid " << tid
10959 << " " << cpp_strerror(r) << dendl;
10960 map<hobject_t,FlushOpRef>::iterator p = flush_ops.find(oid);
10961 if (p == flush_ops.end()) {
10962 dout(10) << __func__ << " no flush_op found" << dendl;
10963 return;
10964 }
10965 FlushOpRef fop = p->second;
10966 if (tid != fop->objecter_tid && !fop->obc->obs.oi.has_manifest()) {
10967 dout(10) << __func__ << " tid " << tid << " != fop " << fop
10968 << " tid " << fop->objecter_tid << dendl;
10969 return;
10970 }
10971 ObjectContextRef obc = fop->obc;
10972 fop->objecter_tid = 0;
10973
10974 if (r < 0 && !(r == -ENOENT && fop->removal)) {
10975 if (fop->op)
10976 osd->reply_op_error(fop->op, -EBUSY);
10977 if (fop->blocking) {
10978 obc->stop_block();
10979 kick_object_context_blocked(obc);
10980 }
10981
10982 if (!fop->dup_ops.empty()) {
10983 dout(20) << __func__ << " requeueing dups" << dendl;
10984 requeue_ops(fop->dup_ops);
10985 }
10986 if (fop->on_flush) {
10987 (*(fop->on_flush))();
10988 fop->on_flush = std::nullopt;
10989 }
10990 flush_ops.erase(oid);
10991 return;
10992 }
10993
10994 r = try_flush_mark_clean(fop);
10995 if (r == -EBUSY && fop->op) {
10996 osd->reply_op_error(fop->op, r);
10997 }
10998 }
10999
11000 int PrimaryLogPG::try_flush_mark_clean(FlushOpRef fop)
11001 {
11002 ObjectContextRef obc = fop->obc;
11003 const hobject_t& oid = obc->obs.oi.soid;
11004
11005 if (fop->blocking) {
11006 obc->stop_block();
11007 kick_object_context_blocked(obc);
11008 }
11009
11010 if (fop->flushed_version != obc->obs.oi.user_version ||
11011 !obc->obs.exists) {
11012 if (obc->obs.exists)
11013 dout(10) << __func__ << " flushed_version " << fop->flushed_version
11014 << " != current " << obc->obs.oi.user_version
11015 << dendl;
11016 else
11017 dout(10) << __func__ << " object no longer exists" << dendl;
11018
11019 if (!fop->dup_ops.empty()) {
11020 dout(20) << __func__ << " requeueing dups" << dendl;
11021 requeue_ops(fop->dup_ops);
11022 }
11023 if (fop->on_flush) {
11024 (*(fop->on_flush))();
11025 fop->on_flush = std::nullopt;
11026 }
11027 flush_ops.erase(oid);
11028 if (fop->blocking)
11029 osd->logger->inc(l_osd_tier_flush_fail);
11030 else
11031 osd->logger->inc(l_osd_tier_try_flush_fail);
11032 return -EBUSY;
11033 }
11034
11035 if (!fop->blocking &&
11036 m_scrubber->write_blocked_by_scrub(oid)) {
11037 if (fop->op) {
11038 dout(10) << __func__ << " blocked by scrub" << dendl;
11039 requeue_op(fop->op);
11040 requeue_ops(fop->dup_ops);
11041 return -EAGAIN; // will retry
11042 } else {
11043 osd->logger->inc(l_osd_tier_try_flush_fail);
11044 vector<ceph_tid_t> tids;
11045 cancel_flush(fop, false, &tids);
11046 osd->objecter->op_cancel(tids, -ECANCELED);
11047 return -ECANCELED;
11048 }
11049 }
11050
11051 // successfully flushed, can we evict this object?
11052 if (!obc->obs.oi.has_manifest() && !fop->op &&
11053 agent_state && agent_state->evict_mode != TierAgentState::EVICT_MODE_IDLE &&
11054 agent_maybe_evict(obc, true)) {
11055 osd->logger->inc(l_osd_tier_clean);
11056 if (fop->on_flush) {
11057 (*(fop->on_flush))();
11058 fop->on_flush = std::nullopt;
11059 }
11060 flush_ops.erase(oid);
11061 return 0;
11062 }
11063
11064 dout(10) << __func__ << " clearing DIRTY flag for " << oid << dendl;
11065 OpContextUPtr ctx = simple_opc_create(fop->obc);
11066
11067 // successfully flushed; can we clear the dirty bit?
11068 // try to take the lock manually, since we don't
11069 // have a ctx yet.
11070 if (ctx->lock_manager.get_lock_type(
11071 RWState::RWWRITE,
11072 oid,
11073 obc,
11074 fop->op)) {
11075 dout(20) << __func__ << " took write lock" << dendl;
11076 } else if (fop->op) {
11077 dout(10) << __func__ << " waiting on write lock " << fop->op << " "
11078 << fop->dup_ops << dendl;
11079 // fop->op is now waiting on the lock; get fop->dup_ops to wait too.
11080 for (auto op : fop->dup_ops) {
11081 bool locked = ctx->lock_manager.get_lock_type(
11082 RWState::RWWRITE,
11083 oid,
11084 obc,
11085 op);
11086 ceph_assert(!locked);
11087 }
11088 close_op_ctx(ctx.release());
11089 return -EAGAIN; // will retry
11090 } else {
11091 dout(10) << __func__ << " failed write lock, no op; failing" << dendl;
11092 close_op_ctx(ctx.release());
11093 osd->logger->inc(l_osd_tier_try_flush_fail);
11094 vector<ceph_tid_t> tids;
11095 cancel_flush(fop, false, &tids);
11096 osd->objecter->op_cancel(tids, -ECANCELED);
11097 return -ECANCELED;
11098 }
11099
11100 if (fop->on_flush) {
11101 ctx->register_on_finish(*(fop->on_flush));
11102 fop->on_flush = std::nullopt;
11103 }
11104
11105 ctx->at_version = get_next_version();
11106
11107 ctx->new_obs = obc->obs;
11108 ctx->new_obs.oi.clear_flag(object_info_t::FLAG_DIRTY);
11109 --ctx->delta_stats.num_objects_dirty;
11110 if (fop->obc->obs.oi.has_manifest()) {
11111 ceph_assert(obc->obs.oi.manifest.is_chunked());
11112 PGTransaction* t = ctx->op_t.get();
11113 uint64_t chunks_size = 0;
11114 for (auto &p : ctx->new_obs.oi.manifest.chunk_map) {
11115 chunks_size += p.second.length;
11116 }
11117 if (ctx->new_obs.oi.is_omap() && pool.info.supports_omap()) {
11118 t->omap_clear(oid);
11119 ctx->new_obs.oi.clear_omap_digest();
11120 ctx->new_obs.oi.clear_flag(object_info_t::FLAG_OMAP);
11121 ctx->clean_regions.mark_omap_dirty();
11122 }
11123 if (obc->obs.oi.size == chunks_size) {
11124 t->truncate(oid, 0);
11125 interval_set<uint64_t> trim;
11126 trim.insert(0, ctx->new_obs.oi.size);
11127 ctx->modified_ranges.union_of(trim);
11128 truncate_update_size_and_usage(ctx->delta_stats,
11129 ctx->new_obs.oi,
11130 0);
11131 ctx->clean_regions.mark_data_region_dirty(0, ctx->new_obs.oi.size);
11132 ctx->new_obs.oi.new_object();
11133 for (auto &p : ctx->new_obs.oi.manifest.chunk_map) {
11134 p.second.set_flag(chunk_info_t::FLAG_MISSING);
11135 }
11136 } else {
11137 for (auto &p : ctx->new_obs.oi.manifest.chunk_map) {
11138 dout(20) << __func__ << " offset: " << p.second.offset
11139 << " length: " << p.second.length << dendl;
11140 p.second.clear_flag(chunk_info_t::FLAG_MISSING); // CLEAN
11141 }
11142 }
11143 }
11144
11145 finish_ctx(ctx.get(), pg_log_entry_t::CLEAN);
11146
11147 osd->logger->inc(l_osd_tier_clean);
11148
11149 if (!fop->dup_ops.empty() || fop->op) {
11150 dout(20) << __func__ << " requeueing for " << ctx->at_version << dendl;
11151 list<OpRequestRef> ls;
11152 if (fop->op)
11153 ls.push_back(fop->op);
11154 ls.splice(ls.end(), fop->dup_ops);
11155 requeue_ops(ls);
11156 }
11157
11158 simple_opc_submit(std::move(ctx));
11159
11160 flush_ops.erase(oid);
11161
11162 if (fop->blocking)
11163 osd->logger->inc(l_osd_tier_flush);
11164 else
11165 osd->logger->inc(l_osd_tier_try_flush);
11166
11167 return -EINPROGRESS;
11168 }
11169
11170 void PrimaryLogPG::cancel_flush(FlushOpRef fop, bool requeue,
11171 vector<ceph_tid_t> *tids)
11172 {
11173 dout(10) << __func__ << " " << fop->obc->obs.oi.soid << " tid "
11174 << fop->objecter_tid << dendl;
11175 if (fop->objecter_tid) {
11176 tids->push_back(fop->objecter_tid);
11177 fop->objecter_tid = 0;
11178 }
11179 if (fop->io_tids.size()) {
11180 for (auto &p : fop->io_tids) {
11181 tids->push_back(p.second);
11182 p.second = 0;
11183 }
11184 }
11185 if (fop->blocking && fop->obc->is_blocked()) {
11186 fop->obc->stop_block();
11187 kick_object_context_blocked(fop->obc);
11188 }
11189 if (requeue) {
11190 if (fop->op)
11191 requeue_op(fop->op);
11192 requeue_ops(fop->dup_ops);
11193 }
11194 if (fop->on_flush) {
11195 (*(fop->on_flush))();
11196 fop->on_flush = std::nullopt;
11197 }
11198 flush_ops.erase(fop->obc->obs.oi.soid);
11199 }
11200
11201 void PrimaryLogPG::cancel_flush_ops(bool requeue, vector<ceph_tid_t> *tids)
11202 {
11203 dout(10) << __func__ << dendl;
11204 map<hobject_t,FlushOpRef>::iterator p = flush_ops.begin();
11205 while (p != flush_ops.end()) {
11206 cancel_flush((p++)->second, requeue, tids);
11207 }
11208 }
11209
11210 bool PrimaryLogPG::is_present_clone(hobject_t coid)
11211 {
11212 if (!pool.info.allow_incomplete_clones())
11213 return true;
11214 if (is_missing_object(coid))
11215 return true;
11216 ObjectContextRef obc = get_object_context(coid, false);
11217 return obc && obc->obs.exists;
11218 }
11219
11220 // ========================================================================
11221 // cls gather
11222 //
11223
11224 void PrimaryLogPG::cancel_cls_gather(map<hobject_t,CLSGatherOp>::iterator iter, bool requeue,
11225 vector<ceph_tid_t> *tids)
11226 {
11227 auto &cgop = iter->second;
11228 for (std::vector<ceph_tid_t>::iterator p = cgop.objecter_tids.begin(); p != cgop.objecter_tids.end(); p++) {
11229 tids->push_back(*p);
11230 dout(10) << __func__ << " " << cgop.obc->obs.oi.soid << " tid " << *p << dendl;
11231 }
11232 cgop.objecter_tids.clear();
11233 close_op_ctx(cgop.ctx);
11234 cgop.ctx = NULL;
11235 if (requeue) {
11236 if (cgop.op)
11237 requeue_op(cgop.op);
11238 }
11239 cls_gather_ops.erase(iter);
11240 }
11241
11242 void PrimaryLogPG::cancel_cls_gather_ops(bool requeue, vector<ceph_tid_t> *tids)
11243 {
11244 dout(10) << __func__ << dendl;
11245 map<hobject_t,CLSGatherOp>::iterator p = cls_gather_ops.begin();
11246 while (p != cls_gather_ops.end()) {
11247 cancel_cls_gather(p++, requeue, tids);
11248 }
11249 }
11250
11251 // ========================================================================
11252 // rep op gather
11253
11254 class C_OSD_RepopCommit : public Context {
11255 PrimaryLogPGRef pg;
11256 boost::intrusive_ptr<PrimaryLogPG::RepGather> repop;
11257 public:
11258 C_OSD_RepopCommit(PrimaryLogPG *pg, PrimaryLogPG::RepGather *repop)
11259 : pg(pg), repop(repop) {}
11260 void finish(int) override {
11261 pg->repop_all_committed(repop.get());
11262 }
11263 };
11264
11265 void PrimaryLogPG::repop_all_committed(RepGather *repop)
11266 {
11267 dout(10) << __func__ << ": repop tid " << repop->rep_tid << " all committed "
11268 << dendl;
11269 repop->all_committed = true;
11270 if (!repop->rep_aborted) {
11271 if (repop->v != eversion_t()) {
11272 recovery_state.complete_write(repop->v, repop->pg_local_last_complete);
11273 }
11274 eval_repop(repop);
11275 }
11276 }
11277
11278 void PrimaryLogPG::op_applied(const eversion_t &applied_version)
11279 {
11280 dout(10) << "op_applied version " << applied_version << dendl;
11281 ceph_assert(applied_version != eversion_t());
11282 ceph_assert(applied_version <= info.last_update);
11283 recovery_state.local_write_applied(applied_version);
11284
11285 if (is_primary() && m_scrubber) {
11286 // if there's a scrub operation waiting for the selected chunk to be fully updated -
11287 // allow it to continue
11288 m_scrubber->on_applied_when_primary(recovery_state.get_last_update_applied());
11289 }
11290 }
11291
11292 void PrimaryLogPG::eval_repop(RepGather *repop)
11293 {
11294 jspan span;
11295 if (repop->op) {
11296 span = tracing::osd::tracer.add_span(__func__, repop->op->osd_parent_span);
11297 }
11298 dout(10) << "eval_repop " << *repop
11299 << (repop->op && repop->op->get_req<MOSDOp>() ? "" : " (no op)") << dendl;
11300
11301 // ondisk?
11302 if (repop->all_committed) {
11303 dout(10) << " commit: " << *repop << dendl;
11304 for (auto p = repop->on_committed.begin();
11305 p != repop->on_committed.end();
11306 repop->on_committed.erase(p++)) {
11307 (*p)();
11308 }
11309 // send dup commits, in order
11310 auto it = waiting_for_ondisk.find(repop->v);
11311 if (it != waiting_for_ondisk.end()) {
11312 ceph_assert(waiting_for_ondisk.begin()->first == repop->v);
11313 for (auto& i : it->second) {
11314 int return_code = repop->r;
11315 if (return_code >= 0) {
11316 return_code = std::get<2>(i);
11317 }
11318 osd->reply_op_error(std::get<0>(i), return_code, repop->v,
11319 std::get<1>(i), std::get<3>(i));
11320 }
11321 waiting_for_ondisk.erase(it);
11322 }
11323
11324 publish_stats_to_osd();
11325
11326 dout(10) << " removing " << *repop << dendl;
11327 ceph_assert(!repop_queue.empty());
11328 dout(20) << " q front is " << *repop_queue.front() << dendl;
11329 if (repop_queue.front() == repop) {
11330 RepGather *to_remove = nullptr;
11331 while (!repop_queue.empty() &&
11332 (to_remove = repop_queue.front())->all_committed) {
11333 repop_queue.pop_front();
11334 for (auto p = to_remove->on_success.begin();
11335 p != to_remove->on_success.end();
11336 to_remove->on_success.erase(p++)) {
11337 (*p)();
11338 }
11339 remove_repop(to_remove);
11340 }
11341 }
11342 }
11343 }
11344
11345 void PrimaryLogPG::issue_repop(RepGather *repop, OpContext *ctx)
11346 {
11347 FUNCTRACE(cct);
11348 const hobject_t& soid = ctx->obs->oi.soid;
11349 dout(7) << "issue_repop rep_tid " << repop->rep_tid
11350 << " o " << soid
11351 << dendl;
11352
11353 jspan span;
11354 if (ctx->op) {
11355 span = tracing::osd::tracer.add_span(__func__, ctx->op->osd_parent_span);
11356 }
11357
11358 repop->v = ctx->at_version;
11359
11360 ctx->op_t->add_obc(ctx->obc);
11361 if (ctx->clone_obc) {
11362 ctx->op_t->add_obc(ctx->clone_obc);
11363 }
11364 if (ctx->head_obc) {
11365 ctx->op_t->add_obc(ctx->head_obc);
11366 }
11367
11368 Context *on_all_commit = new C_OSD_RepopCommit(this, repop);
11369 if (!(ctx->log.empty())) {
11370 ceph_assert(ctx->at_version >= projected_last_update);
11371 projected_last_update = ctx->at_version;
11372 }
11373 for (auto &&entry: ctx->log) {
11374 projected_log.add(entry);
11375 }
11376
11377 recovery_state.pre_submit_op(
11378 soid,
11379 ctx->log,
11380 ctx->at_version);
11381 pgbackend->submit_transaction(
11382 soid,
11383 ctx->delta_stats,
11384 ctx->at_version,
11385 std::move(ctx->op_t),
11386 recovery_state.get_pg_trim_to(),
11387 recovery_state.get_min_last_complete_ondisk(),
11388 std::move(ctx->log),
11389 ctx->updated_hset_history,
11390 on_all_commit,
11391 repop->rep_tid,
11392 ctx->reqid,
11393 ctx->op);
11394 }
11395
11396 PrimaryLogPG::RepGather *PrimaryLogPG::new_repop(
11397 OpContext *ctx,
11398 ceph_tid_t rep_tid)
11399 {
11400 if (ctx->op)
11401 dout(10) << "new_repop rep_tid " << rep_tid << " on " << *ctx->op->get_req() << dendl;
11402 else
11403 dout(10) << "new_repop rep_tid " << rep_tid << " (no op)" << dendl;
11404
11405 RepGather *repop = new RepGather(
11406 ctx, rep_tid, info.last_complete);
11407
11408 repop->start = ceph_clock_now();
11409
11410 repop_queue.push_back(&repop->queue_item);
11411 repop->get();
11412
11413 osd->logger->inc(l_osd_op_wip);
11414
11415 dout(10) << __func__ << ": " << *repop << dendl;
11416 return repop;
11417 }
11418
11419 boost::intrusive_ptr<PrimaryLogPG::RepGather> PrimaryLogPG::new_repop(
11420 eversion_t version,
11421 int r,
11422 ObcLockManager &&manager,
11423 OpRequestRef &&op,
11424 std::optional<std::function<void(void)> > &&on_complete)
11425 {
11426 RepGather *repop = new RepGather(
11427 std::move(manager),
11428 std::move(op),
11429 std::move(on_complete),
11430 osd->get_tid(),
11431 info.last_complete,
11432 r);
11433 repop->v = version;
11434
11435 repop->start = ceph_clock_now();
11436
11437 repop_queue.push_back(&repop->queue_item);
11438
11439 osd->logger->inc(l_osd_op_wip);
11440
11441 dout(10) << __func__ << ": " << *repop << dendl;
11442 return boost::intrusive_ptr<RepGather>(repop);
11443 }
11444
11445 void PrimaryLogPG::remove_repop(RepGather *repop)
11446 {
11447 dout(20) << __func__ << " " << *repop << dendl;
11448
11449 for (auto p = repop->on_finish.begin();
11450 p != repop->on_finish.end();
11451 repop->on_finish.erase(p++)) {
11452 (*p)();
11453 }
11454
11455 release_object_locks(
11456 repop->lock_manager);
11457 repop->put();
11458
11459 osd->logger->dec(l_osd_op_wip);
11460 }
11461
11462 PrimaryLogPG::OpContextUPtr PrimaryLogPG::simple_opc_create(ObjectContextRef obc)
11463 {
11464 dout(20) << __func__ << " " << obc->obs.oi.soid << dendl;
11465 ceph_tid_t rep_tid = osd->get_tid();
11466 osd_reqid_t reqid(osd->get_cluster_msgr_name(), 0, rep_tid);
11467 OpContextUPtr ctx(new OpContext(OpRequestRef(), reqid, nullptr, obc, this));
11468 ctx->op_t.reset(new PGTransaction());
11469 ctx->mtime = ceph_clock_now();
11470 return ctx;
11471 }
11472
11473 void PrimaryLogPG::simple_opc_submit(OpContextUPtr ctx)
11474 {
11475 RepGather *repop = new_repop(ctx.get(), ctx->reqid.tid);
11476 dout(20) << __func__ << " " << repop << dendl;
11477 issue_repop(repop, ctx.get());
11478 eval_repop(repop);
11479 recovery_state.update_trim_to();
11480 repop->put();
11481 }
11482
11483
11484 void PrimaryLogPG::submit_log_entries(
11485 const mempool::osd_pglog::list<pg_log_entry_t> &entries,
11486 ObcLockManager &&manager,
11487 std::optional<std::function<void(void)> > &&_on_complete,
11488 OpRequestRef op,
11489 int r)
11490 {
11491 dout(10) << __func__ << " " << entries << dendl;
11492 ceph_assert(is_primary());
11493
11494 eversion_t version;
11495 if (!entries.empty()) {
11496 ceph_assert(entries.rbegin()->version >= projected_last_update);
11497 version = projected_last_update = entries.rbegin()->version;
11498 }
11499
11500 boost::intrusive_ptr<RepGather> repop;
11501 std::optional<std::function<void(void)> > on_complete;
11502 if (get_osdmap()->require_osd_release >= ceph_release_t::jewel) {
11503 repop = new_repop(
11504 version,
11505 r,
11506 std::move(manager),
11507 std::move(op),
11508 std::move(_on_complete));
11509 } else {
11510 on_complete = std::move(_on_complete);
11511 }
11512
11513 pgbackend->call_write_ordered(
11514 [this, entries, repop, on_complete]() {
11515 ObjectStore::Transaction t;
11516 eversion_t old_last_update = info.last_update;
11517 recovery_state.merge_new_log_entries(
11518 entries, t, recovery_state.get_pg_trim_to(),
11519 recovery_state.get_min_last_complete_ondisk());
11520
11521 set<pg_shard_t> waiting_on;
11522 for (set<pg_shard_t>::const_iterator i = get_acting_recovery_backfill().begin();
11523 i != get_acting_recovery_backfill().end();
11524 ++i) {
11525 pg_shard_t peer(*i);
11526 if (peer == pg_whoami) continue;
11527 ceph_assert(recovery_state.get_peer_missing().count(peer));
11528 ceph_assert(recovery_state.has_peer_info(peer));
11529 if (get_osdmap()->require_osd_release >= ceph_release_t::jewel) {
11530 ceph_assert(repop);
11531 MOSDPGUpdateLogMissing *m = new MOSDPGUpdateLogMissing(
11532 entries,
11533 spg_t(info.pgid.pgid, i->shard),
11534 pg_whoami.shard,
11535 get_osdmap_epoch(),
11536 get_last_peering_reset(),
11537 repop->rep_tid,
11538 recovery_state.get_pg_trim_to(),
11539 recovery_state.get_min_last_complete_ondisk());
11540 osd->send_message_osd_cluster(
11541 peer.osd, m, get_osdmap_epoch());
11542 waiting_on.insert(peer);
11543 } else {
11544 MOSDPGLog *m = new MOSDPGLog(
11545 peer.shard, pg_whoami.shard,
11546 info.last_update.epoch,
11547 info, get_last_peering_reset());
11548 m->log.log = entries;
11549 m->log.tail = old_last_update;
11550 m->log.head = info.last_update;
11551 osd->send_message_osd_cluster(
11552 peer.osd, m, get_osdmap_epoch());
11553 }
11554 }
11555 ceph_tid_t rep_tid = repop->rep_tid;
11556 waiting_on.insert(pg_whoami);
11557 log_entry_update_waiting_on.insert(
11558 make_pair(
11559 rep_tid,
11560 LogUpdateCtx{std::move(repop), std::move(waiting_on)}
11561 ));
11562 struct OnComplete : public Context {
11563 PrimaryLogPGRef pg;
11564 ceph_tid_t rep_tid;
11565 epoch_t epoch;
11566 OnComplete(
11567 PrimaryLogPGRef pg,
11568 ceph_tid_t rep_tid,
11569 epoch_t epoch)
11570 : pg(pg), rep_tid(rep_tid), epoch(epoch) {}
11571 void finish(int) override {
11572 std::scoped_lock l{*pg};
11573 if (!pg->pg_has_reset_since(epoch)) {
11574 auto it = pg->log_entry_update_waiting_on.find(rep_tid);
11575 ceph_assert(it != pg->log_entry_update_waiting_on.end());
11576 auto it2 = it->second.waiting_on.find(pg->pg_whoami);
11577 ceph_assert(it2 != it->second.waiting_on.end());
11578 it->second.waiting_on.erase(it2);
11579 if (it->second.waiting_on.empty()) {
11580 pg->repop_all_committed(it->second.repop.get());
11581 pg->log_entry_update_waiting_on.erase(it);
11582 }
11583 }
11584 }
11585 };
11586 t.register_on_commit(
11587 new OnComplete{this, rep_tid, get_osdmap_epoch()});
11588 int r = osd->store->queue_transaction(ch, std::move(t), NULL);
11589 ceph_assert(r == 0);
11590 op_applied(info.last_update);
11591 });
11592
11593 recovery_state.update_trim_to();
11594 }
11595
11596 void PrimaryLogPG::cancel_log_updates()
11597 {
11598 // get rid of all the LogUpdateCtx so their references to repops are
11599 // dropped
11600 log_entry_update_waiting_on.clear();
11601 }
11602
11603 // -------------------------------------------------------
11604
11605 void PrimaryLogPG::get_watchers(list<obj_watch_item_t> *ls)
11606 {
11607 std::scoped_lock l{*this};
11608 pair<hobject_t, ObjectContextRef> i;
11609 while (object_contexts.get_next(i.first, &i)) {
11610 ObjectContextRef obc(i.second);
11611 get_obc_watchers(obc, *ls);
11612 }
11613 }
11614
11615 void PrimaryLogPG::get_obc_watchers(ObjectContextRef obc, list<obj_watch_item_t> &pg_watchers)
11616 {
11617 for (map<pair<uint64_t, entity_name_t>, WatchRef>::iterator j =
11618 obc->watchers.begin();
11619 j != obc->watchers.end();
11620 ++j) {
11621 obj_watch_item_t owi;
11622
11623 owi.obj = obc->obs.oi.soid;
11624 owi.wi.addr = j->second->get_peer_addr();
11625 owi.wi.name = j->second->get_entity();
11626 owi.wi.cookie = j->second->get_cookie();
11627 owi.wi.timeout_seconds = j->second->get_timeout();
11628
11629 dout(30) << "watch: Found oid=" << owi.obj << " addr=" << owi.wi.addr
11630 << " name=" << owi.wi.name << " cookie=" << owi.wi.cookie << dendl;
11631
11632 pg_watchers.push_back(owi);
11633 }
11634 }
11635
11636 void PrimaryLogPG::check_blocklisted_watchers()
11637 {
11638 dout(20) << "PrimaryLogPG::check_blocklisted_watchers for pg " << get_pgid() << dendl;
11639 pair<hobject_t, ObjectContextRef> i;
11640 while (object_contexts.get_next(i.first, &i))
11641 check_blocklisted_obc_watchers(i.second);
11642 }
11643
11644 void PrimaryLogPG::check_blocklisted_obc_watchers(ObjectContextRef obc)
11645 {
11646 dout(20) << "PrimaryLogPG::check_blocklisted_obc_watchers for obc " << obc->obs.oi.soid << dendl;
11647 for (map<pair<uint64_t, entity_name_t>, WatchRef>::iterator k =
11648 obc->watchers.begin();
11649 k != obc->watchers.end();
11650 ) {
11651 //Advance iterator now so handle_watch_timeout() can erase element
11652 map<pair<uint64_t, entity_name_t>, WatchRef>::iterator j = k++;
11653 dout(30) << "watch: Found " << j->second->get_entity() << " cookie " << j->second->get_cookie() << dendl;
11654 entity_addr_t ea = j->second->get_peer_addr();
11655 dout(30) << "watch: Check entity_addr_t " << ea << dendl;
11656 if (get_osdmap()->is_blocklisted(ea)) {
11657 dout(10) << "watch: Found blocklisted watcher for " << ea << dendl;
11658 ceph_assert(j->second->get_pg() == this);
11659 j->second->unregister_cb();
11660 handle_watch_timeout(j->second);
11661 }
11662 }
11663 }
11664
11665 void PrimaryLogPG::populate_obc_watchers(ObjectContextRef obc)
11666 {
11667 ceph_assert(is_primary() && is_active());
11668 auto it_objects = recovery_state.get_pg_log().get_log().objects.find(obc->obs.oi.soid);
11669 ceph_assert((recovering.count(obc->obs.oi.soid) ||
11670 !is_missing_object(obc->obs.oi.soid)) ||
11671 (it_objects != recovery_state.get_pg_log().get_log().objects.end() && // or this is a revert... see recover_primary()
11672 it_objects->second->op ==
11673 pg_log_entry_t::LOST_REVERT &&
11674 it_objects->second->reverting_to ==
11675 obc->obs.oi.version));
11676
11677 dout(10) << "populate_obc_watchers " << obc->obs.oi.soid << dendl;
11678 ceph_assert(obc->watchers.empty());
11679 // populate unconnected_watchers
11680 for (map<pair<uint64_t, entity_name_t>, watch_info_t>::iterator p =
11681 obc->obs.oi.watchers.begin();
11682 p != obc->obs.oi.watchers.end();
11683 ++p) {
11684 utime_t expire = info.stats.last_became_active;
11685 expire += p->second.timeout_seconds;
11686 dout(10) << " unconnected watcher " << p->first << " will expire " << expire << dendl;
11687 WatchRef watch(
11688 Watch::makeWatchRef(
11689 this, osd, obc, p->second.timeout_seconds, p->first.first,
11690 p->first.second, p->second.addr));
11691 watch->disconnect();
11692 obc->watchers.insert(
11693 make_pair(
11694 make_pair(p->first.first, p->first.second),
11695 watch));
11696 }
11697 // Look for watchers from blocklisted clients and drop
11698 check_blocklisted_obc_watchers(obc);
11699 }
11700
11701 void PrimaryLogPG::handle_watch_timeout(WatchRef watch)
11702 {
11703 ObjectContextRef obc = watch->get_obc(); // handle_watch_timeout owns this ref
11704 dout(10) << "handle_watch_timeout obc " << obc << dendl;
11705
11706 if (!is_active()) {
11707 dout(10) << "handle_watch_timeout not active, no-op" << dendl;
11708 return;
11709 }
11710 if (!obc->obs.exists) {
11711 dout(10) << __func__ << " object " << obc->obs.oi.soid << " dne" << dendl;
11712 return;
11713 }
11714 if (is_degraded_or_backfilling_object(obc->obs.oi.soid)) {
11715 callbacks_for_degraded_object[obc->obs.oi.soid].push_back(
11716 watch->get_delayed_cb()
11717 );
11718 dout(10) << "handle_watch_timeout waiting for degraded on obj "
11719 << obc->obs.oi.soid
11720 << dendl;
11721 return;
11722 }
11723
11724 if (m_scrubber->write_blocked_by_scrub(obc->obs.oi.soid)) {
11725 dout(10) << "handle_watch_timeout waiting for scrub on obj "
11726 << obc->obs.oi.soid
11727 << dendl;
11728 m_scrubber->add_callback(
11729 watch->get_delayed_cb() // This callback!
11730 );
11731 return;
11732 }
11733
11734 OpContextUPtr ctx = simple_opc_create(obc);
11735 ctx->at_version = get_next_version();
11736
11737 object_info_t& oi = ctx->new_obs.oi;
11738 oi.watchers.erase(make_pair(watch->get_cookie(),
11739 watch->get_entity()));
11740
11741 list<watch_disconnect_t> watch_disconnects = {
11742 watch_disconnect_t(watch->get_cookie(), watch->get_entity(), true)
11743 };
11744 ctx->register_on_success(
11745 [this, obc, watch_disconnects]() {
11746 complete_disconnect_watches(obc, watch_disconnects);
11747 });
11748
11749
11750 PGTransaction *t = ctx->op_t.get();
11751 ctx->log.push_back(pg_log_entry_t(pg_log_entry_t::MODIFY, obc->obs.oi.soid,
11752 ctx->at_version,
11753 oi.version,
11754 0,
11755 osd_reqid_t(), ctx->mtime, 0));
11756
11757 oi.prior_version = obc->obs.oi.version;
11758 oi.version = ctx->at_version;
11759 bufferlist bl;
11760 encode(oi, bl, get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
11761 t->setattr(obc->obs.oi.soid, OI_ATTR, bl);
11762
11763 // apply new object state.
11764 ctx->obc->obs = ctx->new_obs;
11765
11766 // no ctx->delta_stats
11767 simple_opc_submit(std::move(ctx));
11768 }
11769
11770 ObjectContextRef PrimaryLogPG::create_object_context(const object_info_t& oi,
11771 SnapSetContext *ssc)
11772 {
11773 ObjectContextRef obc(object_contexts.lookup_or_create(oi.soid));
11774 ceph_assert(obc->destructor_callback == NULL);
11775 obc->destructor_callback = new C_PG_ObjectContext(this, obc.get());
11776 obc->obs.oi = oi;
11777 obc->obs.exists = false;
11778 obc->ssc = ssc;
11779 if (ssc)
11780 register_snapset_context(ssc);
11781 dout(10) << "create_object_context " << (void*)obc.get() << " " << oi.soid << " " << dendl;
11782 if (is_active())
11783 populate_obc_watchers(obc);
11784 return obc;
11785 }
11786
11787 ObjectContextRef PrimaryLogPG::get_object_context(
11788 const hobject_t& soid,
11789 bool can_create,
11790 const map<string, bufferlist, less<>> *attrs)
11791 {
11792 auto it_objects = recovery_state.get_pg_log().get_log().objects.find(soid);
11793 ceph_assert(
11794 attrs || !recovery_state.get_pg_log().get_missing().is_missing(soid) ||
11795 // or this is a revert... see recover_primary()
11796 (it_objects != recovery_state.get_pg_log().get_log().objects.end() &&
11797 it_objects->second->op ==
11798 pg_log_entry_t::LOST_REVERT));
11799 ObjectContextRef obc = object_contexts.lookup(soid);
11800 osd->logger->inc(l_osd_object_ctx_cache_total);
11801 if (obc) {
11802 osd->logger->inc(l_osd_object_ctx_cache_hit);
11803 dout(10) << __func__ << ": found obc in cache: " << obc
11804 << dendl;
11805 } else {
11806 dout(10) << __func__ << ": obc NOT found in cache: " << soid << dendl;
11807 // check disk
11808 bufferlist bv;
11809 if (attrs) {
11810 auto it_oi = attrs->find(OI_ATTR);
11811 ceph_assert(it_oi != attrs->end());
11812 bv = it_oi->second;
11813 } else {
11814 int r = pgbackend->objects_get_attr(soid, OI_ATTR, &bv);
11815 if (r < 0) {
11816 if (!can_create) {
11817 dout(10) << __func__ << ": no obc for soid "
11818 << soid << " and !can_create"
11819 << dendl;
11820 return ObjectContextRef(); // -ENOENT!
11821 }
11822
11823 dout(10) << __func__ << ": no obc for soid "
11824 << soid << " but can_create"
11825 << dendl;
11826 // new object.
11827 object_info_t oi(soid);
11828 SnapSetContext *ssc = get_snapset_context(
11829 soid, true, 0, false);
11830 ceph_assert(ssc);
11831 obc = create_object_context(oi, ssc);
11832 dout(10) << __func__ << ": " << obc << " " << soid
11833 << " " << obc->rwstate
11834 << " oi: " << obc->obs.oi
11835 << " ssc: " << obc->ssc
11836 << " snapset: " << obc->ssc->snapset << dendl;
11837 return obc;
11838 }
11839 }
11840
11841 object_info_t oi;
11842 try {
11843 bufferlist::const_iterator bliter = bv.begin();
11844 decode(oi, bliter);
11845 } catch (...) {
11846 dout(0) << __func__ << ": obc corrupt: " << soid << dendl;
11847 return ObjectContextRef(); // -ENOENT!
11848 }
11849
11850 ceph_assert(oi.soid.pool == (int64_t)info.pgid.pool());
11851
11852 obc = object_contexts.lookup_or_create(oi.soid);
11853 obc->destructor_callback = new C_PG_ObjectContext(this, obc.get());
11854 obc->obs.oi = oi;
11855 obc->obs.exists = true;
11856
11857 obc->ssc = get_snapset_context(
11858 soid, true,
11859 soid.has_snapset() ? attrs : 0);
11860
11861 if (is_primary() && is_active())
11862 populate_obc_watchers(obc);
11863
11864 if (pool.info.is_erasure()) {
11865 if (attrs) {
11866 obc->attr_cache = *attrs;
11867 } else {
11868 int r = pgbackend->objects_get_attrs(
11869 soid,
11870 &obc->attr_cache);
11871 ceph_assert(r == 0);
11872 }
11873 }
11874
11875 dout(10) << __func__ << ": creating obc from disk: " << obc
11876 << dendl;
11877 }
11878
11879 // XXX: Caller doesn't expect this
11880 if (obc->ssc == NULL) {
11881 derr << __func__ << ": obc->ssc not available, not returning context" << dendl;
11882 return ObjectContextRef(); // -ENOENT!
11883 }
11884
11885 dout(10) << __func__ << ": " << obc << " " << soid
11886 << " " << obc->rwstate
11887 << " oi: " << obc->obs.oi
11888 << " exists: " << (int)obc->obs.exists
11889 << " ssc: " << obc->ssc
11890 << " snapset: " << obc->ssc->snapset << dendl;
11891 return obc;
11892 }
11893
11894 void PrimaryLogPG::context_registry_on_change()
11895 {
11896 pair<hobject_t, ObjectContextRef> i;
11897 while (object_contexts.get_next(i.first, &i)) {
11898 ObjectContextRef obc(i.second);
11899 if (obc) {
11900 for (map<pair<uint64_t, entity_name_t>, WatchRef>::iterator j =
11901 obc->watchers.begin();
11902 j != obc->watchers.end();
11903 obc->watchers.erase(j++)) {
11904 j->second->discard();
11905 }
11906 }
11907 }
11908 }
11909
11910
11911 /*
11912 * If we return an error, and set *pmissing, then promoting that
11913 * object may help.
11914 *
11915 * If we return -EAGAIN, we will always set *pmissing to the missing
11916 * object to wait for.
11917 *
11918 * If we return an error but do not set *pmissing, then we know the
11919 * object does not exist.
11920 */
11921 int PrimaryLogPG::find_object_context(const hobject_t& oid,
11922 ObjectContextRef *pobc,
11923 bool can_create,
11924 bool map_snapid_to_clone,
11925 hobject_t *pmissing)
11926 {
11927 FUNCTRACE(cct);
11928 ceph_assert(oid.pool == static_cast<int64_t>(info.pgid.pool()));
11929 // want the head?
11930 if (oid.snap == CEPH_NOSNAP) {
11931 ObjectContextRef obc = get_object_context(oid, can_create);
11932 if (!obc) {
11933 if (pmissing)
11934 *pmissing = oid;
11935 return -ENOENT;
11936 }
11937 dout(10) << __func__ << " " << oid
11938 << " @" << oid.snap
11939 << " oi=" << obc->obs.oi
11940 << dendl;
11941 *pobc = obc;
11942
11943 return 0;
11944 }
11945
11946 // we want a snap
11947
11948 hobject_t head = oid.get_head();
11949 SnapSetContext *ssc = get_snapset_context(oid, can_create);
11950 if (!ssc || !(ssc->exists || can_create)) {
11951 dout(20) << __func__ << " " << oid << " no snapset" << dendl;
11952 if (pmissing)
11953 *pmissing = head; // start by getting the head
11954 if (ssc)
11955 put_snapset_context(ssc);
11956 return -ENOENT;
11957 }
11958
11959 if (map_snapid_to_clone) {
11960 dout(10) << __func__ << " " << oid << " @" << oid.snap
11961 << " snapset " << ssc->snapset
11962 << " map_snapid_to_clone=true" << dendl;
11963 if (oid.snap > ssc->snapset.seq) {
11964 // already must be readable
11965 ObjectContextRef obc = get_object_context(head, false);
11966 dout(10) << __func__ << " " << oid << " @" << oid.snap
11967 << " snapset " << ssc->snapset
11968 << " maps to head" << dendl;
11969 *pobc = obc;
11970 put_snapset_context(ssc);
11971 return (obc && obc->obs.exists) ? 0 : -ENOENT;
11972 } else {
11973 vector<snapid_t>::const_iterator citer = std::find(
11974 ssc->snapset.clones.begin(),
11975 ssc->snapset.clones.end(),
11976 oid.snap);
11977 if (citer == ssc->snapset.clones.end()) {
11978 dout(10) << __func__ << " " << oid << " @" << oid.snap
11979 << " snapset " << ssc->snapset
11980 << " maps to nothing" << dendl;
11981 put_snapset_context(ssc);
11982 return -ENOENT;
11983 }
11984
11985 dout(10) << __func__ << " " << oid << " @" << oid.snap
11986 << " snapset " << ssc->snapset
11987 << " maps to " << oid << dendl;
11988
11989 if (recovery_state.get_pg_log().get_missing().is_missing(oid)) {
11990 dout(10) << __func__ << " " << oid << " @" << oid.snap
11991 << " snapset " << ssc->snapset
11992 << " " << oid << " is missing" << dendl;
11993 if (pmissing)
11994 *pmissing = oid;
11995 put_snapset_context(ssc);
11996 return -EAGAIN;
11997 }
11998
11999 ObjectContextRef obc = get_object_context(oid, false);
12000 if (!obc || !obc->obs.exists) {
12001 dout(10) << __func__ << " " << oid << " @" << oid.snap
12002 << " snapset " << ssc->snapset
12003 << " " << oid << " is not present" << dendl;
12004 if (pmissing)
12005 *pmissing = oid;
12006 put_snapset_context(ssc);
12007 return -ENOENT;
12008 }
12009 dout(10) << __func__ << " " << oid << " @" << oid.snap
12010 << " snapset " << ssc->snapset
12011 << " " << oid << " HIT" << dendl;
12012 *pobc = obc;
12013 put_snapset_context(ssc);
12014 return 0;
12015 }
12016 ceph_abort(); //unreachable
12017 }
12018
12019 dout(10) << __func__ << " " << oid << " @" << oid.snap
12020 << " snapset " << ssc->snapset << dendl;
12021
12022 // head?
12023 if (oid.snap > ssc->snapset.seq) {
12024 ObjectContextRef obc = get_object_context(head, false);
12025 dout(10) << __func__ << " " << head
12026 << " want " << oid.snap << " > snapset seq " << ssc->snapset.seq
12027 << " -- HIT " << obc->obs
12028 << dendl;
12029 if (!obc->ssc)
12030 obc->ssc = ssc;
12031 else {
12032 ceph_assert(ssc == obc->ssc);
12033 put_snapset_context(ssc);
12034 }
12035 *pobc = obc;
12036 return 0;
12037 }
12038
12039 // which clone would it be?
12040 unsigned k = 0;
12041 while (k < ssc->snapset.clones.size() &&
12042 ssc->snapset.clones[k] < oid.snap)
12043 k++;
12044 if (k == ssc->snapset.clones.size()) {
12045 dout(10) << __func__ << " no clones with last >= oid.snap "
12046 << oid.snap << " -- DNE" << dendl;
12047 put_snapset_context(ssc);
12048 return -ENOENT;
12049 }
12050 hobject_t soid(oid.oid, oid.get_key(), ssc->snapset.clones[k], oid.get_hash(),
12051 info.pgid.pool(), oid.get_namespace());
12052
12053 if (recovery_state.get_pg_log().get_missing().is_missing(soid)) {
12054 dout(20) << __func__ << " " << soid << " missing, try again later"
12055 << dendl;
12056 if (pmissing)
12057 *pmissing = soid;
12058 put_snapset_context(ssc);
12059 return -EAGAIN;
12060 }
12061
12062 ObjectContextRef obc = get_object_context(soid, false);
12063 if (!obc || !obc->obs.exists) {
12064 if (pmissing)
12065 *pmissing = soid;
12066 put_snapset_context(ssc);
12067 if (is_primary()) {
12068 if (is_degraded_or_backfilling_object(soid)) {
12069 dout(20) << __func__ << " clone is degraded or backfilling " << soid << dendl;
12070 return -EAGAIN;
12071 } else if (is_degraded_on_async_recovery_target(soid)) {
12072 dout(20) << __func__ << " clone is recovering " << soid << dendl;
12073 return -EAGAIN;
12074 } else {
12075 dout(20) << __func__ << " missing clone " << soid << dendl;
12076 return -ENOENT;
12077 }
12078 } else {
12079 dout(20) << __func__ << " replica missing clone" << soid << dendl;
12080 return -ENOENT;
12081 }
12082 }
12083
12084 if (!obc->ssc) {
12085 obc->ssc = ssc;
12086 } else {
12087 ceph_assert(obc->ssc == ssc);
12088 put_snapset_context(ssc);
12089 }
12090 ssc = 0;
12091
12092 // clone
12093 dout(20) << __func__ << " " << soid
12094 << " snapset " << obc->ssc->snapset
12095 << dendl;
12096 snapid_t first, last;
12097 auto p = obc->ssc->snapset.clone_snaps.find(soid.snap);
12098 ceph_assert(p != obc->ssc->snapset.clone_snaps.end());
12099 if (p->second.empty()) {
12100 dout(1) << __func__ << " " << soid << " empty snapset -- DNE" << dendl;
12101 ceph_assert(!cct->_conf->osd_debug_verify_snaps);
12102 return -ENOENT;
12103 }
12104 if (std::find(p->second.begin(), p->second.end(), oid.snap) ==
12105 p->second.end()) {
12106 dout(20) << __func__ << " " << soid << " clone_snaps " << p->second
12107 << " does not contain " << oid.snap << " -- DNE" << dendl;
12108 return -ENOENT;
12109 }
12110 if (get_osdmap()->in_removed_snaps_queue(info.pgid.pgid.pool(), oid.snap)) {
12111 dout(20) << __func__ << " " << soid << " snap " << oid.snap
12112 << " in removed_snaps_queue" << " -- DNE" << dendl;
12113 return -ENOENT;
12114 }
12115 dout(20) << __func__ << " " << soid << " clone_snaps " << p->second
12116 << " contains " << oid.snap << " -- HIT " << obc->obs << dendl;
12117 *pobc = obc;
12118 return 0;
12119 }
12120
12121 void PrimaryLogPG::object_context_destructor_callback(ObjectContext *obc)
12122 {
12123 if (obc->ssc)
12124 put_snapset_context(obc->ssc);
12125 }
12126
12127 void PrimaryLogPG::add_object_context_to_pg_stat(ObjectContextRef obc, pg_stat_t *pgstat)
12128 {
12129 object_info_t& oi = obc->obs.oi;
12130
12131 dout(10) << __func__ << " " << oi.soid << dendl;
12132 ceph_assert(!oi.soid.is_snapdir());
12133
12134 object_stat_sum_t stat;
12135 stat.num_objects++;
12136 if (oi.is_dirty())
12137 stat.num_objects_dirty++;
12138 if (oi.is_whiteout())
12139 stat.num_whiteouts++;
12140 if (oi.is_omap())
12141 stat.num_objects_omap++;
12142 if (oi.is_cache_pinned())
12143 stat.num_objects_pinned++;
12144 if (oi.has_manifest())
12145 stat.num_objects_manifest++;
12146
12147 if (oi.soid.is_snap()) {
12148 stat.num_object_clones++;
12149
12150 if (!obc->ssc)
12151 obc->ssc = get_snapset_context(oi.soid, false);
12152 ceph_assert(obc->ssc);
12153 stat.num_bytes += obc->ssc->snapset.get_clone_bytes(oi.soid.snap);
12154 } else {
12155 stat.num_bytes += oi.size;
12156 }
12157
12158 // add it in
12159 pgstat->stats.sum.add(stat);
12160 }
12161
12162 void PrimaryLogPG::requeue_op_blocked_by_object(const hobject_t &soid) {
12163 map<hobject_t, list<OpRequestRef>>::iterator p = waiting_for_blocked_object.find(soid);
12164 if (p != waiting_for_blocked_object.end()) {
12165 list<OpRequestRef>& ls = p->second;
12166 dout(10) << __func__ << " " << soid << " requeuing " << ls.size() << " requests" << dendl;
12167 requeue_ops(ls);
12168 waiting_for_blocked_object.erase(p);
12169 }
12170 }
12171
12172 void PrimaryLogPG::kick_object_context_blocked(ObjectContextRef obc)
12173 {
12174 const hobject_t& soid = obc->obs.oi.soid;
12175 if (obc->is_blocked()) {
12176 dout(10) << __func__ << " " << soid << " still blocked" << dendl;
12177 return;
12178 }
12179
12180 requeue_op_blocked_by_object(soid);
12181
12182 map<hobject_t, ObjectContextRef>::iterator i =
12183 objects_blocked_on_snap_promotion.find(obc->obs.oi.soid.get_head());
12184 if (i != objects_blocked_on_snap_promotion.end()) {
12185 ceph_assert(i->second == obc);
12186 ObjectContextRef head_obc = get_object_context(i->first, false);
12187 head_obc->stop_block();
12188 // kick blocked ops (head)
12189 requeue_op_blocked_by_object(i->first);
12190 objects_blocked_on_snap_promotion.erase(i);
12191 }
12192
12193 if (obc->requeue_scrub_on_unblock) {
12194
12195 obc->requeue_scrub_on_unblock = false;
12196
12197 dout(20) << __func__ << " requeuing if still active: " << (is_active() ? "yes" : "no") << dendl;
12198
12199 // only requeue if we are still active: we may be unblocking
12200 // because we are resetting for a new peering interval
12201 if (is_active()) {
12202 osd->queue_scrub_unblocking(this, is_scrub_blocking_ops());
12203 }
12204 }
12205 }
12206
12207 SnapSetContext *PrimaryLogPG::get_snapset_context(
12208 const hobject_t& oid,
12209 bool can_create,
12210 const map<string, bufferlist, less<>> *attrs,
12211 bool oid_existed)
12212 {
12213 std::lock_guard l(snapset_contexts_lock);
12214 SnapSetContext *ssc;
12215 map<hobject_t, SnapSetContext*>::iterator p = snapset_contexts.find(
12216 oid.get_snapdir());
12217 if (p != snapset_contexts.end()) {
12218 if (can_create || p->second->exists) {
12219 ssc = p->second;
12220 } else {
12221 return NULL;
12222 }
12223 } else {
12224 bufferlist bv;
12225 if (!attrs) {
12226 int r = -ENOENT;
12227 if (!(oid.is_head() && !oid_existed)) {
12228 r = pgbackend->objects_get_attr(oid.get_head(), SS_ATTR, &bv);
12229 }
12230 if (r < 0 && !can_create)
12231 return NULL;
12232 } else {
12233 auto it_ss = attrs->find(SS_ATTR);
12234 ceph_assert(it_ss != attrs->end());
12235 bv = it_ss->second;
12236 }
12237 ssc = new SnapSetContext(oid.get_snapdir());
12238 _register_snapset_context(ssc);
12239 if (bv.length()) {
12240 bufferlist::const_iterator bvp = bv.begin();
12241 try {
12242 ssc->snapset.decode(bvp);
12243 } catch (const ceph::buffer::error& e) {
12244 dout(0) << __func__ << " Can't decode snapset: " << e.what() << dendl;
12245 return NULL;
12246 }
12247 ssc->exists = true;
12248 } else {
12249 ssc->exists = false;
12250 }
12251 }
12252 ceph_assert(ssc);
12253 ssc->ref++;
12254 return ssc;
12255 }
12256
12257 void PrimaryLogPG::put_snapset_context(SnapSetContext *ssc)
12258 {
12259 std::lock_guard l(snapset_contexts_lock);
12260 --ssc->ref;
12261 if (ssc->ref == 0) {
12262 if (ssc->registered)
12263 snapset_contexts.erase(ssc->oid);
12264 delete ssc;
12265 }
12266 }
12267
12268 /*
12269 * Return values:
12270 * NONE - didn't pull anything
12271 * YES - pulled what the caller wanted
12272 * HEAD - needed to pull head first
12273 */
12274 enum { PULL_NONE, PULL_HEAD, PULL_YES };
12275
12276 int PrimaryLogPG::recover_missing(
12277 const hobject_t &soid, eversion_t v,
12278 int priority,
12279 PGBackend::RecoveryHandle *h)
12280 {
12281 if (recovery_state.get_missing_loc().is_unfound(soid)) {
12282 dout(7) << __func__ << " " << soid
12283 << " v " << v
12284 << " but it is unfound" << dendl;
12285 return PULL_NONE;
12286 }
12287
12288 if (recovery_state.get_missing_loc().is_deleted(soid)) {
12289 start_recovery_op(soid);
12290 ceph_assert(!recovering.count(soid));
12291 recovering.insert(make_pair(soid, ObjectContextRef()));
12292 epoch_t cur_epoch = get_osdmap_epoch();
12293 remove_missing_object(soid, v, new LambdaContext(
12294 [=](int) {
12295 std::scoped_lock locker{*this};
12296 if (!pg_has_reset_since(cur_epoch)) {
12297 bool object_missing = false;
12298 for (const auto& shard : get_acting_recovery_backfill()) {
12299 if (shard == pg_whoami)
12300 continue;
12301 if (recovery_state.get_peer_missing(shard).is_missing(soid)) {
12302 dout(20) << __func__ << ": soid " << soid << " needs to be deleted from replica " << shard << dendl;
12303 object_missing = true;
12304 break;
12305 }
12306 }
12307 if (!object_missing) {
12308 object_stat_sum_t stat_diff;
12309 stat_diff.num_objects_recovered = 1;
12310 if (scrub_after_recovery)
12311 stat_diff.num_objects_repaired = 1;
12312 on_global_recover(soid, stat_diff, true);
12313 } else {
12314 auto recovery_handle = pgbackend->open_recovery_op();
12315 pgbackend->recover_delete_object(soid, v, recovery_handle);
12316 pgbackend->run_recovery_op(recovery_handle, priority);
12317 }
12318 }
12319 }));
12320 return PULL_YES;
12321 }
12322
12323 // is this a snapped object? if so, consult the snapset.. we may not need the entire object!
12324 ObjectContextRef obc;
12325 ObjectContextRef head_obc;
12326 if (soid.snap && soid.snap < CEPH_NOSNAP) {
12327 // do we have the head?
12328 hobject_t head = soid.get_head();
12329 if (recovery_state.get_pg_log().get_missing().is_missing(head)) {
12330 if (recovering.count(head)) {
12331 dout(10) << " missing but already recovering head " << head << dendl;
12332 return PULL_NONE;
12333 } else {
12334 int r = recover_missing(
12335 head, recovery_state.get_pg_log().get_missing().get_items().find(head)->second.need, priority,
12336 h);
12337 if (r != PULL_NONE)
12338 return PULL_HEAD;
12339 return PULL_NONE;
12340 }
12341 }
12342 head_obc = get_object_context(
12343 head,
12344 false,
12345 0);
12346 ceph_assert(head_obc);
12347 }
12348 start_recovery_op(soid);
12349 ceph_assert(!recovering.count(soid));
12350 recovering.insert(make_pair(soid, obc));
12351 int r = pgbackend->recover_object(
12352 soid,
12353 v,
12354 head_obc,
12355 obc,
12356 h);
12357 // This is only a pull which shouldn't return an error
12358 ceph_assert(r >= 0);
12359 return PULL_YES;
12360 }
12361
12362 void PrimaryLogPG::remove_missing_object(const hobject_t &soid,
12363 eversion_t v, Context *on_complete)
12364 {
12365 dout(20) << __func__ << " " << soid << " " << v << dendl;
12366 ceph_assert(on_complete != nullptr);
12367 // delete locally
12368 ObjectStore::Transaction t;
12369 remove_snap_mapped_object(t, soid);
12370
12371 ObjectRecoveryInfo recovery_info;
12372 recovery_info.soid = soid;
12373 recovery_info.version = v;
12374
12375 epoch_t cur_epoch = get_osdmap_epoch();
12376 t.register_on_complete(new LambdaContext(
12377 [=](int) {
12378 std::unique_lock locker{*this};
12379 if (!pg_has_reset_since(cur_epoch)) {
12380 ObjectStore::Transaction t2;
12381 on_local_recover(soid, recovery_info, ObjectContextRef(), true, &t2);
12382 t2.register_on_complete(on_complete);
12383 int r = osd->store->queue_transaction(ch, std::move(t2), nullptr);
12384 ceph_assert(r == 0);
12385 locker.unlock();
12386 } else {
12387 locker.unlock();
12388 on_complete->complete(-EAGAIN);
12389 }
12390 }));
12391 int r = osd->store->queue_transaction(ch, std::move(t), nullptr);
12392 ceph_assert(r == 0);
12393 }
12394
12395 void PrimaryLogPG::finish_degraded_object(const hobject_t oid)
12396 {
12397 dout(10) << __func__ << " " << oid << dendl;
12398 if (callbacks_for_degraded_object.count(oid)) {
12399 list<Context*> contexts;
12400 contexts.swap(callbacks_for_degraded_object[oid]);
12401 callbacks_for_degraded_object.erase(oid);
12402 for (list<Context*>::iterator i = contexts.begin();
12403 i != contexts.end();
12404 ++i) {
12405 (*i)->complete(0);
12406 }
12407 }
12408 map<hobject_t, snapid_t>::iterator i = objects_blocked_on_degraded_snap.find(
12409 oid.get_head());
12410 if (i != objects_blocked_on_degraded_snap.end() &&
12411 i->second == oid.snap)
12412 objects_blocked_on_degraded_snap.erase(i);
12413 }
12414
12415 void PrimaryLogPG::_committed_pushed_object(
12416 epoch_t epoch, eversion_t last_complete)
12417 {
12418 std::scoped_lock locker{*this};
12419 if (!pg_has_reset_since(epoch)) {
12420 recovery_state.recovery_committed_to(last_complete);
12421 } else {
12422 dout(10) << __func__
12423 << " pg has changed, not touching last_complete_ondisk" << dendl;
12424 }
12425 }
12426
12427 void PrimaryLogPG::_applied_recovered_object(ObjectContextRef obc)
12428 {
12429 dout(20) << __func__ << dendl;
12430 if (obc) {
12431 dout(20) << "obc = " << *obc << dendl;
12432 }
12433 ceph_assert(active_pushes >= 1);
12434 --active_pushes;
12435
12436 // requeue an active chunky scrub waiting on recovery ops
12437 if (!recovery_state.is_deleting() && active_pushes == 0 &&
12438 is_scrub_active()) {
12439
12440 osd->queue_scrub_pushes_update(this, is_scrub_blocking_ops());
12441 }
12442 }
12443
12444 void PrimaryLogPG::_applied_recovered_object_replica()
12445 {
12446 dout(20) << __func__ << dendl;
12447 ceph_assert(active_pushes >= 1);
12448 --active_pushes;
12449
12450 // requeue an active scrub waiting on recovery ops
12451 if (!recovery_state.is_deleting() && active_pushes == 0 &&
12452 is_scrub_active()) {
12453
12454 osd->queue_scrub_replica_pushes(this, m_scrubber->replica_op_priority());
12455 }
12456 }
12457
12458 void PrimaryLogPG::on_failed_pull(
12459 const set<pg_shard_t> &from,
12460 const hobject_t &soid,
12461 const eversion_t &v)
12462 {
12463 dout(20) << __func__ << ": " << soid << dendl;
12464 ceph_assert(recovering.count(soid));
12465 auto obc = recovering[soid];
12466 if (obc) {
12467 list<OpRequestRef> blocked_ops;
12468 obc->drop_recovery_read(&blocked_ops);
12469 requeue_ops(blocked_ops);
12470 }
12471 recovering.erase(soid);
12472 for (auto&& i : from) {
12473 if (i != pg_whoami) { // we'll get it below in primary_error
12474 recovery_state.force_object_missing(i, soid, v);
12475 }
12476 }
12477
12478 dout(0) << __func__ << " " << soid << " from shard " << from
12479 << ", reps on " << recovery_state.get_missing_loc().get_locations(soid)
12480 << " unfound? " << recovery_state.get_missing_loc().is_unfound(soid)
12481 << dendl;
12482 finish_recovery_op(soid); // close out this attempt,
12483 finish_degraded_object(soid);
12484
12485 if (from.count(pg_whoami)) {
12486 dout(0) << " primary missing oid " << soid << " version " << v << dendl;
12487 primary_error(soid, v);
12488 backfills_in_flight.erase(soid);
12489 }
12490 }
12491
12492 eversion_t PrimaryLogPG::pick_newest_available(const hobject_t& oid)
12493 {
12494 eversion_t v;
12495 pg_missing_item pmi;
12496 bool is_missing = recovery_state.get_pg_log().get_missing().is_missing(oid, &pmi);
12497 ceph_assert(is_missing);
12498 v = pmi.have;
12499 dout(10) << "pick_newest_available " << oid << " " << v << " on osd." << osd->whoami << " (local)" << dendl;
12500
12501 ceph_assert(!get_acting_recovery_backfill().empty());
12502 for (set<pg_shard_t>::iterator i = get_acting_recovery_backfill().begin();
12503 i != get_acting_recovery_backfill().end();
12504 ++i) {
12505 if (*i == get_primary()) continue;
12506 pg_shard_t peer = *i;
12507 if (!recovery_state.get_peer_missing(peer).is_missing(oid)) {
12508 continue;
12509 }
12510 eversion_t h = recovery_state.get_peer_missing(peer).get_items().at(oid).have;
12511 dout(10) << "pick_newest_available " << oid << " " << h << " on osd." << peer << dendl;
12512 if (h > v)
12513 v = h;
12514 }
12515
12516 dout(10) << "pick_newest_available " << oid << " " << v << " (newest)" << dendl;
12517 return v;
12518 }
12519
12520 void PrimaryLogPG::do_update_log_missing(OpRequestRef &op)
12521 {
12522 const MOSDPGUpdateLogMissing *m = static_cast<const MOSDPGUpdateLogMissing*>(
12523 op->get_req());
12524 ceph_assert(m->get_type() == MSG_OSD_PG_UPDATE_LOG_MISSING);
12525 ObjectStore::Transaction t;
12526 std::optional<eversion_t> op_trim_to, op_roll_forward_to;
12527 if (m->pg_trim_to != eversion_t())
12528 op_trim_to = m->pg_trim_to;
12529 if (m->pg_roll_forward_to != eversion_t())
12530 op_roll_forward_to = m->pg_roll_forward_to;
12531
12532 dout(20) << __func__
12533 << " op_trim_to = " << op_trim_to << " op_roll_forward_to = " << op_roll_forward_to << dendl;
12534
12535 recovery_state.append_log_entries_update_missing(
12536 m->entries, t, op_trim_to, op_roll_forward_to);
12537 eversion_t new_lcod = info.last_complete;
12538
12539 Context *complete = new LambdaContext(
12540 [=](int) {
12541 const MOSDPGUpdateLogMissing *msg = static_cast<const MOSDPGUpdateLogMissing*>(
12542 op->get_req());
12543 std::scoped_lock locker{*this};
12544 if (!pg_has_reset_since(msg->get_epoch())) {
12545 update_last_complete_ondisk(new_lcod);
12546 MOSDPGUpdateLogMissingReply *reply =
12547 new MOSDPGUpdateLogMissingReply(
12548 spg_t(info.pgid.pgid, primary_shard().shard),
12549 pg_whoami.shard,
12550 msg->get_epoch(),
12551 msg->min_epoch,
12552 msg->get_tid(),
12553 new_lcod);
12554 reply->set_priority(CEPH_MSG_PRIO_HIGH);
12555 msg->get_connection()->send_message(reply);
12556 }
12557 });
12558
12559 if (get_osdmap()->require_osd_release >= ceph_release_t::kraken) {
12560 t.register_on_commit(complete);
12561 } else {
12562 /* Hack to work around the fact that ReplicatedBackend sends
12563 * ack+commit if commit happens first
12564 *
12565 * This behavior is no longer necessary, but we preserve it so old
12566 * primaries can keep their repops in order */
12567 if (pool.info.is_erasure()) {
12568 t.register_on_complete(complete);
12569 } else {
12570 t.register_on_commit(complete);
12571 }
12572 }
12573 int tr = osd->store->queue_transaction(
12574 ch,
12575 std::move(t),
12576 nullptr);
12577 ceph_assert(tr == 0);
12578 op_applied(info.last_update);
12579 }
12580
12581 void PrimaryLogPG::do_update_log_missing_reply(OpRequestRef &op)
12582 {
12583 const MOSDPGUpdateLogMissingReply *m =
12584 static_cast<const MOSDPGUpdateLogMissingReply*>(
12585 op->get_req());
12586 dout(20) << __func__ << " got reply from "
12587 << m->get_from() << dendl;
12588
12589 auto it = log_entry_update_waiting_on.find(m->get_tid());
12590 if (it != log_entry_update_waiting_on.end()) {
12591 if (it->second.waiting_on.count(m->get_from())) {
12592 it->second.waiting_on.erase(m->get_from());
12593 if (m->last_complete_ondisk != eversion_t()) {
12594 update_peer_last_complete_ondisk(m->get_from(), m->last_complete_ondisk);
12595 }
12596 } else {
12597 osd->clog->error()
12598 << info.pgid << " got reply "
12599 << *m << " from shard we are not waiting for "
12600 << m->get_from();
12601 }
12602
12603 if (it->second.waiting_on.empty()) {
12604 repop_all_committed(it->second.repop.get());
12605 log_entry_update_waiting_on.erase(it);
12606 }
12607 } else {
12608 osd->clog->error()
12609 << info.pgid << " got reply "
12610 << *m << " on unknown tid " << m->get_tid();
12611 }
12612 }
12613
12614 /* Mark all unfound objects as lost.
12615 */
12616 void PrimaryLogPG::mark_all_unfound_lost(
12617 int what,
12618 std::function<void(int,const std::string&,bufferlist&)> on_finish)
12619 {
12620 dout(3) << __func__ << " " << pg_log_entry_t::get_op_name(what) << dendl;
12621 list<hobject_t> oids;
12622
12623 dout(30) << __func__ << ": log before:\n";
12624 recovery_state.get_pg_log().get_log().print(*_dout);
12625 *_dout << dendl;
12626
12627 mempool::osd_pglog::list<pg_log_entry_t> log_entries;
12628
12629 utime_t mtime = ceph_clock_now();
12630 map<hobject_t, pg_missing_item>::const_iterator m =
12631 recovery_state.get_missing_loc().get_needs_recovery().begin();
12632 map<hobject_t, pg_missing_item>::const_iterator mend =
12633 recovery_state.get_missing_loc().get_needs_recovery().end();
12634
12635 ObcLockManager manager;
12636 eversion_t v = get_next_version();
12637 v.epoch = get_osdmap_epoch();
12638 uint64_t num_unfound = recovery_state.get_missing_loc().num_unfound();
12639 while (m != mend) {
12640 const hobject_t &oid(m->first);
12641 if (!recovery_state.get_missing_loc().is_unfound(oid)) {
12642 // We only care about unfound objects
12643 ++m;
12644 continue;
12645 }
12646
12647 ObjectContextRef obc;
12648 eversion_t prev;
12649
12650 switch (what) {
12651 case pg_log_entry_t::LOST_MARK:
12652 ceph_abort_msg("actually, not implemented yet!");
12653 break;
12654
12655 case pg_log_entry_t::LOST_REVERT:
12656 prev = pick_newest_available(oid);
12657 if (prev > eversion_t()) {
12658 // log it
12659 pg_log_entry_t e(
12660 pg_log_entry_t::LOST_REVERT, oid, v,
12661 m->second.need, 0, osd_reqid_t(), mtime, 0);
12662 e.reverting_to = prev;
12663 e.mark_unrollbackable();
12664 log_entries.push_back(e);
12665 dout(10) << e << dendl;
12666
12667 // we are now missing the new version; recovery code will sort it out.
12668 ++v.version;
12669 ++m;
12670 break;
12671 }
12672
12673 case pg_log_entry_t::LOST_DELETE:
12674 {
12675 pg_log_entry_t e(pg_log_entry_t::LOST_DELETE, oid, v, m->second.need,
12676 0, osd_reqid_t(), mtime, 0);
12677 if (get_osdmap()->require_osd_release >= ceph_release_t::jewel) {
12678 if (pool.info.require_rollback()) {
12679 e.mod_desc.try_rmobject(v.version);
12680 } else {
12681 e.mark_unrollbackable();
12682 }
12683 } // otherwise, just do what we used to do
12684 dout(10) << e << dendl;
12685 log_entries.push_back(e);
12686 oids.push_back(oid);
12687
12688 // If context found mark object as deleted in case
12689 // of racing with new creation. This can happen if
12690 // object lost and EIO at primary.
12691 obc = object_contexts.lookup(oid);
12692 if (obc)
12693 obc->obs.exists = false;
12694
12695 ++v.version;
12696 ++m;
12697 }
12698 break;
12699
12700 default:
12701 ceph_abort();
12702 }
12703 }
12704
12705 recovery_state.update_stats(
12706 [](auto &history, auto &stats) {
12707 stats.stats_invalid = true;
12708 return false;
12709 });
12710
12711 submit_log_entries(
12712 log_entries,
12713 std::move(manager),
12714 std::optional<std::function<void(void)> >(
12715 [this, oids, num_unfound, on_finish]() {
12716 if (recovery_state.perform_deletes_during_peering()) {
12717 for (auto oid : oids) {
12718 // clear old locations - merge_new_log_entries will have
12719 // handled rebuilding missing_loc for each of these
12720 // objects if we have the RECOVERY_DELETES flag
12721 recovery_state.object_recovered(oid, object_stat_sum_t());
12722 }
12723 }
12724
12725 if (is_recovery_unfound()) {
12726 queue_peering_event(
12727 PGPeeringEventRef(
12728 std::make_shared<PGPeeringEvent>(
12729 get_osdmap_epoch(),
12730 get_osdmap_epoch(),
12731 PeeringState::DoRecovery())));
12732 } else if (is_backfill_unfound()) {
12733 queue_peering_event(
12734 PGPeeringEventRef(
12735 std::make_shared<PGPeeringEvent>(
12736 get_osdmap_epoch(),
12737 get_osdmap_epoch(),
12738 PeeringState::RequestBackfill())));
12739 } else {
12740 queue_recovery();
12741 }
12742
12743 stringstream ss;
12744 ss << "pg has " << num_unfound
12745 << " objects unfound and apparently lost marking";
12746 string rs = ss.str();
12747 dout(0) << "do_command r=" << 0 << " " << rs << dendl;
12748 osd->clog->info() << rs;
12749 bufferlist empty;
12750 on_finish(0, rs, empty);
12751 }),
12752 OpRequestRef());
12753 }
12754
12755 void PrimaryLogPG::_split_into(pg_t child_pgid, PG *child, unsigned split_bits)
12756 {
12757 ceph_assert(repop_queue.empty());
12758 }
12759
12760 /*
12761 * pg status change notification
12762 */
12763
12764 void PrimaryLogPG::apply_and_flush_repops(bool requeue)
12765 {
12766 list<OpRequestRef> rq;
12767
12768 // apply all repops
12769 while (!repop_queue.empty()) {
12770 RepGather *repop = repop_queue.front();
12771 repop_queue.pop_front();
12772 dout(10) << " canceling repop tid " << repop->rep_tid << dendl;
12773 repop->rep_aborted = true;
12774 repop->on_committed.clear();
12775 repop->on_success.clear();
12776
12777 if (requeue) {
12778 if (repop->op) {
12779 dout(10) << " requeuing " << *repop->op->get_req() << dendl;
12780 rq.push_back(repop->op);
12781 repop->op = OpRequestRef();
12782 }
12783
12784 // also requeue any dups, interleaved into position
12785 auto p = waiting_for_ondisk.find(repop->v);
12786 if (p != waiting_for_ondisk.end()) {
12787 dout(10) << " also requeuing ondisk waiters " << p->second << dendl;
12788 for (auto& i : p->second) {
12789 rq.push_back(std::get<0>(i));
12790 }
12791 waiting_for_ondisk.erase(p);
12792 }
12793 }
12794
12795 remove_repop(repop);
12796 }
12797
12798 ceph_assert(repop_queue.empty());
12799
12800 if (requeue) {
12801 requeue_ops(rq);
12802 if (!waiting_for_ondisk.empty()) {
12803 for (auto& i : waiting_for_ondisk) {
12804 for (auto& j : i.second) {
12805 derr << __func__ << ": op " << *(std::get<0>(j)->get_req())
12806 << " waiting on " << i.first << dendl;
12807 }
12808 }
12809 ceph_assert(waiting_for_ondisk.empty());
12810 }
12811 }
12812
12813 waiting_for_ondisk.clear();
12814 }
12815
12816 void PrimaryLogPG::on_flushed()
12817 {
12818 requeue_ops(waiting_for_flush);
12819 if (!is_peered() || !is_primary()) {
12820 pair<hobject_t, ObjectContextRef> i;
12821 while (object_contexts.get_next(i.first, &i)) {
12822 derr << __func__ << ": object " << i.first << " obc still alive" << dendl;
12823 }
12824 ceph_assert(object_contexts.empty());
12825 }
12826 }
12827
12828 void PrimaryLogPG::on_removal(ObjectStore::Transaction &t)
12829 {
12830 dout(10) << __func__ << dendl;
12831
12832 on_shutdown();
12833
12834 t.register_on_commit(new C_DeleteMore(this, get_osdmap_epoch()));
12835 }
12836
12837 void PrimaryLogPG::clear_async_reads()
12838 {
12839 dout(10) << __func__ << dendl;
12840 for(auto& i : in_progress_async_reads) {
12841 dout(10) << "clear ctx: "
12842 << "OpRequestRef " << i.first
12843 << " OpContext " << i.second
12844 << dendl;
12845 close_op_ctx(i.second);
12846 }
12847 }
12848
12849 void PrimaryLogPG::clear_cache()
12850 {
12851 object_contexts.clear();
12852 }
12853
12854 void PrimaryLogPG::on_shutdown()
12855 {
12856 dout(10) << __func__ << dendl;
12857
12858 if (recovery_queued) {
12859 recovery_queued = false;
12860 osd->clear_queued_recovery(this);
12861 }
12862
12863 m_scrubber->scrub_clear_state();
12864 m_scrubber->rm_from_osd_scrubbing();
12865
12866 vector<ceph_tid_t> tids;
12867 cancel_copy_ops(false, &tids);
12868 cancel_flush_ops(false, &tids);
12869 cancel_proxy_ops(false, &tids);
12870 cancel_manifest_ops(false, &tids);
12871 cancel_cls_gather_ops(false, &tids);
12872 osd->objecter->op_cancel(tids, -ECANCELED);
12873
12874 apply_and_flush_repops(false);
12875 cancel_log_updates();
12876 // we must remove PGRefs, so do this this prior to release_backoffs() callers
12877 clear_backoffs();
12878 // clean up snap trim references
12879 snap_trimmer_machine.process_event(Reset());
12880
12881 pgbackend->on_change();
12882
12883 context_registry_on_change();
12884 object_contexts.clear();
12885
12886 clear_async_reads();
12887
12888 osd->remote_reserver.cancel_reservation(info.pgid);
12889 osd->local_reserver.cancel_reservation(info.pgid);
12890
12891 clear_primary_state();
12892 cancel_recovery();
12893
12894 if (is_primary()) {
12895 osd->clear_ready_to_merge(this);
12896 }
12897 }
12898
12899 void PrimaryLogPG::on_activate_complete()
12900 {
12901 check_local();
12902 // waiters
12903 if (!recovery_state.needs_flush()) {
12904 requeue_ops(waiting_for_peered);
12905 } else if (!waiting_for_peered.empty()) {
12906 dout(10) << __func__ << " flushes in progress, moving "
12907 << waiting_for_peered.size()
12908 << " items to waiting_for_flush"
12909 << dendl;
12910 ceph_assert(waiting_for_flush.empty());
12911 waiting_for_flush.swap(waiting_for_peered);
12912 }
12913
12914
12915 // all clean?
12916 if (needs_recovery()) {
12917 dout(10) << "activate not all replicas are up-to-date, queueing recovery" << dendl;
12918 queue_peering_event(
12919 PGPeeringEventRef(
12920 std::make_shared<PGPeeringEvent>(
12921 get_osdmap_epoch(),
12922 get_osdmap_epoch(),
12923 PeeringState::DoRecovery())));
12924 } else if (needs_backfill()) {
12925 dout(10) << "activate queueing backfill" << dendl;
12926 queue_peering_event(
12927 PGPeeringEventRef(
12928 std::make_shared<PGPeeringEvent>(
12929 get_osdmap_epoch(),
12930 get_osdmap_epoch(),
12931 PeeringState::RequestBackfill())));
12932 } else {
12933 dout(10) << "activate all replicas clean, no recovery" << dendl;
12934 queue_peering_event(
12935 PGPeeringEventRef(
12936 std::make_shared<PGPeeringEvent>(
12937 get_osdmap_epoch(),
12938 get_osdmap_epoch(),
12939 PeeringState::AllReplicasRecovered())));
12940 }
12941
12942 publish_stats_to_osd();
12943
12944 if (get_backfill_targets().size()) {
12945 last_backfill_started = recovery_state.earliest_backfill();
12946 new_backfill = true;
12947 ceph_assert(!last_backfill_started.is_max());
12948 dout(5) << __func__ << ": bft=" << get_backfill_targets()
12949 << " from " << last_backfill_started << dendl;
12950 for (set<pg_shard_t>::const_iterator i = get_backfill_targets().begin();
12951 i != get_backfill_targets().end();
12952 ++i) {
12953 dout(5) << "target shard " << *i
12954 << " from " << recovery_state.get_peer_info(*i).last_backfill
12955 << dendl;
12956 }
12957 }
12958
12959 hit_set_setup();
12960 agent_setup();
12961 }
12962
12963 void PrimaryLogPG::on_change(ObjectStore::Transaction &t)
12964 {
12965 dout(10) << __func__ << dendl;
12966
12967 if (hit_set && hit_set->insert_count() == 0) {
12968 dout(20) << " discarding empty hit_set" << dendl;
12969 hit_set_clear();
12970 }
12971
12972 if (recovery_queued) {
12973 recovery_queued = false;
12974 osd->clear_queued_recovery(this);
12975 }
12976
12977 // requeue everything in the reverse order they should be
12978 // reexamined.
12979 requeue_ops(waiting_for_peered);
12980 requeue_ops(waiting_for_flush);
12981 requeue_ops(waiting_for_active);
12982 requeue_ops(waiting_for_readable);
12983
12984 vector<ceph_tid_t> tids;
12985 cancel_copy_ops(is_primary(), &tids);
12986 cancel_flush_ops(is_primary(), &tids);
12987 cancel_proxy_ops(is_primary(), &tids);
12988 cancel_manifest_ops(is_primary(), &tids);
12989 cancel_cls_gather_ops(is_primary(), &tids);
12990 osd->objecter->op_cancel(tids, -ECANCELED);
12991
12992 // requeue object waiters
12993 for (auto& p : waiting_for_unreadable_object) {
12994 release_backoffs(p.first);
12995 }
12996 if (is_primary()) {
12997 requeue_object_waiters(waiting_for_unreadable_object);
12998 } else {
12999 waiting_for_unreadable_object.clear();
13000 }
13001 for (map<hobject_t,list<OpRequestRef>>::iterator p = waiting_for_degraded_object.begin();
13002 p != waiting_for_degraded_object.end();
13003 waiting_for_degraded_object.erase(p++)) {
13004 release_backoffs(p->first);
13005 if (is_primary())
13006 requeue_ops(p->second);
13007 else
13008 p->second.clear();
13009 finish_degraded_object(p->first);
13010 }
13011
13012 // requeues waiting_for_scrub
13013 m_scrubber->scrub_clear_state();
13014
13015 for (auto p = waiting_for_blocked_object.begin();
13016 p != waiting_for_blocked_object.end();
13017 waiting_for_blocked_object.erase(p++)) {
13018 if (is_primary())
13019 requeue_ops(p->second);
13020 else
13021 p->second.clear();
13022 }
13023 for (auto i = callbacks_for_degraded_object.begin();
13024 i != callbacks_for_degraded_object.end();
13025 ) {
13026 finish_degraded_object((i++)->first);
13027 }
13028 ceph_assert(callbacks_for_degraded_object.empty());
13029
13030 if (is_primary()) {
13031 requeue_ops(waiting_for_cache_not_full);
13032 } else {
13033 waiting_for_cache_not_full.clear();
13034 }
13035 objects_blocked_on_cache_full.clear();
13036
13037 for (list<pair<OpRequestRef, OpContext*> >::iterator i =
13038 in_progress_async_reads.begin();
13039 i != in_progress_async_reads.end();
13040 in_progress_async_reads.erase(i++)) {
13041 close_op_ctx(i->second);
13042 if (is_primary())
13043 requeue_op(i->first);
13044 }
13045
13046 // this will requeue ops we were working on but didn't finish, and
13047 // any dups
13048 apply_and_flush_repops(is_primary());
13049 cancel_log_updates();
13050
13051 // do this *after* apply_and_flush_repops so that we catch any newly
13052 // registered watches.
13053 context_registry_on_change();
13054
13055 pgbackend->on_change_cleanup(&t);
13056 m_scrubber->cleanup_store(&t);
13057 pgbackend->on_change();
13058
13059 // clear snap_trimmer state
13060 snap_trimmer_machine.process_event(Reset());
13061
13062 debug_op_order.clear();
13063 unstable_stats.clear();
13064
13065 // we don't want to cache object_contexts through the interval change
13066 // NOTE: we actually assert that all currently live references are dead
13067 // by the time the flush for the next interval completes.
13068 object_contexts.clear();
13069
13070 // should have been cleared above by finishing all of the degraded objects
13071 ceph_assert(objects_blocked_on_degraded_snap.empty());
13072 }
13073
13074 void PrimaryLogPG::plpg_on_role_change()
13075 {
13076 dout(10) << __func__ << dendl;
13077 if (get_role() != 0 && hit_set) {
13078 dout(10) << " clearing hit set" << dendl;
13079 hit_set_clear();
13080 }
13081 }
13082
13083 void PrimaryLogPG::plpg_on_pool_change()
13084 {
13085 dout(10) << __func__ << dendl;
13086 // requeue cache full waiters just in case the cache_mode is
13087 // changing away from writeback mode. note that if we are not
13088 // active the normal requeuing machinery is sufficient (and properly
13089 // ordered).
13090 if (is_active() &&
13091 pool.info.cache_mode != pg_pool_t::CACHEMODE_WRITEBACK &&
13092 !waiting_for_cache_not_full.empty()) {
13093 dout(10) << __func__ << " requeuing full waiters (not in writeback) "
13094 << dendl;
13095 requeue_ops(waiting_for_cache_not_full);
13096 objects_blocked_on_cache_full.clear();
13097 }
13098 hit_set_setup();
13099 agent_setup();
13100 }
13101
13102 // clear state. called on recovery completion AND cancellation.
13103 void PrimaryLogPG::_clear_recovery_state()
13104 {
13105 #ifdef DEBUG_RECOVERY_OIDS
13106 recovering_oids.clear();
13107 #endif
13108 dout(15) << __func__ << " flags: " << m_planned_scrub << dendl;
13109
13110 last_backfill_started = hobject_t();
13111 set<hobject_t>::iterator i = backfills_in_flight.begin();
13112 while (i != backfills_in_flight.end()) {
13113 backfills_in_flight.erase(i++);
13114 }
13115
13116 list<OpRequestRef> blocked_ops;
13117 for (map<hobject_t, ObjectContextRef>::iterator i = recovering.begin();
13118 i != recovering.end();
13119 recovering.erase(i++)) {
13120 if (i->second) {
13121 i->second->drop_recovery_read(&blocked_ops);
13122 requeue_ops(blocked_ops);
13123 }
13124 }
13125 ceph_assert(backfills_in_flight.empty());
13126 pending_backfill_updates.clear();
13127 ceph_assert(recovering.empty());
13128 pgbackend->clear_recovery_state();
13129 }
13130
13131 void PrimaryLogPG::cancel_pull(const hobject_t &soid)
13132 {
13133 dout(20) << __func__ << ": " << soid << dendl;
13134 ceph_assert(recovering.count(soid));
13135 ObjectContextRef obc = recovering[soid];
13136 if (obc) {
13137 list<OpRequestRef> blocked_ops;
13138 obc->drop_recovery_read(&blocked_ops);
13139 requeue_ops(blocked_ops);
13140 }
13141 recovering.erase(soid);
13142 finish_recovery_op(soid);
13143 release_backoffs(soid);
13144 if (waiting_for_degraded_object.count(soid)) {
13145 dout(20) << " kicking degraded waiters on " << soid << dendl;
13146 requeue_ops(waiting_for_degraded_object[soid]);
13147 waiting_for_degraded_object.erase(soid);
13148 }
13149 if (waiting_for_unreadable_object.count(soid)) {
13150 dout(20) << " kicking unreadable waiters on " << soid << dendl;
13151 requeue_ops(waiting_for_unreadable_object[soid]);
13152 waiting_for_unreadable_object.erase(soid);
13153 }
13154 if (is_missing_object(soid))
13155 recovery_state.set_last_requested(0);
13156 finish_degraded_object(soid);
13157 }
13158
13159 void PrimaryLogPG::check_recovery_sources(const OSDMapRef& osdmap)
13160 {
13161 pgbackend->check_recovery_sources(osdmap);
13162 }
13163
13164 bool PrimaryLogPG::start_recovery_ops(
13165 uint64_t max,
13166 ThreadPool::TPHandle &handle,
13167 uint64_t *ops_started)
13168 {
13169 uint64_t& started = *ops_started;
13170 started = 0;
13171 bool work_in_progress = false;
13172 bool recovery_started = false;
13173 ceph_assert(is_primary());
13174 ceph_assert(is_peered());
13175 ceph_assert(!recovery_state.is_deleting());
13176
13177 ceph_assert(recovery_queued);
13178 recovery_queued = false;
13179
13180 if (!state_test(PG_STATE_RECOVERING) &&
13181 !state_test(PG_STATE_BACKFILLING)) {
13182 /* TODO: I think this case is broken and will make do_recovery()
13183 * unhappy since we're returning false */
13184 dout(10) << "recovery raced and were queued twice, ignoring!" << dendl;
13185 return have_unfound();
13186 }
13187
13188 const auto &missing = recovery_state.get_pg_log().get_missing();
13189
13190 uint64_t num_unfound = get_num_unfound();
13191
13192 if (!recovery_state.have_missing()) {
13193 recovery_state.local_recovery_complete();
13194 }
13195
13196 if (!missing.have_missing() || // Primary does not have missing
13197 // or all of the missing objects are unfound.
13198 recovery_state.all_missing_unfound()) {
13199 // Recover the replicas.
13200 started = recover_replicas(max, handle, &recovery_started);
13201 }
13202 if (!started) {
13203 // We still have missing objects that we should grab from replicas.
13204 started += recover_primary(max, handle);
13205 }
13206 if (!started && num_unfound != get_num_unfound()) {
13207 // second chance to recovery replicas
13208 started = recover_replicas(max, handle, &recovery_started);
13209 }
13210
13211 if (started || recovery_started)
13212 work_in_progress = true;
13213
13214 bool deferred_backfill = false;
13215 if (recovering.empty() &&
13216 state_test(PG_STATE_BACKFILLING) &&
13217 !get_backfill_targets().empty() && started < max &&
13218 missing.num_missing() == 0 &&
13219 waiting_on_backfill.empty()) {
13220 if (get_osdmap()->test_flag(CEPH_OSDMAP_NOBACKFILL)) {
13221 dout(10) << "deferring backfill due to NOBACKFILL" << dendl;
13222 deferred_backfill = true;
13223 } else if (get_osdmap()->test_flag(CEPH_OSDMAP_NOREBALANCE) &&
13224 !is_degraded()) {
13225 dout(10) << "deferring backfill due to NOREBALANCE" << dendl;
13226 deferred_backfill = true;
13227 } else if (!recovery_state.is_backfill_reserved()) {
13228 /* DNMNOTE I think this branch is dead */
13229 dout(10) << "deferring backfill due to !backfill_reserved" << dendl;
13230 if (!backfill_reserving) {
13231 dout(10) << "queueing RequestBackfill" << dendl;
13232 backfill_reserving = true;
13233 queue_peering_event(
13234 PGPeeringEventRef(
13235 std::make_shared<PGPeeringEvent>(
13236 get_osdmap_epoch(),
13237 get_osdmap_epoch(),
13238 PeeringState::RequestBackfill())));
13239 }
13240 deferred_backfill = true;
13241 } else {
13242 started += recover_backfill(max - started, handle, &work_in_progress);
13243 }
13244 }
13245
13246 dout(10) << " started " << started << dendl;
13247 osd->logger->inc(l_osd_rop, started);
13248
13249 if (!recovering.empty() ||
13250 work_in_progress || recovery_ops_active > 0 || deferred_backfill)
13251 return !work_in_progress && have_unfound();
13252
13253 ceph_assert(recovering.empty());
13254 ceph_assert(recovery_ops_active == 0);
13255
13256 dout(10) << __func__ << " needs_recovery: "
13257 << recovery_state.get_missing_loc().get_needs_recovery()
13258 << dendl;
13259 dout(10) << __func__ << " missing_loc: "
13260 << recovery_state.get_missing_loc().get_missing_locs()
13261 << dendl;
13262 int unfound = get_num_unfound();
13263 if (unfound) {
13264 dout(10) << " still have " << unfound << " unfound" << dendl;
13265 return true;
13266 }
13267
13268 if (missing.num_missing() > 0) {
13269 // this shouldn't happen!
13270 osd->clog->error() << info.pgid << " Unexpected Error: recovery ending with "
13271 << missing.num_missing() << ": " << missing.get_items();
13272 return false;
13273 }
13274
13275 if (needs_recovery()) {
13276 // this shouldn't happen!
13277 // We already checked num_missing() so we must have missing replicas
13278 osd->clog->error() << info.pgid
13279 << " Unexpected Error: recovery ending with missing replicas";
13280 return false;
13281 }
13282
13283 if (state_test(PG_STATE_RECOVERING)) {
13284 state_clear(PG_STATE_RECOVERING);
13285 state_clear(PG_STATE_FORCED_RECOVERY);
13286 if (needs_backfill()) {
13287 dout(10) << "recovery done, queuing backfill" << dendl;
13288 queue_peering_event(
13289 PGPeeringEventRef(
13290 std::make_shared<PGPeeringEvent>(
13291 get_osdmap_epoch(),
13292 get_osdmap_epoch(),
13293 PeeringState::RequestBackfill())));
13294 } else {
13295 dout(10) << "recovery done, no backfill" << dendl;
13296 state_clear(PG_STATE_FORCED_BACKFILL);
13297 queue_peering_event(
13298 PGPeeringEventRef(
13299 std::make_shared<PGPeeringEvent>(
13300 get_osdmap_epoch(),
13301 get_osdmap_epoch(),
13302 PeeringState::AllReplicasRecovered())));
13303 }
13304 } else { // backfilling
13305 state_clear(PG_STATE_BACKFILLING);
13306 state_clear(PG_STATE_FORCED_BACKFILL);
13307 state_clear(PG_STATE_FORCED_RECOVERY);
13308 dout(10) << "recovery done, backfill done" << dendl;
13309 queue_peering_event(
13310 PGPeeringEventRef(
13311 std::make_shared<PGPeeringEvent>(
13312 get_osdmap_epoch(),
13313 get_osdmap_epoch(),
13314 PeeringState::Backfilled())));
13315 }
13316
13317 return false;
13318 }
13319
13320 /**
13321 * do one recovery op.
13322 * return true if done, false if nothing left to do.
13323 */
13324 uint64_t PrimaryLogPG::recover_primary(uint64_t max, ThreadPool::TPHandle &handle)
13325 {
13326 ceph_assert(is_primary());
13327
13328 const auto &missing = recovery_state.get_pg_log().get_missing();
13329
13330 dout(10) << __func__ << " recovering " << recovering.size()
13331 << " in pg,"
13332 << " missing " << missing << dendl;
13333
13334 dout(25) << __func__ << " " << missing.get_items() << dendl;
13335
13336 // look at log!
13337 pg_log_entry_t *latest = 0;
13338 unsigned started = 0;
13339 int skipped = 0;
13340
13341 PGBackend::RecoveryHandle *h = pgbackend->open_recovery_op();
13342 map<version_t, hobject_t>::const_iterator p =
13343 missing.get_rmissing().lower_bound(recovery_state.get_pg_log().get_log().last_requested);
13344 while (p != missing.get_rmissing().end()) {
13345 handle.reset_tp_timeout();
13346 hobject_t soid;
13347 version_t v = p->first;
13348
13349 auto it_objects = recovery_state.get_pg_log().get_log().objects.find(p->second);
13350 if (it_objects != recovery_state.get_pg_log().get_log().objects.end()) {
13351 latest = it_objects->second;
13352 ceph_assert(latest->is_update() || latest->is_delete());
13353 soid = latest->soid;
13354 } else {
13355 latest = 0;
13356 soid = p->second;
13357 }
13358 const pg_missing_item& item = missing.get_items().find(p->second)->second;
13359 ++p;
13360
13361 hobject_t head = soid.get_head();
13362
13363 eversion_t need = item.need;
13364
13365 dout(10) << __func__ << " "
13366 << soid << " " << item.need
13367 << (missing.is_missing(soid) ? " (missing)":"")
13368 << (missing.is_missing(head) ? " (missing head)":"")
13369 << (recovering.count(soid) ? " (recovering)":"")
13370 << (recovering.count(head) ? " (recovering head)":"")
13371 << dendl;
13372
13373 if (latest) {
13374 switch (latest->op) {
13375 case pg_log_entry_t::CLONE:
13376 /*
13377 * Handling for this special case removed for now, until we
13378 * can correctly construct an accurate SnapSet from the old
13379 * one.
13380 */
13381 break;
13382
13383 case pg_log_entry_t::LOST_REVERT:
13384 {
13385 if (item.have == latest->reverting_to) {
13386 ObjectContextRef obc = get_object_context(soid, true);
13387
13388 if (obc->obs.oi.version == latest->version) {
13389 // I'm already reverting
13390 dout(10) << " already reverting " << soid << dendl;
13391 } else {
13392 dout(10) << " reverting " << soid << " to " << latest->prior_version << dendl;
13393 obc->obs.oi.version = latest->version;
13394
13395 ObjectStore::Transaction t;
13396 bufferlist b2;
13397 obc->obs.oi.encode(
13398 b2,
13399 get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
13400 ceph_assert(!pool.info.require_rollback());
13401 t.setattr(coll, ghobject_t(soid), OI_ATTR, b2);
13402
13403 recovery_state.recover_got(
13404 soid,
13405 latest->version,
13406 false,
13407 t);
13408
13409 ++active_pushes;
13410
13411 t.register_on_applied(new C_OSD_AppliedRecoveredObject(this, obc));
13412 t.register_on_commit(new C_OSD_CommittedPushedObject(
13413 this,
13414 get_osdmap_epoch(),
13415 info.last_complete));
13416 osd->store->queue_transaction(ch, std::move(t));
13417 continue;
13418 }
13419 } else {
13420 /*
13421 * Pull the old version of the object. Update missing_loc here to have the location
13422 * of the version we want.
13423 *
13424 * This doesn't use the usual missing_loc paths, but that's okay:
13425 * - if we have it locally, we hit the case above, and go from there.
13426 * - if we don't, we always pass through this case during recovery and set up the location
13427 * properly.
13428 * - this way we don't need to mangle the missing code to be general about needing an old
13429 * version...
13430 */
13431 eversion_t alternate_need = latest->reverting_to;
13432 dout(10) << " need to pull prior_version " << alternate_need << " for revert " << item << dendl;
13433
13434 set<pg_shard_t> good_peers;
13435 for (auto p = recovery_state.get_peer_missing().begin();
13436 p != recovery_state.get_peer_missing().end();
13437 ++p) {
13438 if (p->second.is_missing(soid, need) &&
13439 p->second.get_items().at(soid).have == alternate_need) {
13440 good_peers.insert(p->first);
13441 }
13442 }
13443 recovery_state.set_revert_with_targets(
13444 soid,
13445 good_peers);
13446 dout(10) << " will pull " << alternate_need << " or " << need
13447 << " from one of "
13448 << recovery_state.get_missing_loc().get_locations(soid)
13449 << dendl;
13450 }
13451 }
13452 break;
13453 }
13454 }
13455
13456 if (!recovering.count(soid)) {
13457 if (recovering.count(head)) {
13458 ++skipped;
13459 } else {
13460 int r = recover_missing(
13461 soid, need, get_recovery_op_priority(), h);
13462 switch (r) {
13463 case PULL_YES:
13464 ++started;
13465 break;
13466 case PULL_HEAD:
13467 ++started;
13468 case PULL_NONE:
13469 ++skipped;
13470 break;
13471 default:
13472 ceph_abort();
13473 }
13474 if (started >= max)
13475 break;
13476 }
13477 }
13478
13479 // only advance last_requested if we haven't skipped anything
13480 if (!skipped)
13481 recovery_state.set_last_requested(v);
13482 }
13483
13484 pgbackend->run_recovery_op(h, get_recovery_op_priority());
13485 return started;
13486 }
13487
13488 bool PrimaryLogPG::primary_error(
13489 const hobject_t& soid, eversion_t v)
13490 {
13491 recovery_state.force_object_missing(pg_whoami, soid, v);
13492 bool uhoh = recovery_state.get_missing_loc().is_unfound(soid);
13493 if (uhoh)
13494 osd->clog->error() << info.pgid << " missing primary copy of "
13495 << soid << ", unfound";
13496 else
13497 osd->clog->error() << info.pgid << " missing primary copy of "
13498 << soid
13499 << ", will try copies on "
13500 << recovery_state.get_missing_loc().get_locations(soid);
13501 return uhoh;
13502 }
13503
13504 int PrimaryLogPG::prep_object_replica_deletes(
13505 const hobject_t& soid, eversion_t v,
13506 PGBackend::RecoveryHandle *h,
13507 bool *work_started)
13508 {
13509 ceph_assert(is_primary());
13510 dout(10) << __func__ << ": on " << soid << dendl;
13511
13512 ObjectContextRef obc = get_object_context(soid, false);
13513 if (obc) {
13514 if (!obc->get_recovery_read()) {
13515 dout(20) << "replica delete delayed on " << soid
13516 << "; could not get rw_manager lock" << dendl;
13517 *work_started = true;
13518 return 0;
13519 } else {
13520 dout(20) << "replica delete got recovery read lock on " << soid
13521 << dendl;
13522 }
13523 }
13524
13525 start_recovery_op(soid);
13526 ceph_assert(!recovering.count(soid));
13527 if (!obc)
13528 recovering.insert(make_pair(soid, ObjectContextRef()));
13529 else
13530 recovering.insert(make_pair(soid, obc));
13531
13532 pgbackend->recover_delete_object(soid, v, h);
13533 return 1;
13534 }
13535
13536 int PrimaryLogPG::prep_object_replica_pushes(
13537 const hobject_t& soid, eversion_t v,
13538 PGBackend::RecoveryHandle *h,
13539 bool *work_started)
13540 {
13541 ceph_assert(is_primary());
13542 dout(10) << __func__ << ": on " << soid << dendl;
13543
13544 if (soid.snap && soid.snap < CEPH_NOSNAP) {
13545 // do we have the head and/or snapdir?
13546 hobject_t head = soid.get_head();
13547 if (recovery_state.get_pg_log().get_missing().is_missing(head)) {
13548 if (recovering.count(head)) {
13549 dout(10) << " missing but already recovering head " << head << dendl;
13550 return 0;
13551 } else {
13552 int r = recover_missing(
13553 head, recovery_state.get_pg_log().get_missing().get_items().find(head)->second.need,
13554 get_recovery_op_priority(), h);
13555 if (r != PULL_NONE)
13556 return 1;
13557 return 0;
13558 }
13559 }
13560 }
13561
13562 // NOTE: we know we will get a valid oloc off of disk here.
13563 ObjectContextRef obc = get_object_context(soid, false);
13564 if (!obc) {
13565 primary_error(soid, v);
13566 return 0;
13567 }
13568
13569 if (!obc->get_recovery_read()) {
13570 dout(20) << "recovery delayed on " << soid
13571 << "; could not get rw_manager lock" << dendl;
13572 *work_started = true;
13573 return 0;
13574 } else {
13575 dout(20) << "recovery got recovery read lock on " << soid
13576 << dendl;
13577 }
13578
13579 start_recovery_op(soid);
13580 ceph_assert(!recovering.count(soid));
13581 recovering.insert(make_pair(soid, obc));
13582
13583 int r = pgbackend->recover_object(
13584 soid,
13585 v,
13586 ObjectContextRef(),
13587 obc, // has snapset context
13588 h);
13589 if (r < 0) {
13590 dout(0) << __func__ << " Error " << r << " on oid " << soid << dendl;
13591 on_failed_pull({ pg_whoami }, soid, v);
13592 return 0;
13593 }
13594 return 1;
13595 }
13596
13597 uint64_t PrimaryLogPG::recover_replicas(uint64_t max, ThreadPool::TPHandle &handle,
13598 bool *work_started)
13599 {
13600 dout(10) << __func__ << "(" << max << ")" << dendl;
13601 uint64_t started = 0;
13602
13603 PGBackend::RecoveryHandle *h = pgbackend->open_recovery_op();
13604
13605 // this is FAR from an optimal recovery order. pretty lame, really.
13606 ceph_assert(!get_acting_recovery_backfill().empty());
13607 // choose replicas to recover, replica has the shortest missing list first
13608 // so we can bring it back to normal ASAP
13609 std::vector<std::pair<unsigned int, pg_shard_t>> replicas_by_num_missing,
13610 async_by_num_missing;
13611 replicas_by_num_missing.reserve(get_acting_recovery_backfill().size() - 1);
13612 for (auto &p: get_acting_recovery_backfill()) {
13613 if (p == get_primary()) {
13614 continue;
13615 }
13616 auto pm = recovery_state.get_peer_missing().find(p);
13617 ceph_assert(pm != recovery_state.get_peer_missing().end());
13618 auto nm = pm->second.num_missing();
13619 if (nm != 0) {
13620 if (is_async_recovery_target(p)) {
13621 async_by_num_missing.push_back(make_pair(nm, p));
13622 } else {
13623 replicas_by_num_missing.push_back(make_pair(nm, p));
13624 }
13625 }
13626 }
13627 // sort by number of missing objects, in ascending order.
13628 auto func = [](const std::pair<unsigned int, pg_shard_t> &lhs,
13629 const std::pair<unsigned int, pg_shard_t> &rhs) {
13630 return lhs.first < rhs.first;
13631 };
13632 // acting goes first
13633 std::sort(replicas_by_num_missing.begin(), replicas_by_num_missing.end(), func);
13634 // then async_recovery_targets
13635 std::sort(async_by_num_missing.begin(), async_by_num_missing.end(), func);
13636 replicas_by_num_missing.insert(replicas_by_num_missing.end(),
13637 async_by_num_missing.begin(), async_by_num_missing.end());
13638 for (auto &replica: replicas_by_num_missing) {
13639 pg_shard_t &peer = replica.second;
13640 ceph_assert(peer != get_primary());
13641 auto pm = recovery_state.get_peer_missing().find(peer);
13642 ceph_assert(pm != recovery_state.get_peer_missing().end());
13643 size_t m_sz = pm->second.num_missing();
13644
13645 dout(10) << " peer osd." << peer << " missing " << m_sz << " objects." << dendl;
13646 dout(20) << " peer osd." << peer << " missing " << pm->second.get_items() << dendl;
13647
13648 // oldest first!
13649 const pg_missing_t &m(pm->second);
13650 for (map<version_t, hobject_t>::const_iterator p = m.get_rmissing().begin();
13651 p != m.get_rmissing().end() && started < max;
13652 ++p) {
13653 handle.reset_tp_timeout();
13654 const hobject_t soid(p->second);
13655
13656 if (recovery_state.get_missing_loc().is_unfound(soid)) {
13657 dout(10) << __func__ << ": " << soid << " still unfound" << dendl;
13658 continue;
13659 }
13660
13661 const pg_info_t &pi = recovery_state.get_peer_info(peer);
13662 if (soid > pi.last_backfill) {
13663 if (!recovering.count(soid)) {
13664 derr << __func__ << ": object " << soid << " last_backfill "
13665 << pi.last_backfill << dendl;
13666 derr << __func__ << ": object added to missing set for backfill, but "
13667 << "is not in recovering, error!" << dendl;
13668 ceph_abort();
13669 }
13670 continue;
13671 }
13672
13673 if (recovering.count(soid)) {
13674 dout(10) << __func__ << ": already recovering " << soid << dendl;
13675 continue;
13676 }
13677
13678 if (recovery_state.get_missing_loc().is_deleted(soid)) {
13679 dout(10) << __func__ << ": " << soid << " is a delete, removing" << dendl;
13680 map<hobject_t,pg_missing_item>::const_iterator r = m.get_items().find(soid);
13681 started += prep_object_replica_deletes(soid, r->second.need, h, work_started);
13682 continue;
13683 }
13684
13685 if (soid.is_snap() &&
13686 recovery_state.get_pg_log().get_missing().is_missing(
13687 soid.get_head())) {
13688 dout(10) << __func__ << ": " << soid.get_head()
13689 << " still missing on primary" << dendl;
13690 continue;
13691 }
13692
13693 if (recovery_state.get_pg_log().get_missing().is_missing(soid)) {
13694 dout(10) << __func__ << ": " << soid << " still missing on primary" << dendl;
13695 continue;
13696 }
13697
13698 dout(10) << __func__ << ": recover_object_replicas(" << soid << ")" << dendl;
13699 map<hobject_t,pg_missing_item>::const_iterator r = m.get_items().find(soid);
13700 started += prep_object_replica_pushes(soid, r->second.need, h, work_started);
13701 }
13702 }
13703
13704 pgbackend->run_recovery_op(h, get_recovery_op_priority());
13705 return started;
13706 }
13707
13708 hobject_t PrimaryLogPG::earliest_peer_backfill() const
13709 {
13710 hobject_t e = hobject_t::get_max();
13711 for (const pg_shard_t& peer : get_backfill_targets()) {
13712 const auto iter = peer_backfill_info.find(peer);
13713 ceph_assert(iter != peer_backfill_info.end());
13714 e = std::min(e, iter->second.begin);
13715 }
13716 return e;
13717 }
13718
13719 bool PrimaryLogPG::all_peer_done() const
13720 {
13721 // Primary hasn't got any more objects
13722 ceph_assert(backfill_info.empty());
13723
13724 for (const pg_shard_t& bt : get_backfill_targets()) {
13725 const auto piter = peer_backfill_info.find(bt);
13726 ceph_assert(piter != peer_backfill_info.end());
13727 const BackfillInterval& pbi = piter->second;
13728 // See if peer has more to process
13729 if (!pbi.extends_to_end() || !pbi.empty())
13730 return false;
13731 }
13732 return true;
13733 }
13734
13735 /**
13736 * recover_backfill
13737 *
13738 * Invariants:
13739 *
13740 * backfilled: fully pushed to replica or present in replica's missing set (both
13741 * our copy and theirs).
13742 *
13743 * All objects on a backfill_target in
13744 * [MIN,peer_backfill_info[backfill_target].begin) are valid; logically-removed
13745 * objects have been actually deleted and all logically-valid objects are replicated.
13746 * There may be PG objects in this interval yet to be backfilled.
13747 *
13748 * All objects in PG in [MIN,backfill_info.begin) have been backfilled to all
13749 * backfill_targets. There may be objects on backfill_target(s) yet to be deleted.
13750 *
13751 * For a backfill target, all objects < std::min(peer_backfill_info[target].begin,
13752 * backfill_info.begin) in PG are backfilled. No deleted objects in this
13753 * interval remain on the backfill target.
13754 *
13755 * For a backfill target, all objects <= peer_info[target].last_backfill
13756 * have been backfilled to target
13757 *
13758 * There *MAY* be missing/outdated objects between last_backfill_started and
13759 * std::min(peer_backfill_info[*].begin, backfill_info.begin) in the event that client
13760 * io created objects since the last scan. For this reason, we call
13761 * update_range() again before continuing backfill.
13762 */
13763 uint64_t PrimaryLogPG::recover_backfill(
13764 uint64_t max,
13765 ThreadPool::TPHandle &handle, bool *work_started)
13766 {
13767 dout(10) << __func__ << " (" << max << ")"
13768 << " bft=" << get_backfill_targets()
13769 << " last_backfill_started " << last_backfill_started
13770 << (new_backfill ? " new_backfill":"")
13771 << dendl;
13772 ceph_assert(!get_backfill_targets().empty());
13773
13774 // Initialize from prior backfill state
13775 if (new_backfill) {
13776 // on_activate() was called prior to getting here
13777 ceph_assert(last_backfill_started == recovery_state.earliest_backfill());
13778 new_backfill = false;
13779
13780 // initialize BackfillIntervals
13781 for (set<pg_shard_t>::const_iterator i = get_backfill_targets().begin();
13782 i != get_backfill_targets().end();
13783 ++i) {
13784 peer_backfill_info[*i].reset(
13785 recovery_state.get_peer_info(*i).last_backfill);
13786 }
13787 backfill_info.reset(last_backfill_started);
13788
13789 backfills_in_flight.clear();
13790 pending_backfill_updates.clear();
13791 }
13792
13793 for (set<pg_shard_t>::const_iterator i = get_backfill_targets().begin();
13794 i != get_backfill_targets().end();
13795 ++i) {
13796 dout(10) << "peer osd." << *i
13797 << " info " << recovery_state.get_peer_info(*i)
13798 << " interval " << peer_backfill_info[*i].begin
13799 << "-" << peer_backfill_info[*i].end
13800 << " " << peer_backfill_info[*i].objects.size() << " objects"
13801 << dendl;
13802 }
13803
13804 // update our local interval to cope with recent changes
13805 backfill_info.begin = last_backfill_started;
13806 update_range(&backfill_info, handle);
13807
13808 unsigned ops = 0;
13809 vector<boost::tuple<hobject_t, eversion_t, pg_shard_t> > to_remove;
13810 set<hobject_t> add_to_stat;
13811
13812 for (set<pg_shard_t>::const_iterator i = get_backfill_targets().begin();
13813 i != get_backfill_targets().end();
13814 ++i) {
13815 peer_backfill_info[*i].trim_to(
13816 std::max(
13817 recovery_state.get_peer_info(*i).last_backfill,
13818 last_backfill_started));
13819 }
13820 backfill_info.trim_to(last_backfill_started);
13821
13822 PGBackend::RecoveryHandle *h = pgbackend->open_recovery_op();
13823 while (ops < max) {
13824 if (backfill_info.begin <= earliest_peer_backfill() &&
13825 !backfill_info.extends_to_end() && backfill_info.empty()) {
13826 hobject_t next = backfill_info.end;
13827 backfill_info.reset(next);
13828 backfill_info.end = hobject_t::get_max();
13829 update_range(&backfill_info, handle);
13830 backfill_info.trim();
13831 }
13832
13833 dout(20) << " my backfill interval " << backfill_info << dendl;
13834
13835 bool sent_scan = false;
13836 for (set<pg_shard_t>::const_iterator i = get_backfill_targets().begin();
13837 i != get_backfill_targets().end();
13838 ++i) {
13839 pg_shard_t bt = *i;
13840 BackfillInterval& pbi = peer_backfill_info[bt];
13841
13842 dout(20) << " peer shard " << bt << " backfill " << pbi << dendl;
13843 if (pbi.begin <= backfill_info.begin &&
13844 !pbi.extends_to_end() && pbi.empty()) {
13845 dout(10) << " scanning peer osd." << bt << " from " << pbi.end << dendl;
13846 epoch_t e = get_osdmap_epoch();
13847 MOSDPGScan *m = new MOSDPGScan(
13848 MOSDPGScan::OP_SCAN_GET_DIGEST, pg_whoami, e, get_last_peering_reset(),
13849 spg_t(info.pgid.pgid, bt.shard),
13850 pbi.end, hobject_t());
13851 osd->send_message_osd_cluster(bt.osd, m, get_osdmap_epoch());
13852 ceph_assert(waiting_on_backfill.find(bt) == waiting_on_backfill.end());
13853 waiting_on_backfill.insert(bt);
13854 sent_scan = true;
13855 }
13856 }
13857
13858 // Count simultaneous scans as a single op and let those complete
13859 if (sent_scan) {
13860 ops++;
13861 start_recovery_op(hobject_t::get_max()); // XXX: was pbi.end
13862 break;
13863 }
13864
13865 if (backfill_info.empty() && all_peer_done()) {
13866 dout(10) << " reached end for both local and all peers" << dendl;
13867 break;
13868 }
13869
13870 // Get object within set of peers to operate on and
13871 // the set of targets for which that object applies.
13872 hobject_t check = earliest_peer_backfill();
13873
13874 if (check < backfill_info.begin) {
13875
13876 set<pg_shard_t> check_targets;
13877 for (set<pg_shard_t>::const_iterator i = get_backfill_targets().begin();
13878 i != get_backfill_targets().end();
13879 ++i) {
13880 pg_shard_t bt = *i;
13881 BackfillInterval& pbi = peer_backfill_info[bt];
13882 if (pbi.begin == check)
13883 check_targets.insert(bt);
13884 }
13885 ceph_assert(!check_targets.empty());
13886
13887 dout(20) << " BACKFILL removing " << check
13888 << " from peers " << check_targets << dendl;
13889 for (set<pg_shard_t>::iterator i = check_targets.begin();
13890 i != check_targets.end();
13891 ++i) {
13892 pg_shard_t bt = *i;
13893 BackfillInterval& pbi = peer_backfill_info[bt];
13894 ceph_assert(pbi.begin == check);
13895
13896 to_remove.push_back(boost::make_tuple(check, pbi.objects.begin()->second, bt));
13897 pbi.pop_front();
13898 }
13899
13900 last_backfill_started = check;
13901
13902 // Don't increment ops here because deletions
13903 // are cheap and not replied to unlike real recovery_ops,
13904 // and we can't increment ops without requeueing ourself
13905 // for recovery.
13906 } else {
13907 eversion_t& obj_v = backfill_info.objects.begin()->second;
13908
13909 vector<pg_shard_t> need_ver_targs, missing_targs, keep_ver_targs, skip_targs;
13910 for (set<pg_shard_t>::const_iterator i = get_backfill_targets().begin();
13911 i != get_backfill_targets().end();
13912 ++i) {
13913 pg_shard_t bt = *i;
13914 BackfillInterval& pbi = peer_backfill_info[bt];
13915 // Find all check peers that have the wrong version
13916 if (check == backfill_info.begin && check == pbi.begin) {
13917 if (pbi.objects.begin()->second != obj_v) {
13918 need_ver_targs.push_back(bt);
13919 } else {
13920 keep_ver_targs.push_back(bt);
13921 }
13922 } else {
13923 const pg_info_t& pinfo = recovery_state.get_peer_info(bt);
13924
13925 // Only include peers that we've caught up to their backfill line
13926 // otherwise, they only appear to be missing this object
13927 // because their pbi.begin > backfill_info.begin.
13928 if (backfill_info.begin > pinfo.last_backfill)
13929 missing_targs.push_back(bt);
13930 else
13931 skip_targs.push_back(bt);
13932 }
13933 }
13934
13935 if (!keep_ver_targs.empty()) {
13936 // These peers have version obj_v
13937 dout(20) << " BACKFILL keeping " << check
13938 << " with ver " << obj_v
13939 << " on peers " << keep_ver_targs << dendl;
13940 //assert(!waiting_for_degraded_object.count(check));
13941 }
13942 if (!need_ver_targs.empty() || !missing_targs.empty()) {
13943 ObjectContextRef obc = get_object_context(backfill_info.begin, false);
13944 ceph_assert(obc);
13945 if (obc->get_recovery_read()) {
13946 if (!need_ver_targs.empty()) {
13947 dout(20) << " BACKFILL replacing " << check
13948 << " with ver " << obj_v
13949 << " to peers " << need_ver_targs << dendl;
13950 }
13951 if (!missing_targs.empty()) {
13952 dout(20) << " BACKFILL pushing " << backfill_info.begin
13953 << " with ver " << obj_v
13954 << " to peers " << missing_targs << dendl;
13955 }
13956 vector<pg_shard_t> all_push = need_ver_targs;
13957 all_push.insert(all_push.end(), missing_targs.begin(), missing_targs.end());
13958
13959 handle.reset_tp_timeout();
13960 int r = prep_backfill_object_push(backfill_info.begin, obj_v, obc, all_push, h);
13961 if (r < 0) {
13962 *work_started = true;
13963 dout(0) << __func__ << " Error " << r << " trying to backfill " << backfill_info.begin << dendl;
13964 break;
13965 }
13966 ops++;
13967 } else {
13968 *work_started = true;
13969 dout(20) << "backfill blocking on " << backfill_info.begin
13970 << "; could not get rw_manager lock" << dendl;
13971 break;
13972 }
13973 }
13974 dout(20) << "need_ver_targs=" << need_ver_targs
13975 << " keep_ver_targs=" << keep_ver_targs << dendl;
13976 dout(20) << "backfill_targets=" << get_backfill_targets()
13977 << " missing_targs=" << missing_targs
13978 << " skip_targs=" << skip_targs << dendl;
13979
13980 last_backfill_started = backfill_info.begin;
13981 add_to_stat.insert(backfill_info.begin); // XXX: Only one for all pushes?
13982 backfill_info.pop_front();
13983 vector<pg_shard_t> check_targets = need_ver_targs;
13984 check_targets.insert(check_targets.end(), keep_ver_targs.begin(), keep_ver_targs.end());
13985 for (vector<pg_shard_t>::iterator i = check_targets.begin();
13986 i != check_targets.end();
13987 ++i) {
13988 pg_shard_t bt = *i;
13989 BackfillInterval& pbi = peer_backfill_info[bt];
13990 pbi.pop_front();
13991 }
13992 }
13993 }
13994
13995 for (set<hobject_t>::iterator i = add_to_stat.begin();
13996 i != add_to_stat.end();
13997 ++i) {
13998 ObjectContextRef obc = get_object_context(*i, false);
13999 ceph_assert(obc);
14000 pg_stat_t stat;
14001 add_object_context_to_pg_stat(obc, &stat);
14002 pending_backfill_updates[*i] = stat;
14003 }
14004 map<pg_shard_t,MOSDPGBackfillRemove*> reqs;
14005 for (unsigned i = 0; i < to_remove.size(); ++i) {
14006 handle.reset_tp_timeout();
14007 const hobject_t& oid = to_remove[i].get<0>();
14008 eversion_t v = to_remove[i].get<1>();
14009 pg_shard_t peer = to_remove[i].get<2>();
14010 MOSDPGBackfillRemove *m;
14011 auto it = reqs.find(peer);
14012 if (it != reqs.end()) {
14013 m = it->second;
14014 } else {
14015 m = reqs[peer] = new MOSDPGBackfillRemove(
14016 spg_t(info.pgid.pgid, peer.shard),
14017 get_osdmap_epoch());
14018 }
14019 m->ls.push_back(make_pair(oid, v));
14020
14021 if (oid <= last_backfill_started)
14022 pending_backfill_updates[oid]; // add empty stat!
14023 }
14024 for (auto p : reqs) {
14025 osd->send_message_osd_cluster(p.first.osd, p.second,
14026 get_osdmap_epoch());
14027 }
14028
14029 pgbackend->run_recovery_op(h, get_recovery_op_priority());
14030
14031 hobject_t backfill_pos =
14032 std::min(backfill_info.begin, earliest_peer_backfill());
14033 dout(5) << "backfill_pos is " << backfill_pos << dendl;
14034 for (set<hobject_t>::iterator i = backfills_in_flight.begin();
14035 i != backfills_in_flight.end();
14036 ++i) {
14037 dout(20) << *i << " is still in flight" << dendl;
14038 }
14039
14040 hobject_t next_backfill_to_complete = backfills_in_flight.empty() ?
14041 backfill_pos : *(backfills_in_flight.begin());
14042 hobject_t new_last_backfill = recovery_state.earliest_backfill();
14043 dout(10) << "starting new_last_backfill at " << new_last_backfill << dendl;
14044 for (map<hobject_t, pg_stat_t>::iterator i =
14045 pending_backfill_updates.begin();
14046 i != pending_backfill_updates.end() &&
14047 i->first < next_backfill_to_complete;
14048 pending_backfill_updates.erase(i++)) {
14049 dout(20) << " pending_backfill_update " << i->first << dendl;
14050 ceph_assert(i->first > new_last_backfill);
14051 // carried from a previous round – if we are here, then we had to
14052 // be requeued (by e.g. on_global_recover()) and those operations
14053 // are done.
14054 recovery_state.update_complete_backfill_object_stats(
14055 i->first,
14056 i->second);
14057 new_last_backfill = i->first;
14058 }
14059 dout(10) << "possible new_last_backfill at " << new_last_backfill << dendl;
14060
14061 ceph_assert(!pending_backfill_updates.empty() ||
14062 new_last_backfill == last_backfill_started);
14063 if (pending_backfill_updates.empty() &&
14064 backfill_pos.is_max()) {
14065 ceph_assert(backfills_in_flight.empty());
14066 new_last_backfill = backfill_pos;
14067 last_backfill_started = backfill_pos;
14068 }
14069 dout(10) << "final new_last_backfill at " << new_last_backfill << dendl;
14070
14071 // If new_last_backfill == MAX, then we will send OP_BACKFILL_FINISH to
14072 // all the backfill targets. Otherwise, we will move last_backfill up on
14073 // those targets need it and send OP_BACKFILL_PROGRESS to them.
14074 for (set<pg_shard_t>::const_iterator i = get_backfill_targets().begin();
14075 i != get_backfill_targets().end();
14076 ++i) {
14077 pg_shard_t bt = *i;
14078 const pg_info_t& pinfo = recovery_state.get_peer_info(bt);
14079
14080 if (new_last_backfill > pinfo.last_backfill) {
14081 recovery_state.update_peer_last_backfill(bt, new_last_backfill);
14082 epoch_t e = get_osdmap_epoch();
14083 MOSDPGBackfill *m = NULL;
14084 if (pinfo.last_backfill.is_max()) {
14085 m = new MOSDPGBackfill(
14086 MOSDPGBackfill::OP_BACKFILL_FINISH,
14087 e,
14088 get_last_peering_reset(),
14089 spg_t(info.pgid.pgid, bt.shard));
14090 // Use default priority here, must match sub_op priority
14091 start_recovery_op(hobject_t::get_max());
14092 } else {
14093 m = new MOSDPGBackfill(
14094 MOSDPGBackfill::OP_BACKFILL_PROGRESS,
14095 e,
14096 get_last_peering_reset(),
14097 spg_t(info.pgid.pgid, bt.shard));
14098 // Use default priority here, must match sub_op priority
14099 }
14100 m->last_backfill = pinfo.last_backfill;
14101 m->stats = pinfo.stats;
14102 osd->send_message_osd_cluster(bt.osd, m, get_osdmap_epoch());
14103 dout(10) << " peer " << bt
14104 << " num_objects now " << pinfo.stats.stats.sum.num_objects
14105 << " / " << info.stats.stats.sum.num_objects << dendl;
14106 }
14107 }
14108
14109 if (ops)
14110 *work_started = true;
14111 return ops;
14112 }
14113
14114 int PrimaryLogPG::prep_backfill_object_push(
14115 hobject_t oid, eversion_t v,
14116 ObjectContextRef obc,
14117 vector<pg_shard_t> peers,
14118 PGBackend::RecoveryHandle *h)
14119 {
14120 dout(10) << __func__ << " " << oid << " v " << v << " to peers " << peers << dendl;
14121 ceph_assert(!peers.empty());
14122
14123 backfills_in_flight.insert(oid);
14124 recovery_state.prepare_backfill_for_missing(oid, v, peers);
14125
14126 ceph_assert(!recovering.count(oid));
14127
14128 start_recovery_op(oid);
14129 recovering.insert(make_pair(oid, obc));
14130
14131 int r = pgbackend->recover_object(
14132 oid,
14133 v,
14134 ObjectContextRef(),
14135 obc,
14136 h);
14137 if (r < 0) {
14138 dout(0) << __func__ << " Error " << r << " on oid " << oid << dendl;
14139 on_failed_pull({ pg_whoami }, oid, v);
14140 }
14141 return r;
14142 }
14143
14144 void PrimaryLogPG::update_range(
14145 BackfillInterval *bi,
14146 ThreadPool::TPHandle &handle)
14147 {
14148 int local_min = cct->_conf->osd_backfill_scan_min;
14149 int local_max = cct->_conf->osd_backfill_scan_max;
14150
14151 if (bi->version < info.log_tail) {
14152 dout(10) << __func__<< ": bi is old, rescanning local backfill_info"
14153 << dendl;
14154 bi->version = info.last_update;
14155 scan_range(local_min, local_max, bi, handle);
14156 }
14157
14158 if (bi->version >= projected_last_update) {
14159 dout(10) << __func__<< ": bi is current " << dendl;
14160 ceph_assert(bi->version == projected_last_update);
14161 } else if (bi->version >= info.log_tail) {
14162 if (recovery_state.get_pg_log().get_log().empty() && projected_log.empty()) {
14163 /* Because we don't move log_tail on split, the log might be
14164 * empty even if log_tail != last_update. However, the only
14165 * way to get here with an empty log is if log_tail is actually
14166 * eversion_t(), because otherwise the entry which changed
14167 * last_update since the last scan would have to be present.
14168 */
14169 ceph_assert(bi->version == eversion_t());
14170 return;
14171 }
14172
14173 dout(10) << __func__<< ": bi is old, (" << bi->version
14174 << ") can be updated with log to projected_last_update "
14175 << projected_last_update << dendl;
14176
14177 auto func = [&](const pg_log_entry_t &e) {
14178 dout(10) << __func__ << ": updating from version " << e.version
14179 << dendl;
14180 const hobject_t &soid = e.soid;
14181 if (soid >= bi->begin &&
14182 soid < bi->end) {
14183 if (e.is_update()) {
14184 dout(10) << __func__ << ": " << e.soid << " updated to version "
14185 << e.version << dendl;
14186 bi->objects.erase(e.soid);
14187 bi->objects.insert(
14188 make_pair(
14189 e.soid,
14190 e.version));
14191 } else if (e.is_delete()) {
14192 dout(10) << __func__ << ": " << e.soid << " removed" << dendl;
14193 bi->objects.erase(e.soid);
14194 }
14195 }
14196 };
14197 dout(10) << "scanning pg log first" << dendl;
14198 recovery_state.get_pg_log().get_log().scan_log_after(bi->version, func);
14199 dout(10) << "scanning projected log" << dendl;
14200 projected_log.scan_log_after(bi->version, func);
14201 bi->version = projected_last_update;
14202 } else {
14203 ceph_abort_msg("scan_range should have raised bi->version past log_tail");
14204 }
14205 }
14206
14207 void PrimaryLogPG::scan_range(
14208 int min, int max, BackfillInterval *bi,
14209 ThreadPool::TPHandle &handle)
14210 {
14211 ceph_assert(is_locked());
14212 dout(10) << "scan_range from " << bi->begin << dendl;
14213 bi->clear_objects();
14214
14215 vector<hobject_t> ls;
14216 ls.reserve(max);
14217 int r = pgbackend->objects_list_partial(bi->begin, min, max, &ls, &bi->end);
14218 ceph_assert(r >= 0);
14219 dout(10) << " got " << ls.size() << " items, next " << bi->end << dendl;
14220 dout(20) << ls << dendl;
14221
14222 for (vector<hobject_t>::iterator p = ls.begin(); p != ls.end(); ++p) {
14223 handle.reset_tp_timeout();
14224 ObjectContextRef obc;
14225 if (is_primary())
14226 obc = object_contexts.lookup(*p);
14227 if (obc) {
14228 if (!obc->obs.exists) {
14229 /* If the object does not exist here, it must have been removed
14230 * between the collection_list_partial and here. This can happen
14231 * for the first item in the range, which is usually last_backfill.
14232 */
14233 continue;
14234 }
14235 bi->objects[*p] = obc->obs.oi.version;
14236 dout(20) << " " << *p << " " << obc->obs.oi.version << dendl;
14237 } else {
14238 bufferlist bl;
14239 int r = pgbackend->objects_get_attr(*p, OI_ATTR, &bl);
14240 /* If the object does not exist here, it must have been removed
14241 * between the collection_list_partial and here. This can happen
14242 * for the first item in the range, which is usually last_backfill.
14243 */
14244 if (r == -ENOENT)
14245 continue;
14246
14247 ceph_assert(r >= 0);
14248 object_info_t oi(bl);
14249 bi->objects[*p] = oi.version;
14250 dout(20) << " " << *p << " " << oi.version << dendl;
14251 }
14252 }
14253 }
14254
14255
14256 /** check_local
14257 *
14258 * verifies that stray objects have been deleted
14259 */
14260 void PrimaryLogPG::check_local()
14261 {
14262 dout(10) << __func__ << dendl;
14263
14264 ceph_assert(
14265 info.last_update >=
14266 recovery_state.get_pg_log().get_tail()); // otherwise we need some help!
14267
14268 if (!cct->_conf->osd_debug_verify_stray_on_activate)
14269 return;
14270
14271 // just scan the log.
14272 set<hobject_t> did;
14273 for (list<pg_log_entry_t>::const_reverse_iterator p = recovery_state.get_pg_log().get_log().log.rbegin();
14274 p != recovery_state.get_pg_log().get_log().log.rend();
14275 ++p) {
14276 if (did.count(p->soid))
14277 continue;
14278 did.insert(p->soid);
14279
14280 if (p->is_delete() && !is_missing_object(p->soid)) {
14281 dout(10) << " checking " << p->soid
14282 << " at " << p->version << dendl;
14283 struct stat st;
14284 int r = osd->store->stat(
14285 ch,
14286 ghobject_t(p->soid, ghobject_t::NO_GEN, pg_whoami.shard),
14287 &st);
14288 if (r != -ENOENT) {
14289 derr << __func__ << " " << p->soid << " exists, but should have been "
14290 << "deleted" << dendl;
14291 ceph_abort_msg("erroneously present object");
14292 }
14293 } else {
14294 // ignore old(+missing) objects
14295 }
14296 }
14297 }
14298
14299
14300
14301 // ===========================
14302 // hit sets
14303
14304 hobject_t PrimaryLogPG::get_hit_set_current_object(utime_t stamp)
14305 {
14306 ostringstream ss;
14307 ss << "hit_set_" << info.pgid.pgid << "_current_" << stamp;
14308 hobject_t hoid(sobject_t(ss.str(), CEPH_NOSNAP), "",
14309 info.pgid.ps(), info.pgid.pool(),
14310 cct->_conf->osd_hit_set_namespace);
14311 dout(20) << __func__ << " " << hoid << dendl;
14312 return hoid;
14313 }
14314
14315 hobject_t PrimaryLogPG::get_hit_set_archive_object(utime_t start,
14316 utime_t end,
14317 bool using_gmt)
14318 {
14319 ostringstream ss;
14320 ss << "hit_set_" << info.pgid.pgid << "_archive_";
14321 if (using_gmt) {
14322 start.gmtime(ss, true /* legacy pre-octopus form */) << "_";
14323 end.gmtime(ss, true /* legacy pre-octopus form */);
14324 } else {
14325 start.localtime(ss, true /* legacy pre-octopus form */) << "_";
14326 end.localtime(ss, true /* legacy pre-octopus form */);
14327 }
14328 hobject_t hoid(sobject_t(ss.str(), CEPH_NOSNAP), "",
14329 info.pgid.ps(), info.pgid.pool(),
14330 cct->_conf->osd_hit_set_namespace);
14331 dout(20) << __func__ << " " << hoid << dendl;
14332 return hoid;
14333 }
14334
14335 void PrimaryLogPG::hit_set_clear()
14336 {
14337 dout(20) << __func__ << dendl;
14338 hit_set.reset();
14339 hit_set_start_stamp = utime_t();
14340 }
14341
14342 void PrimaryLogPG::hit_set_setup()
14343 {
14344 if (!is_active() ||
14345 !is_primary()) {
14346 hit_set_clear();
14347 return;
14348 }
14349
14350 if (is_active() && is_primary() &&
14351 (!pool.info.hit_set_count ||
14352 !pool.info.hit_set_period ||
14353 pool.info.hit_set_params.get_type() == HitSet::TYPE_NONE)) {
14354 hit_set_clear();
14355
14356 // only primary is allowed to remove all the hit set objects
14357 hit_set_remove_all();
14358 return;
14359 }
14360
14361 // FIXME: discard any previous data for now
14362 hit_set_create();
14363
14364 // include any writes we know about from the pg log. this doesn't
14365 // capture reads, but it is better than nothing!
14366 hit_set_apply_log();
14367 }
14368
14369 void PrimaryLogPG::hit_set_remove_all()
14370 {
14371 // If any archives are degraded we skip this
14372 for (auto p = info.hit_set.history.begin();
14373 p != info.hit_set.history.end();
14374 ++p) {
14375 hobject_t aoid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt);
14376
14377 // Once we hit a degraded object just skip
14378 if (is_degraded_or_backfilling_object(aoid))
14379 return;
14380 if (m_scrubber->write_blocked_by_scrub(aoid))
14381 return;
14382 }
14383
14384 if (!info.hit_set.history.empty()) {
14385 auto p = info.hit_set.history.rbegin();
14386 ceph_assert(p != info.hit_set.history.rend());
14387 hobject_t oid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt);
14388 ceph_assert(!is_degraded_or_backfilling_object(oid));
14389 ObjectContextRef obc = get_object_context(oid, false);
14390 ceph_assert(obc);
14391
14392 OpContextUPtr ctx = simple_opc_create(obc);
14393 ctx->at_version = get_next_version();
14394 ctx->updated_hset_history = info.hit_set;
14395 utime_t now = ceph_clock_now();
14396 ctx->mtime = now;
14397 hit_set_trim(ctx, 0);
14398 simple_opc_submit(std::move(ctx));
14399 }
14400
14401 recovery_state.update_hset(pg_hit_set_history_t());
14402 if (agent_state) {
14403 agent_state->discard_hit_sets();
14404 }
14405 }
14406
14407 void PrimaryLogPG::hit_set_create()
14408 {
14409 utime_t now = ceph_clock_now();
14410 // make a copy of the params to modify
14411 HitSet::Params params(pool.info.hit_set_params);
14412
14413 dout(20) << __func__ << " " << params << dendl;
14414 if (pool.info.hit_set_params.get_type() == HitSet::TYPE_BLOOM) {
14415 BloomHitSet::Params *p =
14416 static_cast<BloomHitSet::Params*>(params.impl.get());
14417
14418 // convert false positive rate so it holds up across the full period
14419 p->set_fpp(p->get_fpp() / pool.info.hit_set_count);
14420 if (p->get_fpp() <= 0.0)
14421 p->set_fpp(.01); // fpp cannot be zero!
14422
14423 // if we don't have specified size, estimate target size based on the
14424 // previous bin!
14425 if (p->target_size == 0 && hit_set) {
14426 utime_t dur = now - hit_set_start_stamp;
14427 unsigned unique = hit_set->approx_unique_insert_count();
14428 dout(20) << __func__ << " previous set had approx " << unique
14429 << " unique items over " << dur << " seconds" << dendl;
14430 p->target_size = (double)unique * (double)pool.info.hit_set_period
14431 / (double)dur;
14432 }
14433 if (p->target_size <
14434 static_cast<uint64_t>(cct->_conf->osd_hit_set_min_size))
14435 p->target_size = cct->_conf->osd_hit_set_min_size;
14436
14437 if (p->target_size
14438 > static_cast<uint64_t>(cct->_conf->osd_hit_set_max_size))
14439 p->target_size = cct->_conf->osd_hit_set_max_size;
14440
14441 p->seed = now.sec();
14442
14443 dout(10) << __func__ << " target_size " << p->target_size
14444 << " fpp " << p->get_fpp() << dendl;
14445 }
14446 hit_set.reset(new HitSet(params));
14447 hit_set_start_stamp = now;
14448 }
14449
14450 /**
14451 * apply log entries to set
14452 *
14453 * this would only happen after peering, to at least capture writes
14454 * during an interval that was potentially lost.
14455 */
14456 bool PrimaryLogPG::hit_set_apply_log()
14457 {
14458 if (!hit_set)
14459 return false;
14460
14461 eversion_t to = info.last_update;
14462 eversion_t from = info.hit_set.current_last_update;
14463 if (to <= from) {
14464 dout(20) << __func__ << " no update" << dendl;
14465 return false;
14466 }
14467
14468 dout(20) << __func__ << " " << to << " .. " << info.last_update << dendl;
14469 list<pg_log_entry_t>::const_reverse_iterator p =
14470 recovery_state.get_pg_log().get_log().log.rbegin();
14471 while (p != recovery_state.get_pg_log().get_log().log.rend() && p->version > to)
14472 ++p;
14473 while (p != recovery_state.get_pg_log().get_log().log.rend() && p->version > from) {
14474 hit_set->insert(p->soid);
14475 ++p;
14476 }
14477
14478 return true;
14479 }
14480
14481 void PrimaryLogPG::hit_set_persist()
14482 {
14483 dout(10) << __func__ << dendl;
14484 bufferlist bl;
14485 unsigned max = pool.info.hit_set_count;
14486
14487 utime_t now = ceph_clock_now();
14488 hobject_t oid;
14489
14490 // If any archives are degraded we skip this persist request
14491 // account for the additional entry being added below
14492 for (auto p = info.hit_set.history.begin();
14493 p != info.hit_set.history.end();
14494 ++p) {
14495 hobject_t aoid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt);
14496
14497 // Once we hit a degraded object just skip further trim
14498 if (is_degraded_or_backfilling_object(aoid))
14499 return;
14500 if (m_scrubber->write_blocked_by_scrub(aoid))
14501 return;
14502 }
14503
14504 // If backfill is in progress and we could possibly overlap with the
14505 // hit_set_* objects, back off. Since these all have
14506 // hobject_t::hash set to pgid.ps(), and those sort first, we can
14507 // look just at that. This is necessary because our transactions
14508 // may include a modify of the new hit_set *and* a delete of the
14509 // old one, and this may span the backfill boundary.
14510 for (set<pg_shard_t>::const_iterator p = get_backfill_targets().begin();
14511 p != get_backfill_targets().end();
14512 ++p) {
14513 const pg_info_t& pi = recovery_state.get_peer_info(*p);
14514 if (pi.last_backfill == hobject_t() ||
14515 pi.last_backfill.get_hash() == info.pgid.ps()) {
14516 dout(10) << __func__ << " backfill target osd." << *p
14517 << " last_backfill has not progressed past pgid ps"
14518 << dendl;
14519 return;
14520 }
14521 }
14522
14523
14524 pg_hit_set_info_t new_hset = pg_hit_set_info_t(pool.info.use_gmt_hitset);
14525 new_hset.begin = hit_set_start_stamp;
14526 new_hset.end = now;
14527 oid = get_hit_set_archive_object(
14528 new_hset.begin,
14529 new_hset.end,
14530 new_hset.using_gmt);
14531
14532 // If the current object is degraded we skip this persist request
14533 if (m_scrubber->write_blocked_by_scrub(oid))
14534 return;
14535
14536 hit_set->seal();
14537 encode(*hit_set, bl);
14538 dout(20) << __func__ << " archive " << oid << dendl;
14539
14540 if (agent_state) {
14541 agent_state->add_hit_set(new_hset.begin, hit_set);
14542 uint32_t size = agent_state->hit_set_map.size();
14543 if (size >= pool.info.hit_set_count) {
14544 size = pool.info.hit_set_count > 0 ? pool.info.hit_set_count - 1: 0;
14545 }
14546 hit_set_in_memory_trim(size);
14547 }
14548
14549 ObjectContextRef obc = get_object_context(oid, true);
14550 OpContextUPtr ctx = simple_opc_create(obc);
14551
14552 ctx->at_version = get_next_version();
14553 ctx->updated_hset_history = info.hit_set;
14554 pg_hit_set_history_t &updated_hit_set_hist = *(ctx->updated_hset_history);
14555
14556 updated_hit_set_hist.current_last_update = info.last_update;
14557 new_hset.version = ctx->at_version;
14558
14559 updated_hit_set_hist.history.push_back(new_hset);
14560 hit_set_create();
14561
14562 // fabricate an object_info_t and SnapSet
14563 obc->obs.oi.version = ctx->at_version;
14564 obc->obs.oi.mtime = now;
14565 obc->obs.oi.size = bl.length();
14566 obc->obs.exists = true;
14567 obc->obs.oi.set_data_digest(bl.crc32c(-1));
14568
14569 ctx->new_obs = obc->obs;
14570
14571 ctx->new_snapset = obc->ssc->snapset;
14572
14573 ctx->delta_stats.num_objects++;
14574 ctx->delta_stats.num_objects_hit_set_archive++;
14575
14576 ctx->delta_stats.num_bytes += bl.length();
14577 ctx->delta_stats.num_bytes_hit_set_archive += bl.length();
14578
14579 bufferlist bss;
14580 encode(ctx->new_snapset, bss);
14581 bufferlist boi(sizeof(ctx->new_obs.oi));
14582 encode(ctx->new_obs.oi, boi,
14583 get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
14584
14585 ctx->op_t->create(oid);
14586 if (bl.length()) {
14587 ctx->op_t->write(oid, 0, bl.length(), bl, 0);
14588 write_update_size_and_usage(ctx->delta_stats, obc->obs.oi, ctx->modified_ranges,
14589 0, bl.length());
14590 ctx->clean_regions.mark_data_region_dirty(0, bl.length());
14591 }
14592 map<string, bufferlist, std::less<>> attrs = {
14593 {OI_ATTR, std::move(boi)},
14594 {SS_ATTR, std::move(bss)}
14595 };
14596 setattrs_maybe_cache(ctx->obc, ctx->op_t.get(), attrs);
14597 ctx->log.push_back(
14598 pg_log_entry_t(
14599 pg_log_entry_t::MODIFY,
14600 oid,
14601 ctx->at_version,
14602 eversion_t(),
14603 0,
14604 osd_reqid_t(),
14605 ctx->mtime,
14606 0)
14607 );
14608 ctx->log.back().clean_regions = ctx->clean_regions;
14609
14610 hit_set_trim(ctx, max);
14611
14612 simple_opc_submit(std::move(ctx));
14613 }
14614
14615 void PrimaryLogPG::hit_set_trim(OpContextUPtr &ctx, unsigned max)
14616 {
14617 ceph_assert(ctx->updated_hset_history);
14618 pg_hit_set_history_t &updated_hit_set_hist =
14619 *(ctx->updated_hset_history);
14620 for (unsigned num = updated_hit_set_hist.history.size(); num > max; --num) {
14621 list<pg_hit_set_info_t>::iterator p = updated_hit_set_hist.history.begin();
14622 ceph_assert(p != updated_hit_set_hist.history.end());
14623 hobject_t oid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt);
14624
14625 ceph_assert(!is_degraded_or_backfilling_object(oid));
14626
14627 dout(20) << __func__ << " removing " << oid << dendl;
14628 ++ctx->at_version.version;
14629 ctx->log.push_back(
14630 pg_log_entry_t(pg_log_entry_t::DELETE,
14631 oid,
14632 ctx->at_version,
14633 p->version,
14634 0,
14635 osd_reqid_t(),
14636 ctx->mtime,
14637 0));
14638
14639 ctx->op_t->remove(oid);
14640 updated_hit_set_hist.history.pop_front();
14641
14642 ObjectContextRef obc = get_object_context(oid, false);
14643 ceph_assert(obc);
14644 --ctx->delta_stats.num_objects;
14645 --ctx->delta_stats.num_objects_hit_set_archive;
14646 ctx->delta_stats.num_bytes -= obc->obs.oi.size;
14647 ctx->delta_stats.num_bytes_hit_set_archive -= obc->obs.oi.size;
14648 }
14649 }
14650
14651 void PrimaryLogPG::hit_set_in_memory_trim(uint32_t max_in_memory)
14652 {
14653 while (agent_state->hit_set_map.size() > max_in_memory) {
14654 agent_state->remove_oldest_hit_set();
14655 }
14656 }
14657
14658
14659 // =======================================
14660 // cache agent
14661
14662 void PrimaryLogPG::agent_setup()
14663 {
14664 ceph_assert(is_locked());
14665 if (!is_active() ||
14666 !is_primary() ||
14667 state_test(PG_STATE_PREMERGE) ||
14668 pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE ||
14669 pool.info.tier_of < 0 ||
14670 !get_osdmap()->have_pg_pool(pool.info.tier_of)) {
14671 agent_clear();
14672 return;
14673 }
14674 if (!agent_state) {
14675 agent_state.reset(new TierAgentState);
14676
14677 // choose random starting position
14678 agent_state->position = hobject_t();
14679 agent_state->position.pool = info.pgid.pool();
14680 agent_state->position.set_hash(pool.info.get_random_pg_position(
14681 info.pgid.pgid,
14682 rand()));
14683 agent_state->start = agent_state->position;
14684
14685 dout(10) << __func__ << " allocated new state, position "
14686 << agent_state->position << dendl;
14687 } else {
14688 dout(10) << __func__ << " keeping existing state" << dendl;
14689 }
14690
14691 if (info.stats.stats_invalid) {
14692 osd->clog->warn() << "pg " << info.pgid << " has invalid (post-split) stats; must scrub before tier agent can activate";
14693 }
14694
14695 agent_choose_mode();
14696 }
14697
14698 void PrimaryLogPG::agent_clear()
14699 {
14700 agent_stop();
14701 agent_state.reset(NULL);
14702 }
14703
14704 // Return false if no objects operated on since start of object hash space
14705 bool PrimaryLogPG::agent_work(int start_max, int agent_flush_quota)
14706 {
14707 std::scoped_lock locker{*this};
14708 if (!agent_state) {
14709 dout(10) << __func__ << " no agent state, stopping" << dendl;
14710 return true;
14711 }
14712
14713 ceph_assert(!recovery_state.is_deleting());
14714
14715 if (agent_state->is_idle()) {
14716 dout(10) << __func__ << " idle, stopping" << dendl;
14717 return true;
14718 }
14719
14720 osd->logger->inc(l_osd_agent_wake);
14721
14722 dout(10) << __func__
14723 << " max " << start_max
14724 << ", flush " << agent_state->get_flush_mode_name()
14725 << ", evict " << agent_state->get_evict_mode_name()
14726 << ", pos " << agent_state->position
14727 << dendl;
14728 ceph_assert(is_primary());
14729 ceph_assert(is_active());
14730
14731 agent_load_hit_sets();
14732
14733 const pg_pool_t *base_pool = get_osdmap()->get_pg_pool(pool.info.tier_of);
14734 ceph_assert(base_pool);
14735
14736 int ls_min = 1;
14737 int ls_max = cct->_conf->osd_pool_default_cache_max_evict_check_size;
14738
14739 // list some objects. this conveniently lists clones (oldest to
14740 // newest) before heads... the same order we want to flush in.
14741 //
14742 // NOTE: do not flush the Sequencer. we will assume that the
14743 // listing we get back is imprecise.
14744 vector<hobject_t> ls;
14745 hobject_t next;
14746 int r = pgbackend->objects_list_partial(agent_state->position, ls_min, ls_max,
14747 &ls, &next);
14748 ceph_assert(r >= 0);
14749 dout(20) << __func__ << " got " << ls.size() << " objects" << dendl;
14750 int started = 0;
14751 for (vector<hobject_t>::iterator p = ls.begin();
14752 p != ls.end();
14753 ++p) {
14754 if (p->nspace == cct->_conf->osd_hit_set_namespace) {
14755 dout(20) << __func__ << " skip (hit set) " << *p << dendl;
14756 osd->logger->inc(l_osd_agent_skip);
14757 continue;
14758 }
14759 if (is_degraded_or_backfilling_object(*p)) {
14760 dout(20) << __func__ << " skip (degraded) " << *p << dendl;
14761 osd->logger->inc(l_osd_agent_skip);
14762 continue;
14763 }
14764 if (is_missing_object(p->get_head())) {
14765 dout(20) << __func__ << " skip (missing head) " << *p << dendl;
14766 osd->logger->inc(l_osd_agent_skip);
14767 continue;
14768 }
14769 ObjectContextRef obc = get_object_context(*p, false, NULL);
14770 if (!obc) {
14771 // we didn't flush; we may miss something here.
14772 dout(20) << __func__ << " skip (no obc) " << *p << dendl;
14773 osd->logger->inc(l_osd_agent_skip);
14774 continue;
14775 }
14776 if (!obc->obs.exists) {
14777 dout(20) << __func__ << " skip (dne) " << obc->obs.oi.soid << dendl;
14778 osd->logger->inc(l_osd_agent_skip);
14779 continue;
14780 }
14781 if (m_scrubber->range_intersects_scrub(obc->obs.oi.soid,
14782 obc->obs.oi.soid.get_head())) {
14783 dout(20) << __func__ << " skip (scrubbing) " << obc->obs.oi << dendl;
14784 osd->logger->inc(l_osd_agent_skip);
14785 continue;
14786 }
14787 if (obc->is_blocked()) {
14788 dout(20) << __func__ << " skip (blocked) " << obc->obs.oi << dendl;
14789 osd->logger->inc(l_osd_agent_skip);
14790 continue;
14791 }
14792 if (obc->is_request_pending()) {
14793 dout(20) << __func__ << " skip (request pending) " << obc->obs.oi << dendl;
14794 osd->logger->inc(l_osd_agent_skip);
14795 continue;
14796 }
14797
14798 // be careful flushing omap to an EC pool.
14799 if (!base_pool->supports_omap() &&
14800 obc->obs.oi.is_omap()) {
14801 dout(20) << __func__ << " skip (omap to EC) " << obc->obs.oi << dendl;
14802 osd->logger->inc(l_osd_agent_skip);
14803 continue;
14804 }
14805
14806 if (agent_state->evict_mode != TierAgentState::EVICT_MODE_IDLE &&
14807 agent_maybe_evict(obc, false))
14808 ++started;
14809 else if (agent_state->flush_mode != TierAgentState::FLUSH_MODE_IDLE &&
14810 agent_flush_quota > 0 && agent_maybe_flush(obc)) {
14811 ++started;
14812 --agent_flush_quota;
14813 }
14814 if (started >= start_max) {
14815 // If finishing early, set "next" to the next object
14816 if (++p != ls.end())
14817 next = *p;
14818 break;
14819 }
14820 }
14821
14822 if (++agent_state->hist_age > cct->_conf->osd_agent_hist_halflife) {
14823 dout(20) << __func__ << " resetting atime and temp histograms" << dendl;
14824 agent_state->hist_age = 0;
14825 agent_state->temp_hist.decay();
14826 }
14827
14828 // Total objects operated on so far
14829 int total_started = agent_state->started + started;
14830 bool need_delay = false;
14831
14832 dout(20) << __func__ << " start pos " << agent_state->position
14833 << " next start pos " << next
14834 << " started " << total_started << dendl;
14835
14836 // See if we've made a full pass over the object hash space
14837 // This might check at most ls_max objects a second time to notice that
14838 // we've checked every objects at least once.
14839 if (agent_state->position < agent_state->start &&
14840 next >= agent_state->start) {
14841 dout(20) << __func__ << " wrap around " << agent_state->start << dendl;
14842 if (total_started == 0)
14843 need_delay = true;
14844 else
14845 total_started = 0;
14846 agent_state->start = next;
14847 }
14848 agent_state->started = total_started;
14849
14850 // See if we are starting from beginning
14851 if (next.is_max())
14852 agent_state->position = hobject_t();
14853 else
14854 agent_state->position = next;
14855
14856 // Discard old in memory HitSets
14857 hit_set_in_memory_trim(pool.info.hit_set_count);
14858
14859 if (need_delay) {
14860 ceph_assert(agent_state->delaying == false);
14861 agent_delay();
14862 return false;
14863 }
14864 agent_choose_mode();
14865 return true;
14866 }
14867
14868 void PrimaryLogPG::agent_load_hit_sets()
14869 {
14870 if (agent_state->evict_mode == TierAgentState::EVICT_MODE_IDLE) {
14871 return;
14872 }
14873
14874 if (agent_state->hit_set_map.size() < info.hit_set.history.size()) {
14875 dout(10) << __func__ << dendl;
14876 for (auto p = info.hit_set.history.begin();
14877 p != info.hit_set.history.end(); ++p) {
14878 if (agent_state->hit_set_map.count(p->begin.sec()) == 0) {
14879 dout(10) << __func__ << " loading " << p->begin << "-"
14880 << p->end << dendl;
14881 if (!pool.info.is_replicated()) {
14882 // FIXME: EC not supported here yet
14883 derr << __func__ << " on non-replicated pool" << dendl;
14884 break;
14885 }
14886
14887 hobject_t oid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt);
14888 if (is_unreadable_object(oid)) {
14889 dout(10) << __func__ << " unreadable " << oid << ", waiting" << dendl;
14890 break;
14891 }
14892
14893 ObjectContextRef obc = get_object_context(oid, false);
14894 if (!obc) {
14895 derr << __func__ << ": could not load hitset " << oid << dendl;
14896 break;
14897 }
14898
14899 bufferlist bl;
14900 {
14901 int r = osd->store->read(ch, ghobject_t(oid), 0, 0, bl);
14902 ceph_assert(r >= 0);
14903 }
14904 HitSetRef hs(new HitSet);
14905 bufferlist::const_iterator pbl = bl.begin();
14906 decode(*hs, pbl);
14907 agent_state->add_hit_set(p->begin.sec(), hs);
14908 }
14909 }
14910 }
14911 }
14912
14913 bool PrimaryLogPG::agent_maybe_flush(ObjectContextRef& obc)
14914 {
14915 if (!obc->obs.oi.is_dirty()) {
14916 dout(20) << __func__ << " skip (clean) " << obc->obs.oi << dendl;
14917 osd->logger->inc(l_osd_agent_skip);
14918 return false;
14919 }
14920 if (obc->obs.oi.is_cache_pinned()) {
14921 dout(20) << __func__ << " skip (cache_pinned) " << obc->obs.oi << dendl;
14922 osd->logger->inc(l_osd_agent_skip);
14923 return false;
14924 }
14925
14926 utime_t now = ceph_clock_now();
14927 utime_t ob_local_mtime;
14928 if (obc->obs.oi.local_mtime != utime_t()) {
14929 ob_local_mtime = obc->obs.oi.local_mtime;
14930 } else {
14931 ob_local_mtime = obc->obs.oi.mtime;
14932 }
14933 bool evict_mode_full =
14934 (agent_state->evict_mode == TierAgentState::EVICT_MODE_FULL);
14935 if (!evict_mode_full &&
14936 obc->obs.oi.soid.snap == CEPH_NOSNAP && // snaps immutable; don't delay
14937 (ob_local_mtime + utime_t(pool.info.cache_min_flush_age, 0) > now)) {
14938 dout(20) << __func__ << " skip (too young) " << obc->obs.oi << dendl;
14939 osd->logger->inc(l_osd_agent_skip);
14940 return false;
14941 }
14942
14943 if (osd->agent_is_active_oid(obc->obs.oi.soid)) {
14944 dout(20) << __func__ << " skip (flushing) " << obc->obs.oi << dendl;
14945 osd->logger->inc(l_osd_agent_skip);
14946 return false;
14947 }
14948
14949 dout(10) << __func__ << " flushing " << obc->obs.oi << dendl;
14950
14951 // FIXME: flush anything dirty, regardless of what distribution of
14952 // ages we expect.
14953
14954 hobject_t oid = obc->obs.oi.soid;
14955 osd->agent_start_op(oid);
14956 // no need to capture a pg ref, can't outlive fop or ctx
14957 std::function<void()> on_flush = [this, oid]() {
14958 osd->agent_finish_op(oid);
14959 };
14960
14961 int result = start_flush(
14962 OpRequestRef(), obc, false, NULL,
14963 on_flush);
14964 if (result != -EINPROGRESS) {
14965 on_flush();
14966 dout(10) << __func__ << " start_flush() failed " << obc->obs.oi
14967 << " with " << result << dendl;
14968 osd->logger->inc(l_osd_agent_skip);
14969 return false;
14970 }
14971
14972 osd->logger->inc(l_osd_agent_flush);
14973 return true;
14974 }
14975
14976 bool PrimaryLogPG::agent_maybe_evict(ObjectContextRef& obc, bool after_flush)
14977 {
14978 const hobject_t& soid = obc->obs.oi.soid;
14979 if (!after_flush && obc->obs.oi.is_dirty()) {
14980 dout(20) << __func__ << " skip (dirty) " << obc->obs.oi << dendl;
14981 return false;
14982 }
14983 // This is already checked by agent_work() which passes after_flush = false
14984 if (after_flush && m_scrubber->range_intersects_scrub(soid, soid.get_head())) {
14985 dout(20) << __func__ << " skip (scrubbing) " << obc->obs.oi << dendl;
14986 return false;
14987 }
14988 if (!obc->obs.oi.watchers.empty()) {
14989 dout(20) << __func__ << " skip (watchers) " << obc->obs.oi << dendl;
14990 return false;
14991 }
14992 if (obc->is_blocked()) {
14993 dout(20) << __func__ << " skip (blocked) " << obc->obs.oi << dendl;
14994 return false;
14995 }
14996 if (obc->obs.oi.is_cache_pinned()) {
14997 dout(20) << __func__ << " skip (cache_pinned) " << obc->obs.oi << dendl;
14998 return false;
14999 }
15000
15001 if (soid.snap == CEPH_NOSNAP) {
15002 int result = _verify_no_head_clones(soid, obc->ssc->snapset);
15003 if (result < 0) {
15004 dout(20) << __func__ << " skip (clones) " << obc->obs.oi << dendl;
15005 return false;
15006 }
15007 }
15008
15009 if (agent_state->evict_mode != TierAgentState::EVICT_MODE_FULL) {
15010 // is this object old than cache_min_evict_age?
15011 utime_t now = ceph_clock_now();
15012 utime_t ob_local_mtime;
15013 if (obc->obs.oi.local_mtime != utime_t()) {
15014 ob_local_mtime = obc->obs.oi.local_mtime;
15015 } else {
15016 ob_local_mtime = obc->obs.oi.mtime;
15017 }
15018 if (ob_local_mtime + utime_t(pool.info.cache_min_evict_age, 0) > now) {
15019 dout(20) << __func__ << " skip (too young) " << obc->obs.oi << dendl;
15020 osd->logger->inc(l_osd_agent_skip);
15021 return false;
15022 }
15023 // is this object old and/or cold enough?
15024 int temp = 0;
15025 uint64_t temp_upper = 0, temp_lower = 0;
15026 if (hit_set)
15027 agent_estimate_temp(soid, &temp);
15028 agent_state->temp_hist.add(temp);
15029 agent_state->temp_hist.get_position_micro(temp, &temp_lower, &temp_upper);
15030
15031 dout(20) << __func__
15032 << " temp " << temp
15033 << " pos " << temp_lower << "-" << temp_upper
15034 << ", evict_effort " << agent_state->evict_effort
15035 << dendl;
15036 dout(30) << "agent_state:\n";
15037 Formatter *f = Formatter::create("");
15038 f->open_object_section("agent_state");
15039 agent_state->dump(f);
15040 f->close_section();
15041 f->flush(*_dout);
15042 delete f;
15043 *_dout << dendl;
15044
15045 if (1000000 - temp_upper >= agent_state->evict_effort)
15046 return false;
15047 }
15048
15049 dout(10) << __func__ << " evicting " << obc->obs.oi << dendl;
15050 OpContextUPtr ctx = simple_opc_create(obc);
15051
15052 auto null_op_req = OpRequestRef();
15053 if (!ctx->lock_manager.get_lock_type(
15054 RWState::RWWRITE,
15055 obc->obs.oi.soid,
15056 obc,
15057 null_op_req)) {
15058 close_op_ctx(ctx.release());
15059 dout(20) << __func__ << " skip (cannot get lock) " << obc->obs.oi << dendl;
15060 return false;
15061 }
15062
15063 osd->agent_start_evict_op();
15064 ctx->register_on_finish(
15065 [this]() {
15066 osd->agent_finish_evict_op();
15067 });
15068
15069 ctx->at_version = get_next_version();
15070 ceph_assert(ctx->new_obs.exists);
15071 int r = _delete_oid(ctx.get(), true, false);
15072 if (obc->obs.oi.is_omap())
15073 ctx->delta_stats.num_objects_omap--;
15074 ctx->delta_stats.num_evict++;
15075 ctx->delta_stats.num_evict_kb += shift_round_up(obc->obs.oi.size, 10);
15076 if (obc->obs.oi.is_dirty())
15077 --ctx->delta_stats.num_objects_dirty;
15078 ceph_assert(r == 0);
15079 finish_ctx(ctx.get(), pg_log_entry_t::DELETE);
15080 simple_opc_submit(std::move(ctx));
15081 osd->logger->inc(l_osd_tier_evict);
15082 osd->logger->inc(l_osd_agent_evict);
15083 return true;
15084 }
15085
15086 void PrimaryLogPG::agent_stop()
15087 {
15088 dout(20) << __func__ << dendl;
15089 if (agent_state && !agent_state->is_idle()) {
15090 agent_state->evict_mode = TierAgentState::EVICT_MODE_IDLE;
15091 agent_state->flush_mode = TierAgentState::FLUSH_MODE_IDLE;
15092 osd->agent_disable_pg(this, agent_state->evict_effort);
15093 }
15094 }
15095
15096 void PrimaryLogPG::agent_delay()
15097 {
15098 dout(20) << __func__ << dendl;
15099 if (agent_state && !agent_state->is_idle()) {
15100 ceph_assert(agent_state->delaying == false);
15101 agent_state->delaying = true;
15102 osd->agent_disable_pg(this, agent_state->evict_effort);
15103 }
15104 }
15105
15106 void PrimaryLogPG::agent_choose_mode_restart()
15107 {
15108 dout(20) << __func__ << dendl;
15109 std::scoped_lock locker{*this};
15110 if (agent_state && agent_state->delaying) {
15111 agent_state->delaying = false;
15112 agent_choose_mode(true);
15113 }
15114 }
15115
15116 bool PrimaryLogPG::agent_choose_mode(bool restart, OpRequestRef op)
15117 {
15118 bool requeued = false;
15119 // Let delay play out
15120 if (agent_state->delaying) {
15121 dout(20) << __func__ << " " << this << " delaying, ignored" << dendl;
15122 return requeued;
15123 }
15124
15125 TierAgentState::flush_mode_t flush_mode = TierAgentState::FLUSH_MODE_IDLE;
15126 TierAgentState::evict_mode_t evict_mode = TierAgentState::EVICT_MODE_IDLE;
15127 unsigned evict_effort = 0;
15128
15129 if (info.stats.stats_invalid) {
15130 // idle; stats can't be trusted until we scrub.
15131 dout(20) << __func__ << " stats invalid (post-split), idle" << dendl;
15132 goto skip_calc;
15133 }
15134
15135 {
15136 uint64_t divisor = pool.info.get_pg_num_divisor(info.pgid.pgid);
15137 ceph_assert(divisor > 0);
15138
15139 // adjust (effective) user objects down based on the number
15140 // of HitSet objects, which should not count toward our total since
15141 // they cannot be flushed.
15142 uint64_t unflushable = info.stats.stats.sum.num_objects_hit_set_archive;
15143
15144 // also exclude omap objects if ec backing pool
15145 const pg_pool_t *base_pool = get_osdmap()->get_pg_pool(pool.info.tier_of);
15146 ceph_assert(base_pool);
15147 if (!base_pool->supports_omap())
15148 unflushable += info.stats.stats.sum.num_objects_omap;
15149
15150 uint64_t num_user_objects = info.stats.stats.sum.num_objects;
15151 if (num_user_objects > unflushable)
15152 num_user_objects -= unflushable;
15153 else
15154 num_user_objects = 0;
15155
15156 uint64_t num_user_bytes = info.stats.stats.sum.num_bytes;
15157 uint64_t unflushable_bytes = info.stats.stats.sum.num_bytes_hit_set_archive;
15158 num_user_bytes -= unflushable_bytes;
15159 uint64_t num_overhead_bytes = osd->store->estimate_objects_overhead(num_user_objects);
15160 num_user_bytes += num_overhead_bytes;
15161
15162 // also reduce the num_dirty by num_objects_omap
15163 int64_t num_dirty = info.stats.stats.sum.num_objects_dirty;
15164 if (!base_pool->supports_omap()) {
15165 if (num_dirty > info.stats.stats.sum.num_objects_omap)
15166 num_dirty -= info.stats.stats.sum.num_objects_omap;
15167 else
15168 num_dirty = 0;
15169 }
15170
15171 dout(10) << __func__
15172 << " flush_mode: "
15173 << TierAgentState::get_flush_mode_name(agent_state->flush_mode)
15174 << " evict_mode: "
15175 << TierAgentState::get_evict_mode_name(agent_state->evict_mode)
15176 << " num_objects: " << info.stats.stats.sum.num_objects
15177 << " num_bytes: " << info.stats.stats.sum.num_bytes
15178 << " num_objects_dirty: " << info.stats.stats.sum.num_objects_dirty
15179 << " num_objects_omap: " << info.stats.stats.sum.num_objects_omap
15180 << " num_dirty: " << num_dirty
15181 << " num_user_objects: " << num_user_objects
15182 << " num_user_bytes: " << num_user_bytes
15183 << " num_overhead_bytes: " << num_overhead_bytes
15184 << " pool.info.target_max_bytes: " << pool.info.target_max_bytes
15185 << " pool.info.target_max_objects: " << pool.info.target_max_objects
15186 << dendl;
15187
15188 // get dirty, full ratios
15189 uint64_t dirty_micro = 0;
15190 uint64_t full_micro = 0;
15191 if (pool.info.target_max_bytes && num_user_objects > 0) {
15192 uint64_t avg_size = num_user_bytes / num_user_objects;
15193 dirty_micro =
15194 num_dirty * avg_size * 1000000 /
15195 std::max<uint64_t>(pool.info.target_max_bytes / divisor, 1);
15196 full_micro =
15197 num_user_objects * avg_size * 1000000 /
15198 std::max<uint64_t>(pool.info.target_max_bytes / divisor, 1);
15199 }
15200 if (pool.info.target_max_objects > 0) {
15201 uint64_t dirty_objects_micro =
15202 num_dirty * 1000000 /
15203 std::max<uint64_t>(pool.info.target_max_objects / divisor, 1);
15204 if (dirty_objects_micro > dirty_micro)
15205 dirty_micro = dirty_objects_micro;
15206 uint64_t full_objects_micro =
15207 num_user_objects * 1000000 /
15208 std::max<uint64_t>(pool.info.target_max_objects / divisor, 1);
15209 if (full_objects_micro > full_micro)
15210 full_micro = full_objects_micro;
15211 }
15212 dout(20) << __func__ << " dirty " << ((float)dirty_micro / 1000000.0)
15213 << " full " << ((float)full_micro / 1000000.0)
15214 << dendl;
15215
15216 // flush mode
15217 uint64_t flush_target = pool.info.cache_target_dirty_ratio_micro;
15218 uint64_t flush_high_target = pool.info.cache_target_dirty_high_ratio_micro;
15219 uint64_t flush_slop = (float)flush_target * cct->_conf->osd_agent_slop;
15220 if (restart || agent_state->flush_mode == TierAgentState::FLUSH_MODE_IDLE) {
15221 flush_target += flush_slop;
15222 flush_high_target += flush_slop;
15223 } else {
15224 flush_target -= std::min(flush_target, flush_slop);
15225 flush_high_target -= std::min(flush_high_target, flush_slop);
15226 }
15227
15228 if (dirty_micro > flush_high_target) {
15229 flush_mode = TierAgentState::FLUSH_MODE_HIGH;
15230 } else if (dirty_micro > flush_target || (!flush_target && num_dirty > 0)) {
15231 flush_mode = TierAgentState::FLUSH_MODE_LOW;
15232 }
15233
15234 // evict mode
15235 uint64_t evict_target = pool.info.cache_target_full_ratio_micro;
15236 uint64_t evict_slop = (float)evict_target * cct->_conf->osd_agent_slop;
15237 if (restart || agent_state->evict_mode == TierAgentState::EVICT_MODE_IDLE)
15238 evict_target += evict_slop;
15239 else
15240 evict_target -= std::min(evict_target, evict_slop);
15241
15242 if (full_micro > 1000000) {
15243 // evict anything clean
15244 evict_mode = TierAgentState::EVICT_MODE_FULL;
15245 evict_effort = 1000000;
15246 } else if (full_micro > evict_target) {
15247 // set effort in [0..1] range based on where we are between
15248 evict_mode = TierAgentState::EVICT_MODE_SOME;
15249 uint64_t over = full_micro - evict_target;
15250 uint64_t span = 1000000 - evict_target;
15251 evict_effort = std::max(over * 1000000 / span,
15252 uint64_t(1000000.0 *
15253 cct->_conf->osd_agent_min_evict_effort));
15254
15255 // quantize effort to avoid too much reordering in the agent_queue.
15256 uint64_t inc = cct->_conf->osd_agent_quantize_effort * 1000000;
15257 ceph_assert(inc > 0);
15258 uint64_t was = evict_effort;
15259 evict_effort -= evict_effort % inc;
15260 if (evict_effort < inc)
15261 evict_effort = inc;
15262 ceph_assert(evict_effort >= inc && evict_effort <= 1000000);
15263 dout(30) << __func__ << " evict_effort " << was << " quantized by " << inc << " to " << evict_effort << dendl;
15264 }
15265 }
15266
15267 skip_calc:
15268 bool old_idle = agent_state->is_idle();
15269 if (flush_mode != agent_state->flush_mode) {
15270 dout(5) << __func__ << " flush_mode "
15271 << TierAgentState::get_flush_mode_name(agent_state->flush_mode)
15272 << " -> "
15273 << TierAgentState::get_flush_mode_name(flush_mode)
15274 << dendl;
15275 recovery_state.update_stats(
15276 [=](auto &history, auto &stats) {
15277 if (flush_mode == TierAgentState::FLUSH_MODE_HIGH) {
15278 osd->agent_inc_high_count();
15279 stats.stats.sum.num_flush_mode_high = 1;
15280 } else if (flush_mode == TierAgentState::FLUSH_MODE_LOW) {
15281 stats.stats.sum.num_flush_mode_low = 1;
15282 }
15283 if (agent_state->flush_mode == TierAgentState::FLUSH_MODE_HIGH) {
15284 osd->agent_dec_high_count();
15285 stats.stats.sum.num_flush_mode_high = 0;
15286 } else if (agent_state->flush_mode == TierAgentState::FLUSH_MODE_LOW) {
15287 stats.stats.sum.num_flush_mode_low = 0;
15288 }
15289 return false;
15290 });
15291 agent_state->flush_mode = flush_mode;
15292 }
15293 if (evict_mode != agent_state->evict_mode) {
15294 dout(5) << __func__ << " evict_mode "
15295 << TierAgentState::get_evict_mode_name(agent_state->evict_mode)
15296 << " -> "
15297 << TierAgentState::get_evict_mode_name(evict_mode)
15298 << dendl;
15299 if (agent_state->evict_mode == TierAgentState::EVICT_MODE_FULL &&
15300 is_active()) {
15301 if (op)
15302 requeue_op(op);
15303 requeue_ops(waiting_for_flush);
15304 requeue_ops(waiting_for_active);
15305 requeue_ops(waiting_for_readable);
15306 requeue_ops(waiting_for_scrub);
15307 requeue_ops(waiting_for_cache_not_full);
15308 objects_blocked_on_cache_full.clear();
15309 requeued = true;
15310 }
15311 recovery_state.update_stats(
15312 [=](auto &history, auto &stats) {
15313 if (evict_mode == TierAgentState::EVICT_MODE_SOME) {
15314 stats.stats.sum.num_evict_mode_some = 1;
15315 } else if (evict_mode == TierAgentState::EVICT_MODE_FULL) {
15316 stats.stats.sum.num_evict_mode_full = 1;
15317 }
15318 if (agent_state->evict_mode == TierAgentState::EVICT_MODE_SOME) {
15319 stats.stats.sum.num_evict_mode_some = 0;
15320 } else if (agent_state->evict_mode == TierAgentState::EVICT_MODE_FULL) {
15321 stats.stats.sum.num_evict_mode_full = 0;
15322 }
15323 return false;
15324 });
15325 agent_state->evict_mode = evict_mode;
15326 }
15327 uint64_t old_effort = agent_state->evict_effort;
15328 if (evict_effort != agent_state->evict_effort) {
15329 dout(5) << __func__ << " evict_effort "
15330 << ((float)agent_state->evict_effort / 1000000.0)
15331 << " -> "
15332 << ((float)evict_effort / 1000000.0)
15333 << dendl;
15334 agent_state->evict_effort = evict_effort;
15335 }
15336
15337 // NOTE: we are using evict_effort as a proxy for *all* agent effort
15338 // (including flush). This is probably fine (they should be
15339 // correlated) but it is not precisely correct.
15340 if (agent_state->is_idle()) {
15341 if (!restart && !old_idle) {
15342 osd->agent_disable_pg(this, old_effort);
15343 }
15344 } else {
15345 if (restart || old_idle) {
15346 osd->agent_enable_pg(this, agent_state->evict_effort);
15347 } else if (old_effort != agent_state->evict_effort) {
15348 osd->agent_adjust_pg(this, old_effort, agent_state->evict_effort);
15349 }
15350 }
15351 return requeued;
15352 }
15353
15354 void PrimaryLogPG::agent_estimate_temp(const hobject_t& oid, int *temp)
15355 {
15356 ceph_assert(hit_set);
15357 ceph_assert(temp);
15358 *temp = 0;
15359 if (hit_set->contains(oid))
15360 *temp = 1000000;
15361 unsigned i = 0;
15362 int last_n = pool.info.hit_set_search_last_n;
15363 for (map<time_t,HitSetRef>::reverse_iterator p =
15364 agent_state->hit_set_map.rbegin(); last_n > 0 &&
15365 p != agent_state->hit_set_map.rend(); ++p, ++i) {
15366 if (p->second->contains(oid)) {
15367 *temp += pool.info.get_grade(i);
15368 --last_n;
15369 }
15370 }
15371 }
15372
15373 // Dup op detection
15374
15375 bool PrimaryLogPG::already_complete(eversion_t v)
15376 {
15377 dout(20) << __func__ << ": " << v << dendl;
15378 for (xlist<RepGather*>::iterator i = repop_queue.begin();
15379 !i.end();
15380 ++i) {
15381 dout(20) << __func__ << ": " << **i << dendl;
15382 // skip copy from temp object ops
15383 if ((*i)->v == eversion_t()) {
15384 dout(20) << __func__ << ": " << **i
15385 << " version is empty" << dendl;
15386 continue;
15387 }
15388 if ((*i)->v > v) {
15389 dout(20) << __func__ << ": " << **i
15390 << " (*i)->v past v" << dendl;
15391 break;
15392 }
15393 if (!(*i)->all_committed) {
15394 dout(20) << __func__ << ": " << **i
15395 << " not committed, returning false"
15396 << dendl;
15397 return false;
15398 }
15399 }
15400 dout(20) << __func__ << ": returning true" << dendl;
15401 return true;
15402 }
15403
15404
15405 // ==========================================================================================
15406 // SCRUB
15407
15408 void PrimaryLogPG::do_replica_scrub_map(OpRequestRef op)
15409 {
15410 dout(15) << __func__ << " is scrub active? " << is_scrub_active() << dendl;
15411 op->mark_started();
15412
15413 if (!is_scrub_active()) {
15414 dout(10) << __func__ << " scrub isn't active" << dendl;
15415 return;
15416 }
15417 m_scrubber->map_from_replica(op);
15418 }
15419
15420 bool PrimaryLogPG::_range_available_for_scrub(const hobject_t& begin,
15421 const hobject_t& end)
15422 {
15423 pair<hobject_t, ObjectContextRef> next;
15424 next.second = object_contexts.lookup(begin);
15425 next.first = begin;
15426 bool more = true;
15427 while (more && next.first < end) {
15428 if (next.second && next.second->is_blocked()) {
15429 next.second->requeue_scrub_on_unblock = true;
15430 dout(10) << __func__ << ": scrub delayed, "
15431 << next.first << " is blocked"
15432 << dendl;
15433 return false;
15434 }
15435 more = object_contexts.get_next(next.first, &next);
15436 }
15437 return true;
15438 }
15439
15440
15441 int PrimaryLogPG::rep_repair_primary_object(const hobject_t& soid, OpContext *ctx)
15442 {
15443 OpRequestRef op = ctx->op;
15444 // Only supports replicated pools
15445 ceph_assert(!pool.info.is_erasure());
15446 ceph_assert(is_primary());
15447
15448 dout(10) << __func__ << " " << soid
15449 << " peers osd.{" << get_acting_recovery_backfill() << "}" << dendl;
15450
15451 if (!is_clean()) {
15452 block_for_clean(soid, op);
15453 return -EAGAIN;
15454 }
15455
15456 ceph_assert(!recovery_state.get_pg_log().get_missing().is_missing(soid));
15457 auto& oi = ctx->new_obs.oi;
15458 eversion_t v = oi.version;
15459
15460 if (primary_error(soid, v)) {
15461 dout(0) << __func__ << " No other replicas available for " << soid << dendl;
15462 // XXX: If we knew that there is no down osd which could include this
15463 // object, it would be nice if we could return EIO here.
15464 // If a "never fail" flag was available, that could be used
15465 // for rbd to NOT return EIO until object marked lost.
15466
15467 // Drop through to save this op in case an osd comes up with the object.
15468 }
15469
15470 // Restart the op after object becomes readable again
15471 waiting_for_unreadable_object[soid].push_back(op);
15472 op->mark_delayed("waiting for missing object");
15473
15474 ceph_assert(is_clean());
15475 state_set(PG_STATE_REPAIR);
15476 state_clear(PG_STATE_CLEAN);
15477 queue_peering_event(
15478 PGPeeringEventRef(
15479 std::make_shared<PGPeeringEvent>(
15480 get_osdmap_epoch(),
15481 get_osdmap_epoch(),
15482 PeeringState::DoRecovery())));
15483
15484 return -EAGAIN;
15485 }
15486
15487 /*---SnapTrimmer Logging---*/
15488 #undef dout_prefix
15489 #define dout_prefix pg->gen_prefix(*_dout)
15490
15491 void PrimaryLogPG::SnapTrimmer::log_enter(const char *state_name)
15492 {
15493 ldout(pg->cct, 20) << "enter " << state_name << dendl;
15494 }
15495
15496 void PrimaryLogPG::SnapTrimmer::log_exit(const char *state_name, utime_t enter_time)
15497 {
15498 ldout(pg->cct, 20) << "exit " << state_name << dendl;
15499 }
15500
15501 bool PrimaryLogPG::SnapTrimmer::permit_trim() {
15502 return
15503 pg->is_clean() &&
15504 !pg->is_scrub_queued_or_active() &&
15505 !pg->snap_trimq.empty();
15506 }
15507
15508 /*---SnapTrimmer states---*/
15509 #undef dout_prefix
15510 #define dout_prefix (context< SnapTrimmer >().pg->gen_prefix(*_dout) \
15511 << "SnapTrimmer state<" << get_state_name() << ">: ")
15512
15513 /* NotTrimming */
15514 PrimaryLogPG::NotTrimming::NotTrimming(my_context ctx)
15515 : my_base(ctx),
15516 NamedState(nullptr, "NotTrimming")
15517 {
15518 context< SnapTrimmer >().log_enter(state_name);
15519 }
15520
15521 void PrimaryLogPG::NotTrimming::exit()
15522 {
15523 context< SnapTrimmer >().log_exit(state_name, enter_time);
15524 }
15525
15526 boost::statechart::result PrimaryLogPG::NotTrimming::react(const KickTrim&)
15527 {
15528 PrimaryLogPG *pg = context< SnapTrimmer >().pg;
15529 ldout(pg->cct, 10) << "NotTrimming react KickTrim" << dendl;
15530
15531 if (!(pg->is_primary() && pg->is_active())) {
15532 ldout(pg->cct, 10) << "NotTrimming not primary or active" << dendl;
15533 return discard_event();
15534 }
15535 if (!pg->is_clean() ||
15536 pg->snap_trimq.empty()) {
15537 ldout(pg->cct, 10) << "NotTrimming not clean or nothing to trim" << dendl;
15538 return discard_event();
15539 }
15540 if (pg->is_scrub_queued_or_active()) {
15541 ldout(pg->cct, 10) << " scrubbing, will requeue snap_trimmer after" << dendl;
15542 return transit< WaitScrub >();
15543 } else {
15544 return transit< Trimming >();
15545 }
15546 }
15547
15548 boost::statechart::result PrimaryLogPG::WaitReservation::react(const SnapTrimReserved&)
15549 {
15550 PrimaryLogPG *pg = context< SnapTrimmer >().pg;
15551 ldout(pg->cct, 10) << "WaitReservation react SnapTrimReserved" << dendl;
15552
15553 pending = nullptr;
15554 if (!context< SnapTrimmer >().can_trim()) {
15555 post_event(KickTrim());
15556 return transit< NotTrimming >();
15557 }
15558
15559 context<Trimming>().snap_to_trim = pg->snap_trimq.range_start();
15560 ldout(pg->cct, 10) << "NotTrimming: trimming "
15561 << pg->snap_trimq.range_start()
15562 << dendl;
15563 return transit< AwaitAsyncWork >();
15564 }
15565
15566 /* AwaitAsyncWork */
15567 PrimaryLogPG::AwaitAsyncWork::AwaitAsyncWork(my_context ctx)
15568 : my_base(ctx),
15569 NamedState(nullptr, "Trimming/AwaitAsyncWork")
15570 {
15571 auto *pg = context< SnapTrimmer >().pg;
15572 context< SnapTrimmer >().log_enter(state_name);
15573 context< SnapTrimmer >().pg->osd->queue_for_snap_trim(pg);
15574 pg->state_set(PG_STATE_SNAPTRIM);
15575 pg->state_clear(PG_STATE_SNAPTRIM_ERROR);
15576 pg->publish_stats_to_osd();
15577 }
15578
15579 boost::statechart::result PrimaryLogPG::AwaitAsyncWork::react(const DoSnapWork&)
15580 {
15581 PrimaryLogPGRef pg = context< SnapTrimmer >().pg;
15582 snapid_t snap_to_trim = context<Trimming>().snap_to_trim;
15583 auto &in_flight = context<Trimming>().in_flight;
15584 ceph_assert(in_flight.empty());
15585
15586 ceph_assert(pg->is_primary() && pg->is_active());
15587 if (!context< SnapTrimmer >().can_trim()) {
15588 ldout(pg->cct, 10) << "something changed, reverting to NotTrimming" << dendl;
15589 post_event(KickTrim());
15590 return transit< NotTrimming >();
15591 }
15592
15593 ldout(pg->cct, 10) << "AwaitAsyncWork: trimming snap " << snap_to_trim << dendl;
15594
15595 vector<hobject_t> to_trim;
15596 unsigned max = pg->cct->_conf->osd_pg_max_concurrent_snap_trims;
15597 to_trim.reserve(max);
15598 int r = pg->snap_mapper.get_next_objects_to_trim(
15599 snap_to_trim,
15600 max,
15601 &to_trim);
15602 if (r != 0 && r != -ENOENT) {
15603 lderr(pg->cct) << "get_next_objects_to_trim returned "
15604 << cpp_strerror(r) << dendl;
15605 ceph_abort_msg("get_next_objects_to_trim returned an invalid code");
15606 } else if (r == -ENOENT) {
15607 // Done!
15608 ldout(pg->cct, 10) << "got ENOENT" << dendl;
15609
15610 pg->snap_trimq.erase(snap_to_trim);
15611
15612 if (pg->snap_trimq_repeat.count(snap_to_trim)) {
15613 ldout(pg->cct, 10) << " removing from snap_trimq_repeat" << dendl;
15614 pg->snap_trimq_repeat.erase(snap_to_trim);
15615 } else {
15616 ldout(pg->cct, 10) << "adding snap " << snap_to_trim
15617 << " to purged_snaps"
15618 << dendl;
15619 ObjectStore::Transaction t;
15620 pg->recovery_state.adjust_purged_snaps(
15621 [snap_to_trim](auto &purged_snaps) {
15622 purged_snaps.insert(snap_to_trim);
15623 });
15624 pg->write_if_dirty(t);
15625
15626 ldout(pg->cct, 10) << "purged_snaps now "
15627 << pg->info.purged_snaps << ", snap_trimq now "
15628 << pg->snap_trimq << dendl;
15629
15630 int tr = pg->osd->store->queue_transaction(pg->ch, std::move(t), NULL);
15631 ceph_assert(tr == 0);
15632
15633 pg->recovery_state.share_pg_info();
15634 }
15635 post_event(KickTrim());
15636 return transit< NotTrimming >();
15637 }
15638 ceph_assert(!to_trim.empty());
15639
15640 for (auto &&object: to_trim) {
15641 // Get next
15642 ldout(pg->cct, 10) << "AwaitAsyncWork react trimming " << object << dendl;
15643 OpContextUPtr ctx;
15644 int error = pg->trim_object(in_flight.empty(), object, snap_to_trim, &ctx);
15645 if (error) {
15646 if (error == -ENOLCK) {
15647 ldout(pg->cct, 10) << "could not get write lock on obj "
15648 << object << dendl;
15649 } else {
15650 pg->state_set(PG_STATE_SNAPTRIM_ERROR);
15651 ldout(pg->cct, 10) << "Snaptrim error=" << error << dendl;
15652 }
15653 if (!in_flight.empty()) {
15654 ldout(pg->cct, 10) << "letting the ones we already started finish" << dendl;
15655 return transit< WaitRepops >();
15656 }
15657 if (error == -ENOLCK) {
15658 ldout(pg->cct, 10) << "waiting for it to clear"
15659 << dendl;
15660 return transit< WaitRWLock >();
15661 } else {
15662 return transit< NotTrimming >();
15663 }
15664 }
15665
15666 in_flight.insert(object);
15667 ctx->register_on_success(
15668 [pg, object, &in_flight]() {
15669 ceph_assert(in_flight.find(object) != in_flight.end());
15670 in_flight.erase(object);
15671 if (in_flight.empty()) {
15672 if (pg->state_test(PG_STATE_SNAPTRIM_ERROR)) {
15673 pg->snap_trimmer_machine.process_event(Reset());
15674 } else {
15675 pg->snap_trimmer_machine.process_event(RepopsComplete());
15676 }
15677 }
15678 });
15679
15680 pg->simple_opc_submit(std::move(ctx));
15681 }
15682
15683 return transit< WaitRepops >();
15684 }
15685
15686 void PrimaryLogPG::setattr_maybe_cache(
15687 ObjectContextRef obc,
15688 PGTransaction *t,
15689 const string &key,
15690 bufferlist &val)
15691 {
15692 t->setattr(obc->obs.oi.soid, key, val);
15693 }
15694
15695 void PrimaryLogPG::setattrs_maybe_cache(
15696 ObjectContextRef obc,
15697 PGTransaction *t,
15698 map<string, bufferlist, less<>> &attrs)
15699 {
15700 t->setattrs(obc->obs.oi.soid, attrs);
15701 }
15702
15703 void PrimaryLogPG::rmattr_maybe_cache(
15704 ObjectContextRef obc,
15705 PGTransaction *t,
15706 const string &key)
15707 {
15708 t->rmattr(obc->obs.oi.soid, key);
15709 }
15710
15711 int PrimaryLogPG::getattr_maybe_cache(
15712 ObjectContextRef obc,
15713 const string &key,
15714 bufferlist *val)
15715 {
15716 if (pool.info.is_erasure()) {
15717 map<string, bufferlist>::iterator i = obc->attr_cache.find(key);
15718 if (i != obc->attr_cache.end()) {
15719 if (val)
15720 *val = i->second;
15721 return 0;
15722 } else {
15723 return -ENODATA;
15724 }
15725 }
15726 return pgbackend->objects_get_attr(obc->obs.oi.soid, key, val);
15727 }
15728
15729 int PrimaryLogPG::getattrs_maybe_cache(
15730 ObjectContextRef obc,
15731 map<string, bufferlist, less<>> *out)
15732 {
15733 int r = 0;
15734 ceph_assert(out);
15735 if (pool.info.is_erasure()) {
15736 *out = obc->attr_cache;
15737 } else {
15738 r = pgbackend->objects_get_attrs(obc->obs.oi.soid, out);
15739 }
15740 map<string, bufferlist, less<>> tmp;
15741 for (auto& [key, val]: *out) {
15742 if (key.size() > 1 && key[0] == '_') {
15743 tmp[key.substr(1, key.size())] = std::move(val);
15744 }
15745 }
15746 tmp.swap(*out);
15747 return r;
15748 }
15749
15750 bool PrimaryLogPG::check_failsafe_full() {
15751 return osd->check_failsafe_full(get_dpp());
15752 }
15753
15754 bool PrimaryLogPG::maybe_preempt_replica_scrub(const hobject_t& oid)
15755 {
15756 return m_scrubber->write_blocked_by_scrub(oid);
15757 }
15758
15759 void intrusive_ptr_add_ref(PrimaryLogPG *pg) { pg->get("intptr"); }
15760 void intrusive_ptr_release(PrimaryLogPG *pg) { pg->put("intptr"); }
15761
15762 #ifdef PG_DEBUG_REFS
15763 uint64_t get_with_id(PrimaryLogPG *pg) { return pg->get_with_id(); }
15764 void put_with_id(PrimaryLogPG *pg, uint64_t id) { return pg->put_with_id(id); }
15765 #endif
15766
15767 void intrusive_ptr_add_ref(PrimaryLogPG::RepGather *repop) { repop->get(); }
15768 void intrusive_ptr_release(PrimaryLogPG::RepGather *repop) { repop->put(); }